From daa76e16e05c2b7a3521bf739670903d996d9a33 Mon Sep 17 00:00:00 2001 From: Guozhong Zhuang Date: Fri, 18 May 2018 11:37:53 -0700 Subject: [PATCH 001/598] enhancement with relu primitive reuse --- tensorflow/core/kernels/mkl_conv_ops.cc | 280 +++++----- tensorflow/core/kernels/mkl_relu_op.cc | 661 +++++++++++++++++++----- tensorflow/core/util/mkl_util.h | 32 +- 3 files changed, 702 insertions(+), 271 deletions(-) diff --git a/tensorflow/core/kernels/mkl_conv_ops.cc b/tensorflow/core/kernels/mkl_conv_ops.cc index f2b14f1278..c032add82e 100644 --- a/tensorflow/core/kernels/mkl_conv_ops.cc +++ b/tensorflow/core/kernels/mkl_conv_ops.cc @@ -59,7 +59,8 @@ namespace tensorflow { #ifndef INTEL_MKL_ML -struct ConvFwdDimensions { +// This structure aggregates multiple inputs to Conv2DFwd* methods. +struct MklConvFwdParams { memory::dims src_dims; memory::dims filter_dims; memory::dims bias_dims; @@ -69,7 +70,7 @@ struct ConvFwdDimensions { memory::dims padding_left; memory::dims padding_right; - ConvFwdDimensions(memory::dims src_dims, + MklConvFwdParams(memory::dims src_dims, memory::dims filter_dims, memory::dims bias_dims, memory::dims dst_dims, memory::dims strides, memory::dims dilations, memory::dims padding_left, @@ -82,35 +83,40 @@ struct ConvFwdDimensions { }; template -class Conv2DFwd : public DnnOp { +class MklConv2DFwdPrimitive: public MklPrimitive { public: - explicit Conv2DFwd(const ConvFwdDimensions& convFwdDims) { - fwd_stream_.reset(new stream(stream::kind::eager)); + explicit MklConv2DFwdPrimitive(const MklConvFwdParams& convFwdDims) { + context_.fwd_stream.reset(new stream(stream::kind::eager)); // create conv primitive - if (conv_fwd_ == nullptr) { + if (context_.conv_fwd == nullptr) { Setup(convFwdDims); } } - ~Conv2DFwd() {} + ~MklConv2DFwdPrimitive() {} // Convolution forward execute with bias // src_data: input data buffer of src // filter_data: input data buffer of filter (weights) // bias_data: input data buffer of bias // dst_data: output data buffer of dst - void Execute(T* src_data, T* filter_data, T* bias_data, T* dst_data) { - src_mem_->set_data_handle(static_cast(src_data)); - filter_mem_->set_data_handle(static_cast(filter_data)); - bias_mem_->set_data_handle(static_cast(bias_data)); - dst_mem_->set_data_handle(static_cast(dst_data)); - fwd_stream_->submit(fwd_primitives_); + void Execute(const T* src_data, const T* filter_data, + const T* bias_data, const T* dst_data) { + context_.src_mem->set_data_handle( + static_cast(const_cast(src_data))); + context_.filter_mem->set_data_handle( + static_cast(const_cast(filter_data))); + context_.bias_mem->set_data_handle( + static_cast(const_cast(bias_data))); + context_.dst_mem->set_data_handle( + static_cast(const_cast(dst_data))); + context_.fwd_stream->submit(context_.fwd_primitives); // after exec, set data handle back - src_mem_->set_data_handle(DummyData); - filter_mem_->set_data_handle(DummyData); - bias_mem_->set_data_handle(DummyData); - dst_mem_->set_data_handle(DummyData); + context_.src_mem->set_data_handle(DummyData); + context_.filter_mem->set_data_handle(DummyData); + context_.bias_mem->set_data_handle(DummyData); + context_.dst_mem->set_data_handle(DummyData); return; } @@ -119,139 +125,174 @@ class Conv2DFwd : public DnnOp { // src_data: input data buffer of src // filter_data: input data buffer of filter (weights) // dst_data: output data buffer of dst - void Execute(T* src_data, T* filter_data, T* dst_data) { - src_mem_->set_data_handle(static_cast(src_data)); - filter_mem_->set_data_handle(static_cast(filter_data)); - dst_mem_->set_data_handle(static_cast(dst_data)); - fwd_stream_->submit(fwd_primitives_); - - // after exec, set data handle back - src_mem_->set_data_handle(DummyData); - filter_mem_->set_data_handle(DummyData); - dst_mem_->set_data_handle(DummyData); + void Execute(const T* src_data, const T* filter_data, + const T* dst_data) { + context_.src_mem->set_data_handle( + static_cast(const_cast(src_data))); + context_.filter_mem->set_data_handle( + static_cast(const_cast(filter_data))); + context_.dst_mem->set_data_handle( + static_cast(const_cast(dst_data))); + context_.fwd_stream->submit(context_.fwd_primitives); + + // after execution, set data handle back + context_.src_mem->set_data_handle(DummyData); + context_.filter_mem->set_data_handle(DummyData); + context_.dst_mem->set_data_handle(DummyData); return; } - // expected memory format for this primitive instance - memory::format src_fmt_; - memory::format filter_fmt_; + memory::format GetSrcMemoryFormat() const { + return context_.src_fmt; + } + + memory::format GetFilterMemoryFormat() const { + return context_.filter_fmt; + } - // convolution primitive - std::shared_ptr fwd_pd_; - std::shared_ptr conv_fwd_; + std::shared_ptr + GetPrimitiveDesc() const { + return context_.fwd_pd; + } private: - void Setup(const ConvFwdDimensions& convFwdDims) { + // Primitive reuse context for Conv2D Fwd op + struct ConvFwdContext { + // expected memory format for this primitive instance + memory::format src_fmt; + memory::format filter_fmt; + + // MKLDNN memory + std::shared_ptr src_mem; + std::shared_ptr filter_mem; + std::shared_ptr bias_mem; + std::shared_ptr dst_mem; + + // desc & prmitive desc + std::shared_ptr fwd_desc; + + // memory desc + std::shared_ptr src_md; + std::shared_ptr filter_md; + std::shared_ptr bias_md; + std::shared_ptr dst_md; + + // convolution primitive + std::shared_ptr fwd_pd; + std::shared_ptr conv_fwd; + + std::shared_ptr fwd_stream; + std::vector fwd_primitives; + + ConvFwdContext() : + src_fmt(memory::format::any), filter_fmt(memory::format::any), + src_mem(nullptr), filter_mem(nullptr), bias_mem(nullptr), + dst_mem(nullptr), fwd_desc(nullptr), + src_md(nullptr), filter_md(nullptr), bias_md(nullptr), + fwd_pd(nullptr), conv_fwd(nullptr), fwd_stream(nullptr) { + } + } context_; + + engine cpu_engine_ = engine(engine::cpu, 0); + + void Setup(const MklConvFwdParams& convFwdDims) { // create memory descriptors for convolution data w/ no specified format - src_md_.reset(new memory::desc({convFwdDims.src_dims}, + context_.src_md.reset(new memory::desc({convFwdDims.src_dims}, MklDnnType(), memory::format::any)); - filter_md_.reset(new memory::desc({convFwdDims.filter_dims}, + context_.filter_md.reset(new memory::desc({convFwdDims.filter_dims}, MklDnnType(), memory::format::any)); - dst_md_.reset(new memory::desc({convFwdDims.dst_dims}, + context_.dst_md.reset(new memory::desc({convFwdDims.dst_dims}, MklDnnType(), memory::format::any)); if (!convFwdDims.bias_dims.empty()) - bias_md_.reset(new memory::desc({convFwdDims.bias_dims}, + context_.bias_md.reset(new memory::desc({convFwdDims.bias_dims}, MklDnnType(), memory::format::any)); // create a convolution if (!convFwdDims.bias_dims.empty()) { - fwd_desc_.reset(new convolution_forward::desc(prop_kind::forward, - convolution_direct, *src_md_, *filter_md_, *bias_md_, *dst_md_, + context_.fwd_desc.reset(new convolution_forward::desc(prop_kind::forward, + convolution_direct, *context_.src_md, *context_.filter_md, + *context_.bias_md, *context_.dst_md, convFwdDims.strides, convFwdDims.dilations, convFwdDims.padding_left, convFwdDims.padding_right, padding_kind::zero)); } else { - fwd_desc_.reset(new convolution_forward::desc(prop_kind::forward, - convolution_direct, *src_md_, *filter_md_, *dst_md_, - convFwdDims.strides, convFwdDims.dilations, convFwdDims.padding_left, - convFwdDims.padding_right, padding_kind::zero)); + context_.fwd_desc.reset(new convolution_forward::desc(prop_kind::forward, + convolution_direct, *context_.src_md, *context_.filter_md, + *context_.dst_md, convFwdDims.strides, convFwdDims.dilations, + convFwdDims.padding_left, convFwdDims.padding_right, + padding_kind::zero)); } - fwd_pd_.reset(new convolution_forward::primitive_desc( - *fwd_desc_, cpu_engine_)); + context_.fwd_pd.reset(new convolution_forward::primitive_desc( + *context_.fwd_desc, cpu_engine_)); // store the expected memory format - src_fmt_ = static_cast( - fwd_pd_.get()->src_primitive_desc().desc().data.format); + context_.src_fmt = static_cast( + context_.fwd_pd.get()->src_primitive_desc().desc().data.format); - filter_fmt_ = static_cast( - fwd_pd_.get()->weights_primitive_desc().desc().data.format); + context_.filter_fmt = static_cast( + context_.fwd_pd.get()->weights_primitive_desc().desc().data.format); // create memory primitive based on dummy data - src_mem_.reset(new memory(fwd_pd_.get()->src_primitive_desc(), DummyData)); - filter_mem_.reset(new memory(fwd_pd_.get()->weights_primitive_desc(), - DummyData)); - dst_mem_.reset(new memory(fwd_pd_.get()->dst_primitive_desc(), DummyData)); + context_.src_mem.reset(new memory( + context_.fwd_pd.get()->src_primitive_desc(), DummyData)); + context_.filter_mem.reset(new memory( + context_.fwd_pd.get()->weights_primitive_desc(), DummyData)); + context_.dst_mem.reset(new memory( + context_.fwd_pd.get()->dst_primitive_desc(), DummyData)); // create convolution primitive and add it to net if (!convFwdDims.bias_dims.empty()) { - bias_mem_.reset(new memory({{{convFwdDims.bias_dims}, MklDnnType(), - memory::format::x}, cpu_engine_}, DummyData)); - conv_fwd_.reset(new convolution_forward(*fwd_pd_, *src_mem_, - *filter_mem_, *bias_mem_, *dst_mem_)); + context_.bias_mem.reset(new memory({{{convFwdDims.bias_dims}, + MklDnnType(), memory::format::x}, cpu_engine_}, DummyData)); + context_.conv_fwd.reset(new convolution_forward( + *context_.fwd_pd, *context_.src_mem, *context_.filter_mem, + *context_.bias_mem, *context_.dst_mem)); } else { - conv_fwd_.reset(new convolution_forward(*fwd_pd_, *src_mem_, - *filter_mem_, *dst_mem_)); + context_.conv_fwd.reset(new convolution_forward( + *context_.fwd_pd, *context_.src_mem, + *context_.filter_mem, *context_.dst_mem)); } - fwd_primitives_.push_back(*conv_fwd_); + context_.fwd_primitives.push_back(*context_.conv_fwd); return; } - - // MKLDNN memory - std::shared_ptr src_mem_; - std::shared_ptr filter_mem_; - std::shared_ptr bias_mem_; - std::shared_ptr dst_mem_; - - std::shared_ptr fwd_stream_; - std::vector fwd_primitives_; - - // desc & prmitive desc - std::shared_ptr fwd_desc_; - - // memory desc - std::shared_ptr src_md_; - std::shared_ptr filter_md_; - std::shared_ptr bias_md_; - std::shared_ptr dst_md_; - - engine cpu_engine_ = engine(engine::cpu, 0); }; template -class Conv2DFwdFactory : public DnnOpFactory { +class MklConv2DFwdPrimitiveFactory : public MklPrimitiveFactory { public: - static Conv2DFwd* Get(const ConvFwdDimensions& convFwdDims) { - Conv2DFwd* conv2d_fwd = nullptr; + static MklConv2DFwdPrimitive* Get(const MklConvFwdParams& convFwdDims) { + MklConv2DFwdPrimitive* conv2d_fwd = nullptr; // try to find a suitable one in pool - conv2d_fwd = dynamic_cast*> ( - Conv2DFwdFactory::GetInstance().GetConv2DFwd(convFwdDims)); + conv2d_fwd = dynamic_cast*> ( + MklConv2DFwdPrimitiveFactory::GetInstance().GetConv2DFwd( + convFwdDims)); if (conv2d_fwd == nullptr) { - conv2d_fwd = new Conv2DFwd(convFwdDims); - Conv2DFwdFactory::GetInstance().SetConv2DFwd( + conv2d_fwd = new MklConv2DFwdPrimitive(convFwdDims); + MklConv2DFwdPrimitiveFactory::GetInstance().SetConv2DFwd( convFwdDims, conv2d_fwd); } return conv2d_fwd; } private: - Conv2DFwdFactory() {} - ~Conv2DFwdFactory() {} + MklConv2DFwdPrimitiveFactory() {} + ~MklConv2DFwdPrimitiveFactory() {} static const int kDilationH = 0, kDilationW = 1; - static Conv2DFwdFactory& GetInstance() { - static Conv2DFwdFactory instance_; + static MklConv2DFwdPrimitiveFactory& GetInstance() { + static MklConv2DFwdPrimitiveFactory instance_; return instance_; } - static std::string CreateKey(const ConvFwdDimensions& convFwdDims) { + static std::string CreateKey(const MklConvFwdParams& convFwdDims) { std::string prefix = "conv2d_fwd_"; FactoryKeyCreator key_creator; key_creator.AddAsKey(prefix); @@ -266,12 +307,12 @@ class Conv2DFwdFactory : public DnnOpFactory { return key_creator.GetKey(); } - DnnOp* GetConv2DFwd(const ConvFwdDimensions& convFwdDims) { + MklPrimitive* GetConv2DFwd(const MklConvFwdParams& convFwdDims) { std::string key = CreateKey(convFwdDims); return this->GetOp(key); } - void SetConv2DFwd(const ConvFwdDimensions& convFwdDims, DnnOp *op) { + void SetConv2DFwd(const MklConvFwdParams& convFwdDims, MklPrimitive *op) { std::string key = CreateKey(convFwdDims); this->SetOp(key, op); } @@ -762,7 +803,6 @@ class MklConv2DOp : public OpKernel { MklDnnData src(&cpu_engine); MklDnnData filter(&cpu_engine); - MklDnnData dst(&cpu_engine); // output memory::dims src_dims, filter_dims, padding_left, padding_right, dilations, strides; @@ -812,7 +852,6 @@ class MklConv2DOp : public OpKernel { auto src_md = src_mkl_shape.IsMklTensor() ? src_mkl_shape.GetMklLayout() : memory::desc(src_dims, MklDnnType(), tf_fmt); - src.SetUsrMem(src_md, &src_tensor); // Although filter shape (filter_dims) required is in MKL-DNN order, // the layout is Tensorflow's layout (HWIO). @@ -820,29 +859,28 @@ class MklConv2DOp : public OpKernel { ? filter_mkl_shape.GetMklLayout() : memory::desc(filter_dims, MklDnnType(), memory::format::hwio); - filter.SetUsrMem(filter_md, &filter_tensor); // MKLDNN dilation starts from 0. dilations[kDilationH] -= 1; dilations[kDilationW] -= 1; // get a conv2d fwd from primitive pool - Conv2DFwd *conv2d_fwd = nullptr; + MklConv2DFwdPrimitive *conv2d_fwd = nullptr; if (biasEnabled) { memory::dims bias_dims = {}; conv_utl.GetBiasSizeInMklOrder(kInputIndex_Bias, &bias_dims); - ConvFwdDimensions convFwdDims(src_dims, filter_dims, bias_dims, + MklConvFwdParams convFwdDims(src_dims, filter_dims, bias_dims, dst_dims_mkl_order, strides, dilations, padding_left, padding_right); - conv2d_fwd = Conv2DFwdFactory::Get(convFwdDims); + conv2d_fwd = MklConv2DFwdPrimitiveFactory::Get(convFwdDims); } else { - ConvFwdDimensions convFwdDims(src_dims, filter_dims, NONE_DIMS, + MklConvFwdParams convFwdDims(src_dims, filter_dims, NONE_DIMS, dst_dims_mkl_order, strides, dilations, padding_left, padding_right); - conv2d_fwd = Conv2DFwdFactory::Get(convFwdDims); + conv2d_fwd = MklConv2DFwdPrimitiveFactory::Get(convFwdDims); } // allocate output tensors output_tensor and filter_out_tensor std::shared_ptr - conv_fwd_pd = conv2d_fwd->fwd_pd_; + conv_fwd_pd = conv2d_fwd->GetPrimitiveDesc(); AllocateOutputTensor(context, *conv_fwd_pd, dst_dims_mkl_order, tf_fmt, &dst_tensor); Tensor* filter_out_tensor = nullptr; @@ -854,20 +892,30 @@ class MklConv2DOp : public OpKernel { // check whether src/filter need reorder std::vector net; - if (src_md.data.format != conv2d_fwd->src_fmt_) - src.CheckReorderToOpMem( - conv_fwd_pd.get()->src_primitive_desc(), &net); - - if (filter_md.data.format != conv2d_fwd->filter_fmt_) - filter.CheckReorderToOpMem( - conv_fwd_pd.get()->weights_primitive_desc(), - filter.GetTensorBuffer(filter_out_tensor), &net); + T *src_data = nullptr; + if (src_md.data.format != conv2d_fwd->GetSrcMemoryFormat()) { + src.SetUsrMem(src_md, &src_tensor); + src.CheckReorderToOpMem( + conv_fwd_pd.get()->src_primitive_desc(), &net); + src_data = static_cast(src.GetOpMem().get_data_handle()); + } else { + src_data = static_cast(const_cast( + src_tensor.flat().data())); + } + T *filter_data = nullptr; + if (filter_md.data.format != conv2d_fwd->GetFilterMemoryFormat()) { + filter.SetUsrMem(filter_md, &filter_tensor); + filter.CheckReorderToOpMem( + conv_fwd_pd.get()->weights_primitive_desc(), + filter.GetTensorBuffer(filter_out_tensor), &net); + filter_data = static_cast(filter.GetOpMem().get_data_handle()); + } else { + filter_data = static_cast(const_cast( + filter_tensor.flat().data())); + } + stream(stream::kind::eager).submit(net).wait(); - T* src_data = static_cast( - src.GetOpMem().get_data_handle()); - T* filter_data = static_cast( - filter.GetOpMem().get_data_handle()); // execute convolution if (biasEnabled) { diff --git a/tensorflow/core/kernels/mkl_relu_op.cc b/tensorflow/core/kernels/mkl_relu_op.cc index 1ed43834dd..048d4883b2 100644 --- a/tensorflow/core/kernels/mkl_relu_op.cc +++ b/tensorflow/core/kernels/mkl_relu_op.cc @@ -25,6 +25,7 @@ limitations under the License. #include "mkl_dnn.h" #include "mkl_dnn_types.h" +#include "tensorflow/core/platform/default/logging.h" #include "tensorflow/core/util/mkl_util.h" #ifndef INTEL_MKL_ML @@ -38,10 +39,406 @@ using mkldnn::prop_kind; using mkldnn::relu_backward; using mkldnn::relu_forward; using mkldnn::stream; +using mkldnn::memory; #endif namespace tensorflow { +#ifndef INTEL_MKL_ML + +template +class MklEltwiseFwdParams { + public: + memory::dims src_dims; // check if this is needed + memory::desc src_md; + algorithm alg_kind; + T alpha; + T beta; + + MklEltwiseFwdParams(memory::dims src_dims, memory::desc src_md, + algorithm alg_kind, T alpha, T beta) : + src_dims(src_dims), src_md(src_md), + alg_kind(alg_kind), alpha(alpha), beta(beta) { + } +}; + +template +class MklEltwiseFwdPrimitive : public MklPrimitive { + public: + explicit MklEltwiseFwdPrimitive(const MklEltwiseFwdParams& fwdParams) { + // store expected format + context_.src_fmt = static_cast( + fwdParams.src_md.data.format); + context_.fwd_stream.reset(new stream(stream::kind::eager)); + + // create eltwise primitive + if (context_.eltwise_fwd == nullptr) { + Setup(fwdParams); + } + } + + ~MklEltwiseFwdPrimitive() {} + + // Eltwise forward execute + // src_data: input data buffer of src + // dst_data: output data buffer of dst + void Execute(T* src_data, T* dst_data) { + context_.src_mem->set_data_handle(static_cast(src_data)); + context_.dst_mem->set_data_handle(static_cast(dst_data)); + context_.fwd_stream->submit(context_.fwd_primitives); + + // after execution, set data handle back + context_.src_mem->set_data_handle(DummyData); + context_.dst_mem->set_data_handle(DummyData); + return; + } + + std::shared_ptr GetEltwiseFwdPd() { + return context_.fwd_pd; + } + + memory::format GetSrcMemoryFormat() { + return context_.src_fmt; + } + + private: + // Primitive reuse context for eltwise Fwd ops: Relu, Elu, Tanh + struct EltwiseFwdContext { + // expected memory format for this primitive instance + mkldnn::memory::format src_fmt; + + // MKLDNN memory + std::shared_ptr src_mem; + std::shared_ptr dst_mem; + + // desc & prmitive desc + std::shared_ptr fwd_desc; + std::shared_ptr fwd_pd; + + // memory desc + std::shared_ptr src_md; + std::shared_ptr dst_md; + + // memory primitive desc + std::shared_ptr src_mpd; + + // Eltwise primitive + std::shared_ptr eltwise_fwd; + + std::shared_ptr fwd_stream; + std::vector fwd_primitives; + + EltwiseFwdContext() : + src_fmt(memory::format::any), src_mem(nullptr), dst_mem(nullptr), + fwd_desc(nullptr), fwd_pd(nullptr), src_md(nullptr), dst_md(nullptr), + src_mpd(nullptr), eltwise_fwd(nullptr), fwd_stream(nullptr) { + } + } context_; + + // Eltwise forward primitive setup + void Setup(const MklEltwiseFwdParams& fwdParams) { + // create memory descriptors for eltwise data with specified format + context_.src_md.reset(new memory::desc(fwdParams.src_md.data)); + context_.src_mpd.reset(new memory::primitive_desc( + *context_.src_md, cpu_engine_)); + + // create a eltwise + context_.fwd_desc.reset(new mkldnn::eltwise_forward::desc( + prop_kind::forward, fwdParams.alg_kind, *context_.src_md, + fwdParams.alpha, fwdParams.beta)); + context_.fwd_pd.reset(new mkldnn::eltwise_forward::primitive_desc( + *context_.fwd_desc, cpu_engine_)); + + // create memory primitive based on dummy data + context_.src_mem.reset(new memory(*context_.src_mpd, DummyData)); + context_.dst_mem.reset(new memory( + context_.fwd_pd.get()->dst_primitive_desc(), DummyData)); + + // create eltwise primitive and add it to net + context_.eltwise_fwd.reset(new mkldnn::eltwise_forward(*context_.fwd_pd, + *context_.src_mem, *context_.dst_mem)); + + context_.fwd_primitives.push_back(*context_.eltwise_fwd); + return; + } + + engine cpu_engine_ = engine(engine::cpu, 0); +}; + +template +class MklEltwiseFwdPrimitiveFactory : public MklPrimitiveFactory { + public: + static MklEltwiseFwdPrimitive* Get( + const MklEltwiseFwdParams& fwdParams) { + MklEltwiseFwdPrimitive* eltwise_forward = nullptr; + + auto src_fmt = static_cast( + fwdParams.src_md.data.format); + + // Get a eltwise fwd primitive from the cached pool + eltwise_forward = static_cast*>( + MklEltwiseFwdPrimitiveFactory::GetInstance().GetEltwiseFwd( + fwdParams, src_fmt)); + if (eltwise_forward == nullptr) { + eltwise_forward = new MklEltwiseFwdPrimitive(fwdParams); + MklEltwiseFwdPrimitiveFactory::GetInstance().SetEltwiseFwd( + fwdParams, src_fmt, eltwise_forward); + } + return eltwise_forward; + } + + static MklEltwiseFwdPrimitiveFactory& GetInstance() { + static MklEltwiseFwdPrimitiveFactory instance_; + return instance_; + } + + private: + MklEltwiseFwdPrimitiveFactory() {} + ~MklEltwiseFwdPrimitiveFactory() {} + + static std::string CreateKey( + const MklEltwiseFwdParams& fwdParams, memory::format src_fmt) { + std::string prefix = "eltwise_fwd"; + FactoryKeyCreator key_creator; + key_creator.AddAsKey(prefix); + key_creator.AddAsKey(fwdParams.src_dims); + key_creator.AddAsKey(static_cast(fwdParams.alg_kind)); + key_creator.AddAsKey(static_cast(fwdParams.alpha)); + key_creator.AddAsKey(static_cast(fwdParams.beta)); + key_creator.AddAsKey(static_cast(src_fmt)); + return key_creator.GetKey(); + } + + MklPrimitive* GetEltwiseFwd(const MklEltwiseFwdParams& fwdParams, + memory::format src_fmt) { + std::string key = CreateKey(fwdParams, src_fmt); + return this->GetOp(key); + } + + void SetEltwiseFwd(const MklEltwiseFwdParams& fwdParams, + memory::format src_fmt, MklPrimitive* op) { + std::string key = CreateKey(fwdParams, src_fmt); + this->SetOp(key, op); + } +}; + +template +class MklEltwiseBwdParams { + public: + memory::dims src_dims; + memory::desc common_md; + algorithm alg_kind; + T alpha; + T beta; + + MklEltwiseBwdParams(const memory::dims &src_dims, + const memory::desc &common_md, + algorithm alg_kind, T alpha, T beta) : + src_dims(src_dims), common_md(common_md), + alg_kind(alg_kind), alpha(alpha), beta(beta) { + } +}; + +template +class MklEltwiseBwdPrimitive : public MklPrimitive { + public: + explicit MklEltwiseBwdPrimitive(const MklEltwiseBwdParams& bwdParams) { + context_.src_fmt = static_cast( + bwdParams.common_md.data.format); + context_.diff_dst_fmt = static_cast( + bwdParams.common_md.data.format); + context_.bwd_stream.reset(new stream(stream::kind::eager)); + // create eltwise primitive + if (context_.eltwise_bwd == nullptr) { + Setup(bwdParams); + } + } + + ~MklEltwiseBwdPrimitive() {} + + // Eltwise backward execute + // src_data: input data buffer of src + // diff_dst_data: input data buffer of diff_dst + // diff_src_data: output data buffer of diff_src + + void Execute(T* src_data, T* diff_dst_data, T* diff_src_data) { + context_.src_mem->set_data_handle(static_cast(src_data)); + context_.diff_dst_mem->set_data_handle(static_cast(diff_dst_data)); + context_.diff_src_mem->set_data_handle(static_cast(diff_src_data)); + context_.bwd_stream->submit(context_.bwd_primitives); + + // after execution, set data handle back + context_.src_mem->set_data_handle(DummyData); + context_.diff_dst_mem->set_data_handle(DummyData); + context_.diff_src_mem->set_data_handle(DummyData); + return; + } + + std::shared_ptr GetEltwiseBwdPd() { + return context_.bwd_pd; + } + + memory::format GetSrcMemoryFormat() { + return context_.src_fmt; + } + + memory::format GetDiffDstMemoryFormat() { + return context_.diff_dst_fmt; + } + + private: + // Primitive reuse context for eltwise Bwd ops: Relu, Elu, Tanh + struct EltwiseBwdContext { + // expected memory format for this primitive instance + memory::format src_fmt; + memory::format diff_dst_fmt; + + // MKLDNN memory + std::shared_ptr src_mem; + std::shared_ptr diff_dst_mem; + std::shared_ptr diff_src_mem; + + // desc & prmitive desc + std::shared_ptr bwd_desc; + + // memory desc + std::shared_ptr src_md; + std::shared_ptr diff_dst_md; + std::shared_ptr common_md; + + // memory primitive desc + std::shared_ptr src_mpd; + std::shared_ptr diff_dst_mpd; + + // fwd primitive desc + std::shared_ptr fwd_desc; + std::shared_ptr fwd_pd; + std::shared_ptr bwd_pd; + + // Eltwise primitive + std::shared_ptr eltwise_bwd; + + std::shared_ptr bwd_stream; + std::vector bwd_primitives; + + EltwiseBwdContext() : + src_fmt(memory::format::any), diff_dst_fmt(memory::format::any), + src_mem(nullptr), diff_dst_mem(nullptr), diff_src_mem(nullptr), + src_md(nullptr), diff_dst_md(nullptr), common_md(nullptr), + src_mpd(nullptr), diff_dst_mpd(nullptr), + fwd_desc(nullptr), fwd_pd(nullptr), bwd_pd(nullptr), + eltwise_bwd(nullptr), bwd_stream(nullptr) { + } + } context_; + + // Eltwise backward primitive setup + void Setup(const MklEltwiseBwdParams& bwdParams) { + // create memory descriptors for eltwise data w/ no specified format + context_.src_md.reset(new memory::desc(bwdParams.common_md.data)); + context_.diff_dst_md.reset(new memory::desc(bwdParams.common_md.data)); + + context_.src_mpd.reset(new memory::primitive_desc( + *context_.src_md, cpu_engine_)); + context_.diff_dst_mpd.reset(new memory::primitive_desc( + *context_.diff_dst_md, cpu_engine_)); + + // create forward eltwise primitive + context_.fwd_desc.reset(new mkldnn::eltwise_forward::desc( + prop_kind::forward_training, bwdParams.alg_kind, + *context_.src_md, bwdParams.alpha, bwdParams.beta)); + context_.fwd_pd.reset(new mkldnn::eltwise_forward::primitive_desc( + *context_.fwd_desc, cpu_engine_)); + context_.bwd_desc.reset(new mkldnn::eltwise_backward::desc( + bwdParams.alg_kind, *context_.diff_dst_md, + *context_.src_md, bwdParams.alpha, bwdParams.beta)); + context_.bwd_pd.reset(new mkldnn::eltwise_backward::primitive_desc( + *context_.bwd_desc, cpu_engine_, *context_.fwd_pd)); + + // create memory primitive based on dummy data + context_.src_mem.reset(new memory(*context_.src_mpd, DummyData)); + context_.diff_dst_mem.reset(new memory(*context_.diff_dst_mpd, DummyData)); + context_.diff_src_mem.reset(new memory( + context_.bwd_pd.get()->diff_src_primitive_desc(), DummyData)); + + // create eltwise primitive and add it to net + context_.eltwise_bwd.reset(new mkldnn::eltwise_backward(*context_.bwd_pd, + *context_.src_mem, *context_.diff_dst_mem, *context_.diff_src_mem)); + + context_.bwd_primitives.push_back(*context_.eltwise_bwd); + return; + } + + engine cpu_engine_ = engine(engine::cpu, 0); +}; + + +template +class MklEltwiseBwdPrimitiveFactory : public MklPrimitiveFactory { + private: + MklEltwiseBwdPrimitiveFactory() {} + ~MklEltwiseBwdPrimitiveFactory() {} + + public: + static MklEltwiseBwdPrimitive* Get( + const MklEltwiseBwdParams& bwdParams) { + MklEltwiseBwdPrimitive* eltwise_backward = nullptr; + + auto src_fmt = static_cast( + bwdParams.common_md.data.format); + auto diff_dst_fmt = static_cast( + bwdParams.common_md.data.format); + + // try to find a suitable one in pool + eltwise_backward = static_cast*> ( + MklEltwiseBwdPrimitiveFactory::GetInstance().GetEltwiseBwd( + bwdParams, src_fmt, diff_dst_fmt)); + + if (eltwise_backward == nullptr) { + eltwise_backward = new MklEltwiseBwdPrimitive(bwdParams); + MklEltwiseBwdPrimitiveFactory::GetInstance().SetEltwiseBwd( + bwdParams, src_fmt, diff_dst_fmt, eltwise_backward); + } + return eltwise_backward; + } + + static MklEltwiseBwdPrimitiveFactory& GetInstance() { + static MklEltwiseBwdPrimitiveFactory instance_; + return instance_; + } + + private: + static std::string CreateKey( + const MklEltwiseBwdParams& bwdParams, + const memory::format &src_fmt, + const memory::format &diff_dst_fmt) { + std::string prefix = "eltwise_bwd"; + FactoryKeyCreator key_creator; + key_creator.AddAsKey(prefix); + key_creator.AddAsKey(bwdParams.src_dims); + key_creator.AddAsKey(static_cast(bwdParams.alg_kind)); + key_creator.AddAsKey(static_cast(bwdParams.alpha)); + key_creator.AddAsKey(static_cast(bwdParams.beta)); + key_creator.AddAsKey(static_cast(src_fmt)); + key_creator.AddAsKey(static_cast(diff_dst_fmt)); + return key_creator.GetKey(); + } + + MklPrimitive* GetEltwiseBwd(const MklEltwiseBwdParams& bwdParams, + const memory::format &src_fmt, const memory::format &diff_dst_fmt) { + std::string key = CreateKey(bwdParams, src_fmt, diff_dst_fmt); + return this->GetOp(key); + } + + void SetEltwiseBwd(const MklEltwiseBwdParams& bwdParams, + const memory::format &src_fmt, + const memory::format &diff_dst_fmt, MklPrimitive *op) { + std::string key = CreateKey(bwdParams, src_fmt, diff_dst_fmt); + this->SetOp(key, op); + } +}; + +#endif + typedef Eigen::ThreadPoolDevice CPUDevice; struct MklReluHelpers { @@ -367,104 +764,111 @@ void MklReluGradOp::Compute(OpKernelContext* context) { mkl_context.MklCleanup(); } - - #else // INTEL_MKL_ML - template class MklReluOpBase : public OpKernel { public: ~MklReluOpBase() {} explicit MklReluOpBase(OpKernelConstruction* context) : OpKernel(context) {} - virtual void Compute_Scalar(OpKernelContext* context) = 0; void Compute(OpKernelContext* context) override { try { - auto cpu_engine = engine(engine::cpu, 0); const size_t src_index = 0; // index of src input tensor const size_t dst_index = 0; // index of dst output tensor const Tensor& src_tensor = MklGetInput(context, src_index); MklDnnShape dnn_shape_src; GetMklShape(context, src_index, &dnn_shape_src); - Tensor* dst_tensor = nullptr; if (src_tensor.dims() == 0) { - Compute_Scalar(context); // scalar case doesn't use in-place operation + Compute_Scalar(context); return; } - // Create relu primitive. - MklDnnData src(&cpu_engine); - MklDnnData dst(&cpu_engine); - // Set DNN primitive - src + MklDnnData src(&cpu_engine); + memory::dims src_dims; memory::desc src_md({}, memory::data_undef, memory::format_undef); if (dnn_shape_src.IsMklTensor()) { src_md = dnn_shape_src.GetMklLayout(); + src_dims = dnn_shape_src.GetSizesAsMklDnnDims(); } else { - auto src_dims = TFShapeToMklDnnDims(src_tensor.shape()); + src_dims = TFShapeToMklDnnDims(src_tensor.shape()); auto src_strides = CalculateTFStrides(src_dims); // Create blocked memory descriptor src_md = MklDnnData::CreateBlockedMemDesc(src_dims, src_strides); } - src.SetUsrMem(src_md, &src_tensor); T alpha = 0, beta = 0; - std::shared_ptr relu_fwd_pd; - auto relu_fwd_desc = relu_forward::desc( - prop_kind::forward_training, - // Operator memory descriptor is same as user memory descriptor. - alg_kind, src.GetUsrMemDesc(), alpha, beta); - relu_fwd_pd.reset( - new relu_forward::primitive_desc(relu_fwd_desc, cpu_engine)); - - // allocate dst tensor + + // get a eltwise fwd from primitive pool + MklEltwiseFwdParams fwdParams(src_dims, src_md, + alg_kind, alpha, beta); + MklEltwiseFwdPrimitive *eltwise_fwd = + MklEltwiseFwdPrimitiveFactory::Get(fwdParams); + + // prepare for execuation + T* src_data = nullptr; + // check wehther src need to reorder + if (src_md.data.format != eltwise_fwd->GetSrcMemoryFormat()) { + src.SetUsrMem(src_md, &src_tensor); + std::vector net; + auto src_target_pd = memory::primitive_desc({{src_dims}, + MklDnnType(), eltwise_fwd->GetSrcMemoryFormat()}, cpu_engine); + src.CheckReorderToOpMem(src_target_pd, &net); + stream(stream::kind::eager).submit(net).wait(); + src_data = static_cast(src.GetOpMem().get_data_handle()); + } else { + src_data = static_cast( + const_cast(src_tensor.flat().data())); + } + + // allocate dst tensor, always set it as MKL-DNN layout + std::shared_ptr + eltwise_fwd_pd = eltwise_fwd->GetEltwiseFwdPd(); MklDnnShape dnn_shape_dst; TensorShape tf_shape_dst; if (dnn_shape_src.IsMklTensor()) { dnn_shape_dst.SetMklTensor(true); - auto dst_pd = relu_fwd_pd->dst_primitive_desc(); + auto dst_pd = eltwise_fwd_pd->dst_primitive_desc(); dnn_shape_dst.SetMklLayout(&dst_pd); dnn_shape_dst.SetElemType(MklDnnType()); dnn_shape_dst.SetTfLayout(dnn_shape_src.GetDimension(), dnn_shape_src.GetSizesAsMklDnnDims(), dnn_shape_src.GetTfDataFormat()); - tf_shape_dst.AddDim(dst_pd.get_size() / sizeof(T)); + tf_shape_dst.AddDim(dst_pd.get_size()/sizeof(T)); } else { + // TODO(yli135): why relu's input is TF tensor in VGG16?? dnn_shape_dst.SetMklTensor(false); tf_shape_dst = src_tensor.shape(); } - - // Allocate output and MklDnnShape tensors separately for possible - // in-place operation + + Tensor* dst_tensor = nullptr; OP_REQUIRES_OK(context, context->forward_input_or_allocate_output( - {static_cast(src_index)}, - static_cast(dst_index), - tf_shape_dst, &dst_tensor)); + {src_index}, dst_index, tf_shape_dst, &dst_tensor)); AllocateOutputSetMklShape(context, dst_index, dnn_shape_dst); - // Destination memory descriptor is same as source memory descriptor. - auto &dst_md = src_md; - dst.SetUsrMem(dst_md, dst_tensor); + T* dst_data = static_cast(const_cast( + dst_tensor->flat().data())); - // execute net - std::vector net; - auto relu_fwd = - relu_forward(*relu_fwd_pd, src.GetOpMem(), dst.GetOpMem()); - net.push_back(relu_fwd); - stream(stream::kind::eager).submit(net).wait(); - } catch (mkldnn::error& e) { + // execute eltwise + eltwise_fwd->Execute(src_data, dst_data); + } catch (mkldnn::error &e) { string error_msg = "Status: " + std::to_string(e.status) + - ", message: " + string(e.message) + ", in file " + - string(__FILE__) + ":" + std::to_string(__LINE__); - OP_REQUIRES_OK( - context, - errors::Aborted("Operation received an exception:", error_msg)); + ", message: " + string(e.message) + + ", in file " + string(__FILE__) + ":" + + std::to_string(__LINE__); + OP_REQUIRES_OK(context, + errors::Aborted("Operation received an exception:", + error_msg)); } } + + private: + engine cpu_engine = engine(engine::cpu, 0); + std::shared_ptr relu_fwd_pd; }; template @@ -472,25 +876,25 @@ class MklReluGradOpBase : public OpKernel { public: ~MklReluGradOpBase() {} - explicit MklReluGradOpBase(OpKernelConstruction* context) - : OpKernel(context) {} + explicit MklReluGradOpBase(OpKernelConstruction* context) : + OpKernel(context) { + } virtual void Compute_Scalar(OpKernelContext* context) = 0; - void Compute(OpKernelContext* context) { + void Compute(OpKernelContext* context) { try { - auto cpu_engine = engine(engine::cpu, 0); + // auto cpu_engine = engine(engine::cpu, 0); MklDnnData src(&cpu_engine); MklDnnData diff_dst(&cpu_engine); - MklDnnData diff_src(&cpu_engine); const size_t diff_dst_index = 0; // index of diff_dst input tensor const size_t src_index = 1; // index of src input tensor const size_t diff_src_index = 0; // index of diff_src output tensor - const Tensor& src_tensor = MklGetInput(context, src_index); + const Tensor& src_tensor = MklGetInput(context, src_index); const Tensor& diff_dst_tensor = MklGetInput(context, diff_dst_index); - Tensor* diff_src_tensor = nullptr; + Tensor* diff_src_tensor = nullptr; MklDnnShape dnn_shape_src, dnn_shape_diff_dst; GetMklShape(context, src_index, &dnn_shape_src); @@ -498,37 +902,23 @@ class MklReluGradOpBase : public OpKernel { int src_dims_size = src_tensor.dims(); if (src_dims_size == 0) { - Compute_Scalar(context); // scalar case doesn't use in-place operation + Compute_Scalar(context); return; } - // Set DNN primitives for src & diff_dst + // get a eltwise bwd from primitive pool + memory::dims src_dims = {}; memory::desc src_md({}, memory::data_undef, memory::format_undef); memory::desc diff_dst_md({}, memory::data_undef, memory::format_undef); - - // For creating Sum primitive, we need to ensure that all inputs are in - // same format. What that means is if we have a mixed input case - where - // one input is in Tensorflow format and one input is in MKL format -, - // then we need to ensure that all inputs are in same format for - // primitive construction. For performance reason, we say that all inputs - // are in MKL format in such case, and insert reorder for input that is - // in Tensorflow format into MKL format. On the other hand, if both the - // inputs are in MKL format or both are in Tensorflow format, then we - // dont need reorder. if (!dnn_shape_src.IsMklTensor() && !dnn_shape_diff_dst.IsMklTensor()) { - // If both the inputs are in Tensorflow format, we create blocked memory - // descriptor. - auto src_dims = TFShapeToMklDnnDims(src_tensor.shape()); + src_dims = TFShapeToMklDnnDims(src_tensor.shape()); auto src_strides = CalculateTFStrides(src_dims); src_md = MklDnnData::CreateBlockedMemDesc(src_dims, src_strides); diff_dst_md = src_md; } else if (dnn_shape_src.IsMklTensor() && !dnn_shape_diff_dst.IsMklTensor()) { - // If one input is in MKL format and other is in Tensorflow, then - // create respective descriptors describing the actual case. For input - // in Mkl format, we just get Mkl layout from MklDnnShape. For input in - // Tensorflow format, we create memory descriptor using data format. src_md = dnn_shape_src.GetMklLayout(); + src_dims = dnn_shape_src.GetSizesAsMklDnnDims(); memory::format src_mkl_data_format = dnn_shape_src.GetTfDataFormat(); auto src_tf_data_format = @@ -539,26 +929,23 @@ class MklReluGradOpBase : public OpKernel { memory::desc(diff_dst_dims, MklDnnType(), src_mkl_data_format); } else if (!dnn_shape_src.IsMklTensor() && dnn_shape_diff_dst.IsMklTensor()) { - // Same comment as above. diff_dst_md = dnn_shape_diff_dst.GetMklLayout(); memory::format diff_dst_mkl_data_format = dnn_shape_diff_dst.GetTfDataFormat(); auto diff_dst_tf_data_format = MklDnnDataFormatToTFDataFormat(diff_dst_mkl_data_format); - auto src_dims = TFShapeToMklDnnDimsInNCHW(src_tensor.shape(), + src_dims = TFShapeToMklDnnDimsInNCHW(src_tensor.shape(), diff_dst_tf_data_format); src_md = memory::desc(src_dims, MklDnnType(), diff_dst_mkl_data_format); } else { - // If both the inputs are in MKL format, we use Mkl layout of the input - // tensors. src_md = dnn_shape_src.GetMklLayout(); diff_dst_md = dnn_shape_diff_dst.GetMklLayout(); + src_dims = dnn_shape_src.GetSizesAsMklDnnDims(); } - src.SetUsrMem(src_md, &src_tensor); - diff_dst.SetUsrMem(diff_dst_md, &diff_dst_tensor); + T alpha = 0, beta = 0; // As per comment above, we tell MKLDNN that both the inputs are in same // format. So we set common memory descriptor in MKL format, if any of the @@ -573,83 +960,79 @@ class MklReluGradOpBase : public OpKernel { common_md = src_md; } - T alpha = 0, beta = 0; - std::shared_ptr relu_fwd_pd; - auto relu_fwd_desc = relu_forward::desc(prop_kind::forward_training, - alg_kind, src_md, alpha, beta); - relu_fwd_pd.reset( - new relu_forward::primitive_desc(relu_fwd_desc, cpu_engine)); - auto relu_bwd_desc = - relu_backward::desc(alg_kind, common_md, common_md, alpha, beta); - auto relu_bwd_pd = relu_backward::primitive_desc( - relu_bwd_desc, cpu_engine, *relu_fwd_pd); + MklEltwiseBwdParams bwdParams(src_dims, common_md, + alg_kind, alpha, beta); + MklEltwiseBwdPrimitive *eltwise_bwd = + MklEltwiseBwdPrimitiveFactory::Get(bwdParams); + auto eltwise_bwd_pd = eltwise_bwd->GetEltwiseBwdPd(); + + // check whether need reorder for src / diff_dst + T* src_data; + T* diff_dst_data; + std::vector net; + if (src_md.data.format != eltwise_bwd->GetSrcMemoryFormat()) { + src.SetUsrMem(src_md, &src_tensor); + src.CheckReorderToOpMem( + eltwise_bwd_pd.get()->diff_src_primitive_desc(), &net); + src_data = static_cast(src.GetOpMem().get_data_handle()); + } else { + src_data = static_cast( + const_cast(src_tensor.flat().data())); + } + + if (diff_dst_md.data.format != eltwise_bwd->GetDiffDstMemoryFormat()) { + diff_dst.SetUsrMem(diff_dst_md, &diff_dst_tensor); + diff_dst.CheckReorderToOpMem( + eltwise_bwd_pd.get()->diff_src_primitive_desc(), &net); + diff_dst_data = static_cast( + diff_dst.GetOpMem().get_data_handle()); + } else { + diff_dst_data = static_cast(const_cast( + diff_dst_tensor.flat().data())); + } + stream(stream::kind::eager).submit(net).wait(); // allocate diff_src tensor MklDnnShape dnn_shape_diff_src; TensorShape tf_shape_diff_src; - if (dnn_shape_src.IsMklTensor() || - dnn_shape_diff_dst.IsMklTensor()) { + if (dnn_shape_src.IsMklTensor()) { + auto diff_src_pd = eltwise_bwd_pd->diff_src_primitive_desc(); dnn_shape_diff_src.SetMklTensor(true); - auto diff_src_pd = relu_bwd_pd.diff_src_primitive_desc(); dnn_shape_diff_src.SetMklLayout(&diff_src_pd); dnn_shape_diff_src.SetElemType(MklDnnType()); - if (dnn_shape_src.IsMklTensor()) { - dnn_shape_diff_src.SetTfLayout(dnn_shape_src.GetDimension(), - dnn_shape_src.GetSizesAsMklDnnDims(), - dnn_shape_src.GetTfDataFormat()); - } else { - dnn_shape_diff_src.SetTfLayout(dnn_shape_diff_dst.GetDimension(), - dnn_shape_diff_dst.GetSizesAsMklDnnDims(), - dnn_shape_diff_dst.GetTfDataFormat()); - } - tf_shape_diff_src.AddDim(diff_src_pd.get_size() / sizeof(T)); + dnn_shape_diff_src.SetTfLayout(dnn_shape_src.GetDimension(), + dnn_shape_src.GetSizesAsMklDnnDims(), + dnn_shape_src.GetTfDataFormat()); + tf_shape_diff_src.AddDim(diff_src_pd.get_size()/sizeof(T)); } else { dnn_shape_diff_src.SetMklTensor(false); - // both src and diff_dst are TensorFlow layout, - // so it is ok to get TensorFlow shape. tf_shape_diff_src = src_tensor.shape(); } - // Allocate diff_src and MklDnnShape tensors separately for possible - // in-place operation - OP_REQUIRES_OK(context, context->forward_input_or_allocate_output( - {static_cast(diff_dst_index)}, - static_cast(diff_src_index), - tf_shape_diff_src, - &diff_src_tensor)); - AllocateOutputSetMklShape(context, diff_src_index, dnn_shape_diff_src); - - // diff_src memory descriptor is same as memory descriptor for both - // inputs. - diff_src.SetUsrMem(common_md, diff_src_tensor); - - PrepareAndExecuteNet(relu_bwd_pd, &src, &diff_src, &diff_dst); - } catch (mkldnn::error& e) { - string error_msg = "Status: " + std::to_string(e.status) + - ", message: " + string(e.message) + ", in file " + - string(__FILE__) + ":" + std::to_string(__LINE__); - OP_REQUIRES_OK( - context, - errors::Aborted("Operation received an exception:", error_msg)); + OP_REQUIRES_OK(context, context->forward_input_or_allocate_output( + {diff_dst_index}, diff_src_index, tf_shape_diff_src, + &diff_src_tensor)); + AllocateOutputSetMklShape(context, diff_src_index, dnn_shape_diff_src); + + T* diff_src_data = static_cast(const_cast( + diff_src_tensor->flat().data())); + + // execute eltwise bwd + eltwise_bwd->Execute(src_data, diff_dst_data, diff_src_data); + } catch (mkldnn::error &e) { + string error_msg = "Status: " + std::to_string(e.status) + + ", message: " + string(e.message) + + ", in file " + string(__FILE__) + ":" + + std::to_string(__LINE__); + OP_REQUIRES_OK(context, + errors::Aborted("Operation received an exception:", + error_msg)); } } - void PrepareAndExecuteNet(const relu_backward::primitive_desc& relu_prim_desc, - MklDnnData* src, MklDnnData* diff_src, - MklDnnData* diff_dst) { - std::vector net; - - // Check if we need to reorder original input tensors into common_md layout - // that we set for primitive creation. diff_src_primitive_desc is same as - // common_md. - src->CheckReorderToOpMem(relu_prim_desc.diff_src_primitive_desc(), &net); - diff_dst->CheckReorderToOpMem(relu_prim_desc.diff_src_primitive_desc(), - &net); - - net.push_back(relu_backward(relu_prim_desc, src->GetOpMem(), - diff_dst->GetOpMem(), diff_src->GetOpMem())); - stream(stream::kind::eager).submit(net).wait(); - } + private: + engine cpu_engine = engine(engine::cpu, 0); + std::shared_ptr relu_fwd_pd; }; template diff --git a/tensorflow/core/util/mkl_util.h b/tensorflow/core/util/mkl_util.h index 230b4278ca..c4b5e124fb 100644 --- a/tensorflow/core/util/mkl_util.h +++ b/tensorflow/core/util/mkl_util.h @@ -1794,11 +1794,11 @@ class MklDnnData { } }; -/// Base class for operations with reuse of DNN primitives +/// Base class for operations with reuse of primitives /// -class DnnOp { +class MklPrimitive { public: - virtual ~DnnOp() {} + virtual ~MklPrimitive() {} // Dummy data. Its size, hard-coded as 256 here, does // not matter since MKL should never operate on this buffer. @@ -1806,33 +1806,33 @@ class DnnOp { }; const mkldnn::memory::dims NONE_DIMS = {}; -// This constant is used to declare dummy buffer (size), for MKL primitives + template -class DnnOpFactory { +class MklPrimitiveFactory { public: - DnnOpFactory() {} - ~DnnOpFactory() {} + MklPrimitiveFactory() {} + ~MklPrimitiveFactory() {} - DnnOp* GetOp(const std::string& key) { - auto stream_iter = DnnOpFactory::GetHashMap().find(key); - if (stream_iter == DnnOpFactory::GetHashMap().end()) { + MklPrimitive* GetOp(const std::string& key) { + auto stream_iter = MklPrimitiveFactory::GetHashMap().find(key); + if (stream_iter == MklPrimitiveFactory::GetHashMap().end()) { return nullptr; } else { return stream_iter->second; } } - void SetOp(const std::string& key, DnnOp* op) { - auto stream_iter = DnnOpFactory::GetHashMap().find(key); + void SetOp(const std::string& key, MklPrimitive* op) { + auto stream_iter = MklPrimitiveFactory::GetHashMap().find(key); - CHECK(stream_iter == DnnOpFactory::GetHashMap().end()); + CHECK(stream_iter == MklPrimitiveFactory::GetHashMap().end()); - DnnOpFactory::GetHashMap()[key] = op; + MklPrimitiveFactory::GetHashMap()[key] = op; } private: - static inline std::unordered_map &GetHashMap() { - static thread_local std::unordered_map map_; + static inline std::unordered_map &GetHashMap() { + static thread_local std::unordered_map map_; return map_; } }; -- GitLab From 2bcd873e839c66b2405226508286da371dd8afbe Mon Sep 17 00:00:00 2001 From: Guozhong Zhuang Date: Mon, 21 May 2018 13:27:46 -0700 Subject: [PATCH 002/598] revert mkl_conv_ops.cc to avoid PR review confusion --- tensorflow/core/kernels/mkl_conv_ops.cc | 280 ++++++++++-------------- 1 file changed, 116 insertions(+), 164 deletions(-) diff --git a/tensorflow/core/kernels/mkl_conv_ops.cc b/tensorflow/core/kernels/mkl_conv_ops.cc index c032add82e..f2b14f1278 100644 --- a/tensorflow/core/kernels/mkl_conv_ops.cc +++ b/tensorflow/core/kernels/mkl_conv_ops.cc @@ -59,8 +59,7 @@ namespace tensorflow { #ifndef INTEL_MKL_ML -// This structure aggregates multiple inputs to Conv2DFwd* methods. -struct MklConvFwdParams { +struct ConvFwdDimensions { memory::dims src_dims; memory::dims filter_dims; memory::dims bias_dims; @@ -70,7 +69,7 @@ struct MklConvFwdParams { memory::dims padding_left; memory::dims padding_right; - MklConvFwdParams(memory::dims src_dims, + ConvFwdDimensions(memory::dims src_dims, memory::dims filter_dims, memory::dims bias_dims, memory::dims dst_dims, memory::dims strides, memory::dims dilations, memory::dims padding_left, @@ -83,40 +82,35 @@ struct MklConvFwdParams { }; template -class MklConv2DFwdPrimitive: public MklPrimitive { +class Conv2DFwd : public DnnOp { public: - explicit MklConv2DFwdPrimitive(const MklConvFwdParams& convFwdDims) { - context_.fwd_stream.reset(new stream(stream::kind::eager)); + explicit Conv2DFwd(const ConvFwdDimensions& convFwdDims) { + fwd_stream_.reset(new stream(stream::kind::eager)); // create conv primitive - if (context_.conv_fwd == nullptr) { + if (conv_fwd_ == nullptr) { Setup(convFwdDims); } } - ~MklConv2DFwdPrimitive() {} + ~Conv2DFwd() {} // Convolution forward execute with bias // src_data: input data buffer of src // filter_data: input data buffer of filter (weights) // bias_data: input data buffer of bias // dst_data: output data buffer of dst - void Execute(const T* src_data, const T* filter_data, - const T* bias_data, const T* dst_data) { - context_.src_mem->set_data_handle( - static_cast(const_cast(src_data))); - context_.filter_mem->set_data_handle( - static_cast(const_cast(filter_data))); - context_.bias_mem->set_data_handle( - static_cast(const_cast(bias_data))); - context_.dst_mem->set_data_handle( - static_cast(const_cast(dst_data))); - context_.fwd_stream->submit(context_.fwd_primitives); + void Execute(T* src_data, T* filter_data, T* bias_data, T* dst_data) { + src_mem_->set_data_handle(static_cast(src_data)); + filter_mem_->set_data_handle(static_cast(filter_data)); + bias_mem_->set_data_handle(static_cast(bias_data)); + dst_mem_->set_data_handle(static_cast(dst_data)); + fwd_stream_->submit(fwd_primitives_); // after exec, set data handle back - context_.src_mem->set_data_handle(DummyData); - context_.filter_mem->set_data_handle(DummyData); - context_.bias_mem->set_data_handle(DummyData); - context_.dst_mem->set_data_handle(DummyData); + src_mem_->set_data_handle(DummyData); + filter_mem_->set_data_handle(DummyData); + bias_mem_->set_data_handle(DummyData); + dst_mem_->set_data_handle(DummyData); return; } @@ -125,174 +119,139 @@ class MklConv2DFwdPrimitive: public MklPrimitive { // src_data: input data buffer of src // filter_data: input data buffer of filter (weights) // dst_data: output data buffer of dst - void Execute(const T* src_data, const T* filter_data, - const T* dst_data) { - context_.src_mem->set_data_handle( - static_cast(const_cast(src_data))); - context_.filter_mem->set_data_handle( - static_cast(const_cast(filter_data))); - context_.dst_mem->set_data_handle( - static_cast(const_cast(dst_data))); - context_.fwd_stream->submit(context_.fwd_primitives); - - // after execution, set data handle back - context_.src_mem->set_data_handle(DummyData); - context_.filter_mem->set_data_handle(DummyData); - context_.dst_mem->set_data_handle(DummyData); + void Execute(T* src_data, T* filter_data, T* dst_data) { + src_mem_->set_data_handle(static_cast(src_data)); + filter_mem_->set_data_handle(static_cast(filter_data)); + dst_mem_->set_data_handle(static_cast(dst_data)); + fwd_stream_->submit(fwd_primitives_); - return; - } + // after exec, set data handle back + src_mem_->set_data_handle(DummyData); + filter_mem_->set_data_handle(DummyData); + dst_mem_->set_data_handle(DummyData); - memory::format GetSrcMemoryFormat() const { - return context_.src_fmt; + return; } - memory::format GetFilterMemoryFormat() const { - return context_.filter_fmt; - } + // expected memory format for this primitive instance + memory::format src_fmt_; + memory::format filter_fmt_; - std::shared_ptr - GetPrimitiveDesc() const { - return context_.fwd_pd; - } + // convolution primitive + std::shared_ptr fwd_pd_; + std::shared_ptr conv_fwd_; private: - // Primitive reuse context for Conv2D Fwd op - struct ConvFwdContext { - // expected memory format for this primitive instance - memory::format src_fmt; - memory::format filter_fmt; - - // MKLDNN memory - std::shared_ptr src_mem; - std::shared_ptr filter_mem; - std::shared_ptr bias_mem; - std::shared_ptr dst_mem; - - // desc & prmitive desc - std::shared_ptr fwd_desc; - - // memory desc - std::shared_ptr src_md; - std::shared_ptr filter_md; - std::shared_ptr bias_md; - std::shared_ptr dst_md; - - // convolution primitive - std::shared_ptr fwd_pd; - std::shared_ptr conv_fwd; - - std::shared_ptr fwd_stream; - std::vector fwd_primitives; - - ConvFwdContext() : - src_fmt(memory::format::any), filter_fmt(memory::format::any), - src_mem(nullptr), filter_mem(nullptr), bias_mem(nullptr), - dst_mem(nullptr), fwd_desc(nullptr), - src_md(nullptr), filter_md(nullptr), bias_md(nullptr), - fwd_pd(nullptr), conv_fwd(nullptr), fwd_stream(nullptr) { - } - } context_; - - engine cpu_engine_ = engine(engine::cpu, 0); - - void Setup(const MklConvFwdParams& convFwdDims) { + void Setup(const ConvFwdDimensions& convFwdDims) { // create memory descriptors for convolution data w/ no specified format - context_.src_md.reset(new memory::desc({convFwdDims.src_dims}, + src_md_.reset(new memory::desc({convFwdDims.src_dims}, MklDnnType(), memory::format::any)); - context_.filter_md.reset(new memory::desc({convFwdDims.filter_dims}, + filter_md_.reset(new memory::desc({convFwdDims.filter_dims}, MklDnnType(), memory::format::any)); - context_.dst_md.reset(new memory::desc({convFwdDims.dst_dims}, + dst_md_.reset(new memory::desc({convFwdDims.dst_dims}, MklDnnType(), memory::format::any)); if (!convFwdDims.bias_dims.empty()) - context_.bias_md.reset(new memory::desc({convFwdDims.bias_dims}, + bias_md_.reset(new memory::desc({convFwdDims.bias_dims}, MklDnnType(), memory::format::any)); // create a convolution if (!convFwdDims.bias_dims.empty()) { - context_.fwd_desc.reset(new convolution_forward::desc(prop_kind::forward, - convolution_direct, *context_.src_md, *context_.filter_md, - *context_.bias_md, *context_.dst_md, + fwd_desc_.reset(new convolution_forward::desc(prop_kind::forward, + convolution_direct, *src_md_, *filter_md_, *bias_md_, *dst_md_, convFwdDims.strides, convFwdDims.dilations, convFwdDims.padding_left, convFwdDims.padding_right, padding_kind::zero)); } else { - context_.fwd_desc.reset(new convolution_forward::desc(prop_kind::forward, - convolution_direct, *context_.src_md, *context_.filter_md, - *context_.dst_md, convFwdDims.strides, convFwdDims.dilations, - convFwdDims.padding_left, convFwdDims.padding_right, - padding_kind::zero)); + fwd_desc_.reset(new convolution_forward::desc(prop_kind::forward, + convolution_direct, *src_md_, *filter_md_, *dst_md_, + convFwdDims.strides, convFwdDims.dilations, convFwdDims.padding_left, + convFwdDims.padding_right, padding_kind::zero)); } - context_.fwd_pd.reset(new convolution_forward::primitive_desc( - *context_.fwd_desc, cpu_engine_)); + fwd_pd_.reset(new convolution_forward::primitive_desc( + *fwd_desc_, cpu_engine_)); // store the expected memory format - context_.src_fmt = static_cast( - context_.fwd_pd.get()->src_primitive_desc().desc().data.format); + src_fmt_ = static_cast( + fwd_pd_.get()->src_primitive_desc().desc().data.format); - context_.filter_fmt = static_cast( - context_.fwd_pd.get()->weights_primitive_desc().desc().data.format); + filter_fmt_ = static_cast( + fwd_pd_.get()->weights_primitive_desc().desc().data.format); // create memory primitive based on dummy data - context_.src_mem.reset(new memory( - context_.fwd_pd.get()->src_primitive_desc(), DummyData)); - context_.filter_mem.reset(new memory( - context_.fwd_pd.get()->weights_primitive_desc(), DummyData)); - context_.dst_mem.reset(new memory( - context_.fwd_pd.get()->dst_primitive_desc(), DummyData)); + src_mem_.reset(new memory(fwd_pd_.get()->src_primitive_desc(), DummyData)); + filter_mem_.reset(new memory(fwd_pd_.get()->weights_primitive_desc(), + DummyData)); + dst_mem_.reset(new memory(fwd_pd_.get()->dst_primitive_desc(), DummyData)); // create convolution primitive and add it to net if (!convFwdDims.bias_dims.empty()) { - context_.bias_mem.reset(new memory({{{convFwdDims.bias_dims}, - MklDnnType(), memory::format::x}, cpu_engine_}, DummyData)); - context_.conv_fwd.reset(new convolution_forward( - *context_.fwd_pd, *context_.src_mem, *context_.filter_mem, - *context_.bias_mem, *context_.dst_mem)); + bias_mem_.reset(new memory({{{convFwdDims.bias_dims}, MklDnnType(), + memory::format::x}, cpu_engine_}, DummyData)); + conv_fwd_.reset(new convolution_forward(*fwd_pd_, *src_mem_, + *filter_mem_, *bias_mem_, *dst_mem_)); } else { - context_.conv_fwd.reset(new convolution_forward( - *context_.fwd_pd, *context_.src_mem, - *context_.filter_mem, *context_.dst_mem)); + conv_fwd_.reset(new convolution_forward(*fwd_pd_, *src_mem_, + *filter_mem_, *dst_mem_)); } - context_.fwd_primitives.push_back(*context_.conv_fwd); + fwd_primitives_.push_back(*conv_fwd_); return; } + + // MKLDNN memory + std::shared_ptr src_mem_; + std::shared_ptr filter_mem_; + std::shared_ptr bias_mem_; + std::shared_ptr dst_mem_; + + std::shared_ptr fwd_stream_; + std::vector fwd_primitives_; + + // desc & prmitive desc + std::shared_ptr fwd_desc_; + + // memory desc + std::shared_ptr src_md_; + std::shared_ptr filter_md_; + std::shared_ptr bias_md_; + std::shared_ptr dst_md_; + + engine cpu_engine_ = engine(engine::cpu, 0); }; template -class MklConv2DFwdPrimitiveFactory : public MklPrimitiveFactory { +class Conv2DFwdFactory : public DnnOpFactory { public: - static MklConv2DFwdPrimitive* Get(const MklConvFwdParams& convFwdDims) { - MklConv2DFwdPrimitive* conv2d_fwd = nullptr; + static Conv2DFwd* Get(const ConvFwdDimensions& convFwdDims) { + Conv2DFwd* conv2d_fwd = nullptr; // try to find a suitable one in pool - conv2d_fwd = dynamic_cast*> ( - MklConv2DFwdPrimitiveFactory::GetInstance().GetConv2DFwd( - convFwdDims)); + conv2d_fwd = dynamic_cast*> ( + Conv2DFwdFactory::GetInstance().GetConv2DFwd(convFwdDims)); if (conv2d_fwd == nullptr) { - conv2d_fwd = new MklConv2DFwdPrimitive(convFwdDims); - MklConv2DFwdPrimitiveFactory::GetInstance().SetConv2DFwd( + conv2d_fwd = new Conv2DFwd(convFwdDims); + Conv2DFwdFactory::GetInstance().SetConv2DFwd( convFwdDims, conv2d_fwd); } return conv2d_fwd; } private: - MklConv2DFwdPrimitiveFactory() {} - ~MklConv2DFwdPrimitiveFactory() {} + Conv2DFwdFactory() {} + ~Conv2DFwdFactory() {} static const int kDilationH = 0, kDilationW = 1; - static MklConv2DFwdPrimitiveFactory& GetInstance() { - static MklConv2DFwdPrimitiveFactory instance_; + static Conv2DFwdFactory& GetInstance() { + static Conv2DFwdFactory instance_; return instance_; } - static std::string CreateKey(const MklConvFwdParams& convFwdDims) { + static std::string CreateKey(const ConvFwdDimensions& convFwdDims) { std::string prefix = "conv2d_fwd_"; FactoryKeyCreator key_creator; key_creator.AddAsKey(prefix); @@ -307,12 +266,12 @@ class MklConv2DFwdPrimitiveFactory : public MklPrimitiveFactory { return key_creator.GetKey(); } - MklPrimitive* GetConv2DFwd(const MklConvFwdParams& convFwdDims) { + DnnOp* GetConv2DFwd(const ConvFwdDimensions& convFwdDims) { std::string key = CreateKey(convFwdDims); return this->GetOp(key); } - void SetConv2DFwd(const MklConvFwdParams& convFwdDims, MklPrimitive *op) { + void SetConv2DFwd(const ConvFwdDimensions& convFwdDims, DnnOp *op) { std::string key = CreateKey(convFwdDims); this->SetOp(key, op); } @@ -803,6 +762,7 @@ class MklConv2DOp : public OpKernel { MklDnnData src(&cpu_engine); MklDnnData filter(&cpu_engine); + MklDnnData dst(&cpu_engine); // output memory::dims src_dims, filter_dims, padding_left, padding_right, dilations, strides; @@ -852,6 +812,7 @@ class MklConv2DOp : public OpKernel { auto src_md = src_mkl_shape.IsMklTensor() ? src_mkl_shape.GetMklLayout() : memory::desc(src_dims, MklDnnType(), tf_fmt); + src.SetUsrMem(src_md, &src_tensor); // Although filter shape (filter_dims) required is in MKL-DNN order, // the layout is Tensorflow's layout (HWIO). @@ -859,28 +820,29 @@ class MklConv2DOp : public OpKernel { ? filter_mkl_shape.GetMklLayout() : memory::desc(filter_dims, MklDnnType(), memory::format::hwio); + filter.SetUsrMem(filter_md, &filter_tensor); // MKLDNN dilation starts from 0. dilations[kDilationH] -= 1; dilations[kDilationW] -= 1; // get a conv2d fwd from primitive pool - MklConv2DFwdPrimitive *conv2d_fwd = nullptr; + Conv2DFwd *conv2d_fwd = nullptr; if (biasEnabled) { memory::dims bias_dims = {}; conv_utl.GetBiasSizeInMklOrder(kInputIndex_Bias, &bias_dims); - MklConvFwdParams convFwdDims(src_dims, filter_dims, bias_dims, + ConvFwdDimensions convFwdDims(src_dims, filter_dims, bias_dims, dst_dims_mkl_order, strides, dilations, padding_left, padding_right); - conv2d_fwd = MklConv2DFwdPrimitiveFactory::Get(convFwdDims); + conv2d_fwd = Conv2DFwdFactory::Get(convFwdDims); } else { - MklConvFwdParams convFwdDims(src_dims, filter_dims, NONE_DIMS, + ConvFwdDimensions convFwdDims(src_dims, filter_dims, NONE_DIMS, dst_dims_mkl_order, strides, dilations, padding_left, padding_right); - conv2d_fwd = MklConv2DFwdPrimitiveFactory::Get(convFwdDims); + conv2d_fwd = Conv2DFwdFactory::Get(convFwdDims); } // allocate output tensors output_tensor and filter_out_tensor std::shared_ptr - conv_fwd_pd = conv2d_fwd->GetPrimitiveDesc(); + conv_fwd_pd = conv2d_fwd->fwd_pd_; AllocateOutputTensor(context, *conv_fwd_pd, dst_dims_mkl_order, tf_fmt, &dst_tensor); Tensor* filter_out_tensor = nullptr; @@ -892,30 +854,20 @@ class MklConv2DOp : public OpKernel { // check whether src/filter need reorder std::vector net; - T *src_data = nullptr; - if (src_md.data.format != conv2d_fwd->GetSrcMemoryFormat()) { - src.SetUsrMem(src_md, &src_tensor); - src.CheckReorderToOpMem( - conv_fwd_pd.get()->src_primitive_desc(), &net); - src_data = static_cast(src.GetOpMem().get_data_handle()); - } else { - src_data = static_cast(const_cast( - src_tensor.flat().data())); - } - T *filter_data = nullptr; - if (filter_md.data.format != conv2d_fwd->GetFilterMemoryFormat()) { - filter.SetUsrMem(filter_md, &filter_tensor); - filter.CheckReorderToOpMem( - conv_fwd_pd.get()->weights_primitive_desc(), - filter.GetTensorBuffer(filter_out_tensor), &net); - filter_data = static_cast(filter.GetOpMem().get_data_handle()); - } else { - filter_data = static_cast(const_cast( - filter_tensor.flat().data())); - } - + if (src_md.data.format != conv2d_fwd->src_fmt_) + src.CheckReorderToOpMem( + conv_fwd_pd.get()->src_primitive_desc(), &net); + + if (filter_md.data.format != conv2d_fwd->filter_fmt_) + filter.CheckReorderToOpMem( + conv_fwd_pd.get()->weights_primitive_desc(), + filter.GetTensorBuffer(filter_out_tensor), &net); stream(stream::kind::eager).submit(net).wait(); + T* src_data = static_cast( + src.GetOpMem().get_data_handle()); + T* filter_data = static_cast( + filter.GetOpMem().get_data_handle()); // execute convolution if (biasEnabled) { -- GitLab From e298fae53bee33eaed6ab152d029db5c6fac34c3 Mon Sep 17 00:00:00 2001 From: JxKing Date: Thu, 31 May 2018 12:55:35 +0800 Subject: [PATCH 003/598] fix multiple values for keyword argument error --- .../contrib/opt/python/training/model_average_optimizer.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tensorflow/contrib/opt/python/training/model_average_optimizer.py b/tensorflow/contrib/opt/python/training/model_average_optimizer.py index b6b10e500b..e4d1ae5d63 100644 --- a/tensorflow/contrib/opt/python/training/model_average_optimizer.py +++ b/tensorflow/contrib/opt/python/training/model_average_optimizer.py @@ -89,7 +89,9 @@ class ModelAverageCustomGetter(object): self._local_2_global[local_var] = global_variable return local_var else: - return getter(name, trainable, collections, *args, **kwargs) + kwargs['trainable'] = trainable + kwargs['collections'] = collections + return getter(name, *args, **kwargs) class ModelAverageOptimizer(optimizer.Optimizer): -- GitLab From 7004927328cd8166c6858984ec649e4eea0ceab0 Mon Sep 17 00:00:00 2001 From: JxKing Date: Thu, 31 May 2018 12:57:52 +0800 Subject: [PATCH 004/598] fix multiple values for keyword argument for easgd --- .../contrib/opt/python/training/elastic_average_optimizer.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tensorflow/contrib/opt/python/training/elastic_average_optimizer.py b/tensorflow/contrib/opt/python/training/elastic_average_optimizer.py index 5763593b81..545c3477bf 100644 --- a/tensorflow/contrib/opt/python/training/elastic_average_optimizer.py +++ b/tensorflow/contrib/opt/python/training/elastic_average_optimizer.py @@ -100,7 +100,9 @@ class ElasticAverageCustomGetter(object): self._global_map[local_var] = global_center_variable return local_var else: - return getter(name, trainable, collections, *args, **kwargs) + kwargs['trainable'] = trainable + kwargs['collections'] = collections + return getter(name, *args, **kwargs) class ElasticAverageOptimizer(optimizer.Optimizer): -- GitLab From bdc37544a98cd777e71f83fd1c46a42038004476 Mon Sep 17 00:00:00 2001 From: JxKing Date: Thu, 31 May 2018 12:59:45 +0800 Subject: [PATCH 005/598] place easgd in ea_coustom_getter scope --- .../elastic_average_optimizer_test.py | 30 +++++++++---------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/tensorflow/contrib/opt/python/training/elastic_average_optimizer_test.py b/tensorflow/contrib/opt/python/training/elastic_average_optimizer_test.py index 5ed8057b86..9d57dc08f6 100644 --- a/tensorflow/contrib/opt/python/training/elastic_average_optimizer_test.py +++ b/tensorflow/contrib/opt/python/training/elastic_average_optimizer_test.py @@ -79,21 +79,21 @@ def _get_workers(num_workers, period, workers, moving_rate): var_0 = variable_scope.get_variable(initializer=0.0, name="v0") var_1 = variable_scope.get_variable(initializer=1.0, name="v1") - with ops.device("/job:worker/task:" + str(worker_id)): - grads_0 = constant_op.constant(-1.0) - grads_1 = constant_op.constant(-1.0) - - sgd_opt = gradient_descent.GradientDescentOptimizer(1.0) - opt = ElasticAverageOptimizer( - opt=sgd_opt, - num_worker=num_workers, - moving_rate=moving_rate, - communication_period=period, - ea_custom_getter=ea_coustom) - train_op = [ - opt.apply_gradients(([grads_0, var_0], [grads_1, var_1]), - global_step) - ] + with ops.device("/job:worker/task:" + str(worker_id)): + grads_0 = constant_op.constant(-1.0) + grads_1 = constant_op.constant(-1.0) + + sgd_opt = gradient_descent.GradientDescentOptimizer(1.0) + opt = ElasticAverageOptimizer( + opt=sgd_opt, + num_worker=num_workers, + moving_rate=moving_rate, + communication_period=period, + ea_custom_getter=ea_coustom) + train_op = [ + opt.apply_gradients(([grads_0, var_0], [grads_1, var_1]), + global_step) + ] easgd_hook = opt.make_session_run_hook(is_chief, worker_id) # Creates MonitoredSession sess = training.MonitoredTrainingSession( -- GitLab From f4020cfc79582aa689f7a575445b95e60974071f Mon Sep 17 00:00:00 2001 From: JxKing Date: Thu, 31 May 2018 13:01:25 +0800 Subject: [PATCH 006/598] place ma_opt in ma_coustom_getter scope --- .../training/model_average_optimizer_test.py | 40 +++++++++---------- 1 file changed, 20 insertions(+), 20 deletions(-) diff --git a/tensorflow/contrib/opt/python/training/model_average_optimizer_test.py b/tensorflow/contrib/opt/python/training/model_average_optimizer_test.py index 3acd940268..b1fc50a21f 100644 --- a/tensorflow/contrib/opt/python/training/model_average_optimizer_test.py +++ b/tensorflow/contrib/opt/python/training/model_average_optimizer_test.py @@ -80,28 +80,28 @@ def _get_workers(num_workers, steps, workers): var_0 = variable_scope.get_variable(initializer=0.0, name="v0") var_1 = variable_scope.get_variable(initializer=1.0, name="v1") - with ops.device("/job:worker/task:" + str(worker_id)): - if worker_id == 0: - grads_0 = constant_op.constant(-1.0) - grads_1 = constant_op.constant(-1.0) - else: - grads_0 = constant_op.constant(-2.0) - grads_1 = constant_op.constant(-2.0) - sgd_opt = gradient_descent.GradientDescentOptimizer(1.0) - opt = model_average_optimizer.ModelAverageOptimizer( - opt=sgd_opt, - num_worker=num_workers, - ma_custom_getter=ma_coustom, - is_chief=is_chief, - interval_steps=steps) - train_op = [ - opt.apply_gradients([[grads_0, var_0], [grads_1, var_1]], - global_step) - ] - easgd_hook = opt.make_session_run_hook() + with ops.device("/job:worker/task:" + str(worker_id)): + if worker_id == 0: + grads_0 = constant_op.constant(-1.0) + grads_1 = constant_op.constant(-1.0) + else: + grads_0 = constant_op.constant(-2.0) + grads_1 = constant_op.constant(-2.0) + sgd_opt = gradient_descent.GradientDescentOptimizer(1.0) + opt = model_average_optimizer.ModelAverageOptimizer( + opt=sgd_opt, + num_worker=num_workers, + ma_custom_getter=ma_coustom, + is_chief=is_chief, + interval_steps=steps) + train_op = [ + opt.apply_gradients([[grads_0, var_0], [grads_1, var_1]], + global_step) + ] + ma_hook = opt.make_session_run_hook() # Creates MonitoredSession sess = training.MonitoredTrainingSession( - workers[worker_id].target, hooks=[easgd_hook]) + workers[worker_id].target, hooks=[ma_hook]) sessions.append(sess) graphs.append(graph) -- GitLab From 6c279ad4055a2d568977a02a2eb3b1303117ac15 Mon Sep 17 00:00:00 2001 From: JxKing Date: Thu, 31 May 2018 19:23:32 +0800 Subject: [PATCH 007/598] fix "workers share local variables" error --- .../contrib/opt/python/training/model_average_optimizer.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/tensorflow/contrib/opt/python/training/model_average_optimizer.py b/tensorflow/contrib/opt/python/training/model_average_optimizer.py index e4d1ae5d63..746df77ba2 100644 --- a/tensorflow/contrib/opt/python/training/model_average_optimizer.py +++ b/tensorflow/contrib/opt/python/training/model_average_optimizer.py @@ -91,7 +91,11 @@ class ModelAverageCustomGetter(object): else: kwargs['trainable'] = trainable kwargs['collections'] = collections - return getter(name, *args, **kwargs) + if ops.GraphKeys.LOCAL_VARIABLES in collections: + with ops.device(self._worker_device): + return getter(name, *args, **kwargs) + else: + return getter(name, *args, **kwargs) class ModelAverageOptimizer(optimizer.Optimizer): -- GitLab From 16c42f0d4826b12a5359281997ee3f8e27fd5a87 Mon Sep 17 00:00:00 2001 From: JxKing Date: Thu, 31 May 2018 19:24:19 +0800 Subject: [PATCH 008/598] fix "workers share local variables" error --- .../opt/python/training/elastic_average_optimizer.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/tensorflow/contrib/opt/python/training/elastic_average_optimizer.py b/tensorflow/contrib/opt/python/training/elastic_average_optimizer.py index 545c3477bf..209c4611f3 100644 --- a/tensorflow/contrib/opt/python/training/elastic_average_optimizer.py +++ b/tensorflow/contrib/opt/python/training/elastic_average_optimizer.py @@ -102,7 +102,12 @@ class ElasticAverageCustomGetter(object): else: kwargs['trainable'] = trainable kwargs['collections'] = collections - return getter(name, *args, **kwargs) + if ops.GraphKeys.LOCAL_VARIABLES in collections: + with ops.device(self._worker_device): + return getter(name, *args, **kwargs) + else: + return getter(name, *args, **kwargs) + class ElasticAverageOptimizer(optimizer.Optimizer): -- GitLab From f369de2bb9f28c36b8b654db3dbd4dd187482c22 Mon Sep 17 00:00:00 2001 From: Guozhong Zhuang Date: Tue, 12 Jun 2018 15:54:37 -0700 Subject: [PATCH 009/598] code refactoring per Rasmus's suggestions on PR 19754 --- tensorflow/core/kernels/mkl_relu_op.cc | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/tensorflow/core/kernels/mkl_relu_op.cc b/tensorflow/core/kernels/mkl_relu_op.cc index 048d4883b2..a52c879721 100644 --- a/tensorflow/core/kernels/mkl_relu_op.cc +++ b/tensorflow/core/kernels/mkl_relu_op.cc @@ -65,7 +65,8 @@ class MklEltwiseFwdParams { template class MklEltwiseFwdPrimitive : public MklPrimitive { public: - explicit MklEltwiseFwdPrimitive(const MklEltwiseFwdParams& fwdParams) { + explicit MklEltwiseFwdPrimitive(const MklEltwiseFwdParams& fwdParams) : + cpu_engine_(engine::cpu, 0) { // store expected format context_.src_fmt = static_cast( fwdParams.src_md.data.format); @@ -90,7 +91,6 @@ class MklEltwiseFwdPrimitive : public MklPrimitive { // after execution, set data handle back context_.src_mem->set_data_handle(DummyData); context_.dst_mem->set_data_handle(DummyData); - return; } std::shared_ptr GetEltwiseFwdPd() { @@ -133,7 +133,7 @@ class MklEltwiseFwdPrimitive : public MklPrimitive { fwd_desc(nullptr), fwd_pd(nullptr), src_md(nullptr), dst_md(nullptr), src_mpd(nullptr), eltwise_fwd(nullptr), fwd_stream(nullptr) { } - } context_; + }; // Eltwise forward primitive setup void Setup(const MklEltwiseFwdParams& fwdParams) { @@ -159,10 +159,10 @@ class MklEltwiseFwdPrimitive : public MklPrimitive { *context_.src_mem, *context_.dst_mem)); context_.fwd_primitives.push_back(*context_.eltwise_fwd); - return; } - engine cpu_engine_ = engine(engine::cpu, 0); + struct EltwiseFwdContext context_; + engine cpu_engine_; }; template @@ -242,7 +242,8 @@ class MklEltwiseBwdParams { template class MklEltwiseBwdPrimitive : public MklPrimitive { public: - explicit MklEltwiseBwdPrimitive(const MklEltwiseBwdParams& bwdParams) { + explicit MklEltwiseBwdPrimitive(const MklEltwiseBwdParams& bwdParams) : + cpu_engine_(engine::cpu, 0) { context_.src_fmt = static_cast( bwdParams.common_md.data.format); context_.diff_dst_fmt = static_cast( @@ -271,7 +272,6 @@ class MklEltwiseBwdPrimitive : public MklPrimitive { context_.src_mem->set_data_handle(DummyData); context_.diff_dst_mem->set_data_handle(DummyData); context_.diff_src_mem->set_data_handle(DummyData); - return; } std::shared_ptr GetEltwiseBwdPd() { @@ -329,7 +329,7 @@ class MklEltwiseBwdPrimitive : public MklPrimitive { fwd_desc(nullptr), fwd_pd(nullptr), bwd_pd(nullptr), eltwise_bwd(nullptr), bwd_stream(nullptr) { } - } context_; + }; // Eltwise backward primitive setup void Setup(const MklEltwiseBwdParams& bwdParams) { @@ -365,10 +365,10 @@ class MklEltwiseBwdPrimitive : public MklPrimitive { *context_.src_mem, *context_.diff_dst_mem, *context_.diff_src_mem)); context_.bwd_primitives.push_back(*context_.eltwise_bwd); - return; } - engine cpu_engine_ = engine(engine::cpu, 0); + struct EltwiseBwdContext context_; + engine cpu_engine_; }; -- GitLab From 0059fe57ce7f6b8397b72acfb0ef30013d748116 Mon Sep 17 00:00:00 2001 From: PENGWA Date: Tue, 19 Jun 2018 20:37:58 +0800 Subject: [PATCH 010/598] consider gpu memory fraction option for memory optimizer (cherry picked from commit d7b2a4030d4b6d57f7453f986fdea346e8a76b7c) --- .../core/common_runtime/graph_execution_state.cc | 4 +++- .../core/grappler/optimizers/memory_optimizer.cc | 14 ++++++++------ .../core/grappler/optimizers/memory_optimizer.h | 3 +++ .../core/grappler/optimizers/meta_optimizer.cc | 12 +++++++----- .../core/grappler/optimizers/meta_optimizer.h | 9 ++++++++- 5 files changed, 29 insertions(+), 13 deletions(-) diff --git a/tensorflow/core/common_runtime/graph_execution_state.cc b/tensorflow/core/common_runtime/graph_execution_state.cc index eb710bdbc5..d76f7b49b1 100644 --- a/tensorflow/core/common_runtime/graph_execution_state.cc +++ b/tensorflow/core/common_runtime/graph_execution_state.cc @@ -407,6 +407,8 @@ Status GraphExecutionState::OptimizeGraph( const RewriterConfig& rewrite_options = session_options_->config.graph_options().rewrite_options(); + const GPUOptions& gpu_options = + session_options_->config.gpu_options(); if (grappler::MetaOptimizerEnabled(rewrite_options)) { // Adding this functionality in steps. The first step is to make sure @@ -493,7 +495,7 @@ Status GraphExecutionState::OptimizeGraph( grappler::VirtualCluster cluster(device_map, device_set_); GraphDef new_graph; TF_RETURN_IF_ERROR(grappler::RunMetaOptimizer( - item, rewrite_options, cpu_device, &cluster, &new_graph)); + item, rewrite_options, cpu_device, &cluster, &new_graph, gpu_options)); // Merge optimized graph function library with an original library. // Optimized graph might have new functions specialized for it's diff --git a/tensorflow/core/grappler/optimizers/memory_optimizer.cc b/tensorflow/core/grappler/optimizers/memory_optimizer.cc index 1be5f8dcc2..5a2cec4358 100644 --- a/tensorflow/core/grappler/optimizers/memory_optimizer.cc +++ b/tensorflow/core/grappler/optimizers/memory_optimizer.cc @@ -943,7 +943,7 @@ struct MemInfo { static bool IdentifySwappingCandidates( Cluster* cluster, GrapplerItem* item, std::unordered_set* skip_list, - std::unordered_map* nodes_to_swap) { + std::unordered_map* nodes_to_swap, double memory_fraction) { GraphMemory memory(*item); const std::unordered_map& devices = cluster->GetDevices(); @@ -966,10 +966,10 @@ static bool IdentifySwappingCandidates( } const GraphMemory::MemoryUsage& mem_usage = memory.GetPeakMemoryUsage(name); - if (mem_usage.used_memory <= prop.memory_size()) { + if (mem_usage.used_memory <= memory_fraction * prop.memory_size()) { continue; } - int64 required_savings = mem_usage.used_memory - prop.memory_size(); + int64 required_savings = mem_usage.used_memory - memory_fraction * prop.memory_size(); std::unordered_map op_completion_times; { @@ -1105,13 +1105,14 @@ static bool IdentifySwappingCandidates( bool SwappingPass(RewriterConfig::MemOptType optimization_level, Cluster* cluster, GrapplerItem* item, - std::unordered_set* skip_list) { + std::unordered_set* skip_list, + double memory_fraction) { std::unordered_map nodes_to_swap; if (optimization_level == RewriterConfig::DEFAULT_MEM_OPT || optimization_level == RewriterConfig::SWAPPING_HEURISTICS || optimization_level == RewriterConfig::HEURISTICS) { // Use heuristics to figure out what needs to be swapped; - IdentifySwappingCandidates(cluster, item, skip_list, &nodes_to_swap); + IdentifySwappingCandidates(cluster, item, skip_list, &nodes_to_swap, memory_fraction); } // Look for manual annotatations in the graph. for (auto& node : *item->graph.mutable_node()) { @@ -1324,7 +1325,8 @@ Status MemoryOptimizer::Optimize(Cluster* cluster, const GrapplerItem& item, optimization_level_ == RewriterConfig::MANUAL) && cluster != nullptr) { updated_graph |= SwappingPass(optimization_level_, cluster, - &optimized_item, &skip_list); + &optimized_item, &skip_list, + per_process_gpu_memory_fraction_); } } diff --git a/tensorflow/core/grappler/optimizers/memory_optimizer.h b/tensorflow/core/grappler/optimizers/memory_optimizer.h index 653ffaec4c..6e03f442d6 100644 --- a/tensorflow/core/grappler/optimizers/memory_optimizer.h +++ b/tensorflow/core/grappler/optimizers/memory_optimizer.h @@ -32,8 +32,10 @@ class MemoryOptimizer : public GraphOptimizer { // RewriterConfig::memory_optimizer_target_node_name_scope. explicit MemoryOptimizer( RewriterConfig::MemOptType optimization_level, + double per_process_gpu_memory_fraction = 1.0, const string& recomputation_targets_name_scope = "gradients/") : optimization_level_(optimization_level), + per_process_gpu_memory_fraction_(per_process_gpu_memory_fraction), recomputation_targets_name_scope_(recomputation_targets_name_scope) {} ~MemoryOptimizer() override {} @@ -47,6 +49,7 @@ class MemoryOptimizer : public GraphOptimizer { private: RewriterConfig::MemOptType optimization_level_; + double per_process_gpu_memory_fraction_; string recomputation_targets_name_scope_; }; diff --git a/tensorflow/core/grappler/optimizers/meta_optimizer.cc b/tensorflow/core/grappler/optimizers/meta_optimizer.cc index 143d9dc1c6..e0ab7e00e9 100644 --- a/tensorflow/core/grappler/optimizers/meta_optimizer.cc +++ b/tensorflow/core/grappler/optimizers/meta_optimizer.cc @@ -83,7 +83,7 @@ std::unique_ptr MetaOptimizer::MakeNewOptimizer( MK_OPT("shape", new ShapeOptimizer()); MK_OPT("remap", new Remapper(cfg_.remapping())); MK_OPT("layout", new LayoutOptimizer()); - MK_OPT("memory", new MemoryOptimizer(RewriterConfig::MANUAL)); + MK_OPT("memory", new MemoryOptimizer(RewriterConfig::MANUAL, gpu_options_.per_process_gpu_memory_fraction())); MK_OPT("arithmetic", new ArithmeticOptimizer(cfg_.arithmetic_optimization())); MK_OPT("autoparallel", new AutoParallel(cfg_.auto_parallel().num_replicas())); MK_OPT("loop", new LoopOptimizer(cfg_.loop_optimization())); @@ -134,13 +134,14 @@ Status MetaOptimizer::InitializeOptimizers( optimizers->emplace_back(new LayoutOptimizer()); } if (cfg_.memory_optimization() != RewriterConfig::NO_MEM_OPT) { + double mem_fraction = gpu_options_.per_process_gpu_memory_fraction(); if (cfg_.memory_optimizer_target_node_name_scope().empty()) { optimizers->emplace_back( // Use the default target node name prefix "gradients/" - new MemoryOptimizer(cfg_.memory_optimization())); + new MemoryOptimizer(cfg_.memory_optimization(), mem_fraction)); } else { optimizers->emplace_back( - new MemoryOptimizer(cfg_.memory_optimization(), + new MemoryOptimizer(cfg_.memory_optimization(), mem_fraction, cfg_.memory_optimizer_target_node_name_scope())); } } @@ -412,8 +413,9 @@ bool MetaOptimizerEnabled(const RewriterConfig& cfg) { Status RunMetaOptimizer(const GrapplerItem& item, const RewriterConfig& cfg, DeviceBase* cpu_device, Cluster* cluster, - GraphDef* optimized_graph) { - MetaOptimizer optimizer(cpu_device, cfg); + GraphDef* optimized_graph, + const GPUOptions& gpu_options) { + MetaOptimizer optimizer(cpu_device, cfg, gpu_options); return optimizer.Optimize(cluster, item, optimized_graph); } diff --git a/tensorflow/core/grappler/optimizers/meta_optimizer.h b/tensorflow/core/grappler/optimizers/meta_optimizer.h index 151a54cbdf..74b6bb7f74 100644 --- a/tensorflow/core/grappler/optimizers/meta_optimizer.h +++ b/tensorflow/core/grappler/optimizers/meta_optimizer.h @@ -21,6 +21,7 @@ limitations under the License. #include "tensorflow/core/grappler/optimizers/graph_optimizer.h" #include "tensorflow/core/lib/core/status.h" #include "tensorflow/core/protobuf/rewriter_config.pb.h" +#include "tensorflow/core/protobuf/config.pb.h" namespace tensorflow { namespace grappler { @@ -30,6 +31,10 @@ class MetaOptimizer : public GraphOptimizer { public: MetaOptimizer(DeviceBase* cpu_device, const RewriterConfig& cfg) : cpu_device_(cpu_device), cfg_(cfg) {} + + MetaOptimizer(DeviceBase* cpu_device, const RewriterConfig& cfg, const GPUOptions& gpu_options) + : cpu_device_(cpu_device), cfg_(cfg), gpu_options_(gpu_options) {} + ~MetaOptimizer() override = default; string name() const override { return "meta_optimizer"; }; @@ -77,6 +82,7 @@ class MetaOptimizer : public GraphOptimizer { GraphOptimizationResult* optimization_result); std::vector optimization_results_; + GPUOptions gpu_options_; }; bool MetaOptimizerEnabled(const RewriterConfig& cfg); @@ -89,7 +95,8 @@ bool MetaOptimizerEnabled(const RewriterConfig& cfg); // when possible. Status RunMetaOptimizer(const GrapplerItem& item, const RewriterConfig& cfg, DeviceBase* cpu_device, Cluster* cluster, - GraphDef* optimized_graph); + GraphDef* optimized_graph, + const GPUOptions& gpu_options); } // namespace grappler } // namespace tensorflow -- GitLab From c299b9c74bf52471613fbc743e0dceb6a55f630c Mon Sep 17 00:00:00 2001 From: Niranjan Hasabnis Date: Fri, 22 Jun 2018 16:04:16 -0700 Subject: [PATCH 011/598] [Intel MKL] Optimized implementation of GatherND using OpenMP --- tensorflow/core/kernels/gather_nd_op_cpu_impl.h | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/tensorflow/core/kernels/gather_nd_op_cpu_impl.h b/tensorflow/core/kernels/gather_nd_op_cpu_impl.h index dc028c2f1e..22203e242a 100644 --- a/tensorflow/core/kernels/gather_nd_op_cpu_impl.h +++ b/tensorflow/core/kernels/gather_nd_op_cpu_impl.h @@ -113,10 +113,25 @@ struct GatherNdSlice { #endif generator::GatherNdSliceGenerator gather_nd_generator( slice_size, Tindices, Tparams, Tout, &error_loc); + +#ifdef INTEL_MKL + // Eigen implementation below is not highly performant. gather_nd_generator + // does not seem to be called in parallel, leading to very poor performance. + // Additionally, since it uses scalar (Tscratch) to invoke 'generate', it + // needs to go through redundant operations like 'reshape', 'broadcast' and + // 'sum'. OpenMP loop below essentially does same thing as Eigen code, but + // is considerably more efficient. + #pragma omp parallel for + for (Eigen::DenseIndex i = 0; i < batch_size; i++) { + const Eigen::array loc = i; + gather_nd_generator(loc); + } +#else Tscratch.device(d) = Tscratch.reshape(reshape_dims) .broadcast(broadcast_dims) .generate(gather_nd_generator) .sum(); +#endif // error_loc() returns -1 if there's no out-of-bounds index, // otherwise it returns the location of an OOB index in Tindices. -- GitLab From afbe36c5126cf118c60cbf22454d99d429425334 Mon Sep 17 00:00:00 2001 From: "Peng Wang(SIMPENG)" Date: Sat, 23 Jun 2018 06:03:41 +0000 Subject: [PATCH 012/598] Merge master change --- tensorflow/core/grappler/optimizers/meta_optimizer.cc | 7 +++++++ tensorflow/core/grappler/optimizers/meta_optimizer.h | 4 ++++ 2 files changed, 11 insertions(+) diff --git a/tensorflow/core/grappler/optimizers/meta_optimizer.cc b/tensorflow/core/grappler/optimizers/meta_optimizer.cc index e0ab7e00e9..0d2b9a5763 100644 --- a/tensorflow/core/grappler/optimizers/meta_optimizer.cc +++ b/tensorflow/core/grappler/optimizers/meta_optimizer.cc @@ -411,6 +411,13 @@ bool MetaOptimizerEnabled(const RewriterConfig& cfg) { !cfg.optimizers().empty() || !cfg.custom_optimizers().empty(); } +Status RunMetaOptimizer(const GrapplerItem& item, const RewriterConfig& cfg, + DeviceBase* cpu_device, Cluster* cluster, + GraphDef* optimized_graph) { + MetaOptimizer optimizer(cpu_device, cfg); + return optimizer.Optimize(cluster, item, optimized_graph); +} + Status RunMetaOptimizer(const GrapplerItem& item, const RewriterConfig& cfg, DeviceBase* cpu_device, Cluster* cluster, GraphDef* optimized_graph, diff --git a/tensorflow/core/grappler/optimizers/meta_optimizer.h b/tensorflow/core/grappler/optimizers/meta_optimizer.h index 74b6bb7f74..c267b5fd8e 100644 --- a/tensorflow/core/grappler/optimizers/meta_optimizer.h +++ b/tensorflow/core/grappler/optimizers/meta_optimizer.h @@ -93,6 +93,10 @@ bool MetaOptimizerEnabled(const RewriterConfig& cfg); // during constant folding; if NULL, a new device is created for doing constant // folding. For performance, it is recommended to pass in an existing cpu_device // when possible. +Status RunMetaOptimizer(const GrapplerItem& item, const RewriterConfig& cfg, + DeviceBase* cpu_device, Cluster* cluster, + GraphDef* optimized_graph); + Status RunMetaOptimizer(const GrapplerItem& item, const RewriterConfig& cfg, DeviceBase* cpu_device, Cluster* cluster, GraphDef* optimized_graph, -- GitLab From f814e242d16997dba8b9bbded3ef6e2540e2d044 Mon Sep 17 00:00:00 2001 From: "Li, Yiqiang" Date: Sun, 15 Jul 2018 20:13:09 +0800 Subject: [PATCH 013/598] Replace to use fast reorder path in MklRelu op. --- tensorflow/core/kernels/mkl_relu_op.cc | 17 ++++++----------- tensorflow/core/util/mkl_util.h | 12 +++++++----- 2 files changed, 13 insertions(+), 16 deletions(-) diff --git a/tensorflow/core/kernels/mkl_relu_op.cc b/tensorflow/core/kernels/mkl_relu_op.cc index f73d3d81f9..3d5a05be73 100644 --- a/tensorflow/core/kernels/mkl_relu_op.cc +++ b/tensorflow/core/kernels/mkl_relu_op.cc @@ -814,11 +814,9 @@ class MklReluOpBase : public OpKernel { // check wehther src need to reorder if (src_md.data.format != eltwise_fwd->GetSrcMemoryFormat()) { src.SetUsrMem(src_md, &src_tensor); - std::vector net; auto src_target_pd = memory::primitive_desc({{src_dims}, MklDnnType(), eltwise_fwd->GetSrcMemoryFormat()}, cpu_engine); - src.CheckReorderToOpMem(src_target_pd, &net); - stream(stream::kind::eager).submit(net).wait(); + src.CheckReorderToOpMem(src_target_pd); src_data = static_cast(src.GetOpMem().get_data_handle()); } else { src_data = static_cast( @@ -882,9 +880,8 @@ class MklReluGradOpBase : public OpKernel { virtual void Compute_Scalar(OpKernelContext* context) = 0; - void Compute(OpKernelContext* context) { + void Compute(OpKernelContext* context) { try { - // auto cpu_engine = engine(engine::cpu, 0); MklDnnData src(&cpu_engine); MklDnnData diff_dst(&cpu_engine); @@ -892,9 +889,9 @@ class MklReluGradOpBase : public OpKernel { const size_t src_index = 1; // index of src input tensor const size_t diff_src_index = 0; // index of diff_src output tensor - const Tensor& src_tensor = MklGetInput(context, src_index); + const Tensor& src_tensor = MklGetInput(context, src_index); const Tensor& diff_dst_tensor = MklGetInput(context, diff_dst_index); - Tensor* diff_src_tensor = nullptr; + Tensor* diff_src_tensor = nullptr; MklDnnShape dnn_shape_src, dnn_shape_diff_dst; GetMklShape(context, src_index, &dnn_shape_src); @@ -969,11 +966,10 @@ class MklReluGradOpBase : public OpKernel { // check whether need reorder for src / diff_dst T* src_data; T* diff_dst_data; - std::vector net; if (src_md.data.format != eltwise_bwd->GetSrcMemoryFormat()) { src.SetUsrMem(src_md, &src_tensor); src.CheckReorderToOpMem( - eltwise_bwd_pd.get()->diff_src_primitive_desc(), &net); + eltwise_bwd_pd.get()->diff_src_primitive_desc()); src_data = static_cast(src.GetOpMem().get_data_handle()); } else { src_data = static_cast( @@ -983,14 +979,13 @@ class MklReluGradOpBase : public OpKernel { if (diff_dst_md.data.format != eltwise_bwd->GetDiffDstMemoryFormat()) { diff_dst.SetUsrMem(diff_dst_md, &diff_dst_tensor); diff_dst.CheckReorderToOpMem( - eltwise_bwd_pd.get()->diff_src_primitive_desc(), &net); + eltwise_bwd_pd.get()->diff_src_primitive_desc()); diff_dst_data = static_cast( diff_dst.GetOpMem().get_data_handle()); } else { diff_dst_data = static_cast(const_cast( diff_dst_tensor.flat().data())); } - stream(stream::kind::eager).submit(net).wait(); // allocate diff_src tensor MklDnnShape dnn_shape_diff_src; diff --git a/tensorflow/core/util/mkl_util.h b/tensorflow/core/util/mkl_util.h index bb447e0393..b2c93a508d 100644 --- a/tensorflow/core/util/mkl_util.h +++ b/tensorflow/core/util/mkl_util.h @@ -1897,8 +1897,9 @@ class MklPrimitiveFactory { ~MklPrimitiveFactory() {} MklPrimitive* GetOp(const std::string& key) { - auto stream_iter = MklPrimitiveFactory::GetHashMap().find(key); - if (stream_iter == MklPrimitiveFactory::GetHashMap().end()) { + auto &map = MklPrimitiveFactory::GetHashMap(); + auto stream_iter = map.find(key); + if (stream_iter == map.end()) { return nullptr; } else { return stream_iter->second; @@ -1906,11 +1907,12 @@ class MklPrimitiveFactory { } void SetOp(const std::string& key, MklPrimitive* op) { - auto stream_iter = MklPrimitiveFactory::GetHashMap().find(key); + auto &map = MklPrimitiveFactory::GetHashMap(); + auto stream_iter = map.find(key); - CHECK(stream_iter == MklPrimitiveFactory::GetHashMap().end()); + CHECK(stream_iter == map.end()); - MklPrimitiveFactory::GetHashMap()[key] = op; + map[key] = op; } private: -- GitLab From 2fcfb4abde9d847cff5a344cf06b2704cb6f9545 Mon Sep 17 00:00:00 2001 From: "Peng Wang (SIMPENG)" Date: Fri, 20 Jul 2018 16:25:56 +0800 Subject: [PATCH 014/598] fix build error --- tensorflow/core/grappler/optimizers/memory_optimizer_test.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tensorflow/core/grappler/optimizers/memory_optimizer_test.cc b/tensorflow/core/grappler/optimizers/memory_optimizer_test.cc index a3f0e07861..49543645f6 100644 --- a/tensorflow/core/grappler/optimizers/memory_optimizer_test.cc +++ b/tensorflow/core/grappler/optimizers/memory_optimizer_test.cc @@ -114,7 +114,7 @@ TEST_F(RecomputeSubgraphTest, TwoInputSubgraphs) { (*pre_transform_node_map.GetNode("b")->mutable_attr())["_recompute_hint"] .set_i(0); - MemoryOptimizer optimizer(RewriterConfig::MANUAL, + MemoryOptimizer optimizer(RewriterConfig::MANUAL,1.0, "some_name_scope/gradients"); GraphDef output; Status status = optimizer.Optimize(nullptr, item, &output); -- GitLab From d76aaad2ea9ee4df8c32b382db758854315d230e Mon Sep 17 00:00:00 2001 From: "Peng Wang (SIMPENG)" Date: Fri, 20 Jul 2018 17:50:51 +0800 Subject: [PATCH 015/598] change format a bit --- tensorflow/core/grappler/optimizers/memory_optimizer_test.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tensorflow/core/grappler/optimizers/memory_optimizer_test.cc b/tensorflow/core/grappler/optimizers/memory_optimizer_test.cc index 49543645f6..1473e26cbd 100644 --- a/tensorflow/core/grappler/optimizers/memory_optimizer_test.cc +++ b/tensorflow/core/grappler/optimizers/memory_optimizer_test.cc @@ -114,7 +114,7 @@ TEST_F(RecomputeSubgraphTest, TwoInputSubgraphs) { (*pre_transform_node_map.GetNode("b")->mutable_attr())["_recompute_hint"] .set_i(0); - MemoryOptimizer optimizer(RewriterConfig::MANUAL,1.0, + MemoryOptimizer optimizer(RewriterConfig::MANUAL, 1.0, "some_name_scope/gradients"); GraphDef output; Status status = optimizer.Optimize(nullptr, item, &output); -- GitLab From 35c81bb208622589abaebccb35c44da9148e2d14 Mon Sep 17 00:00:00 2001 From: Stefan Dyulgerov Date: Sun, 22 Jul 2018 17:07:34 +0300 Subject: [PATCH 016/598] ignore cmake build artifacts --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index 5afe375f46..4e526261c7 100644 --- a/.gitignore +++ b/.gitignore @@ -14,6 +14,7 @@ __pycache__ *.swp .vscode/ cmake_build/ +tensorflow/contrib/cmake/_build/ .idea/** /build/ [Bb]uild/ -- GitLab From 171b34a519ea2c888d0f9fd754ca8a8c5ed02587 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Yan=20Facai=20=28=E9=A2=9C=E5=8F=91=E6=89=8D=29?= Date: Sun, 29 Jul 2018 09:21:29 +0800 Subject: [PATCH 017/598] PREP: use np.array to avoid copy behavior of index tensor --- tensorflow/python/ops/array_grad.py | 27 ++++++++++++++++++--------- 1 file changed, 18 insertions(+), 9 deletions(-) diff --git a/tensorflow/python/ops/array_grad.py b/tensorflow/python/ops/array_grad.py index a2b5f77f91..d709f6b36b 100644 --- a/tensorflow/python/ops/array_grad.py +++ b/tensorflow/python/ops/array_grad.py @@ -19,6 +19,7 @@ from __future__ import division from __future__ import print_function from math import ceil +import numpy as np from tensorflow.python import pywrap_tensorflow from tensorflow.python.eager import context @@ -774,17 +775,25 @@ def _ExtractImagePatchesGrad(op, grad): row_steps = range(0, rows_out * stride_r, stride_r) col_steps = range(0, cols_out * stride_h, stride_h) - idx = [] + idx = np.zeros((rows_out * cols_out * ksize_r * ksize_c, 2), + dtype=np.int64) + idx_len = 0 for i in range(rows_out): + r_low = row_steps[i] - pad_rows + r_high = r_low + ksize_r_eff + for j in range(cols_out): - r_low, c_low = row_steps[i] - pad_rows, col_steps[j] - pad_cols - r_high, c_high = r_low + ksize_r_eff, c_low + ksize_c_eff - - idx.extend([(r * (cols_in) + c, i * (cols_out * ksize_r * ksize_c) + j * - (ksize_r * ksize_c) + ri * (ksize_c) + ci) - for (ri, r) in enumerate(range(r_low, r_high, rate_r)) - for (ci, c) in enumerate(range(c_low, c_high, rate_c)) - if 0 <= r and r < rows_in and 0 <= c and c < cols_in]) + c_low = col_steps[j] - pad_cols + c_high = c_low + ksize_c_eff + + for (ri, r) in enumerate(range(r_low, r_high, rate_r)): + for (ci, c) in enumerate(range(c_low, c_high, rate_c)): + if 0 <= r and r < rows_in and 0 <= c and c < cols_in: + idx[idx_len][0] = r * (cols_in) + c + idx[idx_len][1] = (i * (cols_out * ksize_r * ksize_c) + + j * (ksize_r * ksize_c) + ri * (ksize_c) + ci) + idx_len += 1 + idx = idx[:idx_len] sp_shape = (rows_in * cols_in, rows_out * cols_out * ksize_r * ksize_c) -- GitLab From 8e761899a7a8102334fc688b6b0fb69a23e93f92 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Yan=20Facai=20=28=E9=A2=9C=E5=8F=91=E6=89=8D=29?= Date: Sun, 29 Jul 2018 17:11:45 +0800 Subject: [PATCH 018/598] PREP: faster method for construction idx array --- tensorflow/python/ops/array_grad.py | 86 ++++++++++++----------------- 1 file changed, 36 insertions(+), 50 deletions(-) diff --git a/tensorflow/python/ops/array_grad.py b/tensorflow/python/ops/array_grad.py index d709f6b36b..4578639649 100644 --- a/tensorflow/python/ops/array_grad.py +++ b/tensorflow/python/ops/array_grad.py @@ -735,7 +735,6 @@ def _QuantizeAndDequantizeV3Grad(_, grad): @ops.RegisterGradient("ExtractImagePatches") def _ExtractImagePatchesGrad(op, grad): - batch_size, rows_in, cols_in, channels = [ dim.value for dim in op.inputs[0].get_shape() ] @@ -743,28 +742,44 @@ def _ExtractImagePatchesGrad(op, grad): batch_size = input_bhwc[0] channels = input_bhwc[3] + # Create indices matrix for input tensor. + # Note that 0 is preserved for padding location, + # so indice for input starts from 1 to 1 + rows_in * cols_in. + input_indices_num = 1 + rows_in * cols_in + input_idx = array_ops.reshape(math_ops.range(1, input_indices_num, + dtype=ops.dtypes.int64), + (1, rows_in, cols_in, 1)) + input_idx_patched = gen_array_ops.extract_image_patches( + input_idx, + op.get_attr("ksizes"), + op.get_attr("strides"), + op.get_attr("rates"), + op.get_attr("padding")) + + # Create indices matrix for output tensor. _, rows_out, cols_out, _ = [dim.value for dim in op.outputs[0].get_shape()] _, ksize_r, ksize_c, _ = op.get_attr("ksizes") - _, stride_r, stride_h, _ = op.get_attr("strides") - _, rate_r, rate_c, _ = op.get_attr("rates") - padding = op.get_attr("padding") - - ksize_r_eff = ksize_r + (ksize_r - 1) * (rate_r - 1) - ksize_c_eff = ksize_c + (ksize_c - 1) * (rate_c - 1) - - if padding == b"SAME": - rows_out = int(ceil(rows_in / stride_r)) - cols_out = int(ceil(cols_in / stride_h)) - pad_rows = ((rows_out - 1) * stride_r + ksize_r_eff - rows_in) // 2 - pad_cols = ((cols_out - 1) * stride_h + ksize_c_eff - cols_in) // 2 - - elif padding == b"VALID": - rows_out = int(ceil((rows_in - ksize_r_eff + 1) / stride_r)) - cols_out = int(ceil((cols_in - ksize_c_eff + 1) / stride_h)) - pad_rows = (rows_out - 1) * stride_r + ksize_r_eff - rows_in - pad_cols = (cols_out - 1) * stride_h + ksize_c_eff - cols_in - - pad_rows, pad_cols = max(0, pad_rows), max(0, pad_cols) + # Indice for output starts from 0. + output_indices_num = rows_out * cols_out * ksize_r * ksize_c + output_idx = array_ops.reshape(math_ops.range(output_indices_num, + dtype=ops.dtypes.int64), + (1, rows_out, cols_out, ksize_r * ksize_c)) + + # Construct mapping table for indices: input -> output. + idx_matrix = array_ops.concat([array_ops.expand_dims(input_idx_patched, axis=-1), + array_ops.expand_dims(output_idx, axis=-1)], + axis=-1) + idx_map = array_ops.reshape(idx_matrix, (-1, 2)) + + sp_shape = (input_indices_num, output_indices_num) + sp_mat = sparse_tensor.SparseTensor( + idx_map, + array_ops.ones_like(idx_map[:, 0], dtype=grad.dtype), + sp_shape) + # Remove all padding locations: [0, :]. + sp_mat = sparse_ops.sparse_slice(sp_mat, + (1, 0), + (input_indices_num - 1, output_indices_num)) grad_expanded = array_ops.transpose( array_ops.reshape( @@ -772,35 +787,6 @@ def _ExtractImagePatchesGrad(op, grad): (1, 2, 3, 4, 0, 5)) grad_flat = array_ops.reshape(grad_expanded, (-1, batch_size * channels)) - row_steps = range(0, rows_out * stride_r, stride_r) - col_steps = range(0, cols_out * stride_h, stride_h) - - idx = np.zeros((rows_out * cols_out * ksize_r * ksize_c, 2), - dtype=np.int64) - idx_len = 0 - for i in range(rows_out): - r_low = row_steps[i] - pad_rows - r_high = r_low + ksize_r_eff - - for j in range(cols_out): - c_low = col_steps[j] - pad_cols - c_high = c_low + ksize_c_eff - - for (ri, r) in enumerate(range(r_low, r_high, rate_r)): - for (ci, c) in enumerate(range(c_low, c_high, rate_c)): - if 0 <= r and r < rows_in and 0 <= c and c < cols_in: - idx[idx_len][0] = r * (cols_in) + c - idx[idx_len][1] = (i * (cols_out * ksize_r * ksize_c) + - j * (ksize_r * ksize_c) + ri * (ksize_c) + ci) - idx_len += 1 - idx = idx[:idx_len] - - sp_shape = (rows_in * cols_in, rows_out * cols_out * ksize_r * ksize_c) - - sp_mat = sparse_tensor.SparseTensor( - array_ops.constant(idx, dtype=ops.dtypes.int64), - array_ops.ones((len(idx),), dtype=grad.dtype), sp_shape) - jac = sparse_ops.sparse_tensor_dense_matmul(sp_mat, grad_flat) grad_out = array_ops.reshape(jac, (rows_in, cols_in, batch_size, channels)) -- GitLab From c22b5c678a42474fbc9aab59345ac09eeb685c37 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Yan=20Facai=20=28=E9=A2=9C=E5=8F=91=E6=89=8D=29?= Date: Sun, 29 Jul 2018 17:20:45 +0800 Subject: [PATCH 019/598] CLN: remove unused import --- tensorflow/python/ops/array_grad.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/tensorflow/python/ops/array_grad.py b/tensorflow/python/ops/array_grad.py index 4578639649..33c960e0dc 100644 --- a/tensorflow/python/ops/array_grad.py +++ b/tensorflow/python/ops/array_grad.py @@ -18,9 +18,6 @@ from __future__ import absolute_import from __future__ import division from __future__ import print_function -from math import ceil -import numpy as np - from tensorflow.python import pywrap_tensorflow from tensorflow.python.eager import context from tensorflow.python.framework import constant_op -- GitLab From 0d49774a0487b26737b950b510605833671775d9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Yan=20Facai=20=28=E9=A2=9C=E5=8F=91=E6=89=8D=29?= Date: Sun, 29 Jul 2018 17:23:23 +0800 Subject: [PATCH 020/598] CLN: typo: indices --- tensorflow/python/ops/array_grad.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tensorflow/python/ops/array_grad.py b/tensorflow/python/ops/array_grad.py index 33c960e0dc..b6f03144b1 100644 --- a/tensorflow/python/ops/array_grad.py +++ b/tensorflow/python/ops/array_grad.py @@ -741,7 +741,7 @@ def _ExtractImagePatchesGrad(op, grad): # Create indices matrix for input tensor. # Note that 0 is preserved for padding location, - # so indice for input starts from 1 to 1 + rows_in * cols_in. + # so indices for input start from 1 to 1 + rows_in * cols_in. input_indices_num = 1 + rows_in * cols_in input_idx = array_ops.reshape(math_ops.range(1, input_indices_num, dtype=ops.dtypes.int64), @@ -756,7 +756,7 @@ def _ExtractImagePatchesGrad(op, grad): # Create indices matrix for output tensor. _, rows_out, cols_out, _ = [dim.value for dim in op.outputs[0].get_shape()] _, ksize_r, ksize_c, _ = op.get_attr("ksizes") - # Indice for output starts from 0. + # Indices for output start from 0. output_indices_num = rows_out * cols_out * ksize_r * ksize_c output_idx = array_ops.reshape(math_ops.range(output_indices_num, dtype=ops.dtypes.int64), -- GitLab From 4f456bc6f19d667a6d32a7459742b3139e8fe617 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Yan=20Facai=20=28=E9=A2=9C=E5=8F=91=E6=89=8D=29?= Date: Sun, 29 Jul 2018 22:52:44 +0800 Subject: [PATCH 021/598] CLN: clean codes --- tensorflow/python/ops/array_grad.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tensorflow/python/ops/array_grad.py b/tensorflow/python/ops/array_grad.py index b6f03144b1..328b4f7d53 100644 --- a/tensorflow/python/ops/array_grad.py +++ b/tensorflow/python/ops/array_grad.py @@ -762,19 +762,19 @@ def _ExtractImagePatchesGrad(op, grad): dtype=ops.dtypes.int64), (1, rows_out, cols_out, ksize_r * ksize_c)) - # Construct mapping table for indices: input -> output. + # Construct mapping table for indices: (input -> output). idx_matrix = array_ops.concat([array_ops.expand_dims(input_idx_patched, axis=-1), array_ops.expand_dims(output_idx, axis=-1)], axis=-1) idx_map = array_ops.reshape(idx_matrix, (-1, 2)) sp_shape = (input_indices_num, output_indices_num) - sp_mat = sparse_tensor.SparseTensor( + sp_mat_full = sparse_tensor.SparseTensor( idx_map, array_ops.ones_like(idx_map[:, 0], dtype=grad.dtype), sp_shape) - # Remove all padding locations: [0, :]. - sp_mat = sparse_ops.sparse_slice(sp_mat, + # Remove all padding locations [0, :]. + sp_mat = sparse_ops.sparse_slice(sp_mat_full, (1, 0), (input_indices_num - 1, output_indices_num)) -- GitLab From e6ae2664c5f72f09c9a6d102a89963c4a9bbf8f1 Mon Sep 17 00:00:00 2001 From: Johannes Schmitz Date: Tue, 31 Jul 2018 20:05:29 +0200 Subject: [PATCH 022/598] Improve readability of Tensor::CheckType error output --- tensorflow/core/framework/tensor.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tensorflow/core/framework/tensor.cc b/tensorflow/core/framework/tensor.cc index 384a42fc11..2e5426712b 100644 --- a/tensorflow/core/framework/tensor.cc +++ b/tensorflow/core/framework/tensor.cc @@ -613,13 +613,13 @@ bool Tensor::IsInitialized() const { } void Tensor::CheckType(DataType expected_dtype) const { - CHECK_EQ(dtype(), expected_dtype) + CHECK_EQ(dtype(), expected_dtype) << " " << DataTypeString(expected_dtype) << " expected, got " << DataTypeString(dtype()); } void Tensor::CheckTypeAndIsAligned(DataType expected_dtype) const { - CHECK_EQ(dtype(), expected_dtype) + CHECK_EQ(dtype(), expected_dtype) << " " << DataTypeString(expected_dtype) << " expected, got " << DataTypeString(dtype()); CHECK(IsAligned()) << "ptr = " << base(); -- GitLab From 94e0c6bb67b82eb1a43135eb5edff6c6fe4ab638 Mon Sep 17 00:00:00 2001 From: Austin Anderson Date: Wed, 18 Jul 2018 16:59:07 -0700 Subject: [PATCH 023/598] Add new Dockerfile assembler based on partials This change adds a new suite of TensorFlow dockerfiles. The dockerfiles come from an assembler controlled by a yaml spec, and are based on a set of re-usable partial dockerfiles. The assembler and spec include conveniences like spec validation, references to other images and specs for minimizing repetition, and arg expansion. --- tensorflow/tools/docker/README.md | 7 + tensorflow/tools/dockerfiles/Dockerfile | 11 + tensorflow/tools/dockerfiles/README.md | 38 ++ tensorflow/tools/dockerfiles/assembler.py | 528 ++++++++++++++++++ tensorflow/tools/dockerfiles/bashrc | 33 ++ .../dockerfiles/cpu-devel-jupyter.Dockerfile | 85 +++ .../dockerfiles/cpu-devel.Dockerfile | 74 +++ .../dockerfiles/cpu-jupyter.Dockerfile | 54 ++ .../dockerfiles/dockerfiles/cpu.Dockerfile | 43 ++ .../nvidia-devel-jupyter.Dockerfile | 105 ++++ .../dockerfiles/nvidia-devel.Dockerfile | 94 ++++ .../dockerfiles/nvidia-jupyter.Dockerfile | 75 +++ .../dockerfiles/dockerfiles/nvidia.Dockerfile | 64 +++ .../partials/bazel.partial.Dockerfile | 13 + .../partials/jupyter.partial.Dockerfile | 8 + .../partials/nvidia-devel.partial.Dockerfile | 43 ++ .../partials/nvidia.partial.Dockerfile | 23 + .../partials/python.partial.Dockerfile | 12 + .../partials/shell.partial.Dockerfile | 2 + .../partials/tensorflow.partial.Dockerfile | 2 + .../partials/ubuntu-devel.partial.Dockerfile | 24 + .../partials/ubuntu.partial.Dockerfile | 2 + tensorflow/tools/dockerfiles/spec.yml | 177 ++++++ 23 files changed, 1517 insertions(+) create mode 100644 tensorflow/tools/dockerfiles/Dockerfile create mode 100644 tensorflow/tools/dockerfiles/README.md create mode 100644 tensorflow/tools/dockerfiles/assembler.py create mode 100644 tensorflow/tools/dockerfiles/bashrc create mode 100644 tensorflow/tools/dockerfiles/dockerfiles/cpu-devel-jupyter.Dockerfile create mode 100644 tensorflow/tools/dockerfiles/dockerfiles/cpu-devel.Dockerfile create mode 100644 tensorflow/tools/dockerfiles/dockerfiles/cpu-jupyter.Dockerfile create mode 100644 tensorflow/tools/dockerfiles/dockerfiles/cpu.Dockerfile create mode 100644 tensorflow/tools/dockerfiles/dockerfiles/nvidia-devel-jupyter.Dockerfile create mode 100644 tensorflow/tools/dockerfiles/dockerfiles/nvidia-devel.Dockerfile create mode 100644 tensorflow/tools/dockerfiles/dockerfiles/nvidia-jupyter.Dockerfile create mode 100644 tensorflow/tools/dockerfiles/dockerfiles/nvidia.Dockerfile create mode 100644 tensorflow/tools/dockerfiles/partials/bazel.partial.Dockerfile create mode 100644 tensorflow/tools/dockerfiles/partials/jupyter.partial.Dockerfile create mode 100644 tensorflow/tools/dockerfiles/partials/nvidia-devel.partial.Dockerfile create mode 100644 tensorflow/tools/dockerfiles/partials/nvidia.partial.Dockerfile create mode 100644 tensorflow/tools/dockerfiles/partials/python.partial.Dockerfile create mode 100644 tensorflow/tools/dockerfiles/partials/shell.partial.Dockerfile create mode 100644 tensorflow/tools/dockerfiles/partials/tensorflow.partial.Dockerfile create mode 100644 tensorflow/tools/dockerfiles/partials/ubuntu-devel.partial.Dockerfile create mode 100644 tensorflow/tools/dockerfiles/partials/ubuntu.partial.Dockerfile create mode 100644 tensorflow/tools/dockerfiles/spec.yml diff --git a/tensorflow/tools/docker/README.md b/tensorflow/tools/docker/README.md index 525f2995ce..41b8ffdf72 100644 --- a/tensorflow/tools/docker/README.md +++ b/tensorflow/tools/docker/README.md @@ -1,3 +1,10 @@ +# WARNING: THESE IMAGES ARE DEPRECATED. + +TensorFlow's Dockerfiles are now located in +[`tensorflow/tools/dockerfiles/`](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/tools/dockerfiles). + +This directory will eventually be removed. + # Using TensorFlow via Docker This directory contains `Dockerfile`s to make it easy to get up and running with diff --git a/tensorflow/tools/dockerfiles/Dockerfile b/tensorflow/tools/dockerfiles/Dockerfile new file mode 100644 index 0000000000..e8ca012298 --- /dev/null +++ b/tensorflow/tools/dockerfiles/Dockerfile @@ -0,0 +1,11 @@ +FROM hadolint/hadolint:latest-debian +LABEL maintainer="Austin Anderson " + +RUN apt-get update && apt-get install -y python3 python3-pip bash +RUN pip3 install --upgrade pip setuptools pyyaml absl-py cerberus + +WORKDIR /tf +VOLUME ["/tf"] + +COPY bashrc /etc/bash.bashrc +RUN chmod 777 /etc/bash.bashrc diff --git a/tensorflow/tools/dockerfiles/README.md b/tensorflow/tools/dockerfiles/README.md new file mode 100644 index 0000000000..1fe51adb4a --- /dev/null +++ b/tensorflow/tools/dockerfiles/README.md @@ -0,0 +1,38 @@ +# TensorFlow Dockerfiles + +This directory houses TensorFlow's Dockerfiles. **DO NOT EDIT THE DOCKERFILES +MANUALLY!** They are maintained by `assembler.py`, which builds Dockerfiles from +the files in `partials/` and the rules in `spec.yml`. See [the Maintaining +section](#maintaining) for more information. + +## Building + +The Dockerfiles in the `dockerfiles` directory must have their build context set +to **the directory with this README.md** to copy in helper files. For example: + +```bash +$ docker build -f ./dockerfiles/cpu.Dockerfile -t tf-cpu . +``` + +Each Dockerfile has its own set of available `--build-arg`s which are documented +in the Dockerfile itself. + +## Maintaining + +To make changes to TensorFlow's Dockerfiles, you'll update `spec.yml` and the +`*.partial.Dockerfile` files in the `partials` directory, then run +`assembler.py` to re-generate the full Dockerfiles before creating a pull +request. + +You can use the `Dockerfile` in this directory to build an editing environment +that has all of the Python dependencies you'll need: + +```bash +$ docker build -t tf-assembler . + +# Set --user to set correct permissions on generated files +$ docker run --user $(id -u):$(id -g) -it -v $(pwd):/tf tf-assembler bash + +# In the container... +/tf $ python3 ./assembler.py -o dockerfiles -s spec.yml --validate +``` diff --git a/tensorflow/tools/dockerfiles/assembler.py b/tensorflow/tools/dockerfiles/assembler.py new file mode 100644 index 0000000000..a33c42ace6 --- /dev/null +++ b/tensorflow/tools/dockerfiles/assembler.py @@ -0,0 +1,528 @@ +"""Assemble common TF Dockerfiles from many parts. + +TODO(angerson): DO NOT SUBMIT without a detailed description of assembler. +""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import copy +import errno +import os +import os.path +import re +import shutil +import textwrap + +from absl import app +from absl import flags +import cerberus +import yaml + +FLAGS = flags.FLAGS + +flags.DEFINE_boolean( + 'dry_run', False, 'Do not actually generate Dockerfiles', short_name='n') + +flags.DEFINE_string( + 'spec_file', + './spec.yml', + 'Path to a YAML specification file', + short_name='s') + +flags.DEFINE_string( + 'output_dir', + '.', ('Path to an output directory for Dockerfiles. ' + 'Will be created if it doesn\'t exist.'), + short_name='o') + +flags.DEFINE_string( + 'partial_dir', + './partials', + 'Path to a directory containing foo.partial.Dockerfile partial files.', + short_name='p') + +flags.DEFINE_boolean( + 'quiet_dry_run', + True, + 'Do not print contents of dry run Dockerfiles.', + short_name='q') + +flags.DEFINE_boolean( + 'validate', True, 'Validate generated Dockerfiles', short_name='c') + +# Schema to verify the contents of spec.yml with Cerberus. +# Must be converted to a dict from yaml to work. +# Note: can add python references with e.g. +# !!python/name:builtins.str +# !!python/name:__main__.funcname +SCHEMA_TEXT = """ +header: + type: string + +partials: + type: dict + keyschema: + type: string + valueschema: + type: dict + schema: + desc: + type: string + args: + type: dict + keyschema: + type: string + valueschema: + anyof: + - type: [ boolean, number, string ] + - type: dict + schema: + default: + type: [ boolean, number, string ] + desc: + type: string + options: + type: list + schema: + type: string + +images: + keyschema: + type: string + valueschema: + type: dict + schema: + desc: + type: string + arg-defaults: + type: list + schema: + anyof: + - type: dict + keyschema: + type: string + arg_in_use: true + valueschema: + type: string + - type: string + isimage: true + create-dockerfile: + type: boolean + partials: + type: list + schema: + anyof: + - type: dict + keyschema: + type: string + regex: image + valueschema: + type: string + isimage: true + - type: string + ispartial: true +""" + + +class TfDockerValidator(cerberus.Validator): + """Custom Cerberus validator for TF dockerfile spec. + + Note that each custom validator's docstring must end with a segment describing + its own validation schema. + """ + + def _validate_ispartial(self, ispartial, field, value): + """Validate that a partial references an existing partial spec. + + Args: + ispartial: Value of the rule, a bool + field: The field being validated + value: The field's value + + The rule's arguments are validated against this schema: + {'type': 'boolean'} + """ + if ispartial and value not in self.root_document.get('partials', dict()): + self._error(field, '{} is not an existing partial.'.format(value)) + + def _validate_isimage(self, isimage, field, value): + """Validate that an image references an existing partial spec. + + Args: + isimage: Value of the rule, a bool + field: The field being validated + value: The field's value + + The rule's arguments are validated against this schema: + {'type': 'boolean'} + """ + if isimage and value not in self.root_document.get('images', dict()): + self._error(field, '{} is not an existing image.'.format(value)) + + def _validate_arg_in_use(self, arg_in_use, field, value): + """Validate that an arg references an existing partial spec's args. + + Args: + arg_in_use: Value of the rule, a bool + field: The field being validated + value: The field's value + + The rule's arguments are validated against this schema: + {'type': 'boolean'} + """ + if arg_in_use: + for partial in self.root_document.get('partials', dict()).values(): + if value in partial.get('args', tuple()): + return + self._error(field, '{} is not an arg used in any partial.'.format(value)) + + +def build_partial_description(partial_spec): + """Create the documentation lines for a specific partial. + + Generates something like this: + + # This is the partial's description, from spec.yml. + # --build-arg ARG_NAME=argdefault + # this is one of the args. + # --build-arg ANOTHER_ARG=(some|choices) + # another arg. + + Args: + partial_spec: A dict representing one of the partials from spec.yml. Doesn't + include the name of the partial; is a dict like { desc: ..., args: ... }. + + Returns: + A commented string describing this partial. + """ + + # Start from linewrapped desc field + lines = [] + wrapper = textwrap.TextWrapper( + initial_indent='# ', subsequent_indent='# ', width=80) + description = wrapper.fill(partial_spec.get('desc', '( no comments )')) + lines.extend(['#', description]) + + # Document each arg + for arg, arg_data in partial_spec.get('args', dict()).items(): + + # Wrap arg description with comment lines + desc = arg_data.get('desc', '( no description )') + desc = textwrap.fill( + desc, + initial_indent='# ', + subsequent_indent='# ', + width=80, + drop_whitespace=False) + + # Document (each|option|like|this) + if 'options' in arg_data: + arg_options = ' ({})'.format('|'.join(arg_data['options'])) + else: + arg_options = '' + + # Add usage sample + arg_use = '# --build-arg {}={}{}'.format(arg, + arg_data.get('default', '(unset)'), + arg_options) + lines.extend([arg_use, desc]) + return '\n'.join(lines) + + +def construct_contents(partial_specs, image_spec): + """Assemble the dockerfile contents for an image spec. + + It assembles a concrete list of partial references into a single, large + string. + Also expands argument defaults, so that the resulting Dockerfile doesn't have + to be configured with --build-arg=... every time. That is, any ARG directive + will be updated with a new default value. + + Args: + partial_specs: The dict from spec.yml["partials"]. + image_spec: One of the dict values from spec.yml["images"]. + + Returns: + A string containing a valid Dockerfile based on the partials listed in + image_spec. + """ + processed_partial_strings = [] + for partial_name in image_spec['partials']: + + # Apply image arg-defaults to existing arg defaults + partial_spec = copy.deepcopy(partial_specs[partial_name]) + args = partial_spec.get('args', dict()) + for k_v in image_spec.get('arg-defaults', []): + arg, value = list(k_v.items())[0] + if arg in args: + args[arg]['default'] = value + + # Read partial file contents + filename = partial_spec.get('file', partial_name) + partial_path = os.path.join(FLAGS.partial_dir, + '{}.partial.Dockerfile'.format(filename)) + with open(partial_path, 'r') as f_partial: + partial_contents = f_partial.read() + + # Replace ARG FOO=BAR with ARG FOO=[new-default] + for arg, arg_data in args.items(): + if 'default' in arg_data and arg_data['default']: + default = '={}'.format(arg_data['default']) + else: + default = '' + partial_contents = re.sub(r'ARG {}.*'.format(arg), 'ARG {}{}'.format( + arg, default), partial_contents) + processed_partial_strings.append(partial_contents) + return '\n'.join(processed_partial_strings) + + +# Create a directory and its parents, even if it already exists +def mkdir_p(path): + try: + os.makedirs(path) + except OSError as e: + if e.errno != errno.EEXIST: + raise + + +def construct_documentation(header, partial_specs, image_spec): + """Assemble all of the documentation for a single dockerfile. + + Builds explanations of included partials and available build args. + + Args: + header: The string from spec.yml["header"]; will be commented and wrapped. + partial_specs: The dict from spec.yml["partials"]. + image_spec: The spec for the dockerfile being built. + + Returns: + A string containing a commented header that documents the contents of the + dockerfile. + + """ + # Comment and wrap header and image description + commented_header = '\n'.join(['# ' + l for l in header.splitlines()]) + commented_desc = '\n'.join( + ['# ' + l for l in image_spec.get('desc', '').splitlines()]) + partial_descriptions = [] + + # Build documentation for each partial in the image + for partial in image_spec['partials']: + + # Copy partial data for default args unique to this image + partial_spec = copy.deepcopy(partial_specs[partial]) + args = partial_spec.get('args', dict()) + + # Overwrite any existing arg defaults + for k_v in image_spec.get('arg-defaults', []): + arg, value = list(k_v.items())[0] + if arg in args: + args[arg]['default'] = value + + # Build the description from new args + partial_description = build_partial_description(partial_spec) + partial_descriptions.append(partial_description) + + contents = [commented_header, '#', commented_desc] + partial_descriptions + return '\n'.join(contents) + '\n' + + +def normalize_partial_args(partial_specs): + """Normalize the shorthand form of a partial's args specification. + + Turns this: + + partial: + args: + SOME_ARG: arg_value + + Into this: + + partial: + args: + SOME_ARG: + default: arg_value + + Args: + partial_specs: The dict from spec.yml["partials"]. This dict is modified in + place. + + Returns: + The modified contents of partial_specs. + + """ + for _, partial in partial_specs.items(): + args = partial.get('args', dict()) + for arg, value in args.items(): + if not isinstance(value, dict): + new_value = {'default': value} + args[arg] = new_value + return partial_specs + + +def flatten_args_references(image_specs): + """Resolve all default-args in each image spec to a concrete dict. + + Turns this: + + example-image: + arg-defaults: + - MY_ARG: ARG_VALUE + + another-example: + arg-defaults: + - ANOTHER_ARG: ANOTHER_VALUE + - example_image + + Into this: + + example-image: + arg-defaults: + - MY_ARG: ARG_VALUE + + another-example: + arg-defaults: + - ANOTHER_ARG: ANOTHER_VALUE + - MY_ARG: ARG_VALUE + + Args: + image_specs: A dict of image_spec dicts; should be the contents of the + "images" key in the global spec.yaml. This dict is modified in place and + then returned. + + Returns: + The modified contents of image_specs. + """ + for _, image_spec in image_specs.items(): + too_deep = 0 + while str in map(type, image_spec.get('arg-defaults', [])) and too_deep < 5: + new_args = [] + for arg in image_spec['arg-defaults']: + if isinstance(arg, str): + new_args.extend(image_specs[arg]['arg-defaults']) + else: + new_args.append(arg) + image_spec['arg-defaults'] = new_args + too_deep += 1 + return image_specs + + +def flatten_partial_references(image_specs): + """Resolve all partial references in each image spec to a concrete list. + + Turns this: + + example-image: + partials: + - foo + + another-example: + partials: + - bar + - image: example-image + - bat + + Into this: + + example-image: + partials: + - foo + + another-example: + partials: + - bar + - foo + - bat + Args: + image_specs: A dict of image_spec dicts; should be the contents of the + "images" key in the global spec.yaml. This dict is modified in place and + then returned. + + Returns: + The modified contents of image_specs. + """ + for _, image_spec in image_specs.items(): + too_deep = 0 + while dict in map(type, image_spec['partials']) and too_deep < 5: + new_partials = [] + for partial in image_spec['partials']: + if isinstance(partial, str): + new_partials.append(partial) + else: + new_partials.extend(image_specs[partial['image']]['partials']) + image_spec['partials'] = new_partials + too_deep += 1 + return image_specs + + +def construct_dockerfiles(tf_spec): + """Generate a mapping of {"cpu": , ...}. + + Args: + tf_spec: The full spec.yml loaded as a python object. + + Returns: + A string:string dict of short names ("cpu-devel") to Dockerfile contents. + """ + names_to_contents = dict() + image_specs = tf_spec['images'] + image_specs = flatten_partial_references(image_specs) + image_specs = flatten_args_references(image_specs) + partial_specs = tf_spec['partials'] + partial_specs = normalize_partial_args(partial_specs) + + for name, image_spec in image_specs.items(): + if not image_spec.get('create-dockerfile', True): + continue + documentation = construct_documentation(tf_spec['header'], partial_specs, + image_spec) + contents = construct_contents(partial_specs, image_spec) + names_to_contents[name] = '\n'.join([documentation, contents]) + return names_to_contents + + +def main(argv): + if len(argv) > 1: + raise app.UsageError('Too many command-line arguments.') + + with open(FLAGS.spec_file, 'r') as spec_file: + tf_spec = yaml.load(spec_file) + + # Abort if spec.yaml is invalid + if FLAGS.validate: + schema = yaml.load(SCHEMA_TEXT) + v = TfDockerValidator(schema) + if not v.validate(tf_spec): + print('>> ERROR: {} is an invalid spec! The errors are:'.format( + FLAGS.spec_file)) + print(yaml.dump(v.errors, indent=2)) + exit(1) + else: + print('>> WARNING: Not validating {}'.format(FLAGS.spec_file)) + + # Generate mapping of { "cpu-devel": "", ... } + names_to_contents = construct_dockerfiles(tf_spec) + + # Write each completed Dockerfile + if not FLAGS.dry_run: + print('>> Emptying destination dir "{}"'.format(FLAGS.output_dir)) + shutil.rmtree(FLAGS.output_dir, ignore_errors=True) + mkdir_p(FLAGS.output_dir) + else: + print('>> Skipping creation of {} (dry run)'.format(FLAGS.output_dir)) + for name, contents in names_to_contents.items(): + path = os.path.join(FLAGS.output_dir, name + '.Dockerfile') + if FLAGS.dry_run: + print('>> Skipping writing contents of {} (dry run)'.format(path)) + print(contents) + else: + mkdir_p(FLAGS.output_dir) + print('>> Writing {}'.format(path)) + with open(path, 'w') as f: + f.write(contents) + + +if __name__ == '__main__': + app.run(main) diff --git a/tensorflow/tools/dockerfiles/bashrc b/tensorflow/tools/dockerfiles/bashrc new file mode 100644 index 0000000000..7f54609e78 --- /dev/null +++ b/tensorflow/tools/dockerfiles/bashrc @@ -0,0 +1,33 @@ +export PS1="\[\e[31m\]tf-docker\[\e[m\] \[\e[33m\]\w\[\e[m\] > " +export TERM=xterm-256color +alias grep="grep --color=auto" +alias ls="ls --color=auto" + +echo -e "\e[1;31m" +cat< + Start from Nvidia's Ubuntu base image with CUDA and CuDNN, with TF + development packages. + args: + UBUNTU_VERSION: 16.04 + + python: + desc: Python is required for TensorFlow and other libraries. + args: + USE_PYTHON_3_NOT_2: + default: true + desc: Install python 3 over Python 2 + + tensorflow: + desc: Install the TensorFlow Python package. + args: + TF_PACKAGE: + default: tensorflow + options: + - tensorflow + - tensorflow-gpu + - tf-nightly + - tf-nightly-gpu + desc: The specific TensorFlow Python package to install + shell: + desc: Configure TensorFlow's shell prompt and login tools. + jupyter: + desc: Launch Jupyter on execution instead of a bash prompt. + +# =========== +# DOCKERFILES +# =========== +# Represent dockerfiles. +# Spec: +# +# name: the name of the image, referenced from other sections +# desc: A description, inserted later into the Dockerfile +# create-dockerfile: Create a dockerfile based on this. Useful for creating +# base images. Default is true +# partials: List of VALUEs, where a VALUE is either: +# - the name of a partial, which inserts that partial into this file +# - image: [name of another image], which inserts the partials from that +# image into this file +# arg-defaults: List of VALUEs, where a VALUE is either: +# - the name of another image, which loads the default args from that image +# - ARG_NAME: VALUE, which is exactly what you'd expect +images: + + nodev: + create-dockerfile: false + partials: + - python + - tensorflow + - shell + + dev: + create-dockerfile: false + partials: + - python + - bazel + - shell + + cpu: + desc: Ubuntu-based, CPU-only environment for using TensorFlow + partials: + - ubuntu + - image: nodev + + cpu-devel: + desc: > + Ubuntu-based, CPU-only environment for developing changes for + TensorFlow. + partials: + - ubuntu-devel + - image: dev + + nvidia: + desc: Ubuntu-based, Nvidia-GPU-enabled environment for using TensorFlow. + arg-defaults: + - TF_PACKAGE: tensorflow-gpu + partials: + - nvidia + - image: nodev + + nvidia-devel: + desc: > + Ubuntu-based, Nvidia-GPU-enabled environment for developing changes + for TensorFlow. + arg-defaults: + - TF_PACKAGE: tensorflow-gpu + partials: + - nvidia-devel + - image: dev + + cpu-jupyter: + desc: > + Ubuntu-based, CPU-only environment for using TensorFlow, with Jupyter + included. + partials: + - image: cpu + - jupyter + + cpu-devel-jupyter: + desc: > + Ubuntu-based, CPU-only environment for developing changes for + TensorFlow, with Jupyter included. + partials: + - image: cpu-devel + - jupyter + + nvidia-jupyter: + desc: > + Ubuntu-based, Nvidia-GPU-enabled environment for using TensorFlow, with + Jupyter included. + arg-defaults: + - nvidia + partials: + - image: nvidia + - jupyter + + nvidia-devel-jupyter: + desc: > + Ubuntu-based, Nvidia-GPU-enabled environment for developing changes for + TensorFlow, with Jupyter included. + arg-defaults: + - nvidia-devel + partials: + - image: nvidia-devel + - jupyter -- GitLab From 478c4161f2524f9e9a6b78f7de297dc7d194d37a Mon Sep 17 00:00:00 2001 From: Guozhong Zhuang Date: Wed, 1 Aug 2018 09:35:31 -0700 Subject: [PATCH 024/598] Code changes based on Rasmus's code review suggestions on PR19403 and enhancing MklInputConversion for MKL-DNN v0.15 integration --- .../core/kernels/mkl_input_conversion_op.cc | 17 +++-- tensorflow/core/kernels/mkl_relu_op.cc | 73 ++++++++++--------- 2 files changed, 48 insertions(+), 42 deletions(-) diff --git a/tensorflow/core/kernels/mkl_input_conversion_op.cc b/tensorflow/core/kernels/mkl_input_conversion_op.cc index dc4da33a06..fee6c44cfe 100644 --- a/tensorflow/core/kernels/mkl_input_conversion_op.cc +++ b/tensorflow/core/kernels/mkl_input_conversion_op.cc @@ -296,7 +296,9 @@ class MklInputConversionOp : public OpKernel { // implementation. TensorShape tf_shape0 = input_shape_0.GetTfShape(); TensorShape tf_shape1 = input_shape_1.GetTfShape(); - if (tf_shape0 == tf_shape1) { + TensorShape tensor_shape0 = input_tensor_0.shape(); + TensorShape tensor_shape1 = input_tensor_1.shape(); + if (tf_shape0 == tf_shape1 && tensor_shape0 == tensor_shape1) { auto input0_md = input_shape_0.GetMklLayout(); auto input1_md = input_shape_1.GetMklLayout(); @@ -350,7 +352,8 @@ class MklInputConversionOp : public OpKernel { } // Sanity check - bool mkl_shapes_are_same = input_shape_0 == input_shape_1; + bool mkl_shapes_are_same = ((input_shape_0 == input_shape_1) && + (tensor_shape0 == tensor_shape1)); if (mkl_shapes_are_same) { CHECK(false) << "MklInputConversionOp: Unexpected: TF shapes are " "different but MKL shapes are same"; @@ -403,7 +406,8 @@ class MklInputConversionOp : public OpKernel { } // Broadcast is needed if the shapes are not the same - if (mkl_shape->GetTfShape().num_elements() == tf_tensor->shape().num_elements() ) { + if (mkl_shape->GetTfShape().num_elements() + == tf_tensor->shape().num_elements() ) { // Both shapes are same, convert the TF input to MKL VLOG(1) << "MklInputConversionOp: No broadcast needed."; VLOG(1) << "MklInputConversionOp: Converting input " << tf_tensor_index @@ -437,16 +441,17 @@ class MklInputConversionOp : public OpKernel { bool reordered = tf_input.CheckReorderToOpMem( memory::primitive_desc(output_mkl_md, cpu_engine), tensor_out, &net); - if(!reordered) { + + if (!reordered) { // This is the case that the TF tensor has the same shape and format of // mkl tensor. However, tf_tensor can not be simply forwarded to the // output tensor since mkl data tensor is always one dimensional tensor. // Tensor::CopyFrom shares the buffer of the other tensor while set its // shape to the other tensor. CHECK(tensor_out->CopyFrom(*tf_tensor, tensor_out->shape())); - } - else + } else { stream(stream::kind::eager).submit(net).wait(); + } // -- The tensor in MKL format passes through -- ForwardMklTensorInToOut(context, mkl_tensor_index, mkl_tensor_index); diff --git a/tensorflow/core/kernels/mkl_relu_op.cc b/tensorflow/core/kernels/mkl_relu_op.cc index 3d5a05be73..69f2e37b61 100644 --- a/tensorflow/core/kernels/mkl_relu_op.cc +++ b/tensorflow/core/kernels/mkl_relu_op.cc @@ -83,8 +83,9 @@ class MklEltwiseFwdPrimitive : public MklPrimitive { // Eltwise forward execute // src_data: input data buffer of src // dst_data: output data buffer of dst - void Execute(T* src_data, T* dst_data) { - context_.src_mem->set_data_handle(static_cast(src_data)); + void Execute(const T* src_data, T* dst_data) { + context_.src_mem->set_data_handle( + static_cast(const_cast(src_data))); context_.dst_mem->set_data_handle(static_cast(dst_data)); context_.fwd_stream->submit(context_.fwd_primitives); @@ -261,10 +262,11 @@ class MklEltwiseBwdPrimitive : public MklPrimitive { // src_data: input data buffer of src // diff_dst_data: input data buffer of diff_dst // diff_src_data: output data buffer of diff_src - - void Execute(T* src_data, T* diff_dst_data, T* diff_src_data) { - context_.src_mem->set_data_handle(static_cast(src_data)); - context_.diff_dst_mem->set_data_handle(static_cast(diff_dst_data)); + void Execute(const T* src_data, const T* diff_dst_data, T* diff_src_data) { + context_.src_mem->set_data_handle( + static_cast(const_cast(src_data))); + context_.diff_dst_mem->set_data_handle( + static_cast(const_cast(diff_dst_data))); context_.diff_src_mem->set_data_handle(static_cast(diff_src_data)); context_.bwd_stream->submit(context_.bwd_primitives); @@ -810,17 +812,15 @@ class MklReluOpBase : public OpKernel { MklEltwiseFwdPrimitiveFactory::Get(fwdParams); // prepare for execuation - T* src_data = nullptr; + const T* src_data = src_tensor.flat().data(); // check wehther src need to reorder if (src_md.data.format != eltwise_fwd->GetSrcMemoryFormat()) { src.SetUsrMem(src_md, &src_tensor); auto src_target_pd = memory::primitive_desc({{src_dims}, MklDnnType(), eltwise_fwd->GetSrcMemoryFormat()}, cpu_engine); src.CheckReorderToOpMem(src_target_pd); - src_data = static_cast(src.GetOpMem().get_data_handle()); - } else { - src_data = static_cast( - const_cast(src_tensor.flat().data())); + src_data = const_cast( + reinterpret_cast(src.GetOpMem().get_data_handle())); } // allocate dst tensor, always set it as MKL-DNN layout @@ -836,20 +836,20 @@ class MklReluOpBase : public OpKernel { dnn_shape_dst.SetTfLayout(dnn_shape_src.GetDimension(), dnn_shape_src.GetSizesAsMklDnnDims(), dnn_shape_src.GetTfDataFormat()); - tf_shape_dst.AddDim(dst_pd.get_size()/sizeof(T)); + tf_shape_dst.AddDim(dst_pd.get_size() / sizeof(T)); } else { - // TODO(yli135): why relu's input is TF tensor in VGG16?? dnn_shape_dst.SetMklTensor(false); tf_shape_dst = src_tensor.shape(); } Tensor* dst_tensor = nullptr; OP_REQUIRES_OK(context, context->forward_input_or_allocate_output( - {src_index}, dst_index, tf_shape_dst, &dst_tensor)); + {static_cast(src_index)}, + static_cast(dst_index), + tf_shape_dst, &dst_tensor)); AllocateOutputSetMklShape(context, dst_index, dnn_shape_dst); - T* dst_data = static_cast(const_cast( - dst_tensor->flat().data())); + T* dst_data = dst_tensor->flat().data(); // execute eltwise eltwise_fwd->Execute(src_data, dst_data); @@ -874,8 +874,8 @@ class MklReluGradOpBase : public OpKernel { public: ~MklReluGradOpBase() {} - explicit MklReluGradOpBase(OpKernelConstruction* context) : - OpKernel(context) { + explicit MklReluGradOpBase(OpKernelConstruction* context) + : OpKernel(context) { } virtual void Compute_Scalar(OpKernelContext* context) = 0; @@ -964,41 +964,43 @@ class MklReluGradOpBase : public OpKernel { auto eltwise_bwd_pd = eltwise_bwd->GetEltwiseBwdPd(); // check whether need reorder for src / diff_dst - T* src_data; - T* diff_dst_data; + const T* src_data = src_tensor.flat().data(); if (src_md.data.format != eltwise_bwd->GetSrcMemoryFormat()) { src.SetUsrMem(src_md, &src_tensor); src.CheckReorderToOpMem( eltwise_bwd_pd.get()->diff_src_primitive_desc()); - src_data = static_cast(src.GetOpMem().get_data_handle()); - } else { - src_data = static_cast( - const_cast(src_tensor.flat().data())); + src_data = const_cast( + reinterpret_cast(src.GetOpMem().get_data_handle())); } + const T* diff_dst_data = diff_dst_tensor.flat().data(); if (diff_dst_md.data.format != eltwise_bwd->GetDiffDstMemoryFormat()) { diff_dst.SetUsrMem(diff_dst_md, &diff_dst_tensor); diff_dst.CheckReorderToOpMem( eltwise_bwd_pd.get()->diff_src_primitive_desc()); - diff_dst_data = static_cast( - diff_dst.GetOpMem().get_data_handle()); - } else { - diff_dst_data = static_cast(const_cast( - diff_dst_tensor.flat().data())); + diff_dst_data = const_cast( + reinterpret_cast(diff_dst.GetOpMem().get_data_handle())); } // allocate diff_src tensor MklDnnShape dnn_shape_diff_src; TensorShape tf_shape_diff_src; - if (dnn_shape_src.IsMklTensor()) { + if (dnn_shape_src.IsMklTensor() || + dnn_shape_diff_dst.IsMklTensor()) { auto diff_src_pd = eltwise_bwd_pd->diff_src_primitive_desc(); dnn_shape_diff_src.SetMklTensor(true); dnn_shape_diff_src.SetMklLayout(&diff_src_pd); dnn_shape_diff_src.SetElemType(MklDnnType()); - dnn_shape_diff_src.SetTfLayout(dnn_shape_src.GetDimension(), - dnn_shape_src.GetSizesAsMklDnnDims(), - dnn_shape_src.GetTfDataFormat()); - tf_shape_diff_src.AddDim(diff_src_pd.get_size()/sizeof(T)); + if (dnn_shape_src.IsMklTensor()) { + dnn_shape_diff_src.SetTfLayout(dnn_shape_src.GetDimension(), + dnn_shape_src.GetSizesAsMklDnnDims(), + dnn_shape_src.GetTfDataFormat()); + } else { + dnn_shape_diff_src.SetTfLayout(dnn_shape_diff_dst.GetDimension(), + dnn_shape_diff_dst.GetSizesAsMklDnnDims(), + dnn_shape_diff_dst.GetTfDataFormat()); + } + tf_shape_diff_src.AddDim(diff_src_pd.get_size() / sizeof(T)); } else { dnn_shape_diff_src.SetMklTensor(false); tf_shape_diff_src = src_tensor.shape(); @@ -1009,8 +1011,7 @@ class MklReluGradOpBase : public OpKernel { &diff_src_tensor)); AllocateOutputSetMklShape(context, diff_src_index, dnn_shape_diff_src); - T* diff_src_data = static_cast(const_cast( - diff_src_tensor->flat().data())); + T* diff_src_data = diff_src_tensor->flat().data(); // execute eltwise bwd eltwise_bwd->Execute(src_data, diff_dst_data, diff_src_data); -- GitLab From 04fb295a409b426ea44463934c4cec251990bc37 Mon Sep 17 00:00:00 2001 From: Austin Anderson Date: Wed, 1 Aug 2018 15:23:10 -0700 Subject: [PATCH 025/598] Update readme --- tensorflow/tools/dockerfiles/README.md | 23 ++++++++++++++++++++++- 1 file changed, 22 insertions(+), 1 deletion(-) diff --git a/tensorflow/tools/dockerfiles/README.md b/tensorflow/tools/dockerfiles/README.md index 1fe51adb4a..4786f8ec81 100644 --- a/tensorflow/tools/dockerfiles/README.md +++ b/tensorflow/tools/dockerfiles/README.md @@ -11,12 +11,33 @@ The Dockerfiles in the `dockerfiles` directory must have their build context set to **the directory with this README.md** to copy in helper files. For example: ```bash -$ docker build -f ./dockerfiles/cpu.Dockerfile -t tf-cpu . +$ docker build -f ./dockerfiles/cpu.Dockerfile -t tf . ``` Each Dockerfile has its own set of available `--build-arg`s which are documented in the Dockerfile itself. +## Running + +After building the image with the tag `tf` (for example): + +```bash +# A volume mount is optional but highly recommended, especially for Jupyter + +# CPU-based images +$ docker run -u $(id -u):$(id -g) -v $(PWD):/my-devel -it tf + +# GPU-based images (set up nvidia-docker2 first) +$ docker run --runtime=nvidia -u $(id -u):$(id -g) -v $(PWD):/my-devel -it tf + +# Images with Jupyter run on port 8888, and needs a volume for notebooks +$ docker run --user $(id -u):$(id -g) -p 8888:8888 -v $(PWD):/notebooks -it tf + +# Development images +$ docker run --user $(id -u):$(id -g) -it tf +docker$ git clone https://github.com/tensorflow/tensorflow +``` + ## Maintaining To make changes to TensorFlow's Dockerfiles, you'll update `spec.yml` and the -- GitLab From 00869fc36a952418ffa75fd4fd5763b993251dd2 Mon Sep 17 00:00:00 2001 From: Austin Anderson Date: Wed, 1 Aug 2018 15:32:36 -0700 Subject: [PATCH 026/598] Clean up some documentation --- tensorflow/tools/dockerfiles/assembler.py | 3 +- tensorflow/tools/dockerfiles/spec.yml | 43 ++++++++++++----------- 2 files changed, 25 insertions(+), 21 deletions(-) diff --git a/tensorflow/tools/dockerfiles/assembler.py b/tensorflow/tools/dockerfiles/assembler.py index a33c42ace6..8e0e5923d6 100644 --- a/tensorflow/tools/dockerfiles/assembler.py +++ b/tensorflow/tools/dockerfiles/assembler.py @@ -1,6 +1,7 @@ """Assemble common TF Dockerfiles from many parts. -TODO(angerson): DO NOT SUBMIT without a detailed description of assembler. +This script constructs TF's Dockerfiles by aggregating partial +Dockerfiles. See README.md for usage examples. """ from __future__ import absolute_import diff --git a/tensorflow/tools/dockerfiles/spec.yml b/tensorflow/tools/dockerfiles/spec.yml index 48a0cb772e..4d622c53d2 100644 --- a/tensorflow/tools/dockerfiles/spec.yml +++ b/tensorflow/tools/dockerfiles/spec.yml @@ -1,6 +1,7 @@ # ====== # HEADER # ====== +# # This is commented-out and prepended to each generated Dockerfile. header: | THIS IS A GENERATED DOCKERFILE. @@ -12,19 +13,20 @@ header: | # ======== # PARTIALS # ======== -# Represent and document pieces of a Dockerfile. -# Spec: +# +# Represent and document pieces of a Dockerfile. Spec: # -# name: the name of the partial, referenced from other sections +# name: the name of the partial, is referenced from the images section # desc: A description, inserted later into the Dockerfile -# file: Alternative file prefix, e.g. file.partial.Dockerfile (default = name) +# file: Alternative file prefix, e.g. file.partial.Dockerfile. The default is +# the name of the partial. # args: A dict of ARGs in the Dockerfile; each entry has the format -# ARG_NAME: VALUE where VALUE is -# - a concrete value: becomes the default +# ARG_NAME: VALUE where VALUE is one of: # - a dict: -# desc: Arg description -# default: Default value for the arg; is written to the Dockerfile -# options: List of strings, part of documentation +# desc: Documentation for the arg +# default: Default value for the arg; is written to the Dockerfile +# options: List of strings, part of documentation +# - a concrete value: the same as a dictionary with default: [value]. partials: ubuntu: @@ -75,23 +77,24 @@ partials: jupyter: desc: Launch Jupyter on execution instead of a bash prompt. -# =========== -# DOCKERFILES -# =========== -# Represent dockerfiles. -# Spec: +# ====== +# IMAGES +# ====== +# +# Represent Dockerfiles. Spec: # -# name: the name of the image, referenced from other sections +# name: the name of the image, possibly referenced by other images # desc: A description, inserted later into the Dockerfile # create-dockerfile: Create a dockerfile based on this. Useful for creating -# base images. Default is true +# extensible base images that don't need a file. Default is true. # partials: List of VALUEs, where a VALUE is either: -# - the name of a partial, which inserts that partial into this file +# - the name of a partial, which inserts that partial into this image # - image: [name of another image], which inserts the partials from that -# image into this file +# image into this image # arg-defaults: List of VALUEs, where a VALUE is either: -# - the name of another image, which loads the default args from that image -# - ARG_NAME: VALUE, which is exactly what you'd expect +# - ARG_NAME: VALUE, which sets the ARG_NAME to VALUE wherever it appears +# in this image's partials +# - [name of another image], which loads the default args from that image images: nodev: -- GitLab From ffc12e18fbf7acbaf67a11f1470dc54786e2cc17 Mon Sep 17 00:00:00 2001 From: Guozhong Zhuang Date: Tue, 7 Aug 2018 14:53:26 -0700 Subject: [PATCH 027/598] rebase mkl_util.h with master branch --- tensorflow/core/util/mkl_util.h | 107 ++++++++++++++++++++------------ 1 file changed, 66 insertions(+), 41 deletions(-) diff --git a/tensorflow/core/util/mkl_util.h b/tensorflow/core/util/mkl_util.h index 21868fa2c0..a66b1215bd 100644 --- a/tensorflow/core/util/mkl_util.h +++ b/tensorflow/core/util/mkl_util.h @@ -17,9 +17,10 @@ limitations under the License. #define TENSORFLOW_CORE_UTIL_MKL_UTIL_H_ #ifdef INTEL_MKL -#include +#include #include #include +#include #ifdef INTEL_MKL_ML #include "mkl_dnn.h" @@ -34,11 +35,11 @@ limitations under the License. #include "tensorflow/core/graph/mkl_graph_util.h" #include "tensorflow/core/lib/core/errors.h" #include "tensorflow/core/lib/gtl/array_slice.h" +#include "tensorflow/core/platform/cpu_info.h" #include "tensorflow/core/platform/logging.h" #include "tensorflow/core/platform/macros.h" #include "tensorflow/core/util/padding.h" #include "tensorflow/core/util/tensor_format.h" - #ifndef INTEL_MKL_ML #include "mkldnn.hpp" #include "tensorflow/core/lib/core/stringpiece.h" @@ -1503,7 +1504,8 @@ class MklDnnData { /// Operations memory descriptor memory::desc* op_md_; - + /// Operations temp buffer + void* allocated_buffer_; /// CPU engine on which operation will be executed const engine* cpu_engine_; @@ -1512,6 +1514,7 @@ class MklDnnData { : user_memory_(nullptr), reorder_memory_(nullptr), op_md_(nullptr), + allocated_buffer_(nullptr), cpu_engine_(e) {} ~MklDnnData() { @@ -1652,6 +1655,14 @@ class MklDnnData { user_memory_->set_data_handle(GetTensorBuffer(tensor)); } + /// allocate function for data buffer + inline void AllocateBuffer(size_t size) { + const int64 kMemoryAlginment = 64; // For AVX512 memory alignment. + allocated_buffer_ = cpu_allocator()->AllocateRaw(kMemoryAlginment, size); + } + + inline void* GetAllocatedBuffer() { return allocated_buffer_; } + /// Get the memory primitive for input and output of an op. If inputs /// to an op require reorders, then this function returns memory primitive /// for reorder. Otherwise, it will return memory primitive for user memory. @@ -1873,7 +1884,6 @@ class MklDnnData { net.push_back(FindOrCreateReorder(reorder_memory_, user_memory_)); stream(stream::kind::eager).submit(net).wait(); } - }; /// Base class for operations with reuse of primitives @@ -1882,9 +1892,8 @@ class MklPrimitive { public: virtual ~MklPrimitive() {} - // Dummy data. Its size, hard-coded as 256 here, does - // not matter since MKL should never operate on this buffer. - unsigned char DummyData[256]; + // Dummy data which MKL DNN never operates on + unsigned char* DummyData = nullptr; }; const mkldnn::memory::dims NONE_DIMS = {}; @@ -1896,8 +1905,9 @@ class MklPrimitiveFactory { ~MklPrimitiveFactory() {} MklPrimitive* GetOp(const string& key) { - auto stream_iter = MklPrimitiveFactory::GetHashMap().find(key); - if (stream_iter == MklPrimitiveFactory::GetHashMap().end()) { + auto& map = MklPrimitiveFactory::GetHashMap(); + auto stream_iter = map.find(key); + if (stream_iter == map.end()) { return nullptr; } else { CHECK(stream_iter->second != nullptr) << "nullptr present in map"; @@ -1906,7 +1916,8 @@ class MklPrimitiveFactory { } void SetOp(const string& key, MklPrimitive* op) { - auto stream_iter = MklPrimitiveFactory::GetHashMap().find(key); + auto& map = MklPrimitiveFactory::GetHashMap(); + auto stream_iter = map.find(key); CHECK(stream_iter == map.end()); @@ -1955,11 +1966,25 @@ class FactoryKeyCreator { } }; +static inline memory::format get_desired_format(int channel) { + memory::format fmt_desired = memory::format::any; + + if (port::TestCPUFeature(port::CPUFeature::AVX512F) && (channel % 16) == 0) { + fmt_desired = memory::format::nChw16c; + } else if (port::TestCPUFeature(port::CPUFeature::AVX2) && + (channel % 8) == 0) { + fmt_desired = memory::format::nChw8c; + } else { + fmt_desired = memory::format::nchw; + } + return fmt_desired; +} + class MklReorderPrimitive : public MklPrimitive { - public: - explicit MklReorderPrimitive(const memory* from, const memory* to) { - Setup(from, to); - } + public: + explicit MklReorderPrimitive(const memory* from, const memory* to) { + Setup(from, to); + } ~MklReorderPrimitive() {} std::shared_ptr GetPrimitive() { @@ -1971,7 +1996,7 @@ class MklReorderPrimitive : public MklPrimitive { context_.dst_mem->set_data_handle(to->get_data_handle()); } - private: + private: struct ReorderContext { std::shared_ptr src_mem; std::shared_ptr dst_mem; @@ -1995,28 +2020,27 @@ class MklReorderPrimitive : public MklPrimitive { template class MklReorderPrimitiveFactory : public MklPrimitiveFactory { - public: - static MklReorderPrimitive* Get(const memory* from, - const memory* to) { - auto reorderPrim = static_cast( + public: + static MklReorderPrimitive* Get(const memory* from, const memory* to) { + auto reorderPrim = static_cast( MklReorderPrimitiveFactory::GetInstance().GetReorder(from, to)); - if (reorderPrim == nullptr) { - reorderPrim = new MklReorderPrimitive(from, to); - MklReorderPrimitiveFactory::GetInstance().SetReorder( - from, to, reorderPrim); - } - reorderPrim->SetMemory(from, to); - return reorderPrim; + if (reorderPrim == nullptr) { + reorderPrim = new MklReorderPrimitive(from, to); + MklReorderPrimitiveFactory::GetInstance().SetReorder(from, to, + reorderPrim); } + reorderPrim->SetMemory(from, to); + return reorderPrim; + } static MklReorderPrimitiveFactory & GetInstance() { static MklReorderPrimitiveFactory instance_; return instance_; } - private: - MklReorderPrimitiveFactory() {}; - ~MklReorderPrimitiveFactory() {}; + private: + MklReorderPrimitiveFactory() {} + ~MklReorderPrimitiveFactory() {} static string CreateKey(const memory* from, const memory* to) { string prefix = "reorder"; @@ -2046,18 +2070,19 @@ class MklReorderPrimitiveFactory : public MklPrimitiveFactory { } }; - /// Fuction to find(or create) a reorder from memory pointed by from to memory pointed - /// by to, it will created primitive or get primitive from pool if it is cached. - /// Returns the primitive. - template - inline primitive FindOrCreateReorder(const memory* from, const memory* to) { - CHECK_NOTNULL(from); - CHECK_NOTNULL(to); - MklReorderPrimitive *reorder_prim = - MklReorderPrimitiveFactory::Get(from, to); - return *reorder_prim->GetPrimitive(); - } - +/// Fuction to find(or create) a reorder from memory pointed by +/// from to memory pointed by to, it will created primitive or +/// get primitive from pool if it is cached. +/// Returns the primitive. +template +inline primitive FindOrCreateReorder(const memory* from, const memory* to) { + CHECK_NOTNULL(from); + CHECK_NOTNULL(to); + MklReorderPrimitive* reorder_prim = + MklReorderPrimitiveFactory::Get(from, to); + return *reorder_prim->GetPrimitive(); +} + #endif // INTEL_MKL_DNN } // namespace tensorflow -- GitLab From 10ca9a8fb215e66d25a8469c5eeb5b8d6c02e05e Mon Sep 17 00:00:00 2001 From: Yanbo Liang Date: Tue, 7 Aug 2018 15:29:24 -0700 Subject: [PATCH 028/598] RNN.call should get initial state from full input spec --- tensorflow/python/keras/layers/recurrent.py | 8 ++++++ tensorflow/python/keras/layers/wrappers.py | 26 +++++++++++++++---- .../keras/utils/multi_gpu_utils_test.py | 17 ++++++++++++ 3 files changed, 46 insertions(+), 5 deletions(-) diff --git a/tensorflow/python/keras/layers/recurrent.py b/tensorflow/python/keras/layers/recurrent.py index a8bfdf25f2..85d0a70203 100644 --- a/tensorflow/python/keras/layers/recurrent.py +++ b/tensorflow/python/keras/layers/recurrent.py @@ -583,6 +583,14 @@ class RNN(Layer): # note that the .build() method of subclasses MUST define # self.input_spec and self.state_spec with complete input shapes. if isinstance(inputs, list): + # get initial_state from full input spec + # as they could be copied to multiple GPU. + if self._num_constants is None: + initial_state = inputs[1:] + else: + initial_state = inputs[1:-self._num_constants] + if len(initial_state) == 0: + initial_state = None inputs = inputs[0] if initial_state is not None: pass diff --git a/tensorflow/python/keras/layers/wrappers.py b/tensorflow/python/keras/layers/wrappers.py index f0c1e76156..cf2e139fad 100644 --- a/tensorflow/python/keras/layers/wrappers.py +++ b/tensorflow/python/keras/layers/wrappers.py @@ -545,11 +545,27 @@ class Bidirectional(Wrapper): if initial_state is not None and generic_utils.has_arg( self.layer.call, 'initial_state'): - forward_state = initial_state[:len(initial_state) // 2] - backward_state = initial_state[len(initial_state) // 2:] - y = self.forward_layer.call(inputs, initial_state=forward_state, **kwargs) - y_rev = self.backward_layer.call( - inputs, initial_state=backward_state, **kwargs) + forward_inputs = [inputs[0]] + backward_inputs = [inputs[0]] + pivot = len(initial_state) // 2 + 1 + # add forward initial state + forward_state = inputs[1:pivot] + forward_inputs += forward_state + if self._num_constants is None: + # add backward initial state + backward_state = inputs[pivot:] + backward_inputs += backward_state + else: + # add backward initial state + backward_state = inputs[pivot:-self._num_constants] + backward_inputs += backward_state + # add constants for forward and backward layers + forward_inputs += inputs[-self._num_constants:] + backward_inputs += inputs[-self._num_constants:] + y = self.forward_layer.call(forward_inputs, + initial_state=forward_state, **kwargs) + y_rev = self.backward_layer.call(backward_inputs, + initial_state=backward_state, **kwargs) else: y = self.forward_layer.call(inputs, **kwargs) y_rev = self.backward_layer.call(inputs, **kwargs) diff --git a/tensorflow/python/keras/utils/multi_gpu_utils_test.py b/tensorflow/python/keras/utils/multi_gpu_utils_test.py index 77792d14f5..c7e94998b4 100644 --- a/tensorflow/python/keras/utils/multi_gpu_utils_test.py +++ b/tensorflow/python/keras/utils/multi_gpu_utils_test.py @@ -180,6 +180,23 @@ class TestMultiGPUModel(test.TestCase): target_tensors=[targets]) parallel_model.fit(epochs=1, steps_per_epoch=3) + def test_multi_gpu_with_multi_input_layers(self): + gpus = 2 + + if not check_if_compatible_devices(gpus=gpus): + return + + with self.test_session(): + inputs = keras.Input((4, 3)) + init_state = keras.Input((3,)) + outputs = keras.layers.SimpleRNN( + 3, return_sequences=True)(inputs, initial_state=init_state) + x = [np.random.randn(2, 4, 3), np.random.randn(2, 3)] + y = np.random.randn(2, 4, 3) + model = keras.Model([inputs, init_state], outputs) + parallel_model = keras.utils.multi_gpu_model(model, gpus=gpus) + parallel_model.compile(loss='mean_squared_error', optimizer='adam') + parallel_model.train_on_batch(x, y) if __name__ == '__main__': test.main() -- GitLab From b2470ca8a93a7a4bd960ba7dff65be74779c4f62 Mon Sep 17 00:00:00 2001 From: Jonathan Date: Thu, 26 Jul 2018 23:03:33 +0800 Subject: [PATCH 029/598] modify _TopKGrad so that all operations can run on GPU for better performance --- tensorflow/python/ops/nn_grad.py | 22 ++++++++++++++-------- 1 file changed, 14 insertions(+), 8 deletions(-) diff --git a/tensorflow/python/ops/nn_grad.py b/tensorflow/python/ops/nn_grad.py index df23ac55ce..025ce7ce88 100644 --- a/tensorflow/python/ops/nn_grad.py +++ b/tensorflow/python/ops/nn_grad.py @@ -975,25 +975,31 @@ def _TopKGrad(op, grad, _): in_shape = array_ops.shape(op.inputs[0]) ind_shape = array_ops.shape(op.outputs[1]) - ind_lastdim = array_ops.gather(ind_shape, array_ops.size(ind_shape) - 1) + # int32 is not supported on GPU hence up-casting + ind_lastdim = array_ops.gather(math_ops.cast( + ind_shape, dtypes.int64), array_ops.size(ind_shape) - 1) # Flatten indices to 2D. ind_2d = array_ops.reshape(op.outputs[1], array_ops.stack([-1, ind_lastdim])) - in_lastdim = array_ops.gather(in_shape, array_ops.size(in_shape) - 1) + in_lastdim = array_ops.gather(math_ops.cast( + in_shape, dtypes.int64), array_ops.size(in_shape) - 1) outerdim = array_ops.shape(ind_2d)[0] # Compute linear indices (flattened to 1D). - ind = array_ops.reshape(ind_2d + array_ops.expand_dims( - math_ops.range(0, outerdim * in_lastdim, in_lastdim), -1), [-1]) + ind = array_ops.reshape(ind_2d + math_ops.cast(array_ops.expand_dims( + math_ops.range(0, math_ops.cast(outerdim, dtypes.int64) + * in_lastdim, in_lastdim), -1 + ), dtypes.int32), [-1]) # Substitute grad to appropriate locations and fill the rest with zeros, # finally reshaping it to the original input shape. return [ array_ops.reshape( - sparse_ops.sparse_to_dense( - ind, - array_ops.reshape(math_ops.reduce_prod(in_shape), [1]), + array_ops.scatter_nd( + array_ops.expand_dims(ind, -1), array_ops.reshape(grad, [-1]), - validate_indices=False), in_shape), + [math_ops.reduce_prod(in_shape)] + ), + in_shape), array_ops.zeros([], dtype=dtypes.int32) ] -- GitLab From b1aa509588abdf97bd45f51fbce85169fba51b3b Mon Sep 17 00:00:00 2001 From: Yong Tang Date: Sun, 17 Jun 2018 20:32:54 +0000 Subject: [PATCH 030/598] Update glorot_uniform_initializer to match other Initializers This fix tries to address the issue raised in 19910 where the signature of glorot_uniform_initializer does not match other Initializers (e.g., random_uniform_initializer). This fix update the glorot_uniform_initializer so that it matches the signature in a similiar way as other Initializers. This fix fixes 19910. Signed-off-by: Yong Tang --- tensorflow/python/ops/init_ops.py | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) diff --git a/tensorflow/python/ops/init_ops.py b/tensorflow/python/ops/init_ops.py index 4d75ee3974..957a5b32b7 100644 --- a/tensorflow/python/ops/init_ops.py +++ b/tensorflow/python/ops/init_ops.py @@ -1116,6 +1116,36 @@ class Identity(Initializer): def get_config(self): return {"gain": self.gain, "dtype": self.dtype.name} + +@tf_export("glorot_uniform_initializer") +class GlorotUniform(VarianceScaling): + """The Glorot uniform initializer, also called Xavier uniform initializer. + + It draws samples from a uniform distribution within [-limit, limit] + where `limit` is `sqrt(6 / (fan_in + fan_out))` + where `fan_in` is the number of input units in the weight tensor + and `fan_out` is the number of output units in the weight tensor. + + Reference: http://jmlr.org/proceedings/papers/v9/glorot10a/glorot10a.pdf + + Args: + seed: A Python integer. Used to create random seeds. See + @{tf.set_random_seed} + for behavior. + dtype: The data type. Only floating point types are supported. + """ + def __init__(self, + seed=None, + dtype=dtypes.float32): + super(GlorotUniform, self).__init__(scale=1.0, mode="fan_avg", distribution="uniform", seed=seed, dtype=dtype) + + def get_config(self): + return { + "seed": self.seed, + "dtype": self.dtype.name + } + + # Aliases. # pylint: disable=invalid-name @@ -1127,6 +1157,7 @@ random_normal_initializer = RandomNormal truncated_normal_initializer = TruncatedNormal uniform_unit_scaling_initializer = UniformUnitScaling variance_scaling_initializer = VarianceScaling +glorot_uniform_initializer = GlorotUniform orthogonal_initializer = Orthogonal identity_initializer = Identity convolutional_delta_orthogonal = ConvolutionDeltaOrthogonal -- GitLab From 3fdd83cb9ff5a8d1118c4f59fd87ea22cef23615 Mon Sep 17 00:00:00 2001 From: Yong Tang Date: Sun, 17 Jun 2018 20:40:32 +0000 Subject: [PATCH 031/598] Also convert glorot_normal_initializer to Initializer Signed-off-by: Yong Tang --- tensorflow/python/ops/init_ops.py | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/tensorflow/python/ops/init_ops.py b/tensorflow/python/ops/init_ops.py index 957a5b32b7..ae625318b8 100644 --- a/tensorflow/python/ops/init_ops.py +++ b/tensorflow/python/ops/init_ops.py @@ -1146,6 +1146,35 @@ class GlorotUniform(VarianceScaling): } +@tf_export("glorot_normal_initializer") +class GlorotNormal(VarianceScaling): + """The Glorot normal initializer, also called Xavier normal initializer. + + It draws samples from a truncated normal distribution centered on 0 + with `stddev = sqrt(2 / (fan_in + fan_out))` + where `fan_in` is the number of input units in the weight tensor + and `fan_out` is the number of output units in the weight tensor. + + Reference: http://jmlr.org/proceedings/papers/v9/glorot10a/glorot10a.pdf + + Args: + seed: A Python integer. Used to create random seeds. See + @{tf.set_random_seed} + for behavior. + dtype: The data type. Only floating point types are supported. + """ + def __init__(self, + seed=None, + dtype=dtypes.float32): + super(GlorotNormal, self).__init__(scale=1.0, mode="fan_avg", distribution="normal", seed=seed, dtype=dtype) + + def get_config(self): + return { + "seed": self.seed, + "dtype": self.dtype.name + } + + # Aliases. # pylint: disable=invalid-name @@ -1158,6 +1187,7 @@ truncated_normal_initializer = TruncatedNormal uniform_unit_scaling_initializer = UniformUnitScaling variance_scaling_initializer = VarianceScaling glorot_uniform_initializer = GlorotUniform +glorot_normal_initializer = GlorotNormal orthogonal_initializer = Orthogonal identity_initializer = Identity convolutional_delta_orthogonal = ConvolutionDeltaOrthogonal -- GitLab From 046b3711653206dba9190c50540adf8b0cfd5842 Mon Sep 17 00:00:00 2001 From: Yong Tang Date: Sun, 17 Jun 2018 20:43:44 +0000 Subject: [PATCH 032/598] Pylint fix Signed-off-by: Yong Tang --- tensorflow/python/ops/init_ops.py | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/tensorflow/python/ops/init_ops.py b/tensorflow/python/ops/init_ops.py index ae625318b8..b7b4a2da9a 100644 --- a/tensorflow/python/ops/init_ops.py +++ b/tensorflow/python/ops/init_ops.py @@ -1137,7 +1137,12 @@ class GlorotUniform(VarianceScaling): def __init__(self, seed=None, dtype=dtypes.float32): - super(GlorotUniform, self).__init__(scale=1.0, mode="fan_avg", distribution="uniform", seed=seed, dtype=dtype) + super(GlorotUniform, self).__init__( + scale=1.0, + mode="fan_avg", + distribution="uniform", + seed=seed, + dtype=dtype) def get_config(self): return { @@ -1166,7 +1171,12 @@ class GlorotNormal(VarianceScaling): def __init__(self, seed=None, dtype=dtypes.float32): - super(GlorotNormal, self).__init__(scale=1.0, mode="fan_avg", distribution="normal", seed=seed, dtype=dtype) + super(GlorotNormal, self).__init__( + scale=1.0, + mode="fan_avg", + distribution="normal", + seed=seed, + dtype=dtype) def get_config(self): return { -- GitLab From 120126d23e02edfb8817ff3dcf4d7fb8aa1e65f8 Mon Sep 17 00:00:00 2001 From: Yong Tang Date: Thu, 9 Aug 2018 17:36:06 +0000 Subject: [PATCH 033/598] Fix merge conflicts Signed-off-by: Yong Tang --- tensorflow/python/keras/initializers.py | 8 +-- tensorflow/python/ops/init_ops.py | 68 +++---------------------- 2 files changed, 12 insertions(+), 64 deletions(-) diff --git a/tensorflow/python/keras/initializers.py b/tensorflow/python/keras/initializers.py index b9d856efa8..ed3d563687 100644 --- a/tensorflow/python/keras/initializers.py +++ b/tensorflow/python/keras/initializers.py @@ -26,8 +26,8 @@ from tensorflow.python.keras.utils.generic_utils import serialize_keras_object # These imports are brought in so that keras.initializers.deserialize # has them available in module_objects. from tensorflow.python.ops.init_ops import Constant -from tensorflow.python.ops.init_ops import glorot_normal_initializer -from tensorflow.python.ops.init_ops import glorot_uniform_initializer +from tensorflow.python.ops.init_ops import GlorotNormal +from tensorflow.python.ops.init_ops import GlorotUniform from tensorflow.python.ops.init_ops import he_normal # pylint: disable=unused-import from tensorflow.python.ops.init_ops import he_uniform # pylint: disable=unused-import from tensorflow.python.ops.init_ops import Identity @@ -56,8 +56,8 @@ normal = random_normal = RandomNormal truncated_normal = TruncatedNormal identity = Identity orthogonal = Orthogonal -glorot_normal = glorot_normal_initializer -glorot_uniform = glorot_uniform_initializer +glorot_normal = GlorotNormal +glorot_uniform = GlorotUniform # pylint: enable=invalid-name diff --git a/tensorflow/python/ops/init_ops.py b/tensorflow/python/ops/init_ops.py index b7b4a2da9a..aef6677f8b 100644 --- a/tensorflow/python/ops/init_ops.py +++ b/tensorflow/python/ops/init_ops.py @@ -1116,8 +1116,8 @@ class Identity(Initializer): def get_config(self): return {"gain": self.gain, "dtype": self.dtype.name} - -@tf_export("glorot_uniform_initializer") +@tf_export("glorot_uniform_initializer", "keras.initializers.glorot_uniform", + "initializers.glorot_uniform") class GlorotUniform(VarianceScaling): """The Glorot uniform initializer, also called Xavier uniform initializer. @@ -1130,7 +1130,7 @@ class GlorotUniform(VarianceScaling): Args: seed: A Python integer. Used to create random seeds. See - @{tf.set_random_seed} + `tf.set_random_seed` for behavior. dtype: The data type. Only floating point types are supported. """ @@ -1151,7 +1151,8 @@ class GlorotUniform(VarianceScaling): } -@tf_export("glorot_normal_initializer") +@tf_export("glorot_normal_initializer", "keras.initializers.glorot_normal", + "initializers.glorot_normal") class GlorotNormal(VarianceScaling): """The Glorot normal initializer, also called Xavier normal initializer. @@ -1164,7 +1165,7 @@ class GlorotNormal(VarianceScaling): Args: seed: A Python integer. Used to create random seeds. See - @{tf.set_random_seed} + `tf.set_random_seed` for behavior. dtype: The data type. Only floating point types are supported. """ @@ -1174,7 +1175,7 @@ class GlorotNormal(VarianceScaling): super(GlorotNormal, self).__init__( scale=1.0, mode="fan_avg", - distribution="normal", + distribution="truncated_normal", seed=seed, dtype=dtype) @@ -1185,6 +1186,7 @@ class GlorotNormal(VarianceScaling): } + # Aliases. # pylint: disable=invalid-name @@ -1207,60 +1209,6 @@ convolutional_orthogonal_3d = ConvolutionOrthogonal3D # pylint: enable=invalid-name -@tf_export("glorot_uniform_initializer", "keras.initializers.glorot_uniform", - "initializers.glorot_uniform") -def glorot_uniform_initializer(seed=None, dtype=dtypes.float32): - """The Glorot uniform initializer, also called Xavier uniform initializer. - - It draws samples from a uniform distribution within [-limit, limit] - where `limit` is `sqrt(6 / (fan_in + fan_out))` - where `fan_in` is the number of input units in the weight tensor - and `fan_out` is the number of output units in the weight tensor. - - Reference: http://jmlr.org/proceedings/papers/v9/glorot10a/glorot10a.pdf - - Args: - seed: A Python integer. Used to create random seeds. See - `tf.set_random_seed` - for behavior. - dtype: The data type. Only floating point types are supported. - - Returns: - An initializer. - """ - return variance_scaling_initializer( - scale=1.0, mode="fan_avg", distribution="uniform", seed=seed, dtype=dtype) - - -@tf_export("glorot_normal_initializer", "keras.initializers.glorot_normal", - "initializers.glorot_normal") -def glorot_normal_initializer(seed=None, dtype=dtypes.float32): - """The Glorot normal initializer, also called Xavier normal initializer. - - It draws samples from a truncated normal distribution centered on 0 - with `stddev = sqrt(2 / (fan_in + fan_out))` - where `fan_in` is the number of input units in the weight tensor - and `fan_out` is the number of output units in the weight tensor. - - Reference: http://jmlr.org/proceedings/papers/v9/glorot10a/glorot10a.pdf - - Args: - seed: A Python integer. Used to create random seeds. See - `tf.set_random_seed` - for behavior. - dtype: The data type. Only floating point types are supported. - - Returns: - An initializer. - """ - return variance_scaling_initializer( - scale=1.0, - mode="fan_avg", - distribution="truncated_normal", - seed=seed, - dtype=dtype) - - @tf_export("keras.initializers.lecun_normal", "initializers.lecun_normal") def lecun_normal(seed=None): """LeCun normal initializer. -- GitLab From 02633101d60fe801fbd6261bb080f6c56392afdf Mon Sep 17 00:00:00 2001 From: Niranjan Hasabnis Date: Fri, 10 Aug 2018 11:35:04 -0700 Subject: [PATCH 034/598] Change to match clang formatting --- tensorflow/core/kernels/gather_nd_op_cpu_impl.h | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/tensorflow/core/kernels/gather_nd_op_cpu_impl.h b/tensorflow/core/kernels/gather_nd_op_cpu_impl.h index 22203e242a..64bf2516b7 100644 --- a/tensorflow/core/kernels/gather_nd_op_cpu_impl.h +++ b/tensorflow/core/kernels/gather_nd_op_cpu_impl.h @@ -115,13 +115,13 @@ struct GatherNdSlice { slice_size, Tindices, Tparams, Tout, &error_loc); #ifdef INTEL_MKL - // Eigen implementation below is not highly performant. gather_nd_generator - // does not seem to be called in parallel, leading to very poor performance. - // Additionally, since it uses scalar (Tscratch) to invoke 'generate', it - // needs to go through redundant operations like 'reshape', 'broadcast' and - // 'sum'. OpenMP loop below essentially does same thing as Eigen code, but - // is considerably more efficient. - #pragma omp parallel for +// Eigen implementation below is not highly performant. gather_nd_generator +// does not seem to be called in parallel, leading to very poor performance. +// Additionally, since it uses scalar (Tscratch) to invoke 'generate', it +// needs to go through redundant operations like 'reshape', 'broadcast' and +// 'sum'. OpenMP loop below essentially does same thing as Eigen code, but +// is considerably more efficient. +#pragma omp parallel for for (Eigen::DenseIndex i = 0; i < batch_size; i++) { const Eigen::array loc = i; gather_nd_generator(loc); -- GitLab From dbfa330c963f9e015cc66ad4aebdd7985651c024 Mon Sep 17 00:00:00 2001 From: drpngx Date: Fri, 10 Aug 2018 13:19:23 -0700 Subject: [PATCH 035/598] Address ebrevdo --- tensorflow/python/ops/nn_grad.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/tensorflow/python/ops/nn_grad.py b/tensorflow/python/ops/nn_grad.py index 025ce7ce88..59ba0091c8 100644 --- a/tensorflow/python/ops/nn_grad.py +++ b/tensorflow/python/ops/nn_grad.py @@ -987,8 +987,7 @@ def _TopKGrad(op, grad, _): # Compute linear indices (flattened to 1D). ind = array_ops.reshape(ind_2d + math_ops.cast(array_ops.expand_dims( math_ops.range(0, math_ops.cast(outerdim, dtypes.int64) - * in_lastdim, in_lastdim), -1 - ), dtypes.int32), [-1]) + * in_lastdim, in_lastdim), -1), dtypes.int32), [-1]) # Substitute grad to appropriate locations and fill the rest with zeros, # finally reshaping it to the original input shape. -- GitLab From 65f544792950a8d60d5aa8e19f3239bcae66a99f Mon Sep 17 00:00:00 2001 From: Yong Tang Date: Thu, 9 Aug 2018 17:50:39 +0000 Subject: [PATCH 036/598] Update API goldens with ``` bazel-bin/tensorflow/tools/api/tests/api_compatibility_test --update_goldens True ``` Signed-off-by: Yong Tang --- ...tensorflow.glorot_normal_initializer.pbtxt | 19 +++++++++++++++++++ ...ensorflow.glorot_uniform_initializer.pbtxt | 19 +++++++++++++++++++ ...ensorflow.initializers.glorot_normal.pbtxt | 19 +++++++++++++++++++ ...nsorflow.initializers.glorot_uniform.pbtxt | 19 +++++++++++++++++++ .../golden/v1/tensorflow.initializers.pbtxt | 16 ++++++++-------- ...low.keras.initializers.glorot_normal.pbtxt | 19 +++++++++++++++++++ ...ow.keras.initializers.glorot_uniform.pbtxt | 19 +++++++++++++++++++ .../v1/tensorflow.keras.initializers.pbtxt | 16 ++++++++-------- .../tools/api/golden/v1/tensorflow.pbtxt | 16 ++++++++-------- 9 files changed, 138 insertions(+), 24 deletions(-) create mode 100644 tensorflow/tools/api/golden/v1/tensorflow.glorot_normal_initializer.pbtxt create mode 100644 tensorflow/tools/api/golden/v1/tensorflow.glorot_uniform_initializer.pbtxt create mode 100644 tensorflow/tools/api/golden/v1/tensorflow.initializers.glorot_normal.pbtxt create mode 100644 tensorflow/tools/api/golden/v1/tensorflow.initializers.glorot_uniform.pbtxt create mode 100644 tensorflow/tools/api/golden/v1/tensorflow.keras.initializers.glorot_normal.pbtxt create mode 100644 tensorflow/tools/api/golden/v1/tensorflow.keras.initializers.glorot_uniform.pbtxt diff --git a/tensorflow/tools/api/golden/v1/tensorflow.glorot_normal_initializer.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.glorot_normal_initializer.pbtxt new file mode 100644 index 0000000000..483d1f8ba0 --- /dev/null +++ b/tensorflow/tools/api/golden/v1/tensorflow.glorot_normal_initializer.pbtxt @@ -0,0 +1,19 @@ +path: "tensorflow.glorot_normal_initializer" +tf_class { + is_instance: "" + is_instance: "" + is_instance: "" + is_instance: "" + member_method { + name: "__init__" + argspec: "args=[\'self\', \'seed\', \'dtype\'], varargs=None, keywords=None, defaults=[\'None\', \"\"], " + } + member_method { + name: "from_config" + argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None" + } + member_method { + name: "get_config" + argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None" + } +} diff --git a/tensorflow/tools/api/golden/v1/tensorflow.glorot_uniform_initializer.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.glorot_uniform_initializer.pbtxt new file mode 100644 index 0000000000..bb8540d0fd --- /dev/null +++ b/tensorflow/tools/api/golden/v1/tensorflow.glorot_uniform_initializer.pbtxt @@ -0,0 +1,19 @@ +path: "tensorflow.glorot_uniform_initializer" +tf_class { + is_instance: "" + is_instance: "" + is_instance: "" + is_instance: "" + member_method { + name: "__init__" + argspec: "args=[\'self\', \'seed\', \'dtype\'], varargs=None, keywords=None, defaults=[\'None\', \"\"], " + } + member_method { + name: "from_config" + argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None" + } + member_method { + name: "get_config" + argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None" + } +} diff --git a/tensorflow/tools/api/golden/v1/tensorflow.initializers.glorot_normal.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.initializers.glorot_normal.pbtxt new file mode 100644 index 0000000000..4a81e52df9 --- /dev/null +++ b/tensorflow/tools/api/golden/v1/tensorflow.initializers.glorot_normal.pbtxt @@ -0,0 +1,19 @@ +path: "tensorflow.initializers.glorot_normal" +tf_class { + is_instance: "" + is_instance: "" + is_instance: "" + is_instance: "" + member_method { + name: "__init__" + argspec: "args=[\'self\', \'seed\', \'dtype\'], varargs=None, keywords=None, defaults=[\'None\', \"\"], " + } + member_method { + name: "from_config" + argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None" + } + member_method { + name: "get_config" + argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None" + } +} diff --git a/tensorflow/tools/api/golden/v1/tensorflow.initializers.glorot_uniform.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.initializers.glorot_uniform.pbtxt new file mode 100644 index 0000000000..815dc81dff --- /dev/null +++ b/tensorflow/tools/api/golden/v1/tensorflow.initializers.glorot_uniform.pbtxt @@ -0,0 +1,19 @@ +path: "tensorflow.initializers.glorot_uniform" +tf_class { + is_instance: "" + is_instance: "" + is_instance: "" + is_instance: "" + member_method { + name: "__init__" + argspec: "args=[\'self\', \'seed\', \'dtype\'], varargs=None, keywords=None, defaults=[\'None\', \"\"], " + } + member_method { + name: "from_config" + argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None" + } + member_method { + name: "get_config" + argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None" + } +} diff --git a/tensorflow/tools/api/golden/v1/tensorflow.initializers.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.initializers.pbtxt index bc0426f2f1..d499c67d89 100644 --- a/tensorflow/tools/api/golden/v1/tensorflow.initializers.pbtxt +++ b/tensorflow/tools/api/golden/v1/tensorflow.initializers.pbtxt @@ -4,6 +4,14 @@ tf_module { name: "constant" mtype: "" } + member { + name: "glorot_normal" + mtype: "" + } + member { + name: "glorot_uniform" + mtype: "" + } member { name: "identity" mtype: "" @@ -44,14 +52,6 @@ tf_module { name: "global_variables" argspec: "args=[], varargs=None, keywords=None, defaults=None" } - member_method { - name: "glorot_normal" - argspec: "args=[\'seed\', \'dtype\'], varargs=None, keywords=None, defaults=[\'None\', \"\"], " - } - member_method { - name: "glorot_uniform" - argspec: "args=[\'seed\', \'dtype\'], varargs=None, keywords=None, defaults=[\'None\', \"\"], " - } member_method { name: "he_normal" argspec: "args=[\'seed\'], varargs=None, keywords=None, defaults=[\'None\'], " diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.initializers.glorot_normal.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.initializers.glorot_normal.pbtxt new file mode 100644 index 0000000000..ef0815972d --- /dev/null +++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.initializers.glorot_normal.pbtxt @@ -0,0 +1,19 @@ +path: "tensorflow.keras.initializers.glorot_normal" +tf_class { + is_instance: "" + is_instance: "" + is_instance: "" + is_instance: "" + member_method { + name: "__init__" + argspec: "args=[\'self\', \'seed\', \'dtype\'], varargs=None, keywords=None, defaults=[\'None\', \"\"], " + } + member_method { + name: "from_config" + argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None" + } + member_method { + name: "get_config" + argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None" + } +} diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.initializers.glorot_uniform.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.initializers.glorot_uniform.pbtxt new file mode 100644 index 0000000000..439b5ada9b --- /dev/null +++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.initializers.glorot_uniform.pbtxt @@ -0,0 +1,19 @@ +path: "tensorflow.keras.initializers.glorot_uniform" +tf_class { + is_instance: "" + is_instance: "" + is_instance: "" + is_instance: "" + member_method { + name: "__init__" + argspec: "args=[\'self\', \'seed\', \'dtype\'], varargs=None, keywords=None, defaults=[\'None\', \"\"], " + } + member_method { + name: "from_config" + argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None" + } + member_method { + name: "get_config" + argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None" + } +} diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.initializers.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.initializers.pbtxt index 8645e54302..1540c2915b 100644 --- a/tensorflow/tools/api/golden/v1/tensorflow.keras.initializers.pbtxt +++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.initializers.pbtxt @@ -44,6 +44,14 @@ tf_module { name: "constant" mtype: "" } + member { + name: "glorot_normal" + mtype: "" + } + member { + name: "glorot_uniform" + mtype: "" + } member { name: "identity" mtype: "" @@ -88,14 +96,6 @@ tf_module { name: "get" argspec: "args=[\'identifier\'], varargs=None, keywords=None, defaults=None" } - member_method { - name: "glorot_normal" - argspec: "args=[\'seed\', \'dtype\'], varargs=None, keywords=None, defaults=[\'None\', \"\"], " - } - member_method { - name: "glorot_uniform" - argspec: "args=[\'seed\', \'dtype\'], varargs=None, keywords=None, defaults=[\'None\', \"\"], " - } member_method { name: "he_normal" argspec: "args=[\'seed\'], varargs=None, keywords=None, defaults=[\'None\'], " diff --git a/tensorflow/tools/api/golden/v1/tensorflow.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.pbtxt index 5eb42b4db3..18a5793bb6 100644 --- a/tensorflow/tools/api/golden/v1/tensorflow.pbtxt +++ b/tensorflow/tools/api/golden/v1/tensorflow.pbtxt @@ -364,6 +364,14 @@ tf_module { name: "gfile" mtype: "" } + member { + name: "glorot_normal_initializer" + mtype: "" + } + member { + name: "glorot_uniform_initializer" + mtype: "" + } member { name: "graph_util" mtype: "" @@ -1192,14 +1200,6 @@ tf_module { name: "global_variables_initializer" argspec: "args=[], varargs=None, keywords=None, defaults=None" } - member_method { - name: "glorot_normal_initializer" - argspec: "args=[\'seed\', \'dtype\'], varargs=None, keywords=None, defaults=[\'None\', \"\"], " - } - member_method { - name: "glorot_uniform_initializer" - argspec: "args=[\'seed\', \'dtype\'], varargs=None, keywords=None, defaults=[\'None\', \"\"], " - } member_method { name: "gradients" argspec: "args=[\'ys\', \'xs\', \'grad_ys\', \'name\', \'colocate_gradients_with_ops\', \'gate_gradients\', \'aggregation_method\', \'stop_gradients\'], varargs=None, keywords=None, defaults=[\'None\', \'gradients\', \'False\', \'False\', \'None\', \'None\'], " -- GitLab From 026004e05dc172d1639840055462013f95e56bbe Mon Sep 17 00:00:00 2001 From: Seb Bro Date: Sat, 11 Aug 2018 22:04:08 +0200 Subject: [PATCH 037/598] Fix latex text (docs). Escape special character and add text section for comments. --- tensorflow/python/training/adam.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tensorflow/python/training/adam.py b/tensorflow/python/training/adam.py index bcbe5907d6..86b854c0de 100644 --- a/tensorflow/python/training/adam.py +++ b/tensorflow/python/training/adam.py @@ -43,15 +43,15 @@ class AdamOptimizer(optimizer.Optimizer): Initialization: - $$m_0 := 0 (Initialize initial 1st moment vector)$$ - $$v_0 := 0 (Initialize initial 2nd moment vector)$$ - $$t := 0 (Initialize timestep)$$ + $$m_0 := 0 \text{(Initialize initial 1st moment vector)}$$ + $$v_0 := 0 \text{(Initialize initial 2nd moment vector)}$$ + $$t := 0 \text{(Initialize timestep)}$$ The update rule for `variable` with gradient `g` uses an optimization described at the end of section2 of the paper: $$t := t + 1$$ - $$lr_t := \text{learning_rate} * \sqrt{(1 - beta_2^t) / (1 - beta_1^t)}$$ + $$lr_t := \text{learning\_rate} * \sqrt{(1 - beta_2^t) / (1 - beta_1^t)}$$ $$m_t := beta_1 * m_{t-1} + (1 - beta_1) * g$$ $$v_t := beta_2 * v_{t-1} + (1 - beta_2) * g * g$$ -- GitLab From 62049835ce3064c191a8054ec1056b4701afb744 Mon Sep 17 00:00:00 2001 From: Seb Bro Date: Sat, 11 Aug 2018 22:19:44 +0200 Subject: [PATCH 038/598] Fix sqrt in lr formula. --- tensorflow/python/training/adam.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tensorflow/python/training/adam.py b/tensorflow/python/training/adam.py index 86b854c0de..704ad6d3fe 100644 --- a/tensorflow/python/training/adam.py +++ b/tensorflow/python/training/adam.py @@ -51,7 +51,7 @@ class AdamOptimizer(optimizer.Optimizer): described at the end of section2 of the paper: $$t := t + 1$$ - $$lr_t := \text{learning\_rate} * \sqrt{(1 - beta_2^t) / (1 - beta_1^t)}$$ + $$lr_t := \text{learning\_rate} * \sqrt{1 - beta_2^t} / (1 - beta_1^t)$$ $$m_t := beta_1 * m_{t-1} + (1 - beta_1) * g$$ $$v_t := beta_2 * v_{t-1} + (1 - beta_2) * g * g$$ -- GitLab From a4667873d90cf2c8530e8a8058e7d1c065639ce8 Mon Sep 17 00:00:00 2001 From: Seb Bro Date: Sat, 11 Aug 2018 22:23:21 +0200 Subject: [PATCH 039/598] Fix formula. --- tensorflow/core/api_def/base_api/api_def_ApplyAdam.pbtxt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tensorflow/core/api_def/base_api/api_def_ApplyAdam.pbtxt b/tensorflow/core/api_def/base_api/api_def_ApplyAdam.pbtxt index b90f5473c8..6341eeda32 100644 --- a/tensorflow/core/api_def/base_api/api_def_ApplyAdam.pbtxt +++ b/tensorflow/core/api_def/base_api/api_def_ApplyAdam.pbtxt @@ -82,7 +82,7 @@ END } summary: "Update \'*var\' according to the Adam algorithm." description: < Date: Sat, 11 Aug 2018 22:23:52 +0200 Subject: [PATCH 040/598] Fix formula. --- .../core/api_def/base_api/api_def_ResourceApplyAdam.pbtxt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tensorflow/core/api_def/base_api/api_def_ResourceApplyAdam.pbtxt b/tensorflow/core/api_def/base_api/api_def_ResourceApplyAdam.pbtxt index ad0aeac004..2dcd136ae3 100644 --- a/tensorflow/core/api_def/base_api/api_def_ResourceApplyAdam.pbtxt +++ b/tensorflow/core/api_def/base_api/api_def_ResourceApplyAdam.pbtxt @@ -76,7 +76,7 @@ END } summary: "Update \'*var\' according to the Adam algorithm." description: < Date: Sat, 11 Aug 2018 22:26:40 +0200 Subject: [PATCH 041/598] Fix formula and text rendering. --- tensorflow/contrib/optimizer_v2/adam.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/tensorflow/contrib/optimizer_v2/adam.py b/tensorflow/contrib/optimizer_v2/adam.py index 631d4f44df..04b1552b61 100644 --- a/tensorflow/contrib/optimizer_v2/adam.py +++ b/tensorflow/contrib/optimizer_v2/adam.py @@ -40,15 +40,14 @@ class AdamOptimizer(optimizer_v2.OptimizerV2): Initialization: - $$m_0 := 0 (Initialize initial 1st moment vector)$$ - $$v_0 := 0 (Initialize initial 2nd moment vector)$$ - $$t := 0 (Initialize timestep)$$ - + $$m_0 := 0 \text{(Initialize initial 1st moment vector)}$$ + $$v_0 := 0 \text{(Initialize initial 2nd moment vector)}$$ + $$t := 0 \text{(Initialize timestep)}$$ The update rule for `variable` with gradient `g` uses an optimization described at the end of section2 of the paper: $$t := t + 1$$ - $$lr_t := \text{learning_rate} * \sqrt{(1 - beta_2^t) / (1 - beta_1^t)}$$ + $$lr_t := \text{learning\_rate} * \sqrt{1 - beta_2^t} / (1 - beta_1^t)$$ $$m_t := beta_1 * m_{t-1} + (1 - beta_1) * g$$ $$v_t := beta_2 * v_{t-1} + (1 - beta_2) * g * g$$ -- GitLab From 135ac89cae38464a9c6ea21af244e4a1bda255ed Mon Sep 17 00:00:00 2001 From: Guozhong Zhuang Date: Mon, 13 Aug 2018 15:52:43 -0700 Subject: [PATCH 042/598] enable pooling3D op --- tensorflow/core/graph/mkl_layout_pass.cc | 28 +++- .../core/graph/mkl_tfconversion_pass.cc | 12 +- tensorflow/core/kernels/mkl_aggregate_ops.cc | 20 ++- tensorflow/core/kernels/mkl_avgpooling_op.cc | 51 ++++--- tensorflow/core/kernels/mkl_maxpooling_op.cc | 59 +++++--- .../core/kernels/mkl_pooling_ops_common.cc | 129 ++++++++++++----- .../core/kernels/mkl_pooling_ops_common.h | 132 +++++++++++++----- tensorflow/core/ops/nn_ops.cc | 98 +++++++++++++ tensorflow/core/util/mkl_util.h | 114 +++++++++++++-- 9 files changed, 519 insertions(+), 124 deletions(-) diff --git a/tensorflow/core/graph/mkl_layout_pass.cc b/tensorflow/core/graph/mkl_layout_pass.cc index 5683944e46..30e48d3860 100644 --- a/tensorflow/core/graph/mkl_layout_pass.cc +++ b/tensorflow/core/graph/mkl_layout_pass.cc @@ -334,6 +334,7 @@ class MklLayoutRewritePass : public GraphOptimizationPass { rinfo_.push_back({csinfo_.conv2d_grad_input, mkl_op_registry::GetMklOpName(csinfo_.conv2d_grad_input), CopyAttrsConv2D, AlwaysRewrite, nullptr}); + rinfo_.push_back({csinfo_.fused_batch_norm, mkl_op_registry::GetMklOpName(csinfo_.fused_batch_norm), CopyAttrsFusedBatchNorm, AlwaysRewrite, nullptr}); @@ -546,14 +547,14 @@ class MklLayoutRewritePass : public GraphOptimizationPass { // If Op has been specifically assigned to a non-CPU device, then No. if (!n->assigned_device_name().empty() && - !str_util::StrContains(n->assigned_device_name(),kCPUDeviceSubStr)) { + !str_util::StrContains(n->assigned_device_name(), kCPUDeviceSubStr)) { result = false; reason = "Op has been assigned a runtime device that is not CPU."; } // If user has specifically assigned this op to a non-CPU device, then No. if (!n->def().device().empty() && - !str_util::StrContains(n->def().device(),kCPUDeviceSubStr)) { + !str_util::StrContains(n->def().device(), kCPUDeviceSubStr)) { result = false; reason = "User has assigned a device that is not CPU."; } @@ -2408,6 +2409,8 @@ class MklLayoutRewritePass : public GraphOptimizationPass { csinfo_.addn = "AddN"; csinfo_.avg_pool = "AvgPool"; csinfo_.avg_pool_grad = "AvgPoolGrad"; + csinfo_.avg_pool3d = "AvgPool3D"; + csinfo_.avg_pool3d_grad = "AvgPool3DGrad"; csinfo_.bias_add = "BiasAdd"; csinfo_.bias_add_grad = "BiasAddGrad"; csinfo_.concat = "Concat"; @@ -2426,6 +2429,8 @@ class MklLayoutRewritePass : public GraphOptimizationPass { csinfo_.matmul = "MatMul"; csinfo_.max_pool = "MaxPool"; csinfo_.max_pool_grad = "MaxPoolGrad"; + csinfo_.max_pool3d = "MaxPool3D"; + csinfo_.max_pool3d_grad = "MaxPool3DGrad"; csinfo_.mkl_conv2d = "_MklConv2D"; csinfo_.mkl_conv2d_grad_input = "_MklConv2DBackpropInput"; csinfo_.mkl_conv2d_grad_filter = "_MklConv2DBackpropFilter"; @@ -2460,6 +2465,12 @@ class MklLayoutRewritePass : public GraphOptimizationPass { rinfo_.push_back({csinfo_.avg_pool_grad, mkl_op_registry::GetMklOpName(csinfo_.avg_pool_grad), CopyAttrsPooling, AlwaysRewrite}); + rinfo_.push_back({csinfo_.avg_pool3d, + mkl_op_registry::GetMklOpName(csinfo_.avg_pool3d), + CopyAttrsPooling, AlwaysRewrite}); + rinfo_.push_back({csinfo_.avg_pool3d_grad, + mkl_op_registry::GetMklOpName(csinfo_.avg_pool3d_grad), + CopyAttrsPooling, AlwaysRewrite}); rinfo_.push_back({csinfo_.concat, mkl_op_registry::GetMklOpName(csinfo_.concat), CopyAttrsConcat, AlwaysRewrite}); @@ -2501,7 +2512,12 @@ class MklLayoutRewritePass : public GraphOptimizationPass { rinfo_.push_back({csinfo_.max_pool_grad, mkl_op_registry::GetMklOpName(csinfo_.max_pool_grad), CopyAttrsPooling, MaxpoolGradRewrite}); - + rinfo_.push_back({csinfo_.max_pool3d, + mkl_op_registry::GetMklOpName(csinfo_.max_pool3d), + CopyAttrsPooling, NonDepthBatchWisePoolRewrite}); + rinfo_.push_back({csinfo_.max_pool3d_grad, + mkl_op_registry::GetMklOpName(csinfo_.max_pool3d_grad), + CopyAttrsPooling, AlwaysRewrite}); rinfo_.push_back({csinfo_.maximum, mkl_op_registry::GetMklOpName(csinfo_.maximum), CopyAttrsDataType, AlwaysRewrite}); @@ -2538,6 +2554,8 @@ class MklLayoutRewritePass : public GraphOptimizationPass { // Add info about which ops to add workspace edge to and the slots. wsinfo_.push_back({csinfo_.lrn, csinfo_.lrn_grad, 0, 2, 1, 3}); wsinfo_.push_back({csinfo_.max_pool, csinfo_.max_pool_grad, 0, 1, 1, 3}); + wsinfo_.push_back + ({csinfo_.max_pool3d, csinfo_.max_pool3d_grad, 0, 1, 1, 3}); // Add a rule for merging nodes minfo_.push_back({csinfo_.conv2d, csinfo_.bias_add, @@ -2605,6 +2623,8 @@ class MklLayoutRewritePass : public GraphOptimizationPass { string add; string avg_pool; string avg_pool_grad; + string avg_pool3d; + string avg_pool3d_grad; string bias_add; string bias_add_grad; string concat; @@ -2622,6 +2642,8 @@ class MklLayoutRewritePass : public GraphOptimizationPass { string matmul; string max_pool; string max_pool_grad; + string max_pool3d; + string max_pool3d_grad; string maximum; string mkl_conv2d; string mkl_conv2d_grad_input; diff --git a/tensorflow/core/graph/mkl_tfconversion_pass.cc b/tensorflow/core/graph/mkl_tfconversion_pass.cc index aa39af637f..b67a321fc1 100644 --- a/tensorflow/core/graph/mkl_tfconversion_pass.cc +++ b/tensorflow/core/graph/mkl_tfconversion_pass.cc @@ -175,7 +175,11 @@ Status MklToTfConversionPass::InsertConversionNodeOnEdge( .Finalize(&**g, &conversion_node)); CHECK_NOTNULL(conversion_node); - if (GetNodeAttr(src->def(), "data_format", &data_format) == Status::OK()) { + // TODO(Intel-tf) MklToTf accepts only NHWC or NCHW, but doesn't seem to be + // using data_format. This code might be redundant. + if (GetNodeAttr(src->def(), "data_format", &data_format) == Status::OK() && + (data_format == ToString(FORMAT_NHWC) || + data_format == ToString(FORMAT_NCHW))) { conversion_node->AddAttr("data_format", data_format); } @@ -254,9 +258,13 @@ Status MklToTfConversionPass::InsertInputConversionNode( } } + // TODO(Intel-tf) MklInputConversion accepts only NHWC or NCHW, but doesn't + // seem to be using data_format. This code might be redundant. string data_format; if (GetNodeAttr(edges[0]->src()->def(), "data_format", &data_format) == - Status::OK()) { + Status::OK() && + (data_format == ToString(FORMAT_NHWC) || + data_format == ToString(FORMAT_NCHW))) { conversion_node->AddAttr("data_format", data_format); } diff --git a/tensorflow/core/kernels/mkl_aggregate_ops.cc b/tensorflow/core/kernels/mkl_aggregate_ops.cc index 28edf51546..20aa1f7ea1 100644 --- a/tensorflow/core/kernels/mkl_aggregate_ops.cc +++ b/tensorflow/core/kernels/mkl_aggregate_ops.cc @@ -392,16 +392,28 @@ class MklAddNOp : public OpKernel { memory::format src1_mkl_data_format = src1_mkl_shape.GetTfDataFormat(); auto src1_tf_data_format = MklDnnDataFormatToTFDataFormat(src1_mkl_data_format); - auto src2_dims = - TFShapeToMklDnnDimsInNCHW(src2_tensor.shape(), src1_tf_data_format); + memory::dims src2_dims; + if (src2_tensor.dims() == 4) { + src2_dims = TFShapeToMklDnnDimsInNCHW(src2_tensor.shape(), + src1_tf_data_format); + } else { + src2_dims = TFShapeToMklDnnDimsInNCDHW(src2_tensor.shape(), + src1_tf_data_format); + } md2 = memory::desc(src2_dims, MklDnnType(), src1_mkl_data_format); } else if (input2_in_mkl_format && !input1_in_mkl_format) { // Same comment as above. memory::format src2_mkl_data_format = src2_mkl_shape.GetTfDataFormat(); auto src2_tf_data_format = MklDnnDataFormatToTFDataFormat(src2_mkl_data_format); - auto src1_dims = - TFShapeToMklDnnDimsInNCHW(src1_tensor.shape(), src2_tf_data_format); + memory::dims src1_dims; + if (src1_tensor.dims() == 4) { + src1_dims = TFShapeToMklDnnDimsInNCHW(src1_tensor.shape(), + src2_tf_data_format); + } else { + src1_dims = TFShapeToMklDnnDimsInNCDHW(src1_tensor.shape(), + src2_tf_data_format); + } md1 = memory::desc(src1_dims, MklDnnType(), src2_mkl_data_format); md2 = src2_mkl_shape.GetMklLayout(); diff --git a/tensorflow/core/kernels/mkl_avgpooling_op.cc b/tensorflow/core/kernels/mkl_avgpooling_op.cc index 969baecc51..749b2a1838 100644 --- a/tensorflow/core/kernels/mkl_avgpooling_op.cc +++ b/tensorflow/core/kernels/mkl_avgpooling_op.cc @@ -453,6 +453,8 @@ class MklAvgPoolingOp : public MklPoolingForwardOpBase { // initialize variables for the pooling op MklPoolParameters pool_params; + // check whether pooling is 2D or 3D + bool isPool2D = (this->ksize_.size() == 4); // Get the input tensor and initialize the pooling parameters TensorShape input_tensor_shape = input_tensor.shape(); this->InitMklPoolParameters(context, &pool_params, dnn_shape_input, @@ -473,23 +475,22 @@ class MklAvgPoolingOp : public MklPoolingForwardOpBase { } memory::dims filter_dims, strides, padding_left, padding_right; + // Get src/filter/stride/padding information this->PoolParamsToDims(&pool_params, &filter_dims, &strides, - &padding_left, &padding_right); + &padding_left, &padding_right, isPool2D); // Get the input memory descriptor - memory::desc input_md = - dnn_shape_input.IsMklTensor() - ? dnn_shape_input.GetMklLayout() - : memory::desc(TFShapeToMklDnnDimsInNCHW(input_tensor_shape, - this->data_format_tf_), - MklDnnType(), this->data_format_mkldnn_); - - // Get src/filter/stride/padding information memory::dims src_dims = dnn_shape_input.IsMklTensor() ? dnn_shape_input.GetSizesAsMklDnnDims() - : TFShapeToMklDnnDimsInNCHW(input_tensor.shape(), - this->data_format_tf_); + : isPool2D ? TFShapeToMklDnnDimsInNCHW(input_tensor.shape(), + this->data_format_tf_) + : TFShapeToMklDnnDimsInNCDHW(input_tensor.shape(), + this->data_format_tf_); + memory::desc input_md = dnn_shape_input.IsMklTensor() + ? dnn_shape_input.GetMklLayout() + : memory::desc(src_dims, MklDnnType(), + this->data_format_mkldnn_); // Get an average pooling primitive from the op pool MklPoolingFwdPrimitive* pooling_fwd = nullptr; @@ -562,24 +563,30 @@ class MklAvgPoolingGradOp : public MklPoolingBackwardOpBase { for (int i = 0; i < orig_input_tensor.NumElements(); i++) { orig_input_shape.AddDim(shape_vec(i)); } + + bool isPool2D = (this->ksize_.size() == 4); this->InitMklPoolParameters(context, &pool_params, orig_input_mkl_shape, orig_input_shape); memory::dims filter_dims, strides, padding_left, padding_right; this->PoolParamsToDims(&pool_params, &filter_dims, &strides, - &padding_left, &padding_right); + &padding_left, &padding_right, isPool2D); memory::dims orig_input_dims_mkl_order = orig_input_mkl_shape.IsMklTensor() ? orig_input_mkl_shape.GetSizesAsMklDnnDims() - : TFShapeToMklDnnDimsInNCHW(orig_input_shape, - this->data_format_tf_); + : isPool2D ? TFShapeToMklDnnDimsInNCHW(orig_input_shape, + this->data_format_tf_) + : TFShapeToMklDnnDimsInNCDHW(orig_input_shape, + this->data_format_tf_); memory::dims diff_dst_dims = grad_mkl_shape.IsMklTensor() ? grad_mkl_shape.GetSizesAsMklDnnDims() - : TFShapeToMklDnnDimsInNCHW(grad_tensor.shape(), - this->data_format_tf_); + : isPool2D ? TFShapeToMklDnnDimsInNCHW(grad_tensor.shape(), + this->data_format_tf_) + : TFShapeToMklDnnDimsInNCDHW(grad_tensor.shape(), + this->data_format_tf_); memory::dims output_dims_mkl_order; this->GetOutputDims(pool_params, &output_dims_mkl_order); @@ -664,6 +671,18 @@ class MklAvgPoolingGradOp : public MklPoolingBackwardOpBase { } }; // MklAvgPoolingGradOp +REGISTER_KERNEL_BUILDER(Name("_MklAvgPool3D") + .Device(DEVICE_CPU) + .TypeConstraint("T") + .Label(mkl_op_registry::kMklOpLabel), + MklAvgPoolingOp); + +REGISTER_KERNEL_BUILDER(Name("_MklAvgPool3DGrad") + .Device(DEVICE_CPU) + .TypeConstraint("T") + .Label(mkl_op_registry::kMklOpLabel), + MklAvgPoolingGradOp); + #endif // INTEL_MKL_ML_ONLY REGISTER_KERNEL_BUILDER(Name("_MklAvgPool") diff --git a/tensorflow/core/kernels/mkl_maxpooling_op.cc b/tensorflow/core/kernels/mkl_maxpooling_op.cc index e149f003e5..aa7c0d9b7f 100644 --- a/tensorflow/core/kernels/mkl_maxpooling_op.cc +++ b/tensorflow/core/kernels/mkl_maxpooling_op.cc @@ -524,6 +524,8 @@ class MklMaxPoolingOp : public MklPoolingForwardOpBase { // initialize variables for the pooling op MklPoolParameters pool_params; + // check whether pooling is 2D or 3D + bool isPool2D = (this->ksize_.size() == 4); // Get the input tensor and initialize the pooling parameters TensorShape input_tensor_shape = input_tensor.shape(); this->InitMklPoolParameters(context, &pool_params, dnn_shape_input, @@ -547,20 +549,26 @@ class MklMaxPoolingOp : public MklPoolingForwardOpBase { memory::desc input_md = dnn_shape_input.IsMklTensor() ? dnn_shape_input.GetMklLayout() - : memory::desc(TFShapeToMklDnnDimsInNCHW(input_tensor_shape, - this->data_format_tf_), - MklDnnType(), this->data_format_mkldnn_); + : isPool2D ? memory::desc( + TFShapeToMklDnnDimsInNCHW(input_tensor_shape, + this->data_format_tf_), + MklDnnType(), this->data_format_mkldnn_) + : memory::desc( + TFShapeToMklDnnDimsInNCDHW( + input_tensor_shape, this->data_format_tf_), + MklDnnType(), this->data_format_mkldnn_); // Get src/filter/stride/padding information memory::dims src_dims = dnn_shape_input.IsMklTensor() ? dnn_shape_input.GetSizesAsMklDnnDims() - : TFShapeToMklDnnDimsInNCHW(input_tensor.shape(), - this->data_format_tf_); - + : isPool2D ? TFShapeToMklDnnDimsInNCHW(input_tensor.shape(), + this->data_format_tf_) + : TFShapeToMklDnnDimsInNCDHW(input_tensor.shape(), + this->data_format_tf_); memory::dims filter_dims, strides, padding_left, padding_right; this->PoolParamsToDims(&pool_params, &filter_dims, &strides, - &padding_left, &padding_right); + &padding_left, &padding_right, isPool2D); // Get a pooling op from the cached pool MklPoolingFwdPrimitive* pooling_fwd = nullptr; @@ -663,23 +671,30 @@ class MklMaxPoolingGradOp : public MklPoolingBackwardOpBase { MklPoolParameters pool_params; TensorShape orig_input_shape = orig_input_tensor.shape(); + + bool isPool2D = (this->ksize_.size() == 4); this->InitMklPoolParameters(context, &pool_params, orig_input_mkl_shape, orig_input_shape); memory::dims filter_dims, strides, padding_left, padding_right; this->PoolParamsToDims(&pool_params, &filter_dims, &strides, - &padding_left, &padding_right); + &padding_left, &padding_right, isPool2D); - memory::dims diff_dst_dims = - grad_mkl_shape.IsMklTensor() - ? grad_mkl_shape.GetSizesAsMklDnnDims() - : TFShapeToMklDnnDimsInNCHW(grad_tensor.shape(), - this->data_format_tf_); memory::dims orig_input_dims_mkl_order = orig_input_mkl_shape.IsMklTensor() ? orig_input_mkl_shape.GetSizesAsMklDnnDims() - : TFShapeToMklDnnDimsInNCHW(orig_input_shape, - this->data_format_tf_); + : isPool2D ? TFShapeToMklDnnDimsInNCHW(orig_input_shape, + this->data_format_tf_) + : TFShapeToMklDnnDimsInNCDHW(orig_input_shape, + this->data_format_tf_); + + memory::dims diff_dst_dims = + grad_mkl_shape.IsMklTensor() + ? grad_mkl_shape.GetSizesAsMklDnnDims() + : isPool2D ? TFShapeToMklDnnDimsInNCHW(grad_tensor.shape(), + this->data_format_tf_) + : TFShapeToMklDnnDimsInNCDHW(grad_tensor.shape(), + this->data_format_tf_); memory::dims output_dims_mkl_order; this->GetOutputDims(pool_params, &output_dims_mkl_order); @@ -715,7 +730,7 @@ class MklMaxPoolingGradOp : public MklPoolingBackwardOpBase { void* ws_data = static_cast( const_cast(workspace_tensor.flat().data())); - ; + auto ws_md = pooling_bwd->GetPoolingFwdPd()->workspace_primitive_desc().desc(); if (ws_md.data.format != pooling_bwd->GetWorkspaceFormat()) { @@ -817,6 +832,18 @@ class MklMaxPoolingGradOp : public MklPoolingBackwardOpBase { } }; // MklMaxPoolingGradOp +REGISTER_KERNEL_BUILDER(Name("_MklMaxPool3D") + .Device(DEVICE_CPU) + .TypeConstraint("T") + .Label(mkl_op_registry::kMklOpLabel), + MklMaxPoolingOp); + +REGISTER_KERNEL_BUILDER(Name("_MklMaxPool3DGrad") + .Device(DEVICE_CPU) + .TypeConstraint("T") + .Label(mkl_op_registry::kMklOpLabel), + MklMaxPoolingGradOp); + #endif // INTEL_MKL_ML_ONLY REGISTER_KERNEL_BUILDER(Name("_MklMaxPool") diff --git a/tensorflow/core/kernels/mkl_pooling_ops_common.cc b/tensorflow/core/kernels/mkl_pooling_ops_common.cc index d7ad3f9dcd..5d02ceea12 100644 --- a/tensorflow/core/kernels/mkl_pooling_ops_common.cc +++ b/tensorflow/core/kernels/mkl_pooling_ops_common.cc @@ -24,7 +24,7 @@ limitations under the License. namespace tensorflow { -#ifndef INTEL_MKL_ML +#ifndef INTEL_MKL_ML_ONLY using mkldnn::pooling_avg; using mkldnn::pooling_avg_exclude_padding; @@ -46,9 +46,10 @@ void MklPoolingFwdPrimitive::Setup(const MklPoolingParams& fwdParams) { // so src format is currently hard-coded. // A utility function is used to do this, // which may be broken with future CPU architectures + bool is_2d = (fwdParams.src_dims.size() == 4); context_.src_md.reset( new memory::desc({fwdParams.src_dims}, MklDnnType(), - get_desired_format(fwdParams.src_dims[1]))); + get_desired_format(fwdParams.src_dims[1], is_2d))); context_.dst_md.reset(new memory::desc({fwdParams.dst_dims}, MklDnnType(), memory::format::any)); @@ -61,7 +62,7 @@ void MklPoolingFwdPrimitive::Setup(const MklPoolingParams& fwdParams) { new pooling_forward::primitive_desc(*context_.fwd_desc, cpu_engine_)); // store expected primitive format - context_.src_fmt = get_desired_format(fwdParams.src_dims[1]); + context_.src_fmt = get_desired_format(fwdParams.src_dims[1], is_2d); context_.dst_fmt = static_cast( context_.fwd_pd.get()->dst_primitive_desc().desc().data.format); @@ -126,12 +127,14 @@ void MklPoolingBwdPrimitive::Setup(const MklPoolingParams& bwdParams) { } context_.alg_kind = bwdParams.alg_kind; + // check whether it is 2d or 3d + bool is_2d = (bwdParams.dst_dims.size() == 4); // Create memory desc context_.diff_src_md.reset(new memory::desc( {bwdParams.src_dims}, MklDnnType(), memory::format::any)); context_.diff_dst_md.reset( new memory::desc({bwdParams.dst_dims}, MklDnnType(), - get_desired_format(bwdParams.dst_dims[1]))); + get_desired_format(bwdParams.dst_dims[1], is_2d))); context_.bwd_desc.reset(new pooling_backward::desc( bwdParams.alg_kind, *context_.diff_src_md, *context_.diff_dst_md, bwdParams.strides, bwdParams.filter_dims, bwdParams.padding_left, @@ -151,7 +154,7 @@ void MklPoolingBwdPrimitive::Setup(const MklPoolingParams& bwdParams) { // store expected primitive format context_.diff_src_fmt = static_cast( context_.bwd_pd.get()->diff_src_primitive_desc().desc().data.format); - context_.diff_dst_fmt = get_desired_format(bwdParams.dst_dims[1]); + context_.diff_dst_fmt = get_desired_format(bwdParams.dst_dims[1], is_2d); // create MKL-DNN internal memory object with dummy data context_.diff_src_mem.reset( @@ -165,7 +168,7 @@ void MklPoolingBwdPrimitive::Setup(const MklPoolingParams& bwdParams) { if (bwdParams.alg_kind == pooling_max) { auto ws_pd = context_.fwd_pd.get()->workspace_primitive_desc().desc().data; context_.ws_dims.assign(ws_pd.dims, ws_pd.dims + ws_pd.ndims); - context_.ws_fmt = get_desired_format(context_.ws_dims[1]); + context_.ws_fmt = get_desired_format(context_.ws_dims[1], is_2d); context_.ws_dt = static_cast(ws_pd.data_type); context_.ws_mem.reset(new memory( {{{context_.ws_dims}, context_.ws_dt, context_.ws_fmt}, cpu_engine}, @@ -211,13 +214,22 @@ void MklPoolParameters::Init(OpKernelContext* context, const std::vector& stride, Padding padding, TensorFormat data_format, const TensorShape& tensor_in_shape) { - // For maxpooling, tensor_in should have 4 dimensions. - OP_REQUIRES(context, tensor_in_shape.dims() == 4, - errors::InvalidArgument("tensor_in must be 4-dimensional")); + // For maxpooling, tensor_in should have 4 or 5 dimensions. + OP_REQUIRES(context, + tensor_in_shape.dims() == 4 || tensor_in_shape.dims() == 5, + errors::InvalidArgument("tensor_in must be 4 or 5-dimensional")); depth = GetTensorDim(tensor_in_shape, data_format, 'C'); - tensor_in_cols = GetTensorDim(tensor_in_shape, data_format, 'W'); - tensor_in_rows = GetTensorDim(tensor_in_shape, data_format, 'H'); + if (tensor_in_shape.dims() == 4) { + // Pool2D + tensor_in_cols = GetTensorDim(tensor_in_shape, data_format, 'W'); + tensor_in_rows = GetTensorDim(tensor_in_shape, data_format, 'H'); + } else { + // Pool3D + tensor_in_planes = GetTensorDim(tensor_in_shape, data_format, '0'); + tensor_in_rows = GetTensorDim(tensor_in_shape, data_format, '1'); + tensor_in_cols = GetTensorDim(tensor_in_shape, data_format, '2'); + } tensor_in_batch = GetTensorDim(tensor_in_shape, data_format, 'N'); Init(context, ksize, stride, padding, data_format); @@ -246,10 +258,20 @@ void MklPoolParameters::Init(OpKernelContext* context, TensorFormat data_format, const MklDnnShape* mklInputShape) { // Get the input sizes - depth = mklInputShape->GetDimension('C'); - tensor_in_cols = mklInputShape->GetDimension('W'); - tensor_in_rows = mklInputShape->GetDimension('H'); - tensor_in_batch = mklInputShape->GetDimension('N'); + if (ksize.size() == 4) { + // Pool2D + depth = mklInputShape->GetDimension('C'); + tensor_in_cols = mklInputShape->GetDimension('W'); + tensor_in_rows = mklInputShape->GetDimension('H'); + tensor_in_batch = mklInputShape->GetDimension('N'); + } else { + // Pool3D + depth = mklInputShape->GetDimension3D('C'); + tensor_in_cols = mklInputShape->GetDimension3D('W'); + tensor_in_rows = mklInputShape->GetDimension3D('H'); + tensor_in_planes = mklInputShape->GetDimension3D('D'); + tensor_in_batch = mklInputShape->GetDimension3D('N'); + } Init(context, ksize, stride, padding, data_format); } @@ -262,25 +284,58 @@ void MklPoolParameters::Init(OpKernelContext* context, // Get the data format this->data_format = data_format; - // Get the output sizes - window_rows = GetTensorDim(ksize, data_format, 'H'); - window_cols = GetTensorDim(ksize, data_format, 'W'); - depth_window = GetTensorDim(ksize, data_format, 'C'); - - // Get the strides - row_stride = GetTensorDim(stride, data_format, 'H'); - col_stride = GetTensorDim(stride, data_format, 'W'); - depth_stride = GetTensorDim(stride, data_format, 'C'); + bool isPool2D = (ksize.size() == 4); + if (isPool2D) { + // Pool2D + // Get the output sizes + window_rows = GetTensorDim(ksize, data_format, 'H'); + window_cols = GetTensorDim(ksize, data_format, 'W'); + depth_window = GetTensorDim(ksize, data_format, 'C'); + + // Get the strides + row_stride = GetTensorDim(stride, data_format, 'H'); + col_stride = GetTensorDim(stride, data_format, 'W'); + depth_stride = GetTensorDim(stride, data_format, 'C'); + + // We only support 2D pooling across width/height and depthwise + // pooling, not a combination. + OP_REQUIRES(context, + (depth_window == 1 || (window_rows == 1 && window_cols == 1)), + errors::Unimplemented( + "MaxPooling supports exactly one of pooling across depth " + "or pooling across width/height.")); + } else { + // Pool3D + // Get the output sizes + window_planes = GetTensorDim(ksize, data_format, '0'); + window_rows = GetTensorDim(ksize, data_format, '1'); + window_cols = GetTensorDim(ksize, data_format, '2'); + depth_window = GetTensorDim(ksize, data_format, 'C'); + + // Get the strides + planes_stride = GetTensorDim(stride, data_format, '0'); + row_stride = GetTensorDim(stride, data_format, '1'); + col_stride = GetTensorDim(stride, data_format, '2'); + depth_stride = GetTensorDim(stride, data_format, 'C'); + + // We only support 3D pooling across depth/width/height and depthwise + // pooling, not a combination. + OP_REQUIRES(context, + (depth_window == 1 || + (window_rows == 1 && window_cols == 1 && window_planes == 1)), + errors::Unimplemented( + "AvgPooling3D supports exactly one of pooling across depth " + "or pooling across depth/width/height.")); + } - // We only support 2D pooling across width/height and depthwise - // pooling, not a combination. - OP_REQUIRES(context, - (depth_window == 1 || (window_rows == 1 && window_cols == 1)), - errors::Unimplemented( - "MaxPooling supports exactly one of pooling across depth " - "or pooling across width/height.")); + if (depth_window == 1) { // we are pooling in the D (Pool3D only), H and W + if (!isPool2D) { + OP_REQUIRES_OK( + context, GetWindowedOutputSizeVerbose(tensor_in_planes, window_planes, + planes_stride, padding, + &out_planes, &pad_P1, &pad_P2)); + } - if (depth_window == 1) { // we are pooling in the H and W OP_REQUIRES_OK(context, GetWindowedOutputSizeVerbose( tensor_in_rows, window_rows, row_stride, padding, &out_height, &pad_top, &pad_bottom)); @@ -290,7 +345,14 @@ void MklPoolParameters::Init(OpKernelContext* context, padding, &out_width, &pad_left, &pad_right)); #ifndef INTEL_MKL_ML_ONLY // TF can work with int64, but mkldnn only supports int32 - // Fail if the height or width are greater than MAX_INT + // Fail if the depth, height or width are greater than MAX_INT + // We check depth only for 3D pooling case + + if (!isPool2D) { + OP_REQUIRES(context, + FastBoundsCheck(out_planes, std::numeric_limits::max()), + errors::InvalidArgument("output depth/planes is too large")); + } OP_REQUIRES(context, FastBoundsCheck(out_height, std::numeric_limits::max()), @@ -299,7 +361,6 @@ void MklPoolParameters::Init(OpKernelContext* context, OP_REQUIRES(context, FastBoundsCheck(out_width, std::numeric_limits::max()), errors::InvalidArgument("output width is too large")); - #endif out_depth = depth; // output will have the same depth as the input } else { // we are pooling in the depth dimension diff --git a/tensorflow/core/kernels/mkl_pooling_ops_common.h b/tensorflow/core/kernels/mkl_pooling_ops_common.h index ec7af5092d..ea7458062c 100644 --- a/tensorflow/core/kernels/mkl_pooling_ops_common.h +++ b/tensorflow/core/kernels/mkl_pooling_ops_common.h @@ -19,6 +19,7 @@ limitations under the License. #ifdef INTEL_MKL #include #include +#include #include "tensorflow/core/util/mkl_util.h" #include "tensorflow/core/util/padding.h" @@ -32,7 +33,7 @@ using mkldnn::stream; namespace tensorflow { -#ifndef INTEL_MKL_ML +#ifndef INTEL_MKL_ML_ONLY using mkldnn::memory; using mkldnn::pooling_avg; @@ -357,22 +358,28 @@ typedef Eigen::ThreadPoolDevice CPUDevice; struct MklPoolParameters { int depth; + int tensor_in_planes; // Pool3D int tensor_in_cols; int tensor_in_rows; int tensor_in_batch; + int window_planes; // Pool3D int window_rows; int window_cols; int depth_window; + int planes_stride; // Pool3D int row_stride; int col_stride; int depth_stride; + int64 out_planes; // Pool3D int64 out_height; int64 out_width; int out_depth; + int64 pad_P1; // Pool3D + int64 pad_P2; // Pool3D int64 pad_left; int64 pad_right; int64 pad_top; @@ -382,18 +389,24 @@ struct MklPoolParameters { TensorFormat data_format; MklPoolParameters() : depth(0), + tensor_in_planes(0), tensor_in_cols(0), tensor_in_rows(0), tensor_in_batch(0), + window_planes(0), window_rows(0), window_cols(0), depth_window(0), + planes_stride(0), row_stride(0), col_stride(0), depth_stride(0), + out_planes(0), out_height(0), out_width(0), out_depth(0), + pad_P1(0), + pad_P2(0), pad_left(0), pad_right(0), pad_top(0), @@ -433,20 +446,22 @@ class MklPoolingOpBase : public OpKernel { OP_REQUIRES_OK(context, context->GetAttr("data_format", &data_format)); OP_REQUIRES(context, FormatFromString(data_format, &this->data_format_tf_), errors::InvalidArgument("Invalid data format")); - this->data_format_mkldnn_ = - TFDataFormatToMklDnnDataFormat(this->data_format_tf_); OP_REQUIRES_OK(context, context->GetAttr("ksize", &this->ksize_)); - OP_REQUIRES(context, this->ksize_.size() == 4, + OP_REQUIRES(context, this->ksize_.size() == 4 || this->ksize_.size() == 5, errors::InvalidArgument("Sliding window ksize field must " - "specify 4 dimensions")); + "specify 4 or 5 dimensions")); OP_REQUIRES_OK(context, context->GetAttr("strides", &this->stride_)); - OP_REQUIRES(context, this->stride_.size() == 4, + OP_REQUIRES(context, this->stride_.size() == 4 || this->stride_.size() == 5, errors::InvalidArgument("Sliding window strides field must " - "specify 4 dimensions")); + "specify 4 or 5 dimensions")); OP_REQUIRES_OK(context, context->GetAttr("padding", &this->padding_)); OP_REQUIRES(context, this->ksize_[0] == 1 && this->stride_[0] == 1, errors::Unimplemented("Pooling is not yet supported on the " "batch dimension.")); + bool isPool2D = (this->ksize_.size() == 4); + this->data_format_mkldnn_ = + isPool2D ? TFDataFormatToMklDnnDataFormat(this->data_format_tf_) + : TFDataFormatToMklDnn3DDataFormat(this->data_format_tf_); // We may not get this attribute for this node if it does not go through // graph rewrite pass. So we do not check for error while retrieving this @@ -457,17 +472,26 @@ class MklPoolingOpBase : public OpKernel { protected: // Calculate output shape of pooling op in MKL-DNN and TensorFlow order. - // MKL-DNN uses NCHW for output order. But TensorFlow output will be in - // NHWC or NCHW format depending on data format. Function expects - // output height and output width to have already been int32 - // bounds-checked + // MKL-DNN uses NCHW(Pool2D) or NCDHW(Pool3D) for output order. + // But TensorFlow output will be in NHWC/NCHW(Pool2D) or + // NDHWC/NCDHW(Pool3D) format depending on data format. Function expects + // output height and width to have already been int32 bounds-checked. void GetOutputDims(const MklPoolParameters& mkl_pool_params, memory::dims* output_dims_mkl_order) { - // MKL-DNN always needs output in NCHW format. - *output_dims_mkl_order = {mkl_pool_params.tensor_in_batch, - mkl_pool_params.out_depth, - static_cast(mkl_pool_params.out_height), - static_cast(mkl_pool_params.out_width)}; + if (this->ksize_.size() == 4) { + // Pooling2D: MKL-DNN always needs output in NCHW format. + *output_dims_mkl_order = {mkl_pool_params.tensor_in_batch, + mkl_pool_params.out_depth, + static_cast(mkl_pool_params.out_height), + static_cast(mkl_pool_params.out_width)}; + } else { + // Pooling3D: MKL-DNN always needs output in NCDHW format. + *output_dims_mkl_order = {mkl_pool_params.tensor_in_batch, + mkl_pool_params.out_depth, + static_cast(mkl_pool_params.out_planes), + static_cast(mkl_pool_params.out_height), + static_cast(mkl_pool_params.out_width)}; + } } void InitMklPoolParameters(OpKernelContext* context, @@ -485,14 +509,34 @@ class MklPoolingOpBase : public OpKernel { void PoolParamsToDims(const MklPoolParameters* pool_params, memory::dims* filter_dims, memory::dims* strides, - memory::dims* padding_left, - memory::dims* padding_right) { - *filter_dims = {pool_params->window_rows, pool_params->window_cols}; - *strides = {pool_params->row_stride, pool_params->col_stride}; - *padding_left = {static_cast(pool_params->pad_top), - static_cast(pool_params->pad_left)}; - *padding_right = {static_cast(pool_params->pad_bottom), - static_cast(pool_params->pad_right)}; + memory::dims* padding_left, memory::dims* padding_right, + bool isPool2D) { + if (isPool2D) { + // Pool2D + *filter_dims = + memory::dims({pool_params->window_rows, pool_params->window_cols}); + *strides = + memory::dims({pool_params->row_stride, pool_params->col_stride}); + *padding_left = memory::dims({static_cast(pool_params->pad_top), + static_cast(pool_params->pad_left)}); + *padding_right = memory::dims({static_cast(pool_params->pad_bottom), + static_cast(pool_params->pad_right)}); + } else { + // Pool3D + *filter_dims = + memory::dims({pool_params->window_planes, pool_params->window_rows, + pool_params->window_cols}); + *strides = + memory::dims({pool_params->planes_stride, pool_params->row_stride, + pool_params->col_stride}); + + *padding_left = memory::dims({static_cast(pool_params->pad_P1), + static_cast(pool_params->pad_top), + static_cast(pool_params->pad_left)}); + *padding_right = memory::dims({static_cast(pool_params->pad_P2), + static_cast(pool_params->pad_bottom), + static_cast(pool_params->pad_right)}); + } } void AllocateEmptyOutputTensor(OpKernelContext* context, @@ -556,12 +600,27 @@ class MklPoolingForwardOpBase : public MklPoolingOpBase { TensorShape input_tensor_shape = input_tensor.shape(); if (input_tensor.NumElements() != 0) { memory::desc input_md = - input_mkl_shape.IsMklTensor() - ? input_mkl_shape.GetMklLayout() - : memory::desc(TFShapeToMklDnnDimsInNCHW(input_tensor_shape, + input_mkl_shape.IsMklTensor() + ? input_mkl_shape.GetMklLayout() + : memory::desc( + (this->ksize_.size() == 4) + ? TFShapeToMklDnnDimsInNCHW(input_tensor_shape, + this->data_format_tf_) + : TFShapeToMklDnnDimsInNCDHW(input_tensor_shape, this->data_format_tf_), - MklDnnType(), this->data_format_mkldnn_); + MklDnnType(), this->data_format_mkldnn_); dnn_data_input->SetUsrMem(input_md, &input_tensor); + + if (this->ksize_.size() == 5) { + // Pool3D + std::vector mkldnn_sizes(5, -1); + mkldnn_sizes[MklDnnDims3D::Dim3d_N] = input_md.data.dims[0]; + mkldnn_sizes[MklDnnDims3D::Dim3d_C] = input_md.data.dims[1]; + mkldnn_sizes[MklDnnDims3D::Dim3d_D] = input_md.data.dims[2]; + mkldnn_sizes[MklDnnDims3D::Dim3d_H] = input_md.data.dims[3]; + mkldnn_sizes[MklDnnDims3D::Dim3d_W] = input_md.data.dims[4]; + dnn_data_input->SetOpMemDesc(mkldnn_sizes, this->data_format_mkldnn_); + } } this->InitMklPoolParameters(context, pool_params, input_mkl_shape, input_tensor_shape); @@ -593,12 +652,13 @@ class MklPoolingForwardOpBase : public MklPoolingOpBase { void SanityCheckInput(OpKernelContext* context, const Tensor& input_tensor, const MklDnnShape& input_mkl_shape) { if (!input_mkl_shape.IsMklTensor()) { - OP_REQUIRES(context, input_tensor.dims() == 4, - errors::InvalidArgument("Input must be 4-dimensional")); + OP_REQUIRES(context, input_tensor.dims() == 4 || input_tensor.dims() == 5, + errors::InvalidArgument("Input must be 4 or 5-dimensional")); } else { - OP_REQUIRES(context, input_mkl_shape.GetDimension() == 4, + OP_REQUIRES(context, input_mkl_shape.GetDimension() == 4 || + input_mkl_shape.GetDimension() == 5, errors::InvalidArgument("Input shape must be " - "4-dimensional")); + "4 or 5-dimensional")); } } // .Input("value: T") @@ -649,8 +709,12 @@ class MklPoolingBackwardOpBase : public MklPoolingOpBase { input_gradient_mkl_shape.IsMklTensor() ? input_gradient_mkl_shape.GetMklLayout() : memory::desc( - TFShapeToMklDnnDimsInNCHW(input_gradient_tensor.shape(), - this->data_format_tf_), + (this->ksize_.size() == 4) + ? TFShapeToMklDnnDimsInNCHW(input_gradient_tensor.shape(), + this->data_format_tf_) + : TFShapeToMklDnnDimsInNCDHW( + input_gradient_tensor.shape(), + this->data_format_tf_), MklDnnType(), this->data_format_mkldnn_); input_gradient_dnn_data->SetUsrMem(original_input_grad_md, diff --git a/tensorflow/core/ops/nn_ops.cc b/tensorflow/core/ops/nn_ops.cc index e0f25fb4ef..a01413f2a7 100644 --- a/tensorflow/core/ops/nn_ops.cc +++ b/tensorflow/core/ops/nn_ops.cc @@ -1943,6 +1943,104 @@ NOTE Do not invoke this operator directly in Python. Graph rewrite pass is expected to invoke these operators. )doc"); +REGISTER_OP("_MklAvgPool3D") + .Input("value: T") + .Input("mkl_input: uint8") + .Output("output: T") + .Output("mkl_output: uint8") + .Attr("ksize: list(int) >= 5") + .Attr("strides: list(int) >= 5") + .Attr(GetPaddingAttrString()) + .Attr(GetConvnet3dDataFormatAttrString()) + .Attr("T: {float, half, double}") + .SetShapeFn(shape_inference::Pool3DShape) + .Doc(R"doc( +MKL version of AvgPool3D operator. Uses MKL DNN APIs to perform average pooling +on the input. + +NOTE Do not invoke this operator directly in Python. Graph rewrite pass is +expected to invoke these operators. +)doc"); + + +REGISTER_OP("_MklAvgPool3DGrad") + .Input("orig_input_shape: int32") + .Input("grad: T") + .Input("mkl_orig_input: uint8") + .Input("mkl_grad: uint8") + .Output("output: T") + .Output("mkl_output: uint8") + .Attr("ksize: list(int) >= 5") + .Attr("strides: list(int) >= 5") + .Attr(GetPaddingAttrString()) + .Attr(GetConvnet3dDataFormatAttrString()) + .Attr("T: {float, half, double}") + .SetShapeFn([](InferenceContext* c) { + ShapeHandle s; + TF_RETURN_IF_ERROR(c->MakeShapeFromShapeTensor(0, &s)); + TF_RETURN_IF_ERROR(c->WithRank(s, 5, &s)); + c->set_output(0, s); + return Status::OK(); + }) + .Doc(R"doc( +MKL version of AvgPool3DGrad operator. Uses MKL DNN APIs to compute gradients +of AvgPool function. + +NOTE Do not invoke this operator directly in Python. Graph rewrite pass is +expected to invoke these operators. +)doc"); + +REGISTER_OP("_MklMaxPool3D") + .Input("input: T") + .Input("mkl_input: uint8") + .Output("output: T") + .Output("workspace: uint8") + .Output("mkl_output: uint8") + .Output("mkl_workspace: uint8") + .Attr("ksize: list(int) >= 5") + .Attr("strides: list(int) >= 5") + .Attr(GetPaddingAttrString()) + .Attr(GetConvnet3dDataFormatAttrString()) + .Attr("T: {half, bfloat16, float}") + .Attr("workspace_enabled: bool = false") + .SetShapeFn(shape_inference::Pool3DShape) + .Doc(R"doc( +MKL version of MaxPool3D operator. Uses MKL DNN APIs to perform average pooling +on the input. + +NOTE Do not invoke this operator directly in Python. Graph rewrite pass is +expected to invoke these operators. +)doc"); + +REGISTER_OP("_MklMaxPool3DGrad") + .Input("orig_input: TInput") + .Input("orig_output: TInput") + .Input("grad: T") + .Input("workspace: uint8") + .Input("mkl_orig_input: uint8") + .Input("mkl_orig_output: uint8") + .Input("mkl_grad: uint8") + .Input("mkl_workspace: uint8") + .Output("output: T") + .Output("mkl_output: uint8") + .Attr("ksize: list(int) >= 5") + .Attr("strides: list(int) >= 5") + .Attr(GetPaddingAttrString()) + .Attr(GetConvnet3dDataFormatAttrString()) + .Attr("T: {half, bfloat16, float} = DT_FLOAT") + .Attr("TInput: {half, bfloat16, float} = DT_FLOAT") + .Attr("workspace_enabled: bool = false") + .SetShapeFn([](InferenceContext* c) { + return UnchangedShapeWithRank(c, 5); + }) + .Doc(R"doc( +MKL version of MklPool3DGrad operator. Uses MKL DNN APIs to compute gradients +of MklPool function. + +NOTE Do not invoke this operator directly in Python. Graph rewrite pass is +expected to invoke these operators. +)doc"); + REGISTER_OP("_MklLRN") .Input("input: T") .Input("mkl_input: uint8") diff --git a/tensorflow/core/util/mkl_util.h b/tensorflow/core/util/mkl_util.h index 159a787d05..79fc7500fc 100644 --- a/tensorflow/core/util/mkl_util.h +++ b/tensorflow/core/util/mkl_util.h @@ -66,7 +66,6 @@ using mkldnn::reorder; typedef unsigned int uint; #endif - namespace tensorflow { // The file contains a number of utility classes and functions used by MKL @@ -87,6 +86,17 @@ typedef enum { Dim_I = 1 } MklDnnDims; +typedef enum { + Dim3d_N = 0, + Dim3d_C = 1, + Dim3d_D = 2, + Dim3d_H = 3, + Dim3d_W = 4, + Dim3d_O = 0, + Dim3d_I = 1 +} MklDnnDims3D; + + #ifdef INTEL_MKL_ML_ONLY class MklShape { public: @@ -453,6 +463,14 @@ class MklDnnShape { return this->DimSize(index); } + inline size_t GetDimension3D(char dimension) const { + int index = GetMklDnnTensor3DDimIndex(dimension); + CHECK(index >= 0 && index < this->GetDimension()) + << "Invalid index from the dimension: " << index << ", " << dimension; + return this->DimSize(index); + } + + inline int32 GetMklDnnTensorDimIndex(char dimension) const { switch (dimension) { case 'N': @@ -469,6 +487,24 @@ class MklDnnShape { } } + inline int32 GetMklDnnTensor3DDimIndex(char dimension) const { + switch (dimension) { + case 'N': + return MklDnnDims3D::Dim3d_N; + case 'C': + return MklDnnDims3D::Dim3d_C; + case 'D': + return MklDnnDims3D::Dim3d_D; + case 'H': + return MklDnnDims3D::Dim3d_H; + case 'W': + return MklDnnDims3D::Dim3d_W; + default: + LOG(FATAL) << "Invalid dimension: " << dimension; + return -1; // Avoid compiler warning about missing return value + } + } + inline size_t GetDimension() const { return data_.dimension_; } inline const int* GetSizes() const { return reinterpret_cast(&data_.sizes_[0]); @@ -587,15 +623,29 @@ class MklDnnShape { } inline void SetTfDimOrder(const size_t dimension, TensorFormat data_format) { - // TODO(nhasabni): Why do we restrict this to 4D? - CHECK_EQ(dimension, 4); - CHECK(dimension == data_.dimension_); - data_.map_[GetTensorDimIndex<2>(data_format, 'W')] = MklDnnDims::Dim_W; - data_.map_[GetTensorDimIndex<2>(data_format, 'H')] = MklDnnDims::Dim_H; - data_.map_[GetTensorDimIndex<2>(data_format, 'C')] = MklDnnDims::Dim_C; - data_.map_[GetTensorDimIndex<2>(data_format, 'N')] = MklDnnDims::Dim_N; + if (dimension == 5) { + CHECK(dimension == data_.dimension_); + data_.map_[GetTensorDimIndex<3>(data_format, '0')] = + MklDnnDims3D::Dim3d_D; + data_.map_[GetTensorDimIndex<3>(data_format, '1')] = + MklDnnDims3D::Dim3d_H; + data_.map_[GetTensorDimIndex<3>(data_format, '2')] = + MklDnnDims3D::Dim3d_W; + data_.map_[GetTensorDimIndex<3>(data_format, 'C')] = + MklDnnDims3D::Dim3d_C; + data_.map_[GetTensorDimIndex<3>(data_format, 'N')] = + MklDnnDims3D::Dim3d_N; + } else { + CHECK_EQ(dimension, 4); + CHECK(dimension == data_.dimension_); + data_.map_[GetTensorDimIndex<2>(data_format, 'W')] = MklDnnDims::Dim_W; + data_.map_[GetTensorDimIndex<2>(data_format, 'H')] = MklDnnDims::Dim_H; + data_.map_[GetTensorDimIndex<2>(data_format, 'C')] = MklDnnDims::Dim_C; + data_.map_[GetTensorDimIndex<2>(data_format, 'N')] = MklDnnDims::Dim_N; + } } + inline void SetTfDimOrder(const size_t dimension, memory::format format) { TensorFormat data_format = MklDnnDataFormatToTFDataFormat(format); SetTfDimOrder(dimension, data_format); @@ -1329,6 +1379,19 @@ memory::data_type MklDnnType() { return memory::data_type::f32; } +/// Map TensorFlow's data format into MKL-DNN 3D data format +/// @input: TensorFlow data format +/// @return: memory::format corresponding to TensorFlow data format; +/// Fails with an error if invalid data format. +inline memory::format TFDataFormatToMklDnn3DDataFormat(TensorFormat format) { + if (format == FORMAT_NHWC) + return memory::format::ndhwc; + else if (format == FORMAT_NCHW) + return memory::format::ncdhw; + TF_CHECK_OK(Status(error::Code::INVALID_ARGUMENT, "Unsupported data format")); + return memory::format::format_undef; +} + /// Map TensorFlow's data format into MKL-DNN data format /// /// @input: TensorFlow data format @@ -1350,9 +1413,9 @@ inline memory::format TFDataFormatToMklDnnDataFormat(TensorFormat format) { /// @return: Tensorflow data format corresponding to memory::format /// Fails with an error if invalid data format. inline TensorFormat MklDnnDataFormatToTFDataFormat(memory::format format) { - if (format == memory::format::nhwc) + if (format == memory::format::nhwc || format == memory::format::ndhwc) return FORMAT_NHWC; - else if (format == memory::format::nchw) + else if (format == memory::format::nchw || format == memory::format::ncdhw) return FORMAT_NCHW; TF_CHECK_OK(Status(error::Code::INVALID_ARGUMENT, "Unsupported data format")); @@ -1402,6 +1465,23 @@ inline memory::dims TFShapeToMklDnnDimsInNCHW(const TensorShape& shape, return memory::dims({n, c, h, w}); } +inline memory::dims TFShapeToMklDnnDimsInNCDHW(const TensorShape& shape, + TensorFormat format) { + // Check validity of format. + CHECK_NE(TFDataFormatToMklDnn3DDataFormat(format), + memory::format::format_undef); + + int n = shape.dim_size(GetTensorDimIndex<3>(format, 'N')); + int c = shape.dim_size(GetTensorDimIndex<3>(format, 'C')); + int d = shape.dim_size(GetTensorDimIndex<3>(format, '0')); + int h = shape.dim_size(GetTensorDimIndex<3>(format, '1')); + int w = shape.dim_size(GetTensorDimIndex<3>(format, '2')); + + // MKL-DNN requires dimensions in NCDHW format. + return memory::dims({n, c, d, h, w}); +} + + /// Overloaded version of function above. Input parameters are /// self-explanatory. inline memory::dims MklDnnDimsInNCHW(const memory::dims& in_dims, @@ -1976,16 +2056,20 @@ class FactoryKeyCreator { } }; -static inline memory::format get_desired_format(int channel) { + +static inline memory::format get_desired_format(int channel, + bool is_2d = true) { memory::format fmt_desired = memory::format::any; - if (port::TestCPUFeature(port::CPUFeature::AVX512F) && (channel % 16) == 0) { - fmt_desired = memory::format::nChw16c; + if (port::TestCPUFeature(port::CPUFeature::AVX512F)) { + fmt_desired = is_2d ? memory::format::nChw16c : memory::format::nCdhw16c; } else if (port::TestCPUFeature(port::CPUFeature::AVX2) && (channel % 8) == 0) { - fmt_desired = memory::format::nChw8c; + fmt_desired = is_2d + ? memory::format::nChw8c + : memory::format::ncdhw; //not support avx2 for 3d yet. } else { - fmt_desired = memory::format::nchw; + fmt_desired = is_2d ? memory::format::nchw : memory::format::ncdhw; } return fmt_desired; } -- GitLab From 7b35aac2924b2dbd744ff5db9a24d8b05eb90f58 Mon Sep 17 00:00:00 2001 From: AG Ramesh Date: Mon, 13 Aug 2018 18:00:58 -0700 Subject: [PATCH 043/598] Replaced INTEL_MKL_ML with new macro INTEL_MKL_ML_ONLY --- tensorflow/core/kernels/mkl_relu_op.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tensorflow/core/kernels/mkl_relu_op.cc b/tensorflow/core/kernels/mkl_relu_op.cc index 8db43b2a8d..99f8136f41 100644 --- a/tensorflow/core/kernels/mkl_relu_op.cc +++ b/tensorflow/core/kernels/mkl_relu_op.cc @@ -44,7 +44,7 @@ using mkldnn::memory; namespace tensorflow { -#ifndef INTEL_MKL_ML +#ifndef INTEL_MKL_ML_ONLY template class MklEltwiseFwdParams { -- GitLab From d680982f4cdf009ad3ef68abe5853448c37e1a83 Mon Sep 17 00:00:00 2001 From: AG Ramesh Date: Mon, 13 Aug 2018 18:10:11 -0700 Subject: [PATCH 044/598] Renamed INTEL_MKL_ML and added deprecated warning for INTEL_MKL_ML_ONLY --- tensorflow/contrib/cmake/CMakeLists.txt | 2 +- tensorflow/core/kernels/mkl_pooling_ops_common.cc | 2 +- tensorflow/core/kernels/mkl_pooling_ops_common.h | 2 +- tensorflow/core/util/mkl_util.h | 6 ++++++ 4 files changed, 9 insertions(+), 3 deletions(-) diff --git a/tensorflow/contrib/cmake/CMakeLists.txt b/tensorflow/contrib/cmake/CMakeLists.txt index f6c928e2be..ebcabb4223 100644 --- a/tensorflow/contrib/cmake/CMakeLists.txt +++ b/tensorflow/contrib/cmake/CMakeLists.txt @@ -364,7 +364,7 @@ if (tensorflow_ENABLE_MKL_SUPPORT) list(APPEND tensorflow_EXTERNAL_DEPENDENCIES mkldnn_copy_shared_to_destination) include_directories(${mkldnn_INCLUDE_DIRS}) else (tensorflow_ENABLE_MKLDNN_SUPPORT) - add_definitions(-DINTEL_MKL_ML) + add_definitions(-DINTEL_MKL_ML_ONLY) endif() endif (tensorflow_ENABLE_MKL_SUPPORT) diff --git a/tensorflow/core/kernels/mkl_pooling_ops_common.cc b/tensorflow/core/kernels/mkl_pooling_ops_common.cc index d7ad3f9dcd..ab393ee5f8 100644 --- a/tensorflow/core/kernels/mkl_pooling_ops_common.cc +++ b/tensorflow/core/kernels/mkl_pooling_ops_common.cc @@ -24,7 +24,7 @@ limitations under the License. namespace tensorflow { -#ifndef INTEL_MKL_ML +#ifndef INTEL_MKL_ML_ONLY using mkldnn::pooling_avg; using mkldnn::pooling_avg_exclude_padding; diff --git a/tensorflow/core/kernels/mkl_pooling_ops_common.h b/tensorflow/core/kernels/mkl_pooling_ops_common.h index ec7af5092d..47a8afc081 100644 --- a/tensorflow/core/kernels/mkl_pooling_ops_common.h +++ b/tensorflow/core/kernels/mkl_pooling_ops_common.h @@ -32,7 +32,7 @@ using mkldnn::stream; namespace tensorflow { -#ifndef INTEL_MKL_ML +#ifndef INTEL_MKL_ML_ONLY using mkldnn::memory; using mkldnn::pooling_avg; diff --git a/tensorflow/core/util/mkl_util.h b/tensorflow/core/util/mkl_util.h index 159a787d05..af0d0f3a32 100644 --- a/tensorflow/core/util/mkl_util.h +++ b/tensorflow/core/util/mkl_util.h @@ -32,6 +32,12 @@ limitations under the License. #error "at most one of INTEL_MKL_ML_ONLY and INTEL_MKL_DNN_ONLY may be defined" #endif +#ifdef INTEL_MKL_ML_ONLY +// Using pragma as #warning doesn't work with all compilers +#pragma message("Compiling for INTEL MKL ML only will be deprecated soon.") +#pragma message("Please use MKL DNN (the default option for --config=mkl)") +#endif + #ifdef INTEL_MKL_ML_ONLY #include "mkl_dnn.h" #include "mkl_dnn_types.h" -- GitLab From 285273717d17c0609c49e020b4cc9220913d3558 Mon Sep 17 00:00:00 2001 From: Ben Date: Mon, 13 Aug 2018 21:50:54 -0400 Subject: [PATCH 045/598] py37 --- tensorflow/c/eager/c_api.cc | 8 +++--- tensorflow/c/eager/c_api.h | 4 +-- tensorflow/python/eager/pywrap_tfe_src.cc | 4 +-- tensorflow/workspace.bzl | 34 +++++++++++++++-------- 4 files changed, 30 insertions(+), 20 deletions(-) diff --git a/tensorflow/c/eager/c_api.cc b/tensorflow/c/eager/c_api.cc index dfb1c9a376..ce5a3f29a4 100644 --- a/tensorflow/c/eager/c_api.cc +++ b/tensorflow/c/eager/c_api.cc @@ -244,8 +244,8 @@ void TFE_ContextOptionsSetConfig(TFE_ContextOptions* options, const void* proto, } void TFE_ContextOptionsSetAsync(TFE_ContextOptions* options, - unsigned char async) { - options->async = async; + unsigned char async_) { + options->async = async_; } void TFE_ContextOptionsSetDevicePlacementPolicy( TFE_ContextOptions* options, TFE_ContextDevicePlacementPolicy policy) { @@ -253,9 +253,9 @@ void TFE_ContextOptionsSetDevicePlacementPolicy( } TF_CAPI_EXPORT extern void TFE_ContextSetAsyncForThread(TFE_Context* ctx, - unsigned char async, + unsigned char async_, TF_Status* status) { - status->status = ctx->context.SetAsyncForThread(async); + status->status = ctx->context.SetAsyncForThread(async_); } void TFE_DeleteContextOptions(TFE_ContextOptions* options) { delete options; } diff --git a/tensorflow/c/eager/c_api.h b/tensorflow/c/eager/c_api.h index a0ebc6fa0a..db0079b0de 100644 --- a/tensorflow/c/eager/c_api.h +++ b/tensorflow/c/eager/c_api.h @@ -76,7 +76,7 @@ typedef enum TFE_ContextDevicePlacementPolicy { // Sets the default execution mode (sync/async). Note that this can be // overridden per thread using TFE_ContextSetAsyncForThread. TF_CAPI_EXPORT extern void TFE_ContextOptionsSetAsync(TFE_ContextOptions*, - unsigned char async); + unsigned char async_); TF_CAPI_EXPORT extern void TFE_ContextOptionsSetDevicePlacementPolicy( TFE_ContextOptions*, TFE_ContextDevicePlacementPolicy); @@ -114,7 +114,7 @@ TFE_ContextGetDevicePlacementPolicy(TFE_Context*); // Overrides the execution mode (sync/async) for the current thread. TF_CAPI_EXPORT extern void TFE_ContextSetAsyncForThread(TFE_Context*, - unsigned char async, + unsigned char async_, TF_Status* status); // A tensorflow.ServerDef specifies remote workers (in addition to the current diff --git a/tensorflow/python/eager/pywrap_tfe_src.cc b/tensorflow/python/eager/pywrap_tfe_src.cc index 2d54555cd3..64cf36d079 100644 --- a/tensorflow/python/eager/pywrap_tfe_src.cc +++ b/tensorflow/python/eager/pywrap_tfe_src.cc @@ -216,7 +216,7 @@ bool ParseStringValue(const string& key, PyObject* py_value, TF_Status* status, #if PY_MAJOR_VERSION >= 3 if (PyUnicode_Check(py_value)) { Py_ssize_t size = 0; - char* buf = PyUnicode_AsUTF8AndSize(py_value, &size); + const char* buf = PyUnicode_AsUTF8AndSize(py_value, &size); if (buf == nullptr) return false; *value = tensorflow::StringPiece(buf, size); return true; @@ -825,7 +825,7 @@ int MaybeRaiseExceptionFromStatus(const tensorflow::Status& status, return -1; } -char* TFE_GetPythonString(PyObject* o) { +const char* TFE_GetPythonString(PyObject* o) { if (PyBytes_Check(o)) { return PyBytes_AsString(o); } diff --git a/tensorflow/workspace.bzl b/tensorflow/workspace.bzl index c21e5ebc9e..7138c0a452 100644 --- a/tensorflow/workspace.bzl +++ b/tensorflow/workspace.bzl @@ -362,11 +362,11 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""): tf_http_archive( name = "protobuf_archive", urls = [ - "https://mirror.bazel.build/github.com/google/protobuf/archive/v3.6.0.tar.gz", - "https://github.com/google/protobuf/archive/v3.6.0.tar.gz", + "https://mirror.bazel.build/github.com/google/protobuf/archive/0a59054c30e4f0ba10f10acfc1d7f3814c63e1a7.tar.gz", + "https://github.com/google/protobuf/archive/0a59054c30e4f0ba10f10acfc1d7f3814c63e1a7.tar.gz", ], - sha256 = "50a5753995b3142627ac55cfd496cebc418a2e575ca0236e29033c67bd5665f4", - strip_prefix = "protobuf-3.6.0", + sha256 = "4bb48bcc972ee9d40a8bb7e481522030b6e6771a7283ae83c896872115180d25", + strip_prefix = "protobuf-0a59054c30e4f0ba10f10acfc1d7f3814c63e1a7", ) # We need to import the protobuf library under the names com_google_protobuf @@ -375,21 +375,31 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""): tf_http_archive( name = "com_google_protobuf", urls = [ - "https://mirror.bazel.build/github.com/google/protobuf/archive/v3.6.0.tar.gz", - "https://github.com/google/protobuf/archive/v3.6.0.tar.gz", + "https://mirror.bazel.build/github.com/google/protobuf/archive/0a59054c30e4f0ba10f10acfc1d7f3814c63e1a7.tar.gz", + "https://github.com/google/protobuf/archive/0a59054c30e4f0ba10f10acfc1d7f3814c63e1a7.tar.gz", ], - sha256 = "50a5753995b3142627ac55cfd496cebc418a2e575ca0236e29033c67bd5665f4", - strip_prefix = "protobuf-3.6.0", + sha256 = "4bb48bcc972ee9d40a8bb7e481522030b6e6771a7283ae83c896872115180d25", + strip_prefix = "protobuf-0a59054c30e4f0ba10f10acfc1d7f3814c63e1a7", ) tf_http_archive( name = "com_google_protobuf_cc", urls = [ - "https://mirror.bazel.build/github.com/google/protobuf/archive/v3.6.0.tar.gz", - "https://github.com/google/protobuf/archive/v3.6.0.tar.gz", + "https://mirror.bazel.build/github.com/google/protobuf/archive/0a59054c30e4f0ba10f10acfc1d7f3814c63e1a7.tar.gz", + "https://github.com/google/protobuf/archive/0a59054c30e4f0ba10f10acfc1d7f3814c63e1a7.tar.gz", ], - sha256 = "50a5753995b3142627ac55cfd496cebc418a2e575ca0236e29033c67bd5665f4", - strip_prefix = "protobuf-3.6.0", + sha256 = "4bb48bcc972ee9d40a8bb7e481522030b6e6771a7283ae83c896872115180d25", + strip_prefix = "protobuf-0a59054c30e4f0ba10f10acfc1d7f3814c63e1a7", + ) + + tf_http_archive( + name = "bazel_skylib", + urls = [ + "https://mirror.bazel.build/github.com/bazelbuild/bazel-skylib/archive/2169ae1c374aab4a09aa90e65efe1a3aad4e279b.tar.gz", + "https://github.com/bazelbuild/bazel-skylib/archive/2169ae1c374aab4a09aa90e65efe1a3aad4e279b.tar.gz" + ], + sha256 = "bbccf674aa441c266df9894182d80de104cabd19be98be002f6d478aaa31574d", + strip_prefix = "bazel-skylib-2169ae1c374aab4a09aa90e65efe1a3aad4e279b", ) tf_http_archive( -- GitLab From 4aaab50552a3cdb4b785653f071ae6c7193992ca Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Yan=20Facai=20=28=E9=A2=9C=E5=8F=91=E6=89=8D=29?= Date: Tue, 14 Aug 2018 12:25:18 +0800 Subject: [PATCH 046/598] CLN: fix coding style --- tensorflow/python/ops/array_grad.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/tensorflow/python/ops/array_grad.py b/tensorflow/python/ops/array_grad.py index 328b4f7d53..2beb58d534 100644 --- a/tensorflow/python/ops/array_grad.py +++ b/tensorflow/python/ops/array_grad.py @@ -763,9 +763,10 @@ def _ExtractImagePatchesGrad(op, grad): (1, rows_out, cols_out, ksize_r * ksize_c)) # Construct mapping table for indices: (input -> output). - idx_matrix = array_ops.concat([array_ops.expand_dims(input_idx_patched, axis=-1), - array_ops.expand_dims(output_idx, axis=-1)], - axis=-1) + idx_matrix = array_ops.concat( + [array_ops.expand_dims(input_idx_patched, axis=-1), + array_ops.expand_dims(output_idx, axis=-1)], + axis=-1) idx_map = array_ops.reshape(idx_matrix, (-1, 2)) sp_shape = (input_indices_num, output_indices_num) -- GitLab From f982cfe9f943c9920cafeefff7818ea298d5b509 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Yan=20Facai=20=28=E9=A2=9C=E5=8F=91=E6=89=8D=29?= Date: Tue, 14 Aug 2018 12:41:36 +0800 Subject: [PATCH 047/598] TST: add benchmark --- .../extract_image_patches_grad_test.py | 20 +++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/tensorflow/python/kernel_tests/extract_image_patches_grad_test.py b/tensorflow/python/kernel_tests/extract_image_patches_grad_test.py index 60090a1510..e1f5a6b620 100644 --- a/tensorflow/python/kernel_tests/extract_image_patches_grad_test.py +++ b/tensorflow/python/kernel_tests/extract_image_patches_grad_test.py @@ -25,6 +25,8 @@ from tensorflow.python.framework import dtypes from tensorflow.python.framework import random_seed as random_seed_lib from tensorflow.python.ops import array_ops from tensorflow.python.ops import gradient_checker +from tensorflow.python.ops import gradients_impl +from tensorflow.python.ops import variable_scope from tensorflow.python.platform import test @@ -100,6 +102,24 @@ class ExtractImagePatchesGradTest(test.TestCase): print('extract_image_patches gradient err: %.4e' % err) self.assertLess(err, 1e-4) + def testConstructGradientWithLargeImages(self): + batch_size = 4 + height = 1024 + width = 1024 + ksize = 5 + images = variable_scope.get_variable('inputs', + (batch_size, height, width, 1)) + patches = array_ops.extract_image_patches(images, + ksizes=[1, ksize, ksize, 1], + strides=[1, 1, 1, 1], + rates=[1, 1, 1, 1], + padding='SAME') + # Github issue: #20146 + # tf.extract_image_patches() gradient very slow at graph construction time + gradients = gradients_impl.gradients(patches, images) + # Won't time out. + self.assertIsNotNone(gradients) + if __name__ == '__main__': test.main() -- GitLab From 26e7d51fee4ecfaeffbfad7beaf6952b3132b444 Mon Sep 17 00:00:00 2001 From: bstriner Date: Tue, 14 Aug 2018 04:48:11 -0400 Subject: [PATCH 048/598] py37 --- tensorflow/c/eager/c_api.cc | 8 ++++---- tensorflow/python/eager/pywrap_tfe.h | 2 +- tensorflow/python/pywrap_tfe.i | 6 +++--- tensorflow/workspace.bzl | 24 ++++++++++++------------ 4 files changed, 20 insertions(+), 20 deletions(-) mode change 100644 => 100755 tensorflow/c/eager/c_api.cc mode change 100644 => 100755 tensorflow/python/eager/pywrap_tfe.h mode change 100644 => 100755 tensorflow/python/pywrap_tfe.i mode change 100644 => 100755 tensorflow/workspace.bzl diff --git a/tensorflow/c/eager/c_api.cc b/tensorflow/c/eager/c_api.cc old mode 100644 new mode 100755 index ce5a3f29a4..1ccae3f138 --- a/tensorflow/c/eager/c_api.cc +++ b/tensorflow/c/eager/c_api.cc @@ -244,8 +244,8 @@ void TFE_ContextOptionsSetConfig(TFE_ContextOptions* options, const void* proto, } void TFE_ContextOptionsSetAsync(TFE_ContextOptions* options, - unsigned char async_) { - options->async = async_; + unsigned char enable) { + options->async = enable; } void TFE_ContextOptionsSetDevicePlacementPolicy( TFE_ContextOptions* options, TFE_ContextDevicePlacementPolicy policy) { @@ -253,9 +253,9 @@ void TFE_ContextOptionsSetDevicePlacementPolicy( } TF_CAPI_EXPORT extern void TFE_ContextSetAsyncForThread(TFE_Context* ctx, - unsigned char async_, + unsigned char enable, TF_Status* status) { - status->status = ctx->context.SetAsyncForThread(async_); + status->status = ctx->context.SetAsyncForThread(enable); } void TFE_DeleteContextOptions(TFE_ContextOptions* options) { delete options; } diff --git a/tensorflow/python/eager/pywrap_tfe.h b/tensorflow/python/eager/pywrap_tfe.h old mode 100644 new mode 100755 index a916a75f00..823c4078b8 --- a/tensorflow/python/eager/pywrap_tfe.h +++ b/tensorflow/python/eager/pywrap_tfe.h @@ -89,7 +89,7 @@ int MaybeRaiseExceptionFromStatus(const tensorflow::Status& status, PyObject* exception); // Returns the string associated with the passed-in python object. -char* TFE_GetPythonString(PyObject* o); +const char* TFE_GetPythonString(PyObject* o); // Returns a unique id on each call. int64_t get_uid(); diff --git a/tensorflow/python/pywrap_tfe.i b/tensorflow/python/pywrap_tfe.i old mode 100644 new mode 100755 index 157f2341e0..bc02e9a35c --- a/tensorflow/python/pywrap_tfe.i +++ b/tensorflow/python/pywrap_tfe.i @@ -106,19 +106,19 @@ limitations under the License. } %typemap(in) const char* serialized_function_def { - $1 = TFE_GetPythonString($input); + $1 = const_cast(TFE_GetPythonString($input)); } %typemap(in) const char* device_name { if ($input == Py_None) { $1 = nullptr; } else { - $1 = TFE_GetPythonString($input); + $1 = const_cast(TFE_GetPythonString($input)); } } %typemap(in) const char* op_name { - $1 = TFE_GetPythonString($input); + $1 = const_cast(TFE_GetPythonString($input)); } %typemap(in) (TFE_Context*) { diff --git a/tensorflow/workspace.bzl b/tensorflow/workspace.bzl old mode 100644 new mode 100755 index 7138c0a452..769e74d5a5 --- a/tensorflow/workspace.bzl +++ b/tensorflow/workspace.bzl @@ -362,11 +362,11 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""): tf_http_archive( name = "protobuf_archive", urls = [ - "https://mirror.bazel.build/github.com/google/protobuf/archive/0a59054c30e4f0ba10f10acfc1d7f3814c63e1a7.tar.gz", - "https://github.com/google/protobuf/archive/0a59054c30e4f0ba10f10acfc1d7f3814c63e1a7.tar.gz", + "https://mirror.bazel.build/github.com/google/protobuf/archive/fe2eef4bf414ebb352cf11bcec633f1fd46ec876.tar.gz", + "https://github.com/google/protobuf/archive/fe2eef4bf414ebb352cf11bcec633f1fd46ec876.tar.gz", ], - sha256 = "4bb48bcc972ee9d40a8bb7e481522030b6e6771a7283ae83c896872115180d25", - strip_prefix = "protobuf-0a59054c30e4f0ba10f10acfc1d7f3814c63e1a7", + sha256 = "b91b0ac9907af983877c960809dcad7a6dc8e4b06e34d32b7d66a12b9ea1fa17", + strip_prefix = "protobuf-fe2eef4bf414ebb352cf11bcec633f1fd46ec876", ) # We need to import the protobuf library under the names com_google_protobuf @@ -375,21 +375,21 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""): tf_http_archive( name = "com_google_protobuf", urls = [ - "https://mirror.bazel.build/github.com/google/protobuf/archive/0a59054c30e4f0ba10f10acfc1d7f3814c63e1a7.tar.gz", - "https://github.com/google/protobuf/archive/0a59054c30e4f0ba10f10acfc1d7f3814c63e1a7.tar.gz", + "https://mirror.bazel.build/github.com/google/protobuf/archive/fe2eef4bf414ebb352cf11bcec633f1fd46ec876.tar.gz", + "https://github.com/google/protobuf/archive/fe2eef4bf414ebb352cf11bcec633f1fd46ec876.tar.gz", ], - sha256 = "4bb48bcc972ee9d40a8bb7e481522030b6e6771a7283ae83c896872115180d25", - strip_prefix = "protobuf-0a59054c30e4f0ba10f10acfc1d7f3814c63e1a7", + sha256 = "b91b0ac9907af983877c960809dcad7a6dc8e4b06e34d32b7d66a12b9ea1fa17", + strip_prefix = "protobuf-fe2eef4bf414ebb352cf11bcec633f1fd46ec876", ) tf_http_archive( name = "com_google_protobuf_cc", urls = [ - "https://mirror.bazel.build/github.com/google/protobuf/archive/0a59054c30e4f0ba10f10acfc1d7f3814c63e1a7.tar.gz", - "https://github.com/google/protobuf/archive/0a59054c30e4f0ba10f10acfc1d7f3814c63e1a7.tar.gz", + "https://mirror.bazel.build/github.com/google/protobuf/archive/fe2eef4bf414ebb352cf11bcec633f1fd46ec876.tar.gz", + "https://github.com/google/protobuf/archive/fe2eef4bf414ebb352cf11bcec633f1fd46ec876.tar.gz", ], - sha256 = "4bb48bcc972ee9d40a8bb7e481522030b6e6771a7283ae83c896872115180d25", - strip_prefix = "protobuf-0a59054c30e4f0ba10f10acfc1d7f3814c63e1a7", + sha256 = "b91b0ac9907af983877c960809dcad7a6dc8e4b06e34d32b7d66a12b9ea1fa17", + strip_prefix = "protobuf-fe2eef4bf414ebb352cf11bcec633f1fd46ec876", ) tf_http_archive( -- GitLab From 60ea4be9ac3bdbee55dee9b011b151971dfae5ad Mon Sep 17 00:00:00 2001 From: bstriner Date: Tue, 14 Aug 2018 05:18:43 -0400 Subject: [PATCH 049/598] rename enable --- tensorflow/c/eager/c_api.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) mode change 100644 => 100755 tensorflow/c/eager/c_api.h diff --git a/tensorflow/c/eager/c_api.h b/tensorflow/c/eager/c_api.h old mode 100644 new mode 100755 index db0079b0de..eec2750d6e --- a/tensorflow/c/eager/c_api.h +++ b/tensorflow/c/eager/c_api.h @@ -76,7 +76,7 @@ typedef enum TFE_ContextDevicePlacementPolicy { // Sets the default execution mode (sync/async). Note that this can be // overridden per thread using TFE_ContextSetAsyncForThread. TF_CAPI_EXPORT extern void TFE_ContextOptionsSetAsync(TFE_ContextOptions*, - unsigned char async_); + unsigned char enable); TF_CAPI_EXPORT extern void TFE_ContextOptionsSetDevicePlacementPolicy( TFE_ContextOptions*, TFE_ContextDevicePlacementPolicy); @@ -114,7 +114,7 @@ TFE_ContextGetDevicePlacementPolicy(TFE_Context*); // Overrides the execution mode (sync/async) for the current thread. TF_CAPI_EXPORT extern void TFE_ContextSetAsyncForThread(TFE_Context*, - unsigned char async_, + unsigned char enable, TF_Status* status); // A tensorflow.ServerDef specifies remote workers (in addition to the current -- GitLab From c521738635ed5c50e31be2e87305e49c7dfeb601 Mon Sep 17 00:00:00 2001 From: bstriner Date: Tue, 14 Aug 2018 05:31:58 -0400 Subject: [PATCH 050/598] workspace --- tensorflow/workspace.bzl | 34 ++++++++++++++++------------------ 1 file changed, 16 insertions(+), 18 deletions(-) diff --git a/tensorflow/workspace.bzl b/tensorflow/workspace.bzl index 67beb17978..2cf1c86395 100755 --- a/tensorflow/workspace.bzl +++ b/tensorflow/workspace.bzl @@ -359,14 +359,18 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""): }, ) + PROTOBUF_urls =[ + "https://mirror.bazel.build/github.com/google/protobuf/archive/fe2eef4bf414ebb352cf11bcec633f1fd46ec876.tar.gz", + "https://github.com/google/protobuf/archive/fe2eef4bf414ebb352cf11bcec633f1fd46ec876.tar.gz", + ] + PROTOBUF_sha256 = "b91b0ac9907af983877c960809dcad7a6dc8e4b06e34d32b7d66a12b9ea1fa17" + PROTOBUF_strip_prefix = "protobuf-fe2eef4bf414ebb352cf11bcec633f1fd46ec876" + tf_http_archive( name = "protobuf_archive", - urls = [ - "https://mirror.bazel.build/github.com/google/protobuf/archive/fe2eef4bf414ebb352cf11bcec633f1fd46ec876.tar.gz", - "https://github.com/google/protobuf/archive/fe2eef4bf414ebb352cf11bcec633f1fd46ec876.tar.gz", - ], - sha256 = "b91b0ac9907af983877c960809dcad7a6dc8e4b06e34d32b7d66a12b9ea1fa17", - strip_prefix = "protobuf-fe2eef4bf414ebb352cf11bcec633f1fd46ec876", + urls = PROTOBUF_urls, + sha256 = PROTOBUF_sha256, + strip_prefix = PROTOBUF_strip_prefix, ) # We need to import the protobuf library under the names com_google_protobuf @@ -374,22 +378,16 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""): # Unfortunately there is no way to alias http_archives at the moment. tf_http_archive( name = "com_google_protobuf", - urls = [ - "https://mirror.bazel.build/github.com/google/protobuf/archive/fe2eef4bf414ebb352cf11bcec633f1fd46ec876.tar.gz", - "https://github.com/google/protobuf/archive/fe2eef4bf414ebb352cf11bcec633f1fd46ec876.tar.gz", - ], - sha256 = "b91b0ac9907af983877c960809dcad7a6dc8e4b06e34d32b7d66a12b9ea1fa17", - strip_prefix = "protobuf-fe2eef4bf414ebb352cf11bcec633f1fd46ec876", + urls = PROTOBUF_urls, + sha256 = PROTOBUF_sha256, + strip_prefix = PROTOBUF_strip_prefix, ) tf_http_archive( name = "com_google_protobuf_cc", - urls = [ - "https://mirror.bazel.build/github.com/google/protobuf/archive/fe2eef4bf414ebb352cf11bcec633f1fd46ec876.tar.gz", - "https://github.com/google/protobuf/archive/fe2eef4bf414ebb352cf11bcec633f1fd46ec876.tar.gz", - ], - sha256 = "b91b0ac9907af983877c960809dcad7a6dc8e4b06e34d32b7d66a12b9ea1fa17", - strip_prefix = "protobuf-fe2eef4bf414ebb352cf11bcec633f1fd46ec876", + urls = PROTOBUF_urls, + sha256 = PROTOBUF_sha256, + strip_prefix = PROTOBUF_strip_prefix, ) tf_http_archive( -- GitLab From c2687096e60f443d445c8871ab54ce095137018e Mon Sep 17 00:00:00 2001 From: "karl@kubx.ca" Date: Tue, 7 Aug 2018 22:40:25 -0400 Subject: [PATCH 051/598] Render secondary factory for default output types --- tensorflow/java/src/gen/cc/java_defs.h | 30 +++++++++ tensorflow/java/src/gen/cc/op_generator.cc | 74 +++++++++++++++++++++ tensorflow/java/src/gen/cc/op_specs.cc | 45 ++----------- tensorflow/java/src/gen/cc/op_specs.h | 11 +-- tensorflow/java/src/gen/cc/source_writer.cc | 1 - 5 files changed, 116 insertions(+), 45 deletions(-) diff --git a/tensorflow/java/src/gen/cc/java_defs.h b/tensorflow/java/src/gen/cc/java_defs.h index d9d6f8adc8..d39653ef41 100644 --- a/tensorflow/java/src/gen/cc/java_defs.h +++ b/tensorflow/java/src/gen/cc/java_defs.h @@ -21,6 +21,8 @@ limitations under the License. #include #include +#include "tensorflow/core/framework/types.h" + namespace tensorflow { namespace java { @@ -95,6 +97,34 @@ class Type { static Type IterableOf(const Type& type) { return Interface("Iterable").add_parameter(type); } + static Type ForDataType(DataType data_type) { + switch (data_type) { + case DataType::DT_BOOL: + return Class("Boolean"); + case DataType::DT_STRING: + return Class("String"); + case DataType::DT_FLOAT: + return Class("Float"); + case DataType::DT_DOUBLE: + return Class("Double"); + case DataType::DT_UINT8: + return Class("UInt8", "org.tensorflow.types"); + case DataType::DT_INT32: + return Class("Integer"); + case DataType::DT_INT64: + return Class("Long"); + case DataType::DT_RESOURCE: + // TODO(karllessard) create a Resource utility class that could be + // used to store a resource and its type (passed in a second argument). + // For now, we need to force a wildcard and we will unfortunately lose + // track of the resource type. + // Falling through... + default: + // Any other datatypes does not have a equivalent in Java and must + // remain a wildcard (e.g. DT_COMPLEX64, DT_QINT8, ...) + return Wildcard(); + } + } const Kind& kind() const { return kind_; } const string& name() const { return name_; } const string& package() const { return package_; } diff --git a/tensorflow/java/src/gen/cc/op_generator.cc b/tensorflow/java/src/gen/cc/op_generator.cc index d5bd99bdd9..8587d4dc30 100644 --- a/tensorflow/java/src/gen/cc/op_generator.cc +++ b/tensorflow/java/src/gen/cc/op_generator.cc @@ -19,6 +19,7 @@ limitations under the License. #include #include #include +#include #include "tensorflow/core/framework/op_gen_lib.h" #include "tensorflow/core/lib/core/errors.h" @@ -100,6 +101,10 @@ void CollectOpDependencies(const OpSpec& op, RenderMode mode, for (const AttributeSpec& attribute : op.attributes()) { out->push_back(attribute.var().type()); out->push_back(attribute.jni_type()); + if (attribute.has_default_value() + && attribute.type().kind() == Type::GENERIC) { + out->push_back(Type::ForDataType(attribute.default_value()->type())); + } } for (const AttributeSpec& optional_attribute : op.optional_attributes()) { out->push_back(optional_attribute.var().type()); @@ -139,6 +144,60 @@ void WriteSetAttrDirective(const AttributeSpec& attr, bool optional, } } +void RenderSecondaryFactoryMethod(const OpSpec& op, const Type& op_class, + std::map default_types, + SourceWriter* writer) { + // Build the return type for the secondary factory, replacing generic + // parameters with their default value if any + Type return_type = Type::Class(op_class.name(), op_class.package()); + for (const Type& parameter : op_class.parameters()) { + if (parameter.kind() == Type::GENERIC + && default_types.find(parameter.name()) != default_types.end()) { + return_type.add_parameter(default_types.at(parameter.name())); + } else { + return_type.add_parameter(parameter); + } + } + Method factory = Method::Create("create", return_type); + Javadoc factory_doc = + Javadoc::Create("Factory method to create a class to wrap a new " + + op_class.name() + " operation to the graph, using " + "default output types."); + Variable scope = + Variable::Create("scope", Type::Class("Scope", "org.tensorflow.op")); + AddArgument(scope, "current graph scope", &factory, &factory_doc); + std::stringstream factory_statement; + factory_statement << "return create(scope"; + for (const ArgumentSpec& input : op.inputs()) { + AddArgument(input.var(), input.description(), &factory, &factory_doc); + factory_statement << ", " << input.var().name(); + } + for (const AttributeSpec& attr : op.attributes()) { + // Only add attributes that are not types or have no default value to the + // signature of the secondary factory + factory_statement << ", "; + if (attr.type().kind() == Type::GENERIC + && default_types.find(attr.type().name()) != default_types.end()) { + factory_statement << default_types.at(attr.type().name()).name() + << ".class"; + } else { + AddArgument(attr.var(), attr.description(), &factory, &factory_doc); + factory_statement << attr.var().name(); + } + } + if (!op.optional_attributes().empty()) { + Variable options_var = Variable::Varargs("options", Type::Class("Options")); + AddArgument(options_var, "carries optional attributes values", &factory, + &factory_doc); + factory_statement << ", " << options_var.name(); + } + factory_doc.add_tag("return", "a new instance of " + op_class.name()); + + writer->BeginMethod(factory, PUBLIC | STATIC, &factory_doc); + writer->Append(factory_statement.str()).Append(");").EndLine(); + writer->EndMethod(); +} + void RenderFactoryMethods(const OpSpec& op, const Type& op_class, SourceWriter* writer) { Method factory = Method::Create("create", op_class); @@ -151,8 +210,17 @@ void RenderFactoryMethods(const OpSpec& op, const Type& op_class, for (const ArgumentSpec& input : op.inputs()) { AddArgument(input.var(), input.description(), &factory, &factory_doc); } + std::map default_types; for (const AttributeSpec& attr : op.attributes()) { AddArgument(attr.var(), attr.description(), &factory, &factory_doc); + // If this attribute is a type with a default value, save its value + // for passing it implicitly in a secondary factory method + if (attr.has_default_value() && attr.type().kind() == Type::GENERIC) { + Type default_type = Type::ForDataType(attr.default_value()->type()); + if (!default_type.wildcard()) { + default_types.insert(std::make_pair(attr.type().name(), default_type)); + } + } } if (!op.optional_attributes().empty()) { AddArgument(Variable::Varargs("options", Type::Class("Options")), @@ -194,6 +262,12 @@ void RenderFactoryMethods(const OpSpec& op, const Type& op_class, .Append("(opBuilder.build());") .EndLine(); writer->EndMethod(); + + // If this operation has type attributes with a default value, create a + // second factory method that infers those values implicitly + if (!default_types.empty()) { + RenderSecondaryFactoryMethod(op, op_class, default_types, writer); + } } void RenderConstructor(const OpSpec& op, const Type& op_class, diff --git a/tensorflow/java/src/gen/cc/op_specs.cc b/tensorflow/java/src/gen/cc/op_specs.cc index 941ab2699c..b2f2fb18a9 100644 --- a/tensorflow/java/src/gen/cc/op_specs.cc +++ b/tensorflow/java/src/gen/cc/op_specs.cc @@ -96,43 +96,10 @@ Type TypeResolver::TypeOf(const OpDef_ArgDef& arg_def, bool* iterable_out) { *iterable_out = true; visited_attrs_.insert(std::make_pair(arg_def.number_attr(), Type::Int())); } - Type type = Type::Wildcard(); if (arg_def.type() != DataType::DT_INVALID) { - // resolve type from DataType - switch (arg_def.type()) { - case DataType::DT_BOOL: - type = Type::Class("Boolean"); - break; - case DataType::DT_STRING: - type = Type::Class("String"); - break; - case DataType::DT_FLOAT: - type = Type::Class("Float"); - break; - case DataType::DT_DOUBLE: - type = Type::Class("Double"); - break; - case DataType::DT_UINT8: - type = Type::Class("UInt8", "org.tensorflow.types"); - break; - case DataType::DT_INT32: - type = Type::Class("Integer"); - break; - case DataType::DT_INT64: - type = Type::Class("Long"); - break; - case DataType::DT_RESOURCE: - // TODO(karllessard) create a Resource utility class that could be - // used to store a resource and its type (passed in a second argument). - // For now, we need to force a wildcard and we will unfortunately lose - // track of the resource type. - break; - default: - // Any other datatypes does not have a equivalent in Java and must - // remain a wildcard (e.g. DT_COMPLEX64, DT_QINT8, ...) - break; - } + type = Type::ForDataType(arg_def.type()); + } else if (!arg_def.type_attr().empty()) { // resolve type from attribute (if already visited, retrieve its type) if (IsAttributeVisited(arg_def.type_attr())) { @@ -337,16 +304,16 @@ AttributeSpec CreateAttribute(const OpDef_AttrDef& attr_def, bool iterable = false; std::pair types = type_resolver->TypesOf(attr_def, &iterable); Type var_type = types.first.kind() == Type::GENERIC - ? Type::Class("Class").add_parameter(types.first) - : types.first; + ? Type::ClassOf(types.first) : types.first; if (iterable) { var_type = Type::ListOf(var_type); } return AttributeSpec( attr_api_def.name(), Variable::Create(SnakeToCamelCase(attr_api_def.rename_to()), var_type), - types.first, types.second, ParseDocumentation(attr_api_def.description()), - iterable, attr_api_def.has_default_value()); + types.first, types.second, + ParseDocumentation(attr_api_def.description()), iterable, + attr_def.has_default_value() ? &attr_def.default_value() : nullptr); } ArgumentSpec CreateOutput(const OpDef_ArgDef& output_def, diff --git a/tensorflow/java/src/gen/cc/op_specs.h b/tensorflow/java/src/gen/cc/op_specs.h index 30ecb8ce53..7ad19af562 100644 --- a/tensorflow/java/src/gen/cc/op_specs.h +++ b/tensorflow/java/src/gen/cc/op_specs.h @@ -94,18 +94,18 @@ class AttributeSpec { // jni_type: the type of this attribute in JNI layer (see OperationBuilder) // description: a description of this attribute, in javadoc // iterable: true if this attribute is a list - // has_default_value: true if this attribute has a default value if not set + // default_value: default value for this attribute or nullptr if none AttributeSpec(const string& op_def_name, const Variable& var, const Type& type, const Type& jni_type, const string& description, bool iterable, - bool has_default_value) + const AttrValue* default_value) : op_def_name_(op_def_name), var_(var), type_(type), description_(description), iterable_(iterable), jni_type_(jni_type), - has_default_value_(has_default_value) {} + default_value_(default_value) {} const string& op_def_name() const { return op_def_name_; } const Variable& var() const { return var_; } @@ -113,7 +113,8 @@ class AttributeSpec { const string& description() const { return description_; } bool iterable() const { return iterable_; } const Type& jni_type() const { return jni_type_; } - bool has_default_value() const { return has_default_value_; } + bool has_default_value() const { return default_value_ != nullptr; } + const AttrValue* default_value() const { return default_value_; } private: const string op_def_name_; @@ -122,7 +123,7 @@ class AttributeSpec { const string description_; const bool iterable_; const Type jni_type_; - const bool has_default_value_; + const AttrValue* default_value_; }; class OpSpec { diff --git a/tensorflow/java/src/gen/cc/source_writer.cc b/tensorflow/java/src/gen/cc/source_writer.cc index 8e5fba7e32..a71b367691 100644 --- a/tensorflow/java/src/gen/cc/source_writer.cc +++ b/tensorflow/java/src/gen/cc/source_writer.cc @@ -16,7 +16,6 @@ limitations under the License. #include #include #include -#include #include "tensorflow/java/src/gen/cc/source_writer.h" -- GitLab From 8077ae1d1e8bfe6a5cc55df07ad82ae91f431d2e Mon Sep 17 00:00:00 2001 From: AG Ramesh Date: Tue, 14 Aug 2018 17:07:09 -0700 Subject: [PATCH 052/598] Minor changes in comments --- tensorflow/core/util/mkl_util.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tensorflow/core/util/mkl_util.h b/tensorflow/core/util/mkl_util.h index af0d0f3a32..907059febc 100644 --- a/tensorflow/core/util/mkl_util.h +++ b/tensorflow/core/util/mkl_util.h @@ -33,7 +33,7 @@ limitations under the License. #endif #ifdef INTEL_MKL_ML_ONLY -// Using pragma as #warning doesn't work with all compilers +// Using pragma message since #warning doesn't work with all compilers #pragma message("Compiling for INTEL MKL ML only will be deprecated soon.") #pragma message("Please use MKL DNN (the default option for --config=mkl)") #endif -- GitLab From 66a6473283aae36889fd80419b407a34c763e1d2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Yan=20Facai=20=28=E9=A2=9C=E5=8F=91=E6=89=8D=29?= Date: Wed, 15 Aug 2018 12:55:26 +0800 Subject: [PATCH 053/598] CLN: use ones op --- tensorflow/python/ops/array_grad.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tensorflow/python/ops/array_grad.py b/tensorflow/python/ops/array_grad.py index 2beb58d534..6ae869b89e 100644 --- a/tensorflow/python/ops/array_grad.py +++ b/tensorflow/python/ops/array_grad.py @@ -772,7 +772,7 @@ def _ExtractImagePatchesGrad(op, grad): sp_shape = (input_indices_num, output_indices_num) sp_mat_full = sparse_tensor.SparseTensor( idx_map, - array_ops.ones_like(idx_map[:, 0], dtype=grad.dtype), + array_ops.ones([output_indices_num], dtype=grad.dtype), sp_shape) # Remove all padding locations [0, :]. sp_mat = sparse_ops.sparse_slice(sp_mat_full, -- GitLab From 1a00709a05115560a54d8c8a4d045353151708a8 Mon Sep 17 00:00:00 2001 From: Clayne Robison Date: Thu, 16 Aug 2018 16:11:12 -0700 Subject: [PATCH 054/598] [Intel MKL] Static code analysis tool fixes - mkl_cpu_allocator.h: disallowing copy constructor and assignement operator; returning nullptr from non-void functions even though they generate Unimplemented Status code - mkl_graph_util.h: making kTensorOrdering const because it never gets changed anyway - mkl_layout_pass.cc: adding checks for nullptr before dereferencing --- tensorflow/core/common_runtime/mkl_cpu_allocator.h | 5 +++++ tensorflow/core/graph/mkl_graph_util.h | 2 +- tensorflow/core/graph/mkl_layout_pass.cc | 3 +++ 3 files changed, 9 insertions(+), 1 deletion(-) diff --git a/tensorflow/core/common_runtime/mkl_cpu_allocator.h b/tensorflow/core/common_runtime/mkl_cpu_allocator.h index 99bd43e090..6b76e7e0e7 100644 --- a/tensorflow/core/common_runtime/mkl_cpu_allocator.h +++ b/tensorflow/core/common_runtime/mkl_cpu_allocator.h @@ -148,12 +148,14 @@ class MklCPUAllocator : public VisitableAllocator { Status s = Status(error::Code::UNIMPLEMENTED, "Unimplemented case for hooking MKL function."); TF_CHECK_OK(s); // way to assert with an error message + return nullptr; // return a value and make static code analyzers happy } static inline void* ReallocHook(void* ptr, size_t size) { Status s = Status(error::Code::UNIMPLEMENTED, "Unimplemented case for hooking MKL function."); TF_CHECK_OK(s); // way to assert with an error message + return nullptr; // return a value and make static code analyzers happy } /// Do we allow growth in BFC Allocator @@ -166,6 +168,9 @@ class MklCPUAllocator : public VisitableAllocator { static constexpr const size_t kAlignment = 64; VisitableAllocator* allocator_; // owned by this class + + // Prevent copying and assignment + TF_DISALLOW_COPY_AND_ASSIGN(MklCPUAllocator); }; } // namespace tensorflow diff --git a/tensorflow/core/graph/mkl_graph_util.h b/tensorflow/core/graph/mkl_graph_util.h index 333bf761b0..bab1df87a4 100644 --- a/tensorflow/core/graph/mkl_graph_util.h +++ b/tensorflow/core/graph/mkl_graph_util.h @@ -41,7 +41,7 @@ namespace tensorflow { typedef enum { TENSORS_INTERLEAVED, TENSORS_CONTIGUOUS } MklTfTensorOrdering; // NOTE: Currently, we use contiguous ordering. If you change this, then you // would need to change Mkl op definitions in nn_ops.cc. -static MklTfTensorOrdering kTensorOrdering = TENSORS_CONTIGUOUS; +static const MklTfTensorOrdering kTensorOrdering = TENSORS_CONTIGUOUS; // Get index of MetaData tensor from index 'n' of Data tensor. inline int DataIndexToMetaDataIndex(int n, int total_tensors) { diff --git a/tensorflow/core/graph/mkl_layout_pass.cc b/tensorflow/core/graph/mkl_layout_pass.cc index 5683944e46..82a3a31e79 100644 --- a/tensorflow/core/graph/mkl_layout_pass.cc +++ b/tensorflow/core/graph/mkl_layout_pass.cc @@ -1042,6 +1042,7 @@ void MklLayoutRewritePass::GetDummyMklTensorNode(std::unique_ptr* g, // device of the original // node. .Finalize(&**g, out)); + CHECK_NOTNULL(*out); // Make sure we got a valid object before using it // If number of inputs to the original node is > 0, then we add // control dependency between 1st input (index 0) of the original node and @@ -1335,6 +1336,7 @@ void MklLayoutRewritePass::GetDummyWorkspaceTensorNode( // device of the original // node. .Finalize(&**g, out)); + CHECK_NOTNULL(*out); // Make sure we got a valid object before using it // If number of inputs to the original node is > 0, then we add // control dependency between 1st input (index 0) of the original node and @@ -3177,6 +3179,7 @@ void MklLayoutRewritePass::GetDummyMklTensorNode(std::unique_ptr* g, // device of the original // node. .Finalize(&**g, out)); + CHECK_NOTNULL(*out); // Make sure we got a valid object before using it // If number of inputs to the original node is > 0, then we add // control dependency between 1st input (index 0) of the original node and -- GitLab From 00ddbca932f40c50aab40489981304ff4ed590e2 Mon Sep 17 00:00:00 2001 From: Guozhong Zhuang Date: Fri, 17 Aug 2018 12:31:01 -0700 Subject: [PATCH 055/598] variable renaming per code review suggestions --- tensorflow/core/kernels/mkl_avgpooling_op.cc | 14 +++++++------- tensorflow/core/kernels/mkl_maxpooling_op.cc | 16 ++++++++-------- .../core/kernels/mkl_pooling_ops_common.cc | 8 ++++---- tensorflow/core/kernels/mkl_pooling_ops_common.h | 8 ++++---- 4 files changed, 23 insertions(+), 23 deletions(-) diff --git a/tensorflow/core/kernels/mkl_avgpooling_op.cc b/tensorflow/core/kernels/mkl_avgpooling_op.cc index 749b2a1838..2409f7e9dc 100644 --- a/tensorflow/core/kernels/mkl_avgpooling_op.cc +++ b/tensorflow/core/kernels/mkl_avgpooling_op.cc @@ -454,7 +454,7 @@ class MklAvgPoolingOp : public MklPoolingForwardOpBase { // initialize variables for the pooling op MklPoolParameters pool_params; // check whether pooling is 2D or 3D - bool isPool2D = (this->ksize_.size() == 4); + bool is_pool2d = (this->ksize_.size() == 4); // Get the input tensor and initialize the pooling parameters TensorShape input_tensor_shape = input_tensor.shape(); this->InitMklPoolParameters(context, &pool_params, dnn_shape_input, @@ -477,13 +477,13 @@ class MklAvgPoolingOp : public MklPoolingForwardOpBase { memory::dims filter_dims, strides, padding_left, padding_right; // Get src/filter/stride/padding information this->PoolParamsToDims(&pool_params, &filter_dims, &strides, - &padding_left, &padding_right, isPool2D); + &padding_left, &padding_right, is_pool2d); // Get the input memory descriptor memory::dims src_dims = dnn_shape_input.IsMklTensor() ? dnn_shape_input.GetSizesAsMklDnnDims() - : isPool2D ? TFShapeToMklDnnDimsInNCHW(input_tensor.shape(), + : is_pool2d ? TFShapeToMklDnnDimsInNCHW(input_tensor.shape(), this->data_format_tf_) : TFShapeToMklDnnDimsInNCDHW(input_tensor.shape(), this->data_format_tf_); @@ -564,18 +564,18 @@ class MklAvgPoolingGradOp : public MklPoolingBackwardOpBase { orig_input_shape.AddDim(shape_vec(i)); } - bool isPool2D = (this->ksize_.size() == 4); + bool is_pool2d = (this->ksize_.size() == 4); this->InitMklPoolParameters(context, &pool_params, orig_input_mkl_shape, orig_input_shape); memory::dims filter_dims, strides, padding_left, padding_right; this->PoolParamsToDims(&pool_params, &filter_dims, &strides, - &padding_left, &padding_right, isPool2D); + &padding_left, &padding_right, is_pool2d); memory::dims orig_input_dims_mkl_order = orig_input_mkl_shape.IsMklTensor() ? orig_input_mkl_shape.GetSizesAsMklDnnDims() - : isPool2D ? TFShapeToMklDnnDimsInNCHW(orig_input_shape, + : is_pool2d ? TFShapeToMklDnnDimsInNCHW(orig_input_shape, this->data_format_tf_) : TFShapeToMklDnnDimsInNCDHW(orig_input_shape, this->data_format_tf_); @@ -583,7 +583,7 @@ class MklAvgPoolingGradOp : public MklPoolingBackwardOpBase { memory::dims diff_dst_dims = grad_mkl_shape.IsMklTensor() ? grad_mkl_shape.GetSizesAsMklDnnDims() - : isPool2D ? TFShapeToMklDnnDimsInNCHW(grad_tensor.shape(), + : is_pool2d ? TFShapeToMklDnnDimsInNCHW(grad_tensor.shape(), this->data_format_tf_) : TFShapeToMklDnnDimsInNCDHW(grad_tensor.shape(), this->data_format_tf_); diff --git a/tensorflow/core/kernels/mkl_maxpooling_op.cc b/tensorflow/core/kernels/mkl_maxpooling_op.cc index aa7c0d9b7f..256d48f4d5 100644 --- a/tensorflow/core/kernels/mkl_maxpooling_op.cc +++ b/tensorflow/core/kernels/mkl_maxpooling_op.cc @@ -525,7 +525,7 @@ class MklMaxPoolingOp : public MklPoolingForwardOpBase { // initialize variables for the pooling op MklPoolParameters pool_params; // check whether pooling is 2D or 3D - bool isPool2D = (this->ksize_.size() == 4); + bool is_pool2d = (this->ksize_.size() == 4); // Get the input tensor and initialize the pooling parameters TensorShape input_tensor_shape = input_tensor.shape(); this->InitMklPoolParameters(context, &pool_params, dnn_shape_input, @@ -549,7 +549,7 @@ class MklMaxPoolingOp : public MklPoolingForwardOpBase { memory::desc input_md = dnn_shape_input.IsMklTensor() ? dnn_shape_input.GetMklLayout() - : isPool2D ? memory::desc( + : is_pool2d ? memory::desc( TFShapeToMklDnnDimsInNCHW(input_tensor_shape, this->data_format_tf_), MklDnnType(), this->data_format_mkldnn_) @@ -562,13 +562,13 @@ class MklMaxPoolingOp : public MklPoolingForwardOpBase { memory::dims src_dims = dnn_shape_input.IsMklTensor() ? dnn_shape_input.GetSizesAsMklDnnDims() - : isPool2D ? TFShapeToMklDnnDimsInNCHW(input_tensor.shape(), + : is_pool2d ? TFShapeToMklDnnDimsInNCHW(input_tensor.shape(), this->data_format_tf_) : TFShapeToMklDnnDimsInNCDHW(input_tensor.shape(), this->data_format_tf_); memory::dims filter_dims, strides, padding_left, padding_right; this->PoolParamsToDims(&pool_params, &filter_dims, &strides, - &padding_left, &padding_right, isPool2D); + &padding_left, &padding_right, is_pool2d); // Get a pooling op from the cached pool MklPoolingFwdPrimitive* pooling_fwd = nullptr; @@ -672,18 +672,18 @@ class MklMaxPoolingGradOp : public MklPoolingBackwardOpBase { MklPoolParameters pool_params; TensorShape orig_input_shape = orig_input_tensor.shape(); - bool isPool2D = (this->ksize_.size() == 4); + bool is_pool2d = (this->ksize_.size() == 4); this->InitMklPoolParameters(context, &pool_params, orig_input_mkl_shape, orig_input_shape); memory::dims filter_dims, strides, padding_left, padding_right; this->PoolParamsToDims(&pool_params, &filter_dims, &strides, - &padding_left, &padding_right, isPool2D); + &padding_left, &padding_right, is_pool2d); memory::dims orig_input_dims_mkl_order = orig_input_mkl_shape.IsMklTensor() ? orig_input_mkl_shape.GetSizesAsMklDnnDims() - : isPool2D ? TFShapeToMklDnnDimsInNCHW(orig_input_shape, + : is_pool2d ? TFShapeToMklDnnDimsInNCHW(orig_input_shape, this->data_format_tf_) : TFShapeToMklDnnDimsInNCDHW(orig_input_shape, this->data_format_tf_); @@ -691,7 +691,7 @@ class MklMaxPoolingGradOp : public MklPoolingBackwardOpBase { memory::dims diff_dst_dims = grad_mkl_shape.IsMklTensor() ? grad_mkl_shape.GetSizesAsMklDnnDims() - : isPool2D ? TFShapeToMklDnnDimsInNCHW(grad_tensor.shape(), + : is_pool2d ? TFShapeToMklDnnDimsInNCHW(grad_tensor.shape(), this->data_format_tf_) : TFShapeToMklDnnDimsInNCDHW(grad_tensor.shape(), this->data_format_tf_); diff --git a/tensorflow/core/kernels/mkl_pooling_ops_common.cc b/tensorflow/core/kernels/mkl_pooling_ops_common.cc index 5d02ceea12..ec6d241e17 100644 --- a/tensorflow/core/kernels/mkl_pooling_ops_common.cc +++ b/tensorflow/core/kernels/mkl_pooling_ops_common.cc @@ -284,8 +284,8 @@ void MklPoolParameters::Init(OpKernelContext* context, // Get the data format this->data_format = data_format; - bool isPool2D = (ksize.size() == 4); - if (isPool2D) { + bool is_pool2d = (ksize.size() == 4); + if (is_pool2d) { // Pool2D // Get the output sizes window_rows = GetTensorDim(ksize, data_format, 'H'); @@ -329,7 +329,7 @@ void MklPoolParameters::Init(OpKernelContext* context, } if (depth_window == 1) { // we are pooling in the D (Pool3D only), H and W - if (!isPool2D) { + if (!is_pool2d) { OP_REQUIRES_OK( context, GetWindowedOutputSizeVerbose(tensor_in_planes, window_planes, planes_stride, padding, @@ -348,7 +348,7 @@ void MklPoolParameters::Init(OpKernelContext* context, // Fail if the depth, height or width are greater than MAX_INT // We check depth only for 3D pooling case - if (!isPool2D) { + if (!is_pool2d) { OP_REQUIRES(context, FastBoundsCheck(out_planes, std::numeric_limits::max()), errors::InvalidArgument("output depth/planes is too large")); diff --git a/tensorflow/core/kernels/mkl_pooling_ops_common.h b/tensorflow/core/kernels/mkl_pooling_ops_common.h index ea7458062c..49f799d7ba 100644 --- a/tensorflow/core/kernels/mkl_pooling_ops_common.h +++ b/tensorflow/core/kernels/mkl_pooling_ops_common.h @@ -458,9 +458,9 @@ class MklPoolingOpBase : public OpKernel { OP_REQUIRES(context, this->ksize_[0] == 1 && this->stride_[0] == 1, errors::Unimplemented("Pooling is not yet supported on the " "batch dimension.")); - bool isPool2D = (this->ksize_.size() == 4); + bool is_pool2d = (this->ksize_.size() == 4); this->data_format_mkldnn_ = - isPool2D ? TFDataFormatToMklDnnDataFormat(this->data_format_tf_) + is_pool2d ? TFDataFormatToMklDnnDataFormat(this->data_format_tf_) : TFDataFormatToMklDnn3DDataFormat(this->data_format_tf_); // We may not get this attribute for this node if it does not go through @@ -510,8 +510,8 @@ class MklPoolingOpBase : public OpKernel { void PoolParamsToDims(const MklPoolParameters* pool_params, memory::dims* filter_dims, memory::dims* strides, memory::dims* padding_left, memory::dims* padding_right, - bool isPool2D) { - if (isPool2D) { + bool is_pool2d) { + if (is_pool2d) { // Pool2D *filter_dims = memory::dims({pool_params->window_rows, pool_params->window_cols}); -- GitLab From f5ef2477b6337fd30cf3c1348d35e8296c349b30 Mon Sep 17 00:00:00 2001 From: Guozhong Zhuang Date: Fri, 17 Aug 2018 13:41:50 -0700 Subject: [PATCH 056/598] enable Relu 3D --- .../core/kernels/mkl_input_conversion_op.cc | 17 +- tensorflow/core/kernels/mkl_relu_op.cc | 622 ++++++++++++++---- 2 files changed, 515 insertions(+), 124 deletions(-) diff --git a/tensorflow/core/kernels/mkl_input_conversion_op.cc b/tensorflow/core/kernels/mkl_input_conversion_op.cc index 06ce820ae9..c89b8048ee 100644 --- a/tensorflow/core/kernels/mkl_input_conversion_op.cc +++ b/tensorflow/core/kernels/mkl_input_conversion_op.cc @@ -296,7 +296,9 @@ class MklInputConversionOp : public OpKernel { // implementation. TensorShape tf_shape0 = input_shape_0.GetTfShape(); TensorShape tf_shape1 = input_shape_1.GetTfShape(); - if (tf_shape0 == tf_shape1) { + TensorShape tensor_shape0 = input_tensor_0.shape(); + TensorShape tensor_shape1 = input_tensor_1.shape(); + if (tf_shape0 == tf_shape1 && tensor_shape0 == tensor_shape1) { auto input0_md = input_shape_0.GetMklLayout(); auto input1_md = input_shape_1.GetMklLayout(); @@ -350,7 +352,8 @@ class MklInputConversionOp : public OpKernel { } // Sanity check - bool mkl_shapes_are_same = input_shape_0 == input_shape_1; + bool mkl_shapes_are_same = ((input_shape_0 == input_shape_1) && + (tensor_shape0 == tensor_shape1)); if (mkl_shapes_are_same) { CHECK(false) << "MklInputConversionOp: Unexpected: TF shapes are " "different but MKL shapes are same"; @@ -403,7 +406,8 @@ class MklInputConversionOp : public OpKernel { } // Broadcast is needed if the shapes are not the same - if (mkl_shape->GetTfShape().num_elements() == tf_tensor->shape().num_elements() ) { + if (mkl_shape->GetTfShape().num_elements() + == tf_tensor->shape().num_elements() ) { // Both shapes are same, convert the TF input to MKL VLOG(1) << "MklInputConversionOp: No broadcast needed."; VLOG(1) << "MklInputConversionOp: Converting input " << tf_tensor_index @@ -437,16 +441,17 @@ class MklInputConversionOp : public OpKernel { bool reordered = tf_input.CheckReorderToOpMem( memory::primitive_desc(output_mkl_md, cpu_engine), tensor_out, &net); - if(!reordered) { + + if (!reordered) { // This is the case that the TF tensor has the same shape and format of // mkl tensor. However, tf_tensor can not be simply forwarded to the // output tensor since mkl data tensor is always one dimensional tensor. // Tensor::CopyFrom shares the buffer of the other tensor while set its // shape to the other tensor. CHECK(tensor_out->CopyFrom(*tf_tensor, tensor_out->shape())); - } - else + } else { stream(stream::kind::eager).submit(net).wait(); + } // -- The tensor in MKL format passes through -- ForwardMklTensorInToOut(context, mkl_tensor_index, mkl_tensor_index); diff --git a/tensorflow/core/kernels/mkl_relu_op.cc b/tensorflow/core/kernels/mkl_relu_op.cc index 05034894e5..bea6fd6d3c 100644 --- a/tensorflow/core/kernels/mkl_relu_op.cc +++ b/tensorflow/core/kernels/mkl_relu_op.cc @@ -34,14 +34,413 @@ using mkldnn::prop_kind; using mkldnn::relu_backward; using mkldnn::relu_forward; using mkldnn::stream; +using mkldnn::memory; #else #include "mkl_dnn.h" #include "mkl_dnn_types.h" #endif +#include "tensorflow/core/platform/default/logging.h" #include "tensorflow/core/util/mkl_util.h" namespace tensorflow { +#ifndef INTEL_MKL_ML_ONLY + +template +class MklEltwiseFwdParams { + public: + memory::dims src_dims; // check if this is needed + memory::desc src_md; + algorithm alg_kind; + T alpha; + T beta; + + MklEltwiseFwdParams(memory::dims src_dims, memory::desc src_md, + algorithm alg_kind, T alpha, T beta) : + src_dims(src_dims), src_md(src_md), + alg_kind(alg_kind), alpha(alpha), beta(beta) { + } +}; + +template +class MklEltwiseFwdPrimitive : public MklPrimitive { + public: + explicit MklEltwiseFwdPrimitive(const MklEltwiseFwdParams& fwdParams) : + cpu_engine_(engine::cpu, 0) { + // store expected format + context_.src_fmt = static_cast( + fwdParams.src_md.data.format); + context_.fwd_stream.reset(new stream(stream::kind::eager)); + + // create eltwise primitive + if (context_.eltwise_fwd == nullptr) { + Setup(fwdParams); + } + } + + ~MklEltwiseFwdPrimitive() {} + + // Eltwise forward execute + // src_data: input data buffer of src + // dst_data: output data buffer of dst + void Execute(const T* src_data, T* dst_data) { + context_.src_mem->set_data_handle( + static_cast(const_cast(src_data))); + context_.dst_mem->set_data_handle(static_cast(dst_data)); + context_.fwd_stream->submit(context_.fwd_primitives); + + // after execution, set data handle back + context_.src_mem->set_data_handle(DummyData); + context_.dst_mem->set_data_handle(DummyData); + } + + std::shared_ptr GetEltwiseFwdPd() { + return context_.fwd_pd; + } + + memory::format GetSrcMemoryFormat() { + return context_.src_fmt; + } + + private: + // Primitive reuse context for eltwise Fwd ops: Relu, Elu, Tanh + struct EltwiseFwdContext { + // expected memory format for this primitive instance + mkldnn::memory::format src_fmt; + + // MKLDNN memory + std::shared_ptr src_mem; + std::shared_ptr dst_mem; + + // desc & prmitive desc + std::shared_ptr fwd_desc; + std::shared_ptr fwd_pd; + + // memory desc + std::shared_ptr src_md; + std::shared_ptr dst_md; + + // memory primitive desc + std::shared_ptr src_mpd; + + // Eltwise primitive + std::shared_ptr eltwise_fwd; + + std::shared_ptr fwd_stream; + std::vector fwd_primitives; + + EltwiseFwdContext() : + src_fmt(memory::format::any), src_mem(nullptr), dst_mem(nullptr), + fwd_desc(nullptr), fwd_pd(nullptr), src_md(nullptr), dst_md(nullptr), + src_mpd(nullptr), eltwise_fwd(nullptr), fwd_stream(nullptr) { + } + }; + + // Eltwise forward primitive setup + void Setup(const MklEltwiseFwdParams& fwdParams) { + // create memory descriptors for eltwise data with specified format + context_.src_md.reset(new memory::desc(fwdParams.src_md.data)); + context_.src_mpd.reset(new memory::primitive_desc( + *context_.src_md, cpu_engine_)); + + // create a eltwise + context_.fwd_desc.reset(new mkldnn::eltwise_forward::desc( + prop_kind::forward, fwdParams.alg_kind, *context_.src_md, + fwdParams.alpha, fwdParams.beta)); + context_.fwd_pd.reset(new mkldnn::eltwise_forward::primitive_desc( + *context_.fwd_desc, cpu_engine_)); + + // create memory primitive based on dummy data + context_.src_mem.reset(new memory(*context_.src_mpd, DummyData)); + context_.dst_mem.reset(new memory( + context_.fwd_pd.get()->dst_primitive_desc(), DummyData)); + + // create eltwise primitive and add it to net + context_.eltwise_fwd.reset(new mkldnn::eltwise_forward(*context_.fwd_pd, + *context_.src_mem, *context_.dst_mem)); + + context_.fwd_primitives.push_back(*context_.eltwise_fwd); + } + + struct EltwiseFwdContext context_; + engine cpu_engine_; +}; + +template +class MklEltwiseFwdPrimitiveFactory : public MklPrimitiveFactory { + public: + static MklEltwiseFwdPrimitive* Get( + const MklEltwiseFwdParams& fwdParams) { + MklEltwiseFwdPrimitive* eltwise_forward = nullptr; + + auto src_fmt = static_cast( + fwdParams.src_md.data.format); + + // Get a eltwise fwd primitive from the cached pool + eltwise_forward = static_cast*>( + MklEltwiseFwdPrimitiveFactory::GetInstance().GetEltwiseFwd( + fwdParams, src_fmt)); + if (eltwise_forward == nullptr) { + eltwise_forward = new MklEltwiseFwdPrimitive(fwdParams); + MklEltwiseFwdPrimitiveFactory::GetInstance().SetEltwiseFwd( + fwdParams, src_fmt, eltwise_forward); + } + return eltwise_forward; + } + + static MklEltwiseFwdPrimitiveFactory& GetInstance() { + static MklEltwiseFwdPrimitiveFactory instance_; + return instance_; + } + + private: + MklEltwiseFwdPrimitiveFactory() {} + ~MklEltwiseFwdPrimitiveFactory() {} + + static std::string CreateKey( + const MklEltwiseFwdParams& fwdParams, memory::format src_fmt) { + std::string prefix = "eltwise_fwd"; + FactoryKeyCreator key_creator; + key_creator.AddAsKey(prefix); + key_creator.AddAsKey(fwdParams.src_dims); + key_creator.AddAsKey(static_cast(fwdParams.alg_kind)); + key_creator.AddAsKey(static_cast(fwdParams.alpha)); + key_creator.AddAsKey(static_cast(fwdParams.beta)); + key_creator.AddAsKey(static_cast(src_fmt)); + return key_creator.GetKey(); + } + + MklPrimitive* GetEltwiseFwd(const MklEltwiseFwdParams& fwdParams, + memory::format src_fmt) { + std::string key = CreateKey(fwdParams, src_fmt); + return this->GetOp(key); + } + + void SetEltwiseFwd(const MklEltwiseFwdParams& fwdParams, + memory::format src_fmt, MklPrimitive* op) { + std::string key = CreateKey(fwdParams, src_fmt); + this->SetOp(key, op); + } +}; + +template +class MklEltwiseBwdParams { + public: + memory::dims src_dims; + memory::desc common_md; + algorithm alg_kind; + T alpha; + T beta; + + MklEltwiseBwdParams(const memory::dims &src_dims, + const memory::desc &common_md, + algorithm alg_kind, T alpha, T beta) : + src_dims(src_dims), common_md(common_md), + alg_kind(alg_kind), alpha(alpha), beta(beta) { + } +}; + +template +class MklEltwiseBwdPrimitive : public MklPrimitive { + public: + explicit MklEltwiseBwdPrimitive(const MklEltwiseBwdParams& bwdParams) : + cpu_engine_(engine::cpu, 0) { + context_.src_fmt = static_cast( + bwdParams.common_md.data.format); + context_.diff_dst_fmt = static_cast( + bwdParams.common_md.data.format); + context_.bwd_stream.reset(new stream(stream::kind::eager)); + // create eltwise primitive + if (context_.eltwise_bwd == nullptr) { + Setup(bwdParams); + } + } + + ~MklEltwiseBwdPrimitive() {} + + // Eltwise backward execute + // src_data: input data buffer of src + // diff_dst_data: input data buffer of diff_dst + // diff_src_data: output data buffer of diff_src + void Execute(const T* src_data, const T* diff_dst_data, T* diff_src_data) { + context_.src_mem->set_data_handle( + static_cast(const_cast(src_data))); + context_.diff_dst_mem->set_data_handle( + static_cast(const_cast(diff_dst_data))); + context_.diff_src_mem->set_data_handle(static_cast(diff_src_data)); + context_.bwd_stream->submit(context_.bwd_primitives); + + // after execution, set data handle back + context_.src_mem->set_data_handle(DummyData); + context_.diff_dst_mem->set_data_handle(DummyData); + context_.diff_src_mem->set_data_handle(DummyData); + } + + std::shared_ptr GetEltwiseBwdPd() { + return context_.bwd_pd; + } + + memory::format GetSrcMemoryFormat() { + return context_.src_fmt; + } + + memory::format GetDiffDstMemoryFormat() { + return context_.diff_dst_fmt; + } + + private: + // Primitive reuse context for eltwise Bwd ops: Relu, Elu, Tanh + struct EltwiseBwdContext { + // expected memory format for this primitive instance + memory::format src_fmt; + memory::format diff_dst_fmt; + + // MKLDNN memory + std::shared_ptr src_mem; + std::shared_ptr diff_dst_mem; + std::shared_ptr diff_src_mem; + + // desc & prmitive desc + std::shared_ptr bwd_desc; + + // memory desc + std::shared_ptr src_md; + std::shared_ptr diff_dst_md; + std::shared_ptr common_md; + + // memory primitive desc + std::shared_ptr src_mpd; + std::shared_ptr diff_dst_mpd; + + // fwd primitive desc + std::shared_ptr fwd_desc; + std::shared_ptr fwd_pd; + std::shared_ptr bwd_pd; + + // Eltwise primitive + std::shared_ptr eltwise_bwd; + + std::shared_ptr bwd_stream; + std::vector bwd_primitives; + + EltwiseBwdContext() : + src_fmt(memory::format::any), diff_dst_fmt(memory::format::any), + src_mem(nullptr), diff_dst_mem(nullptr), diff_src_mem(nullptr), + src_md(nullptr), diff_dst_md(nullptr), common_md(nullptr), + src_mpd(nullptr), diff_dst_mpd(nullptr), + fwd_desc(nullptr), fwd_pd(nullptr), bwd_pd(nullptr), + eltwise_bwd(nullptr), bwd_stream(nullptr) { + } + }; + + // Eltwise backward primitive setup + void Setup(const MklEltwiseBwdParams& bwdParams) { + // create memory descriptors for eltwise data w/ no specified format + context_.src_md.reset(new memory::desc(bwdParams.common_md.data)); + context_.diff_dst_md.reset(new memory::desc(bwdParams.common_md.data)); + + context_.src_mpd.reset(new memory::primitive_desc( + *context_.src_md, cpu_engine_)); + context_.diff_dst_mpd.reset(new memory::primitive_desc( + *context_.diff_dst_md, cpu_engine_)); + + // create forward eltwise primitive + context_.fwd_desc.reset(new mkldnn::eltwise_forward::desc( + prop_kind::forward_training, bwdParams.alg_kind, + *context_.src_md, bwdParams.alpha, bwdParams.beta)); + context_.fwd_pd.reset(new mkldnn::eltwise_forward::primitive_desc( + *context_.fwd_desc, cpu_engine_)); + context_.bwd_desc.reset(new mkldnn::eltwise_backward::desc( + bwdParams.alg_kind, *context_.diff_dst_md, + *context_.src_md, bwdParams.alpha, bwdParams.beta)); + context_.bwd_pd.reset(new mkldnn::eltwise_backward::primitive_desc( + *context_.bwd_desc, cpu_engine_, *context_.fwd_pd)); + + // create memory primitive based on dummy data + context_.src_mem.reset(new memory(*context_.src_mpd, DummyData)); + context_.diff_dst_mem.reset(new memory(*context_.diff_dst_mpd, DummyData)); + context_.diff_src_mem.reset(new memory( + context_.bwd_pd.get()->diff_src_primitive_desc(), DummyData)); + + // create eltwise primitive and add it to net + context_.eltwise_bwd.reset(new mkldnn::eltwise_backward(*context_.bwd_pd, + *context_.src_mem, *context_.diff_dst_mem, *context_.diff_src_mem)); + + context_.bwd_primitives.push_back(*context_.eltwise_bwd); + } + + struct EltwiseBwdContext context_; + engine cpu_engine_; +}; + + +template +class MklEltwiseBwdPrimitiveFactory : public MklPrimitiveFactory { + private: + MklEltwiseBwdPrimitiveFactory() {} + ~MklEltwiseBwdPrimitiveFactory() {} + + public: + static MklEltwiseBwdPrimitive* Get( + const MklEltwiseBwdParams& bwdParams) { + MklEltwiseBwdPrimitive* eltwise_backward = nullptr; + + auto src_fmt = static_cast( + bwdParams.common_md.data.format); + auto diff_dst_fmt = static_cast( + bwdParams.common_md.data.format); + + // try to find a suitable one in pool + eltwise_backward = static_cast*> ( + MklEltwiseBwdPrimitiveFactory::GetInstance().GetEltwiseBwd( + bwdParams, src_fmt, diff_dst_fmt)); + + if (eltwise_backward == nullptr) { + eltwise_backward = new MklEltwiseBwdPrimitive(bwdParams); + MklEltwiseBwdPrimitiveFactory::GetInstance().SetEltwiseBwd( + bwdParams, src_fmt, diff_dst_fmt, eltwise_backward); + } + return eltwise_backward; + } + + static MklEltwiseBwdPrimitiveFactory& GetInstance() { + static MklEltwiseBwdPrimitiveFactory instance_; + return instance_; + } + + private: + static std::string CreateKey( + const MklEltwiseBwdParams& bwdParams, + const memory::format &src_fmt, + const memory::format &diff_dst_fmt) { + std::string prefix = "eltwise_bwd"; + FactoryKeyCreator key_creator; + key_creator.AddAsKey(prefix); + key_creator.AddAsKey(bwdParams.src_dims); + key_creator.AddAsKey(static_cast(bwdParams.alg_kind)); + key_creator.AddAsKey(static_cast(bwdParams.alpha)); + key_creator.AddAsKey(static_cast(bwdParams.beta)); + key_creator.AddAsKey(static_cast(src_fmt)); + key_creator.AddAsKey(static_cast(diff_dst_fmt)); + return key_creator.GetKey(); + } + + MklPrimitive* GetEltwiseBwd(const MklEltwiseBwdParams& bwdParams, + const memory::format &src_fmt, const memory::format &diff_dst_fmt) { + std::string key = CreateKey(bwdParams, src_fmt, diff_dst_fmt); + return this->GetOp(key); + } + + void SetEltwiseBwd(const MklEltwiseBwdParams& bwdParams, + const memory::format &src_fmt, + const memory::format &diff_dst_fmt, MklPrimitive *op) { + std::string key = CreateKey(bwdParams, src_fmt, diff_dst_fmt); + this->SetOp(key, op); + } +}; + +#endif + typedef Eigen::ThreadPoolDevice CPUDevice; struct MklReluHelpers { @@ -375,55 +774,63 @@ class MklReluOpBase : public OpKernel { ~MklReluOpBase() {} explicit MklReluOpBase(OpKernelConstruction* context) : OpKernel(context) {} - virtual void Compute_Scalar(OpKernelContext* context) = 0; void Compute(OpKernelContext* context) override { try { - auto cpu_engine = engine(engine::cpu, 0); const size_t src_index = 0; // index of src input tensor const size_t dst_index = 0; // index of dst output tensor const Tensor& src_tensor = MklGetInput(context, src_index); MklDnnShape dnn_shape_src; GetMklShape(context, src_index, &dnn_shape_src); - Tensor* dst_tensor = nullptr; if (src_tensor.dims() == 0) { - Compute_Scalar(context); // scalar case doesn't use in-place operation + Compute_Scalar(context); return; } - // Create relu primitive. - MklDnnData src(&cpu_engine); - MklDnnData dst(&cpu_engine); - // Set DNN primitive - src + MklDnnData src(&cpu_engine); + memory::dims src_dims; memory::desc src_md({}, memory::data_undef, memory::format_undef); if (dnn_shape_src.IsMklTensor()) { src_md = dnn_shape_src.GetMklLayout(); + src_dims = dnn_shape_src.GetSizesAsMklDnnDims(); } else { - auto src_dims = TFShapeToMklDnnDims(src_tensor.shape()); + src_dims = TFShapeToMklDnnDims(src_tensor.shape()); auto src_strides = CalculateTFStrides(src_dims); // Create blocked memory descriptor src_md = MklDnnData::CreateBlockedMemDesc(src_dims, src_strides); } - src.SetUsrMem(src_md, &src_tensor); T alpha = 0, beta = 0; - std::shared_ptr relu_fwd_pd; - auto relu_fwd_desc = relu_forward::desc( - prop_kind::forward_training, - // Operator memory descriptor is same as user memory descriptor. - alg_kind, src.GetUsrMemDesc(), alpha, beta); - relu_fwd_pd.reset( - new relu_forward::primitive_desc(relu_fwd_desc, cpu_engine)); - - // allocate dst tensor + + // get a eltwise fwd from primitive pool + MklEltwiseFwdParams fwdParams(src_dims, src_md, + alg_kind, alpha, beta); + MklEltwiseFwdPrimitive *eltwise_fwd = + MklEltwiseFwdPrimitiveFactory::Get(fwdParams); + + // prepare for execuation + const T* src_data = src_tensor.flat().data(); + // check wehther src need to reorder + if (src_md.data.format != eltwise_fwd->GetSrcMemoryFormat()) { + src.SetUsrMem(src_md, &src_tensor); + auto src_target_pd = memory::primitive_desc({{src_dims}, + MklDnnType(), eltwise_fwd->GetSrcMemoryFormat()}, cpu_engine); + src.CheckReorderToOpMem(src_target_pd); + src_data = const_cast( + reinterpret_cast(src.GetOpMem().get_data_handle())); + } + + // allocate dst tensor, always set it as MKL-DNN layout + std::shared_ptr + eltwise_fwd_pd = eltwise_fwd->GetEltwiseFwdPd(); MklDnnShape dnn_shape_dst; TensorShape tf_shape_dst; if (dnn_shape_src.IsMklTensor()) { dnn_shape_dst.SetMklTensor(true); - auto dst_pd = relu_fwd_pd->dst_primitive_desc(); + auto dst_pd = eltwise_fwd_pd->dst_primitive_desc(); dnn_shape_dst.SetMklLayout(&dst_pd); dnn_shape_dst.SetElemType(MklDnnType()); dnn_shape_dst.SetTfLayout(dnn_shape_src.GetDimension(), @@ -434,34 +841,32 @@ class MklReluOpBase : public OpKernel { dnn_shape_dst.SetMklTensor(false); tf_shape_dst = src_tensor.shape(); } - - // Allocate output and MklDnnShape tensors separately for possible - // in-place operation + + Tensor* dst_tensor = nullptr; OP_REQUIRES_OK(context, context->forward_input_or_allocate_output( {static_cast(src_index)}, static_cast(dst_index), tf_shape_dst, &dst_tensor)); AllocateOutputSetMklShape(context, dst_index, dnn_shape_dst); - // Destination memory descriptor is same as source memory descriptor. - auto &dst_md = src_md; - dst.SetUsrMem(dst_md, dst_tensor); - - // execute net - std::vector net; - auto relu_fwd = - relu_forward(*relu_fwd_pd, src.GetOpMem(), dst.GetOpMem()); - net.push_back(relu_fwd); - stream(stream::kind::eager).submit(net).wait(); - } catch (mkldnn::error& e) { + T* dst_data = dst_tensor->flat().data(); + + // execute eltwise + eltwise_fwd->Execute(src_data, dst_data); + } catch (mkldnn::error &e) { string error_msg = "Status: " + std::to_string(e.status) + - ", message: " + string(e.message) + ", in file " + - string(__FILE__) + ":" + std::to_string(__LINE__); - OP_REQUIRES_OK( - context, - errors::Aborted("Operation received an exception:", error_msg)); + ", message: " + string(e.message) + + ", in file " + string(__FILE__) + ":" + + std::to_string(__LINE__); + OP_REQUIRES_OK(context, + errors::Aborted("Operation received an exception:", + error_msg)); } } + + private: + engine cpu_engine = engine(engine::cpu, 0); + std::shared_ptr relu_fwd_pd; }; template @@ -470,16 +875,15 @@ class MklReluGradOpBase : public OpKernel { ~MklReluGradOpBase() {} explicit MklReluGradOpBase(OpKernelConstruction* context) - : OpKernel(context) {} + : OpKernel(context) { + } virtual void Compute_Scalar(OpKernelContext* context) = 0; void Compute(OpKernelContext* context) { try { - auto cpu_engine = engine(engine::cpu, 0); MklDnnData src(&cpu_engine); MklDnnData diff_dst(&cpu_engine); - MklDnnData diff_src(&cpu_engine); const size_t diff_dst_index = 0; // index of diff_dst input tensor const size_t src_index = 1; // index of src input tensor @@ -495,37 +899,23 @@ class MklReluGradOpBase : public OpKernel { int src_dims_size = src_tensor.dims(); if (src_dims_size == 0) { - Compute_Scalar(context); // scalar case doesn't use in-place operation + Compute_Scalar(context); return; } - // Set DNN primitives for src & diff_dst + // get a eltwise bwd from primitive pool + memory::dims src_dims = {}; memory::desc src_md({}, memory::data_undef, memory::format_undef); memory::desc diff_dst_md({}, memory::data_undef, memory::format_undef); - - // For creating Sum primitive, we need to ensure that all inputs are in - // same format. What that means is if we have a mixed input case - where - // one input is in Tensorflow format and one input is in MKL format -, - // then we need to ensure that all inputs are in same format for - // primitive construction. For performance reason, we say that all inputs - // are in MKL format in such case, and insert reorder for input that is - // in Tensorflow format into MKL format. On the other hand, if both the - // inputs are in MKL format or both are in Tensorflow format, then we - // dont need reorder. if (!dnn_shape_src.IsMklTensor() && !dnn_shape_diff_dst.IsMklTensor()) { - // If both the inputs are in Tensorflow format, we create blocked memory - // descriptor. - auto src_dims = TFShapeToMklDnnDims(src_tensor.shape()); + src_dims = TFShapeToMklDnnDims(src_tensor.shape()); auto src_strides = CalculateTFStrides(src_dims); src_md = MklDnnData::CreateBlockedMemDesc(src_dims, src_strides); diff_dst_md = src_md; } else if (dnn_shape_src.IsMklTensor() && !dnn_shape_diff_dst.IsMklTensor()) { - // If one input is in MKL format and other is in Tensorflow, then - // create respective descriptors describing the actual case. For input - // in Mkl format, we just get Mkl layout from MklDnnShape. For input in - // Tensorflow format, we create memory descriptor using data format. src_md = dnn_shape_src.GetMklLayout(); + src_dims = dnn_shape_src.GetSizesAsMklDnnDims(); memory::format src_mkl_data_format = dnn_shape_src.GetTfDataFormat(); auto src_tf_data_format = @@ -536,26 +926,27 @@ class MklReluGradOpBase : public OpKernel { memory::desc(diff_dst_dims, MklDnnType(), src_mkl_data_format); } else if (!dnn_shape_src.IsMklTensor() && dnn_shape_diff_dst.IsMklTensor()) { - // Same comment as above. diff_dst_md = dnn_shape_diff_dst.GetMklLayout(); memory::format diff_dst_mkl_data_format = dnn_shape_diff_dst.GetTfDataFormat(); auto diff_dst_tf_data_format = MklDnnDataFormatToTFDataFormat(diff_dst_mkl_data_format); - auto src_dims = TFShapeToMklDnnDimsInNCHW(src_tensor.shape(), - diff_dst_tf_data_format); + + src_dims = (src_tensor.dims() == 4) + ? TFShapeToMklDnnDimsInNCHW(src_tensor.shape(), + diff_dst_tf_data_format) + : TFShapeToMklDnnDimsInNCDHW(src_tensor.shape(), + diff_dst_tf_data_format); src_md = memory::desc(src_dims, MklDnnType(), diff_dst_mkl_data_format); } else { - // If both the inputs are in MKL format, we use Mkl layout of the input - // tensors. src_md = dnn_shape_src.GetMklLayout(); diff_dst_md = dnn_shape_diff_dst.GetMklLayout(); + src_dims = dnn_shape_src.GetSizesAsMklDnnDims(); } - src.SetUsrMem(src_md, &src_tensor); - diff_dst.SetUsrMem(diff_dst_md, &diff_dst_tensor); + T alpha = 0, beta = 0; // As per comment above, we tell MKLDNN that both the inputs are in same // format. So we set common memory descriptor in MKL format, if any of the @@ -570,24 +961,38 @@ class MklReluGradOpBase : public OpKernel { common_md = src_md; } - T alpha = 0, beta = 0; - std::shared_ptr relu_fwd_pd; - auto relu_fwd_desc = relu_forward::desc(prop_kind::forward_training, - alg_kind, src_md, alpha, beta); - relu_fwd_pd.reset( - new relu_forward::primitive_desc(relu_fwd_desc, cpu_engine)); - auto relu_bwd_desc = - relu_backward::desc(alg_kind, common_md, common_md, alpha, beta); - auto relu_bwd_pd = relu_backward::primitive_desc( - relu_bwd_desc, cpu_engine, *relu_fwd_pd); + MklEltwiseBwdParams bwdParams(src_dims, common_md, + alg_kind, alpha, beta); + MklEltwiseBwdPrimitive *eltwise_bwd = + MklEltwiseBwdPrimitiveFactory::Get(bwdParams); + auto eltwise_bwd_pd = eltwise_bwd->GetEltwiseBwdPd(); + + // check whether need reorder for src / diff_dst + const T* src_data = src_tensor.flat().data(); + if (src_md.data.format != eltwise_bwd->GetSrcMemoryFormat()) { + src.SetUsrMem(src_md, &src_tensor); + src.CheckReorderToOpMem( + eltwise_bwd_pd.get()->diff_src_primitive_desc()); + src_data = const_cast( + reinterpret_cast(src.GetOpMem().get_data_handle())); + } + + const T* diff_dst_data = diff_dst_tensor.flat().data(); + if (diff_dst_md.data.format != eltwise_bwd->GetDiffDstMemoryFormat()) { + diff_dst.SetUsrMem(diff_dst_md, &diff_dst_tensor); + diff_dst.CheckReorderToOpMem( + eltwise_bwd_pd.get()->diff_src_primitive_desc()); + diff_dst_data = const_cast( + reinterpret_cast(diff_dst.GetOpMem().get_data_handle())); + } // allocate diff_src tensor MklDnnShape dnn_shape_diff_src; TensorShape tf_shape_diff_src; if (dnn_shape_src.IsMklTensor() || dnn_shape_diff_dst.IsMklTensor()) { + auto diff_src_pd = eltwise_bwd_pd->diff_src_primitive_desc(); dnn_shape_diff_src.SetMklTensor(true); - auto diff_src_pd = relu_bwd_pd.diff_src_primitive_desc(); dnn_shape_diff_src.SetMklLayout(&diff_src_pd); dnn_shape_diff_src.SetElemType(MklDnnType()); if (dnn_shape_src.IsMklTensor()) { @@ -602,51 +1007,32 @@ class MklReluGradOpBase : public OpKernel { tf_shape_diff_src.AddDim(diff_src_pd.get_size() / sizeof(T)); } else { dnn_shape_diff_src.SetMklTensor(false); - // both src and diff_dst are TensorFlow layout, - // so it is ok to get TensorFlow shape. tf_shape_diff_src = src_tensor.shape(); } - // Allocate diff_src and MklDnnShape tensors separately for possible - // in-place operation - OP_REQUIRES_OK(context, context->forward_input_or_allocate_output( - {static_cast(diff_dst_index)}, - static_cast(diff_src_index), - tf_shape_diff_src, - &diff_src_tensor)); - AllocateOutputSetMklShape(context, diff_src_index, dnn_shape_diff_src); - - // diff_src memory descriptor is same as memory descriptor for both - // inputs. - diff_src.SetUsrMem(common_md, diff_src_tensor); - - PrepareAndExecuteNet(relu_bwd_pd, &src, &diff_src, &diff_dst); - } catch (mkldnn::error& e) { - string error_msg = "Status: " + std::to_string(e.status) + - ", message: " + string(e.message) + ", in file " + - string(__FILE__) + ":" + std::to_string(__LINE__); - OP_REQUIRES_OK( - context, - errors::Aborted("Operation received an exception:", error_msg)); + OP_REQUIRES_OK(context, context->forward_input_or_allocate_output( + {diff_dst_index}, diff_src_index, tf_shape_diff_src, + &diff_src_tensor)); + AllocateOutputSetMklShape(context, diff_src_index, dnn_shape_diff_src); + + T* diff_src_data = diff_src_tensor->flat().data(); + + // execute eltwise bwd + eltwise_bwd->Execute(src_data, diff_dst_data, diff_src_data); + } catch (mkldnn::error &e) { + string error_msg = "Status: " + std::to_string(e.status) + + ", message: " + string(e.message) + + ", in file " + string(__FILE__) + ":" + + std::to_string(__LINE__); + OP_REQUIRES_OK(context, + errors::Aborted("Operation received an exception:", + error_msg)); } } - void PrepareAndExecuteNet(const relu_backward::primitive_desc& relu_prim_desc, - MklDnnData* src, MklDnnData* diff_src, - MklDnnData* diff_dst) { - std::vector net; - - // Check if we need to reorder original input tensors into common_md layout - // that we set for primitive creation. diff_src_primitive_desc is same as - // common_md. - src->CheckReorderToOpMem(relu_prim_desc.diff_src_primitive_desc(), &net); - diff_dst->CheckReorderToOpMem(relu_prim_desc.diff_src_primitive_desc(), - &net); - - net.push_back(relu_backward(relu_prim_desc, src->GetOpMem(), - diff_dst->GetOpMem(), diff_src->GetOpMem())); - stream(stream::kind::eager).submit(net).wait(); - } + private: + engine cpu_engine = engine(engine::cpu, 0); + std::shared_ptr relu_fwd_pd; }; template -- GitLab From 2459fd5f01b2a135335b588803fd8946ea761387 Mon Sep 17 00:00:00 2001 From: Guozhong Zhuang Date: Fri, 17 Aug 2018 14:35:32 -0700 Subject: [PATCH 057/598] disable primitive reuse for conv2d fwd/bwd (some cases) to avoid big memory caching --- .../core/kernels/mkl_conv_grad_filter_ops.cc | 32 ++++++++++---- .../core/kernels/mkl_conv_grad_input_ops.cc | 43 +++++++++++++------ tensorflow/core/kernels/mkl_conv_ops.cc | 41 +++++++++++++----- tensorflow/core/util/mkl_util.h | 41 ++++++++++++++++-- 4 files changed, 123 insertions(+), 34 deletions(-) diff --git a/tensorflow/core/kernels/mkl_conv_grad_filter_ops.cc b/tensorflow/core/kernels/mkl_conv_grad_filter_ops.cc index afbfaa83f3..701124f3a9 100644 --- a/tensorflow/core/kernels/mkl_conv_grad_filter_ops.cc +++ b/tensorflow/core/kernels/mkl_conv_grad_filter_ops.cc @@ -300,19 +300,24 @@ template class MklConvBwdFilterPrimitiveFactory : public MklPrimitiveFactory { public: static MklConvBwdFilterPrimitive* Get( - const MklConvBwdFilterParams& convBwdFilterDims) { + const MklConvBwdFilterParams& convBwdFilterDims, bool not_cache) { MklConvBwdFilterPrimitive* conv_bwd_filter = nullptr; - // look into the pool for reusable primitive - conv_bwd_filter = dynamic_cast*>( + if (not_cache) { /* Create new primitive always */ + conv_bwd_filter = new MklConvBwdFilterPrimitive(convBwdFilterDims); + } else { + // look into the pool for reusable primitive + conv_bwd_filter = dynamic_cast*> ( MklConvBwdFilterPrimitiveFactory::GetInstance().GetConvBwdFilter( convBwdFilterDims)); - if (conv_bwd_filter == nullptr) { - conv_bwd_filter = new MklConvBwdFilterPrimitive(convBwdFilterDims); - MklConvBwdFilterPrimitiveFactory::GetInstance().SetConvBwdFilter( - convBwdFilterDims, conv_bwd_filter); + if (conv_bwd_filter == nullptr) { + conv_bwd_filter = new MklConvBwdFilterPrimitive(convBwdFilterDims); + MklConvBwdFilterPrimitiveFactory::GetInstance().SetConvBwdFilter( + convBwdFilterDims, conv_bwd_filter); + } } + return conv_bwd_filter; } @@ -845,8 +850,13 @@ class MklConvCustomBackpropFilterOp MklConvBwdFilterParams convBwdFilterDims(fwd_src_dims, fwd_filter_dims, diff_bias_dims, diff_dst_dims, strides, dilations, padding_left, padding_right, TFPaddingToMklDnnPadding(this->padding_)); - conv_bwd_filter = - MklConvBwdFilterPrimitiveFactory::Get(convBwdFilterDims); + + // MKL DNN allocates large buffers when a conv gradient filter primtive is + // created. So we don't cache conv backward primitives when the env + // variable TF_MKL_OPTIMIZE_PRIMITVE_MEMUSE is set to true. + not_cache_ = MklPrimitiveFactory::IsPrimitiveMemOptEnabled(); + conv_bwd_filter = MklConvBwdFilterPrimitiveFactory::Get( + convBwdFilterDims, not_cache_); auto bwd_filter_pd = conv_bwd_filter->GetPrimitiveDesc(); // allocate output tensors: diff_fitler and diff_bias (w bias) @@ -938,6 +948,9 @@ class MklConvCustomBackpropFilterOp if (diff_filter_reorder_required) { diff_filter.InsertReorderToUserMem(); } + + // delete primitive since it is not cached. + if (not_cache_) delete conv_bwd_filter; } catch (mkldnn::error& e) { string error_msg = "Status: " + std::to_string(e.status) + ", message: " + string(e.message) + ", in file " + @@ -953,6 +966,7 @@ class MklConvCustomBackpropFilterOp const int kInputIndex_InputSizes = 0; const int kDilationH = 0, kDilationW = 1; engine cpu_engine_ = engine(engine::cpu, 0); + bool not_cache_; // Validate input shapes. // Function asserts that input shapes are valid. diff --git a/tensorflow/core/kernels/mkl_conv_grad_input_ops.cc b/tensorflow/core/kernels/mkl_conv_grad_input_ops.cc index b5a98301e2..9372a24ef3 100644 --- a/tensorflow/core/kernels/mkl_conv_grad_input_ops.cc +++ b/tensorflow/core/kernels/mkl_conv_grad_input_ops.cc @@ -174,7 +174,6 @@ class MklConvBwdInputPrimitive : public MklPrimitive { } }; - void Setup(const MklConvBwdInputParams& convBwdInputDims) { // create memory descriptors for convolution data w/ no specified format context_.diff_src_md.reset(new memory::desc( @@ -242,19 +241,23 @@ class MklConvBwdInputPrimitiveFactory : public MklPrimitiveFactory { public: static MklConvBwdInputPrimitive* Get( - const MklConvBwdInputParams& convBwdInputDims) { + const MklConvBwdInputParams& convBwdInputDims, bool not_cache) { MklConvBwdInputPrimitive* conv_bwd_input = nullptr; - // look into the pool for reusable primitive - conv_bwd_input = dynamic_cast*>( - MklConvBwdInputPrimitiveFactory::GetInstance().GetConvBwdInput( - convBwdInputDims)); - - if (conv_bwd_input == nullptr) { + if (not_cache) { /* Always allocate primitive */ conv_bwd_input = new MklConvBwdInputPrimitive(convBwdInputDims); - MklConvBwdInputPrimitiveFactory::GetInstance().SetConvBwdInput( - convBwdInputDims, conv_bwd_input); + } else { + // look into the pool for reusable primitive + conv_bwd_input = dynamic_cast*>( + MklConvBwdInputPrimitiveFactory::GetInstance().GetConvBwdInput( + convBwdInputDims)); + if (conv_bwd_input == nullptr) { + conv_bwd_input = new MklConvBwdInputPrimitive(convBwdInputDims); + MklConvBwdInputPrimitiveFactory::GetInstance().SetConvBwdInput( + convBwdInputDims, conv_bwd_input); + } } + return conv_bwd_input; } @@ -708,8 +711,18 @@ class MklConvCustomBackpropInputOp : public MklConvBackpropCommonOp { MklConvBwdInputParams convBwdInputDims(fwd_src_dims, fwd_filter_dims, diff_dst_dims, strides, dilations, padding_left, padding_right, TFPaddingToMklDnnPadding(this->padding_)); - conv_bwd_input = - MklConvBwdInputPrimitiveFactory::Get(convBwdInputDims); + + // We don't cache those primitves if the env variable + // TF_MKL_OPTIMIZE_PRIMITVE_MEMUSE is true and if primitve descriptor + // includes potentialy large buffers. MKL DNN allocates buffers + // in the following cases + // 1. Legacy CPU without AVX512/AVX2, or + // 2. 1x1 convolution with stride != 1 + not_cache_ = MklPrimitiveFactory::IsPrimitiveMemOptEnabled() && + (MklPrimitiveFactory::IsLegacyPlatform() || + IsConv1x1StrideNot1(fwd_filter_dims, strides)); + conv_bwd_input = MklConvBwdInputPrimitiveFactory::Get(convBwdInputDims, + not_cache_); auto bwd_input_pd = conv_bwd_input->GetPrimitiveDesc(); // allocate output tensor @@ -755,6 +768,11 @@ class MklConvCustomBackpropInputOp : public MklConvBackpropCommonOp { // execute convolution input bwd conv_bwd_input->Execute(diff_src_data, filter_data, diff_dst_data); + + // delete primitive since it is not cached. + if (not_cache_) { + delete conv_bwd_input; + } } catch (mkldnn::error& e) { string error_msg = "Status: " + std::to_string(e.status) + ", message: " + string(e.message) + ", in file " + @@ -769,6 +787,7 @@ class MklConvCustomBackpropInputOp : public MklConvBackpropCommonOp { const int kInputIndex_Filter = 1, kInputIndex_InputSizes = 0; const int kDilationH = 0, kDilationW = 1; engine cpu_engine = engine(engine::cpu, 0); + bool not_cache_; // Validate input shapes. // Function asserts that input shapes are valid. diff --git a/tensorflow/core/kernels/mkl_conv_ops.cc b/tensorflow/core/kernels/mkl_conv_ops.cc index c6295c7280..a5763e4b74 100644 --- a/tensorflow/core/kernels/mkl_conv_ops.cc +++ b/tensorflow/core/kernels/mkl_conv_ops.cc @@ -271,18 +271,23 @@ class MklConvFwdPrimitive : public MklPrimitive { template class MklConvFwdPrimitiveFactory : public MklPrimitiveFactory { public: - static MklConvFwdPrimitive* Get(const MklConvFwdParams& convFwdDims) { + static MklConvFwdPrimitive* Get(const MklConvFwdParams& convFwdDims, + bool not_cache) { MklConvFwdPrimitive* conv_fwd = nullptr; - // try to find a suitable one in pool - conv_fwd = dynamic_cast*>( - MklConvFwdPrimitiveFactory::GetInstance().GetConvFwd(convFwdDims)); - - if (conv_fwd == nullptr) { + if (not_cache) { /* Always create new primitive */ conv_fwd = new MklConvFwdPrimitive(convFwdDims); - MklConvFwdPrimitiveFactory::GetInstance().SetConvFwd(convFwdDims, - conv_fwd); + } else { + // try to find a suitable one in pool + conv_fwd = dynamic_cast*>( + MklConvFwdPrimitiveFactory::GetInstance().GetConvFwd(convFwdDims)); + if (conv_fwd == nullptr) { + conv_fwd = new MklConvFwdPrimitive(convFwdDims); + MklConvFwdPrimitiveFactory::GetInstance().SetConvFwd(convFwdDims, + conv_fwd); + } } + return conv_fwd; } @@ -894,6 +899,16 @@ class MklConvOp : public OpKernel { // MKLDNN dilation starts from 0. for (int i = 0; i < dilations.size(); i++) dilations[i] -= 1; + // In some cases, primitve descriptor includes potentialy large buffers, + // we don't cache those primitves if the env variable + // TF_MKL_OPTIMIZE_PRIMITVE_MEMUSE is true. MKL DNN allocates buffers + // in the following cases + // 1. Legacy CPU without AVX512/AVX2, or + // 2. 1x1 convolution with stride != 1 + not_cache_ = MklPrimitiveFactory::IsPrimitiveMemOptEnabled() && + (MklPrimitiveFactory::IsLegacyPlatform() || + IsConv1x1StrideNot1(filter_dims, strides)); + // get a conv2d fwd from primitive pool MklConvFwdPrimitive* conv_fwd = nullptr; if (biasEnabled) { @@ -902,12 +917,14 @@ class MklConvOp : public OpKernel { MklConvFwdParams convFwdDims(src_dims, filter_dims, bias_dims, dst_dims_mkl_order, strides, dilations, padding_left, padding_right); - conv_fwd = MklConvFwdPrimitiveFactory::Get(convFwdDims); + conv_fwd = MklConvFwdPrimitiveFactory::Get( + convFwdDims, not_cache_); } else { MklConvFwdParams convFwdDims(src_dims, filter_dims, NONE_DIMS, dst_dims_mkl_order, strides, dilations, padding_left, padding_right); - conv_fwd = MklConvFwdPrimitiveFactory::Get(convFwdDims); + conv_fwd = MklConvFwdPrimitiveFactory::Get( + convFwdDims, not_cache_); } // allocate output tensors output_tensor and filter_out_tensor @@ -952,6 +969,9 @@ class MklConvOp : public OpKernel { } else { conv_fwd->Execute(src_data, filter_data, dst_data); } + + // delete primitive since it is not cached. + if (not_cache_) delete conv_fwd; } catch (mkldnn::error &e) { string error_msg = tensorflow::strings::StrCat( "Status: ", e.status, ", message: ", string(e.message), ", in file ", @@ -970,6 +990,7 @@ class MklConvOp : public OpKernel { const int kOutputIndex_Dst = 0, kOutputIndex_Filter = 1; const int kDilationH = 0, kDilationW = 1; engine cpu_engine = engine(engine::cpu, 0); + bool not_cache_; // Allocate output tensor. void AllocateOutputTensor( diff --git a/tensorflow/core/util/mkl_util.h b/tensorflow/core/util/mkl_util.h index 422be9356d..ee02debddf 100644 --- a/tensorflow/core/util/mkl_util.h +++ b/tensorflow/core/util/mkl_util.h @@ -17,6 +17,7 @@ limitations under the License. #define TENSORFLOW_CORE_UTIL_MKL_UTIL_H_ #ifdef INTEL_MKL +#include #include #include #include @@ -32,6 +33,12 @@ limitations under the License. #error "at most one of INTEL_MKL_ML_ONLY and INTEL_MKL_DNN_ONLY may be defined" #endif +#ifdef INTEL_MKL_ML_ONLY +// Using pragma message since #warning doesn't work with all compilers +#pragma message("Compiling for INTEL MKL ML only will be deprecated soon.") +#pragma message("Please use MKL DNN (the default option for --config=mkl)") +#endif + #ifdef INTEL_MKL_ML_ONLY #include "mkl_dnn.h" #include "mkl_dnn_types.h" @@ -50,6 +57,7 @@ limitations under the License. #include "tensorflow/core/platform/macros.h" #include "tensorflow/core/util/padding.h" #include "tensorflow/core/util/tensor_format.h" +#include "tensorflow/core/util/env_var.h" #ifndef INTEL_MKL_ML_ONLY #include "mkldnn.hpp" @@ -1994,7 +2002,9 @@ const mkldnn::memory::dims NONE_DIMS = {}; template class MklPrimitiveFactory { public: - MklPrimitiveFactory() {} + MklPrimitiveFactory() { + } + ~MklPrimitiveFactory() {} MklPrimitive* GetOp(const string& key) { @@ -2017,6 +2027,22 @@ class MklPrimitiveFactory { map[key] = op; } + /// Function to decide whether HW has AVX512 or AVX2 + /// For those legacy device(w/o AVX512 and AVX2), + /// MKL-DNN GEMM will be used. + static inline bool IsLegacyPlatform() { + return (!port::TestCPUFeature(port::CPUFeature::AVX512F) + && !port::TestCPUFeature(port::CPUFeature::AVX2)); + } + + /// Fuction to check whether primitive memory optimization is enabled + static inline bool IsPrimitiveMemOptEnabled() { + bool is_primitive_mem_opt_enabled = true; + TF_CHECK_OK(ReadBoolFromEnvVar("TF_MKL_OPTIMIZE_PRIMITVE_MEMUSE", true, + &is_primitive_mem_opt_enabled)); + return is_primitive_mem_opt_enabled; + } + private: static inline std::unordered_map& GetHashMap() { static thread_local std::unordered_map map_; @@ -2089,7 +2115,7 @@ class MklReorderPrimitive : public MklPrimitive { context_.dst_mem->set_data_handle(to->get_data_handle()); } - private: + private: struct ReorderContext { std::shared_ptr src_mem; std::shared_ptr dst_mem; @@ -2131,7 +2157,7 @@ class MklReorderPrimitiveFactory : public MklPrimitiveFactory { return instance_; } - private: + private: MklReorderPrimitiveFactory() {} ~MklReorderPrimitiveFactory() {} @@ -2176,6 +2202,15 @@ inline primitive FindOrCreateReorder(const memory* from, const memory* to) { return *reorder_prim->GetPrimitive(); } +// utility function to determine if it is conv 1x1 and stride != 1 +// for purpose of temporarily disabling primitive reuse +inline bool IsConv1x1StrideNot1(memory::dims filter_dims, memory::dims strides) { + if (filter_dims.size() != 4 || strides.size() != 2) return false; + + return ((filter_dims[2] == 1) && (filter_dims[3] == 1) && + ((strides[0] != 1) || (strides[1] != 1))); +} + #endif // INTEL_MKL_DNN } // namespace tensorflow -- GitLab From 6a90dce2a099d4dba9cf9ef067c1520ecd8cd00c Mon Sep 17 00:00:00 2001 From: HuiyangFei Date: Fri, 17 Aug 2018 16:17:51 -0700 Subject: [PATCH 058/598] upgrade mkldnn to 0.16 for bazel build --- tensorflow/workspace.bzl | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/tensorflow/workspace.bzl b/tensorflow/workspace.bzl index 1847335656..c43575f970 100644 --- a/tensorflow/workspace.bzl +++ b/tensorflow/workspace.bzl @@ -60,31 +60,31 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""): mkl_repository( name = "mkl_linux", urls = [ - "https://mirror.bazel.build/github.com/intel/mkl-dnn/releases/download/v0.15/mklml_lnx_2018.0.3.20180406.tgz", - "https://github.com/intel/mkl-dnn/releases/download/v0.15/mklml_lnx_2018.0.3.20180406.tgz", + "https://mirror.bazel.build/github.com/intel/mkl-dnn/releases/download/v0.16/mklml_lnx_2019.0.20180710.tgz", + "https://github.com/intel/mkl-dnn/releases/download/v0.16/mklml_lnx_2019.0.20180710.tgz", ], - sha256 = "d2305244fdc9b87db7426ed4496e87a4b3977ad3374d73b8000e8b7a5b7aa725", - strip_prefix = "mklml_lnx_2018.0.3.20180406", + sha256 = "e2233534a9d15c387e22260997af4312a39e9f86f791768409be273b5453c4e6", + strip_prefix = "mklml_lnx_2019.0.20180710", build_file = clean_dep("//third_party/mkl:mkl.BUILD"), ) mkl_repository( name = "mkl_windows", urls = [ - "https://mirror.bazel.build/github.com/intel/mkl-dnn/releases/download/v0.15/mklml_win_2018.0.3.20180406.zip", - "https://github.com/intel/mkl-dnn/releases/download/v0.15/mklml_win_2018.0.3.20180406.zip", + "https://mirror.bazel.build/github.com/intel/mkl-dnn/releases/download/v0.16/mklml_win_2019.0.20180710.zip", + "https://github.com/intel/mkl-dnn/releases/download/v0.16/mklml_win_2019.0.20180710.zip", ], - sha256 = "a584a5bf1c8d2ad70b90d12b52652030e9a338217719064fdb84b7ad0d693694", - strip_prefix = "mklml_win_2018.0.3.20180406", + sha256 = "3fdcff17b018a0082491adf3ba143358265336a801646e46e0191ec8d58d24a2", + strip_prefix = "mklml_win_2019.0.20180710", build_file = clean_dep("//third_party/mkl:mkl.BUILD"), ) mkl_repository( name = "mkl_darwin", urls = [ - "https://mirror.bazel.build/github.com/intel/mkl-dnn/releases/download/v0.15/mklml_mac_2018.0.3.20180406.tgz", - "https://github.com/intel/mkl-dnn/releases/download/v0.15/mklml_mac_2018.0.3.20180406.tgz", + "https://mirror.bazel.build/github.com/intel/mkl-dnn/releases/download/v0.16/mklml_mac_2019.0.20180710.tgz", + "https://github.com/intel/mkl-dnn/releases/download/v0.16/mklml_mac_2019.0.20180710.tgz", ], - sha256 = "094e3dfd61c816136dc8d12a45cc611ce26c5f4828176a3644cd0b0efa15a25b", - strip_prefix = "mklml_mac_2018.0.3.20180406", + sha256 = "411a30014a938eb83fb9f37b3dbe8e371b106fc1dd621fc23123cadc72737ce6", + strip_prefix = "mklml_mac_2019.0.20180710", build_file = clean_dep("//third_party/mkl:mkl.BUILD"), ) @@ -95,11 +95,11 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""): tf_http_archive( name = "mkl_dnn", urls = [ - "https://mirror.bazel.build/github.com/intel/mkl-dnn/archive/0c1cf54b63732e5a723c5670f66f6dfb19b64d20.tar.gz", - "https://github.com/intel/mkl-dnn/archive/0c1cf54b63732e5a723c5670f66f6dfb19b64d20.tar.gz", + "https://mirror.bazel.build/github.com/intel/mkl-dnn/archive/4e333787e0d66a1dca1218e99a891d493dbc8ef1.tar.gz", + "https://github.com/intel/mkl-dnn/archive/4e333787e0d66a1dca1218e99a891d493dbc8ef1.tar.gz", ], - sha256 = "da1f27f92453a65331197dd8e4992e810fb7b1c4e0b902a1da5611592df2b633", - strip_prefix = "mkl-dnn-0c1cf54b63732e5a723c5670f66f6dfb19b64d20", + sha256 = "363cc9239eacf8e7917753c6d8c94f767e4cd049160d0654a61ef32d5e1b3049", + strip_prefix = "mkl-dnn-4e333787e0d66a1dca1218e99a891d493dbc8ef1", build_file = clean_dep("//third_party/mkl_dnn:mkldnn.BUILD"), ) -- GitLab From c4858c15110286b1afd091c70ab4d99549b2e856 Mon Sep 17 00:00:00 2001 From: Lukas Geiger Date: Sat, 18 Aug 2018 10:01:17 +0200 Subject: [PATCH 059/598] [tfgan] Respect use_loss_summaries in GANEstimator Since the refactor done in 47dea684efa41981e10299c2737317c504ce41af the `use_loss_summaries` argument of GANEstimator isn't respected anymore. This PR restores the original behavior and passes `use_loss_summaries` down to the loss functions. --- .../gan/python/estimator/python/gan_estimator_impl.py | 10 ++++++---- .../gan/python/estimator/python/gan_estimator_test.py | 2 +- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/tensorflow/contrib/gan/python/estimator/python/gan_estimator_impl.py b/tensorflow/contrib/gan/python/estimator/python/gan_estimator_impl.py index 8e4affb9b4..3dd066a406 100644 --- a/tensorflow/contrib/gan/python/estimator/python/gan_estimator_impl.py +++ b/tensorflow/contrib/gan/python/estimator/python/gan_estimator_impl.py @@ -187,7 +187,7 @@ class GANEstimator(estimator.Estimator): return _get_estimator_spec( mode, gan_model, generator_loss_fn, discriminator_loss_fn, get_eval_metric_ops_fn, generator_optimizer, discriminator_optimizer, - get_hooks_fn) + get_hooks_fn, use_loss_summaries) super(GANEstimator, self).__init__( model_fn=_model_fn, model_dir=model_dir, config=config) @@ -214,15 +214,17 @@ def _get_gan_model( def _get_estimator_spec( mode, gan_model, generator_loss_fn, discriminator_loss_fn, get_eval_metric_ops_fn, generator_optimizer, discriminator_optimizer, - get_hooks_fn=None): + get_hooks_fn=None, use_loss_summaries=True): """Get the EstimatorSpec for the current mode.""" if mode == model_fn_lib.ModeKeys.PREDICT: estimator_spec = model_fn_lib.EstimatorSpec( mode=mode, predictions=gan_model.generated_data) else: gan_loss = tfgan_tuples.GANLoss( - generator_loss=generator_loss_fn(gan_model), - discriminator_loss=discriminator_loss_fn(gan_model)) + generator_loss=generator_loss_fn( + gan_model, add_summaries=use_loss_summaries), + discriminator_loss=discriminator_loss_fn( + gan_model, add_summaries=use_loss_summaries)) if mode == model_fn_lib.ModeKeys.EVAL: estimator_spec = _get_eval_estimator_spec( gan_model, gan_loss, get_eval_metric_ops_fn) diff --git a/tensorflow/contrib/gan/python/estimator/python/gan_estimator_test.py b/tensorflow/contrib/gan/python/estimator/python/gan_estimator_test.py index 9ac9c6ca9c..83f8dd641f 100644 --- a/tensorflow/contrib/gan/python/estimator/python/gan_estimator_test.py +++ b/tensorflow/contrib/gan/python/estimator/python/gan_estimator_test.py @@ -116,7 +116,7 @@ def get_dummy_gan_model(): discriminator_fn=None) -def dummy_loss_fn(gan_model): +def dummy_loss_fn(gan_model, add_summaries=True): return math_ops.reduce_sum(gan_model.discriminator_real_outputs - gan_model.discriminator_gen_outputs) -- GitLab From 7a3ab5d6201b467c783e8d44e0b9180624e0dfbd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Yan=20Facai=20=28=E9=A2=9C=E5=8F=91=E6=89=8D=29?= Date: Sat, 18 Aug 2018 17:30:25 +0800 Subject: [PATCH 060/598] TST: add test case for partial shape --- .../kernel_tests/batch_dataset_op_test.py | 137 ++++++++++++++++-- 1 file changed, 128 insertions(+), 9 deletions(-) diff --git a/tensorflow/contrib/data/python/kernel_tests/batch_dataset_op_test.py b/tensorflow/contrib/data/python/kernel_tests/batch_dataset_op_test.py index 42adfd17f0..ebc5160408 100644 --- a/tensorflow/contrib/data/python/kernel_tests/batch_dataset_op_test.py +++ b/tensorflow/contrib/data/python/kernel_tests/batch_dataset_op_test.py @@ -720,11 +720,47 @@ class RestructuredDatasetTest(test.TestCase): def test_assert_element_shape(self): + def create_dataset(_): + return (array_ops.ones(2, dtype=dtypes.float32), + array_ops.zeros((3, 4), dtype=dtypes.int32)) + + dataset = dataset_ops.Dataset.range(5).map(create_dataset) + expected_shapes = (tensor_shape.TensorShape(2), + tensor_shape.TensorShape((3, 4))) + self.assertEqual(expected_shapes, dataset.output_shapes) + + result = dataset.apply(batching.assert_element_shape(expected_shapes)) + self.assertEqual(expected_shapes, result.output_shapes) + + iterator = result.make_initializable_iterator() + init_op = iterator.initializer + get_next = iterator.get_next() + with self.test_session() as sess: + sess.run(init_op) + for _ in range(5): + sess.run(get_next) + with self.assertRaises(errors.OutOfRangeError): + sess.run(get_next) + + def test_assert_wrong_element_shape(self): + + def create_dataset(_): + return (array_ops.ones(2, dtype=dtypes.float32), + array_ops.zeros((3, 4), dtype=dtypes.int32)) + + dataset = dataset_ops.Dataset.range(3).map(create_dataset) + wrong_shapes = (tensor_shape.TensorShape(2), + tensor_shape.TensorShape((3, 10))) + with self.assertRaises(ValueError): + dataset.apply(batching.assert_element_shape(wrong_shapes)) + + def test_assert_element_shape_on_unknown_shape_dataset(self): + def create_unknown_shape_dataset(x): return script_ops.py_func( lambda _: ( # pylint: disable=g-long-lambda - np.ones(2, dtype=np.float32), - np.zeros((3, 4), dtype=np.int32)), + np.ones(2, dtype=np.float32), + np.zeros((3, 4), dtype=np.int32)), [x], [dtypes.float32, dtypes.int32]) @@ -748,7 +784,60 @@ class RestructuredDatasetTest(test.TestCase): with self.assertRaises(errors.OutOfRangeError): sess.run(get_next) - def test_assert_wrong_element_shape(self): + def test_assert_wrong_element_shape_on_unknown_shape_dataset(self): + + def create_unknown_shape_dataset(x): + return script_ops.py_func( + lambda _: ( # pylint: disable=g-long-lambda + np.ones(2, dtype=np.float32), + np.zeros((3, 4), dtype=np.int32)), + [x], + [dtypes.float32, dtypes.int32]) + + dataset = dataset_ops.Dataset.range(3).map(create_unknown_shape_dataset) + unknown_shapes = (tensor_shape.TensorShape(None), + tensor_shape.TensorShape(None)) + self.assertEqual(unknown_shapes, dataset.output_shapes) + + wrong_shapes = (tensor_shape.TensorShape(2), + tensor_shape.TensorShape((3, 10))) + iterator = ( + dataset.apply(batching.assert_element_shape(wrong_shapes)) + .make_initializable_iterator()) + init_op = iterator.initializer + get_next = iterator.get_next() + with self.test_session() as sess: + sess.run(init_op) + with self.assertRaises(errors.InvalidArgumentError): + sess.run(get_next) + + def test_assert_partial_element_shape(self): + + def create_dataset(_): + return (array_ops.ones(2, dtype=dtypes.float32), + array_ops.zeros((3, 4), dtype=dtypes.int32)) + + dataset = dataset_ops.Dataset.range(5).map(create_dataset) + partial_expected_shape = (tensor_shape.TensorShape(None), # Unknown shape + tensor_shape.TensorShape((None, 4))) # Partial shape + result = dataset.apply( + batching.assert_element_shape(partial_expected_shape)) + # Partial shapes are merged with actual shapes: + actual_shapes = (tensor_shape.TensorShape(2), + tensor_shape.TensorShape((3, 4))) + self.assertEqual(actual_shapes, result.output_shapes) + + iterator = result.make_initializable_iterator() + init_op = iterator.initializer + get_next = iterator.get_next() + with self.test_session() as sess: + sess.run(init_op) + for _ in range(5): + sess.run(get_next) + with self.assertRaises(errors.OutOfRangeError): + sess.run(get_next) + + def test_assert_wrong_partial_element_shape(self): def create_dataset(_): return (array_ops.ones(2, dtype=dtypes.float32), @@ -756,17 +845,47 @@ class RestructuredDatasetTest(test.TestCase): dataset = dataset_ops.Dataset.range(3).map(create_dataset) wrong_shapes = (tensor_shape.TensorShape(2), - tensor_shape.TensorShape((3, 10))) + tensor_shape.TensorShape((None, 10))) with self.assertRaises(ValueError): dataset.apply(batching.assert_element_shape(wrong_shapes)) - def test_assert_wrong_element_shape_on_unknown_shape_dataset(self): + def test_assert_partial_element_shape_on_unknown_shape_dataset(self): + + def create_unknown_shape_dataset(x): + return script_ops.py_func( + lambda _: ( # pylint: disable=g-long-lambda + np.ones(2, dtype=np.float32), + np.zeros((3, 4), dtype=np.int32)), + [x], + [dtypes.float32, dtypes.int32]) + + dataset = dataset_ops.Dataset.range(5).map(create_unknown_shape_dataset) + unknown_shapes = (tensor_shape.TensorShape(None), + tensor_shape.TensorShape(None)) + self.assertEqual(unknown_shapes, dataset.output_shapes) + + expected_shapes = (tensor_shape.TensorShape(2), + tensor_shape.TensorShape((None, 4))) + result = dataset.apply(batching.assert_element_shape(expected_shapes)) + self.assertEqual(expected_shapes, result.output_shapes) + + iterator = result.make_initializable_iterator() + init_op = iterator.initializer + get_next = iterator.get_next() + with self.test_session() as sess: + sess.run(init_op) + for _ in range(5): + sess.run(get_next) + with self.assertRaises(errors.OutOfRangeError): + sess.run(get_next) + + def test_assert_wrong_partial_element_shape_on_unknown_shape_dataset(self): def create_unknown_shape_dataset(x): return script_ops.py_func( lambda _: ( # pylint: disable=g-long-lambda - np.ones(2, dtype=np.float32), - np.zeros((3, 4), dtype=np.int32)), + np.ones(2, dtype=np.float32), + np.zeros((3, 4), dtype=np.int32)), [x], [dtypes.float32, dtypes.int32]) @@ -776,9 +895,9 @@ class RestructuredDatasetTest(test.TestCase): self.assertEqual(unknown_shapes, dataset.output_shapes) wrong_shapes = (tensor_shape.TensorShape(2), - tensor_shape.TensorShape((3, 10))) + tensor_shape.TensorShape((None, 10))) iterator = ( - dataset.apply(batching.assert_element_shape(wrong_shapes)) + dataset.apply(batching.assert_element_shape(wrong_shapes)) .make_initializable_iterator()) init_op = iterator.initializer get_next = iterator.get_next() -- GitLab From 46522659b41d7c2fe93fec54feb93a8e4b56505d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Yan=20Facai=20=28=E9=A2=9C=E5=8F=91=E6=89=8D=29?= Date: Sat, 18 Aug 2018 17:30:54 +0800 Subject: [PATCH 061/598] BUG: fix for partial shape --- .../contrib/data/python/ops/batching.py | 2 +- .../framework/python/framework/tensor_util.py | 33 +++++++++++++++---- 2 files changed, 28 insertions(+), 7 deletions(-) diff --git a/tensorflow/contrib/data/python/ops/batching.py b/tensorflow/contrib/data/python/ops/batching.py index 9f059942a6..a9535d9b83 100644 --- a/tensorflow/contrib/data/python/ops/batching.py +++ b/tensorflow/contrib/data/python/ops/batching.py @@ -668,7 +668,7 @@ def assert_element_shape(expected_shapes): flatten_tensors = nest.flatten(elements) flatten_shapes = nest.flatten(expected_shapes) checked_tensors = [ - with_shape(shape, tensor) + with_shape(shape, tensor) if shape else tensor # Overview unknown shape for shape, tensor in zip(flatten_shapes, flatten_tensors) ] return nest.pack_sequence_as(elements, checked_tensors) diff --git a/tensorflow/contrib/framework/python/framework/tensor_util.py b/tensorflow/contrib/framework/python/framework/tensor_util.py index 4e6eea8884..c8fc1789c7 100644 --- a/tensorflow/contrib/framework/python/framework/tensor_util.py +++ b/tensorflow/contrib/framework/python/framework/tensor_util.py @@ -23,6 +23,7 @@ import numpy as np from tensorflow.python.framework import dtypes from tensorflow.python.framework import ops from tensorflow.python.framework import sparse_tensor +from tensorflow.python.framework import tensor_shape from tensorflow.python.framework import tensor_util from tensorflow.python.ops import array_ops from tensorflow.python.ops import check_ops @@ -129,10 +130,25 @@ def remove_squeezable_dimensions(predictions, labels, name=None): return predictions, labels -def _all_equal(tensor0, tensor1): - with ops.name_scope('all_equal', values=[tensor0, tensor1]) as scope: +def _shape_tensor_equal(expected_shape, actual_shape): + """Returns whether actual_shape is equal to expected_shape. + + Note that -1 in `expected_shape` is recognized as unknown dimension. + + Args: + expected_shape: Integer list defining the expected shape, or tensor of same. + actual_shape: Shape of the tensor to test. + Returns: + New tensor. + """ + with ops.name_scope('shape_tensor_equal', + values=[expected_shape, actual_shape]) as scope: return math_ops.reduce_all( - math_ops.equal(tensor0, tensor1, name='equal'), name=scope) + math_ops.logical_or( + math_ops.equal(expected_shape, -1), + math_ops.equal(expected_shape, actual_shape, 'equal'), + name='exclude_partial_shape'), + name=scope) def _is_rank(expected_rank, actual_tensor): @@ -153,6 +169,8 @@ def _is_rank(expected_rank, actual_tensor): def _is_shape(expected_shape, actual_tensor, actual_shape=None): """Returns whether actual_tensor's shape is expected_shape. + Note that -1 in `expected_shape` is recognized as unknown dimension. + Args: expected_shape: Integer list defining the expected shape, or tensor of same. actual_tensor: Tensor to test. @@ -164,15 +182,15 @@ def _is_shape(expected_shape, actual_tensor, actual_shape=None): is_rank = _is_rank(array_ops.size(expected_shape), actual_tensor) if actual_shape is None: actual_shape = array_ops.shape(actual_tensor, name='actual') - shape_equal = _all_equal( - ops.convert_to_tensor(expected_shape, name='expected'), - actual_shape) + shape_equal = _shape_tensor_equal(expected_shape, actual_shape) return math_ops.logical_and(is_rank, shape_equal, name=scope) def _assert_shape_op(expected_shape, actual_tensor): """Asserts actual_tensor's shape is expected_shape. + Note that unknown dimension in `expected_shape` will be ignored. + Args: expected_shape: List of integers defining the expected shape, or tensor of same. @@ -182,6 +200,9 @@ def _assert_shape_op(expected_shape, actual_tensor): """ with ops.name_scope('assert_shape', values=[actual_tensor]) as scope: actual_shape = array_ops.shape(actual_tensor, name='actual') + if (isinstance(expected_shape, tensor_shape.TensorShape) + and not expected_shape.is_fully_defined()): + expected_shape = [d if d else -1 for d in expected_shape.as_list()] is_shape = _is_shape(expected_shape, actual_tensor, actual_shape) return control_flow_ops.Assert( is_shape, [ -- GitLab From 42c116791beddd071e669f6455b8bd3f55cc1bcc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Yan=20Facai=20=28=E9=A2=9C=E5=8F=91=E6=89=8D=29?= Date: Sat, 18 Aug 2018 17:31:22 +0800 Subject: [PATCH 062/598] TST: add test case for with_shape --- .../framework/python/framework/tensor_util_test.py | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/tensorflow/contrib/framework/python/framework/tensor_util_test.py b/tensorflow/contrib/framework/python/framework/tensor_util_test.py index af1b404cb5..2fa1d33328 100644 --- a/tensorflow/contrib/framework/python/framework/tensor_util_test.py +++ b/tensorflow/contrib/framework/python/framework/tensor_util_test.py @@ -29,7 +29,7 @@ from tensorflow.python.framework import constant_op from tensorflow.python.framework import dtypes from tensorflow.python.framework import errors_impl from tensorflow.python.framework import ops -from tensorflow.python.framework import sparse_tensor +from tensorflow.python.framework import tensor_shape from tensorflow.python.framework import test_util from tensorflow.python.ops import array_ops from tensorflow.python.ops import variables as variables_lib @@ -185,6 +185,16 @@ class WithShapeTest(test.TestCase): shape, unexpected_shapes) + def test_with_shape_2x2_with_partial_expected_shape(self): + with self.test_session(): + value = [[42, 43], [44, 45]] + actual_shape = [2, 2] + tensor = constant_op.constant(value, shape=actual_shape) + partial_expected_shape = tensor_shape.TensorShape([None, 2]) + # Won't raise any exception here: + tensor_with_shape = tensor_util.with_shape(partial_expected_shape, tensor) + np.testing.assert_array_equal(value, tensor_with_shape.eval()) + def test_with_shape_none(self): with self.test_session(): tensor_no_shape = array_ops.placeholder(dtypes.float32) -- GitLab From 7bf6f8bff1ec27ccfff67f6cd482309b9f8ef066 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Yan=20Facai=20=28=E9=A2=9C=E5=8F=91=E6=89=8D=29?= Date: Sat, 18 Aug 2018 17:35:42 +0800 Subject: [PATCH 063/598] DOC: update document --- tensorflow/contrib/data/python/ops/batching.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/tensorflow/contrib/data/python/ops/batching.py b/tensorflow/contrib/data/python/ops/batching.py index a9535d9b83..0d942f33e6 100644 --- a/tensorflow/contrib/data/python/ops/batching.py +++ b/tensorflow/contrib/data/python/ops/batching.py @@ -647,15 +647,17 @@ def assert_element_shape(expected_shapes): """Assert the shape of this `Dataset`. ```python - shapes = [tf.TensorShape([16, 256]), tf.TensorShape(None)] + shapes = [tf.TensorShape([16, 256]), tf.TensorShape([None, 2])] result = dataset.apply(tf.contrib.data.assert_element_shape(shapes)) - print(result.output_shapes) # ==> "((16, 256), )" + print(result.output_shapes) # ==> "((16, 256), (, 2))" ``` If dataset shapes and expected_shape, are fully defined, assert they match. Otherwise, add assert op that will validate the shapes when tensors are evaluated, and set shapes on tensors, respectively. + Note that unknown dimension in `expected_shapes` will be ignored. + Args: expected_shapes: A nested structure of `tf.TensorShape` objects. -- GitLab From 6e0f1120fd7a6df805a8b712d2d4a38042576b46 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Yan=20Facai=20=28=E9=A2=9C=E5=8F=91=E6=89=8D=29?= Date: Sat, 18 Aug 2018 18:12:19 +0800 Subject: [PATCH 064/598] BUG: merge partial shape --- tensorflow/contrib/data/python/ops/batching.py | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/tensorflow/contrib/data/python/ops/batching.py b/tensorflow/contrib/data/python/ops/batching.py index 0d942f33e6..3cad83fcb1 100644 --- a/tensorflow/contrib/data/python/ops/batching.py +++ b/tensorflow/contrib/data/python/ops/batching.py @@ -666,20 +666,31 @@ def assert_element_shape(expected_shapes): `tf.data.Dataset.apply` """ + def _merge_output_shape(original_shapes, expected_shapes): + flat_original_shapes = nest.flatten(original_shapes) + flat_new_shapes = nest.flatten_up_to(original_shapes, expected_shapes) + flat_merged_output_shapes = [ + original_shape.merge_with(new_shape) + for original_shape, new_shape in zip(flat_original_shapes, + flat_new_shapes)] + return nest.pack_sequence_as(original_shapes, flat_merged_output_shapes) + def _check_shape(*elements): flatten_tensors = nest.flatten(elements) flatten_shapes = nest.flatten(expected_shapes) checked_tensors = [ - with_shape(shape, tensor) if shape else tensor # Overview unknown shape + with_shape(shape, tensor) if shape else tensor # Ignore unknown shape for shape, tensor in zip(flatten_shapes, flatten_tensors) ] return nest.pack_sequence_as(elements, checked_tensors) def _apply_fn(dataset): + output_shapes = _merge_output_shape(dataset.output_shapes, + expected_shapes) return _RestructuredDataset( dataset.map(_check_shape), dataset.output_types, - output_shapes=expected_shapes, + output_shapes=output_shapes, output_classes=dataset.output_classes) return _apply_fn -- GitLab From 8fbafe6c7e75e1d931eca7202ea3a4c5ac8fc2dd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Yan=20Facai=20=28=E9=A2=9C=E5=8F=91=E6=89=8D=29?= Date: Sat, 18 Aug 2018 18:28:09 +0800 Subject: [PATCH 065/598] CLN: fix code style --- .../kernel_tests/batch_dataset_op_test.py | 20 +++++++++---------- .../contrib/data/python/ops/batching.py | 12 +++++------ 2 files changed, 16 insertions(+), 16 deletions(-) diff --git a/tensorflow/contrib/data/python/kernel_tests/batch_dataset_op_test.py b/tensorflow/contrib/data/python/kernel_tests/batch_dataset_op_test.py index ebc5160408..9d8e955245 100644 --- a/tensorflow/contrib/data/python/kernel_tests/batch_dataset_op_test.py +++ b/tensorflow/contrib/data/python/kernel_tests/batch_dataset_op_test.py @@ -759,8 +759,8 @@ class RestructuredDatasetTest(test.TestCase): def create_unknown_shape_dataset(x): return script_ops.py_func( lambda _: ( # pylint: disable=g-long-lambda - np.ones(2, dtype=np.float32), - np.zeros((3, 4), dtype=np.int32)), + np.ones(2, dtype=np.float32), + np.zeros((3, 4), dtype=np.int32)), [x], [dtypes.float32, dtypes.int32]) @@ -789,8 +789,8 @@ class RestructuredDatasetTest(test.TestCase): def create_unknown_shape_dataset(x): return script_ops.py_func( lambda _: ( # pylint: disable=g-long-lambda - np.ones(2, dtype=np.float32), - np.zeros((3, 4), dtype=np.int32)), + np.ones(2, dtype=np.float32), + np.zeros((3, 4), dtype=np.int32)), [x], [dtypes.float32, dtypes.int32]) @@ -802,7 +802,7 @@ class RestructuredDatasetTest(test.TestCase): wrong_shapes = (tensor_shape.TensorShape(2), tensor_shape.TensorShape((3, 10))) iterator = ( - dataset.apply(batching.assert_element_shape(wrong_shapes)) + dataset.apply(batching.assert_element_shape(wrong_shapes)) .make_initializable_iterator()) init_op = iterator.initializer get_next = iterator.get_next() @@ -854,8 +854,8 @@ class RestructuredDatasetTest(test.TestCase): def create_unknown_shape_dataset(x): return script_ops.py_func( lambda _: ( # pylint: disable=g-long-lambda - np.ones(2, dtype=np.float32), - np.zeros((3, 4), dtype=np.int32)), + np.ones(2, dtype=np.float32), + np.zeros((3, 4), dtype=np.int32)), [x], [dtypes.float32, dtypes.int32]) @@ -884,8 +884,8 @@ class RestructuredDatasetTest(test.TestCase): def create_unknown_shape_dataset(x): return script_ops.py_func( lambda _: ( # pylint: disable=g-long-lambda - np.ones(2, dtype=np.float32), - np.zeros((3, 4), dtype=np.int32)), + np.ones(2, dtype=np.float32), + np.zeros((3, 4), dtype=np.int32)), [x], [dtypes.float32, dtypes.int32]) @@ -897,7 +897,7 @@ class RestructuredDatasetTest(test.TestCase): wrong_shapes = (tensor_shape.TensorShape(2), tensor_shape.TensorShape((None, 10))) iterator = ( - dataset.apply(batching.assert_element_shape(wrong_shapes)) + dataset.apply(batching.assert_element_shape(wrong_shapes)) .make_initializable_iterator()) init_op = iterator.initializer get_next = iterator.get_next() diff --git a/tensorflow/contrib/data/python/ops/batching.py b/tensorflow/contrib/data/python/ops/batching.py index 3cad83fcb1..9c2001c34f 100644 --- a/tensorflow/contrib/data/python/ops/batching.py +++ b/tensorflow/contrib/data/python/ops/batching.py @@ -666,13 +666,13 @@ def assert_element_shape(expected_shapes): `tf.data.Dataset.apply` """ - def _merge_output_shape(original_shapes, expected_shapes): + def _merge_output_shapes(original_shapes, expected_shapes): flat_original_shapes = nest.flatten(original_shapes) flat_new_shapes = nest.flatten_up_to(original_shapes, expected_shapes) flat_merged_output_shapes = [ - original_shape.merge_with(new_shape) - for original_shape, new_shape in zip(flat_original_shapes, - flat_new_shapes)] + original_shape.merge_with(new_shape) + for original_shape, new_shape in zip(flat_original_shapes, + flat_new_shapes)] return nest.pack_sequence_as(original_shapes, flat_merged_output_shapes) def _check_shape(*elements): @@ -685,8 +685,8 @@ def assert_element_shape(expected_shapes): return nest.pack_sequence_as(elements, checked_tensors) def _apply_fn(dataset): - output_shapes = _merge_output_shape(dataset.output_shapes, - expected_shapes) + output_shapes = _merge_output_shapes(dataset.output_shapes, + expected_shapes) return _RestructuredDataset( dataset.map(_check_shape), dataset.output_types, -- GitLab From 74c3a77ab3eb91f1ca36c3728e15827246f4d089 Mon Sep 17 00:00:00 2001 From: Artem Sobolev Date: Sun, 19 Aug 2018 12:45:42 +0300 Subject: [PATCH 066/598] Use tf.platform FLAGS wrapper instead of raw absl --- tensorflow/python/ops/parallel_for/pfor.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/tensorflow/python/ops/parallel_for/pfor.py b/tensorflow/python/ops/parallel_for/pfor.py index 2e4b2fd64e..6689c309c7 100644 --- a/tensorflow/python/ops/parallel_for/pfor.py +++ b/tensorflow/python/ops/parallel_for/pfor.py @@ -21,8 +21,6 @@ from __future__ import print_function import collections -from absl import flags - from tensorflow.python.framework import constant_op from tensorflow.python.framework import dtypes from tensorflow.python.framework import ops @@ -41,6 +39,7 @@ from tensorflow.python.ops import nn_ops from tensorflow.python.ops import parsing_ops from tensorflow.python.ops import sparse_ops from tensorflow.python.ops import tensor_array_ops +from tensorflow.python.platform import flags from tensorflow.python.platform import tf_logging as logging from tensorflow.python.util import nest -- GitLab From 0c8c6fc35f5939c9ae54e29c0051090f49cee274 Mon Sep 17 00:00:00 2001 From: Artem Sobolev Date: Sun, 19 Aug 2018 12:46:57 +0300 Subject: [PATCH 067/598] Make SoftplusGrad convertible --- tensorflow/python/ops/parallel_for/pfor.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tensorflow/python/ops/parallel_for/pfor.py b/tensorflow/python/ops/parallel_for/pfor.py index 6689c309c7..58fa6447f3 100644 --- a/tensorflow/python/ops/parallel_for/pfor.py +++ b/tensorflow/python/ops/parallel_for/pfor.py @@ -2010,6 +2010,7 @@ def _convert_biasaddgrad(pfor_input): @RegisterPForWithArgs("ReluGrad") @RegisterPForWithArgs("TanhGrad") @RegisterPForWithArgs("SigmoidGrad") +@RegisterPForWithArgs("SoftplusGrad") def _convert_grads(pfor_input, op_type, *args, **kw_args): del args del kw_args -- GitLab From 8c3457521f719736a7ac109bf7debbedd7fe4584 Mon Sep 17 00:00:00 2001 From: Yong Tang Date: Sun, 19 Aug 2018 20:48:25 +0000 Subject: [PATCH 068/598] Fix compilation failure with RDMA+GDR This fix tries to address the issue raised in 21696 where tensorflow failed to compile when both RDMA and GDR are on. The issue is that the memory allocator of GDR used the same name as RDMA. This fix fixes 21696. Signed-off-by: Yong Tang --- tensorflow/contrib/gdr/gdr_memory_manager.cc | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/tensorflow/contrib/gdr/gdr_memory_manager.cc b/tensorflow/contrib/gdr/gdr_memory_manager.cc index 7e6a0f14f6..c6bb02389d 100644 --- a/tensorflow/contrib/gdr/gdr_memory_manager.cc +++ b/tensorflow/contrib/gdr/gdr_memory_manager.cc @@ -186,22 +186,22 @@ class GdrMemoryManager : public RemoteMemoryManager { // TODO(byronyi): remove this class and its registration when the default // cpu_allocator() returns visitable allocator, or cpu_allocator() is no // longer in use. -class BFCRdmaAllocator : public BFCAllocator { +class BFCGdrAllocator : public BFCAllocator { public: - BFCRdmaAllocator() + BFCGdrAllocator() : BFCAllocator(new BasicCPUAllocator(port::kNUMANoAffinity), 1LL << 36, true, "cpu_rdma_bfc") {} }; -class BFCRdmaAllocatorFactory : public AllocatorFactory { +class BFCGdrAllocatorFactory : public AllocatorFactory { public: - Allocator* CreateAllocator() override { return new BFCRdmaAllocator; } + Allocator* CreateAllocator() override { return new BFCGdrAllocator; } virtual SubAllocator* CreateSubAllocator(int numa_node) { return new BasicCPUAllocator(numa_node); } }; -REGISTER_MEM_ALLOCATOR("BFCRdmaAllocator", 101, BFCRdmaAllocatorFactory); +REGISTER_MEM_ALLOCATOR("BFCGdrAllocator", 101, BFCGdrAllocatorFactory); GdrMemoryManager::GdrMemoryManager(const string& host, const string& port) : host_(host), -- GitLab From e1ca6dca575d58a7ab3264c8907ff05d98a6ddeb Mon Sep 17 00:00:00 2001 From: Yong Tang Date: Sun, 19 Aug 2018 20:54:59 +0000 Subject: [PATCH 069/598] Rename allocator name to cpu_gdr_bfc Signed-off-by: Yong Tang --- tensorflow/contrib/gdr/gdr_memory_manager.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tensorflow/contrib/gdr/gdr_memory_manager.cc b/tensorflow/contrib/gdr/gdr_memory_manager.cc index c6bb02389d..f464760f90 100644 --- a/tensorflow/contrib/gdr/gdr_memory_manager.cc +++ b/tensorflow/contrib/gdr/gdr_memory_manager.cc @@ -190,7 +190,7 @@ class BFCGdrAllocator : public BFCAllocator { public: BFCGdrAllocator() : BFCAllocator(new BasicCPUAllocator(port::kNUMANoAffinity), 1LL << 36, - true, "cpu_rdma_bfc") {} + true, "cpu_gdr_bfc") {} }; class BFCGdrAllocatorFactory : public AllocatorFactory { public: -- GitLab From 8c4737fa73d74e0c445a1ac90a4f08e4196f0e34 Mon Sep 17 00:00:00 2001 From: Yong Tang Date: Sun, 19 Aug 2018 23:12:22 +0000 Subject: [PATCH 070/598] Fix documentation issue with `tf.nn.conv1d` The `tf.nn.conv1d` supports float16, float32, and float64 though in `tf.nn.conv1d.__doc__` only float16 and float32 are mentioned. This fix updates the doc string to add float64 as the supported data type. Signed-off-by: Yong Tang --- tensorflow/python/ops/nn_ops.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tensorflow/python/ops/nn_ops.py b/tensorflow/python/ops/nn_ops.py index edc6e04b48..b6e8174ace 100644 --- a/tensorflow/python/ops/nn_ops.py +++ b/tensorflow/python/ops/nn_ops.py @@ -2454,7 +2454,7 @@ def conv1d(value, returned to the caller. Args: - value: A 3D `Tensor`. Must be of type `float16` or `float32`. + value: A 3D `Tensor`. Must be of type `float16`, `float32`, or `float64`. filters: A 3D `Tensor`. Must have the same type as `value`. stride: An `integer`. The number of entries by which the filter is moved right at each step. -- GitLab From 86d9ce130c5691cdba16024f7cc7987082acd294 Mon Sep 17 00:00:00 2001 From: Guozhong Zhuang Date: Mon, 20 Aug 2018 15:06:18 -0700 Subject: [PATCH 071/598] do not control primitive caching for small batch size --- tensorflow/core/kernels/mkl_conv_ops.cc | 1 + tensorflow/core/util/mkl_util.h | 2 ++ 2 files changed, 3 insertions(+) diff --git a/tensorflow/core/kernels/mkl_conv_ops.cc b/tensorflow/core/kernels/mkl_conv_ops.cc index a5763e4b74..7351e9a526 100644 --- a/tensorflow/core/kernels/mkl_conv_ops.cc +++ b/tensorflow/core/kernels/mkl_conv_ops.cc @@ -906,6 +906,7 @@ class MklConvOp : public OpKernel { // 1. Legacy CPU without AVX512/AVX2, or // 2. 1x1 convolution with stride != 1 not_cache_ = MklPrimitiveFactory::IsPrimitiveMemOptEnabled() && + (src_dims[MklDnnDims::Dim_N] > kSmallBatchSize) && (MklPrimitiveFactory::IsLegacyPlatform() || IsConv1x1StrideNot1(filter_dims, strides)); diff --git a/tensorflow/core/util/mkl_util.h b/tensorflow/core/util/mkl_util.h index ee02debddf..5d28b8a3e5 100644 --- a/tensorflow/core/util/mkl_util.h +++ b/tensorflow/core/util/mkl_util.h @@ -105,6 +105,8 @@ typedef enum { Dim3d_I = 1 } MklDnnDims3D; +static const int kSmallBatchSize = 32; + #ifdef INTEL_MKL_ML_ONLY class MklShape { public: -- GitLab From a5559a9d28bab6abfd65a9fad116ef9c6e13f8c2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Yan=20Facai=20=28=E9=A2=9C=E5=8F=91=E6=89=8D=29?= Date: Tue, 21 Aug 2018 11:51:19 +0800 Subject: [PATCH 072/598] CLN: rename _shape_tensor_equal to _shape_tensor_compatible --- .../contrib/framework/python/framework/tensor_util.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tensorflow/contrib/framework/python/framework/tensor_util.py b/tensorflow/contrib/framework/python/framework/tensor_util.py index c8fc1789c7..bdf8aeb2b8 100644 --- a/tensorflow/contrib/framework/python/framework/tensor_util.py +++ b/tensorflow/contrib/framework/python/framework/tensor_util.py @@ -130,8 +130,8 @@ def remove_squeezable_dimensions(predictions, labels, name=None): return predictions, labels -def _shape_tensor_equal(expected_shape, actual_shape): - """Returns whether actual_shape is equal to expected_shape. +def _shape_tensor_compatible(expected_shape, actual_shape): + """Returns whether actual_shape is compatible with expected_shape. Note that -1 in `expected_shape` is recognized as unknown dimension. @@ -182,7 +182,7 @@ def _is_shape(expected_shape, actual_tensor, actual_shape=None): is_rank = _is_rank(array_ops.size(expected_shape), actual_tensor) if actual_shape is None: actual_shape = array_ops.shape(actual_tensor, name='actual') - shape_equal = _shape_tensor_equal(expected_shape, actual_shape) + shape_equal = _shape_tensor_compatible(expected_shape, actual_shape) return math_ops.logical_and(is_rank, shape_equal, name=scope) -- GitLab From 41d36dea21dba7d65567250f4b47242128c05df2 Mon Sep 17 00:00:00 2001 From: Austin Anderson Date: Tue, 21 Aug 2018 16:04:07 -0700 Subject: [PATCH 073/598] Update with feedback from gunan --- tensorflow/tools/dockerfiles/Dockerfile | 6 +++++- tensorflow/tools/dockerfiles/README.md | 22 +++++++++++++++------- tensorflow/tools/dockerfiles/assembler.py | 17 +++++++++++------ 3 files changed, 31 insertions(+), 14 deletions(-) diff --git a/tensorflow/tools/dockerfiles/Dockerfile b/tensorflow/tools/dockerfiles/Dockerfile index e8ca012298..1d0dc3247d 100644 --- a/tensorflow/tools/dockerfiles/Dockerfile +++ b/tensorflow/tools/dockerfiles/Dockerfile @@ -1,4 +1,8 @@ -FROM hadolint/hadolint:latest-debian +# TensorFlow Dockerfile Development Container +# +# You can use this image to quickly develop changes to the Dockerfile assembler +# or set of TF Docker partials. See README.md for usage instructions. +FROM debian:stretch LABEL maintainer="Austin Anderson " RUN apt-get update && apt-get install -y python3 python3-pip bash diff --git a/tensorflow/tools/dockerfiles/README.md b/tensorflow/tools/dockerfiles/README.md index 4786f8ec81..ed026c20a0 100644 --- a/tensorflow/tools/dockerfiles/README.md +++ b/tensorflow/tools/dockerfiles/README.md @@ -19,10 +19,19 @@ in the Dockerfile itself. ## Running -After building the image with the tag `tf` (for example): +After building the image with the tag `tf` (for example), use `docker run` to +run the images. Examples are below. + +Note for new Docker users: the `-v` and `-u` flags share directories between +the Docker container and your machine, and very important. Without +`-v`, your work will be wiped once the container quits, and without `-u`, files +created by the container will have the wrong file permissions on your host +machine. If you are confused, check out the [Docker run +documentation](https://docs.docker.com/engine/reference/run/). ```bash -# A volume mount is optional but highly recommended, especially for Jupyter +# A volume mount (-v) is optional but highly recommended, especially for Jupyter +# User permissions (-u) are required if you use (-v). # CPU-based images $ docker run -u $(id -u):$(id -g) -v $(PWD):/my-devel -it tf @@ -32,13 +41,12 @@ $ docker run --runtime=nvidia -u $(id -u):$(id -g) -v $(PWD):/my-devel -it tf # Images with Jupyter run on port 8888, and needs a volume for notebooks $ docker run --user $(id -u):$(id -g) -p 8888:8888 -v $(PWD):/notebooks -it tf - -# Development images -$ docker run --user $(id -u):$(id -g) -it tf -docker$ git clone https://github.com/tensorflow/tensorflow ``` -## Maintaining +These images do not come with the TensorFlow source code -- but the development +images have git included, so you can `git clone` it yourself. + +## Contributing To make changes to TensorFlow's Dockerfiles, you'll update `spec.yml` and the `*.partial.Dockerfile` files in the `partials` directory, then run diff --git a/tensorflow/tools/dockerfiles/assembler.py b/tensorflow/tools/dockerfiles/assembler.py index 8e0e5923d6..8dd6cea720 100644 --- a/tensorflow/tools/dockerfiles/assembler.py +++ b/tensorflow/tools/dockerfiles/assembler.py @@ -34,8 +34,8 @@ flags.DEFINE_string( flags.DEFINE_string( 'output_dir', - '.', ('Path to an output directory for Dockerfiles. ' - 'Will be created if it doesn\'t exist.'), + 'dockerfiles', ('Path to an output directory for Dockerfiles. ' + 'Will be created if it doesn\'t exist.'), short_name='o') flags.DEFINE_string( @@ -130,8 +130,9 @@ images: class TfDockerValidator(cerberus.Validator): """Custom Cerberus validator for TF dockerfile spec. - Note that each custom validator's docstring must end with a segment describing - its own validation schema. + Note: Each _validate_foo function's docstring must end with a segment + describing its own validation schema, e.g. "The rule's arguments are...". If + you add a new validator, you can copy/paste that section. """ def _validate_ispartial(self, ispartial, field, value): @@ -275,12 +276,16 @@ def construct_contents(partial_specs, image_spec): default = '' partial_contents = re.sub(r'ARG {}.*'.format(arg), 'ARG {}{}'.format( arg, default), partial_contents) + + # Store updated partial contents processed_partial_strings.append(partial_contents) + + # Join everything together return '\n'.join(processed_partial_strings) -# Create a directory and its parents, even if it already exists def mkdir_p(path): + """Create a directory and its parents, even if it already exists.""" try: os.makedirs(path) except OSError as e: @@ -486,7 +491,7 @@ def construct_dockerfiles(tf_spec): def main(argv): if len(argv) > 1: - raise app.UsageError('Too many command-line arguments.') + raise app.UsageError('Unexpected command line args found: {}'.format(argv)) with open(FLAGS.spec_file, 'r') as spec_file: tf_spec = yaml.load(spec_file) -- GitLab From 6528b69885fa00c21db648c004be93b823d36d0d Mon Sep 17 00:00:00 2001 From: avijit-nervana Date: Wed, 22 Aug 2018 11:33:42 -0700 Subject: [PATCH 074/598] Fixed the licence file names that was causing the nGraph build to fail. --- tensorflow/tools/pip_package/BUILD | 2 +- third_party/ngraph/ngraph.BUILD | 10 +--------- third_party/ngraph/ngraph_tf.BUILD | 10 +--------- third_party/ngraph/nlohmann_json.BUILD | 10 +--------- 4 files changed, 4 insertions(+), 28 deletions(-) diff --git a/tensorflow/tools/pip_package/BUILD b/tensorflow/tools/pip_package/BUILD index 7645612cf1..91c5cd094c 100644 --- a/tensorflow/tools/pip_package/BUILD +++ b/tensorflow/tools/pip_package/BUILD @@ -209,7 +209,7 @@ filegroup( ) + if_ngraph([ "@ngraph//:LICENSE", "@ngraph_tf//:LICENSE", - "@nlohmann_json_lib//:LICENSE", + "@nlohmann_json_lib//:LICENSE.MIT", ]) + tf_additional_license_deps(), ) diff --git a/third_party/ngraph/ngraph.BUILD b/third_party/ngraph/ngraph.BUILD index f73ce4f674..31aa3cee51 100644 --- a/third_party/ngraph/ngraph.BUILD +++ b/third_party/ngraph/ngraph.BUILD @@ -1,14 +1,6 @@ licenses(["notice"]) # 3-Clause BSD -exports_files(["license.txt"]) - -filegroup( - name = "LICENSE", - srcs = [ - "license.txt", - ], - visibility = ["//visibility:public"], -) +exports_files(["LICENSE"]) cc_library( name = "ngraph_core", diff --git a/third_party/ngraph/ngraph_tf.BUILD b/third_party/ngraph/ngraph_tf.BUILD index 0c2c8a718f..4d96ccf2f2 100644 --- a/third_party/ngraph/ngraph_tf.BUILD +++ b/third_party/ngraph/ngraph_tf.BUILD @@ -1,14 +1,6 @@ licenses(["notice"]) # 3-Clause BSD -exports_files(["license.txt"]) - -filegroup( - name = "LICENSE", - srcs = [ - "license.txt", - ], - visibility = ["//visibility:public"], -) +exports_files(["LICENSE"]) load( "@org_tensorflow//tensorflow:tensorflow.bzl", diff --git a/third_party/ngraph/nlohmann_json.BUILD b/third_party/ngraph/nlohmann_json.BUILD index a0b18a51cb..04c8db6a96 100644 --- a/third_party/ngraph/nlohmann_json.BUILD +++ b/third_party/ngraph/nlohmann_json.BUILD @@ -1,14 +1,6 @@ licenses(["notice"]) # 3-Clause BSD -exports_files(["license.txt"]) - -filegroup( - name = "LICENSE", - srcs = [ - "license.txt", - ], - visibility = ["//visibility:public"], -) +exports_files(["LICENSE.MIT"]) cc_library( name = "nlohmann_json_lib", -- GitLab From 273fce7024f031ab90f8af475f74d64cb6a185ec Mon Sep 17 00:00:00 2001 From: Austin Anderson Date: Wed, 22 Aug 2018 11:43:32 -0700 Subject: [PATCH 075/598] Cleaned up requests from gunan --- tensorflow/tools/dockerfiles/Dockerfile | 15 ++++++++++++ tensorflow/tools/dockerfiles/README.md | 4 +-- tensorflow/tools/dockerfiles/assembler.py | 30 +++++++++++++++++++---- tensorflow/tools/dockerfiles/bashrc | 19 +++++++++++++- tensorflow/tools/dockerfiles/spec.yml | 15 ++++++++++++ 5 files changed, 75 insertions(+), 8 deletions(-) diff --git a/tensorflow/tools/dockerfiles/Dockerfile b/tensorflow/tools/dockerfiles/Dockerfile index 1d0dc3247d..2798e83cb7 100644 --- a/tensorflow/tools/dockerfiles/Dockerfile +++ b/tensorflow/tools/dockerfiles/Dockerfile @@ -1,3 +1,18 @@ +# Copyright 2018 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +# # TensorFlow Dockerfile Development Container # # You can use this image to quickly develop changes to the Dockerfile assembler diff --git a/tensorflow/tools/dockerfiles/README.md b/tensorflow/tools/dockerfiles/README.md index ed026c20a0..ea80e9feaa 100644 --- a/tensorflow/tools/dockerfiles/README.md +++ b/tensorflow/tools/dockerfiles/README.md @@ -30,7 +30,7 @@ machine. If you are confused, check out the [Docker run documentation](https://docs.docker.com/engine/reference/run/). ```bash -# A volume mount (-v) is optional but highly recommended, especially for Jupyter +# Volume mount (-v) is optional but highly recommended, especially for Jupyter. # User permissions (-u) are required if you use (-v). # CPU-based images @@ -63,5 +63,5 @@ $ docker build -t tf-assembler . $ docker run --user $(id -u):$(id -g) -it -v $(pwd):/tf tf-assembler bash # In the container... -/tf $ python3 ./assembler.py -o dockerfiles -s spec.yml --validate +/tf $ python3 ./assembler.py -o dockerfiles -s spec.yml ``` diff --git a/tensorflow/tools/dockerfiles/assembler.py b/tensorflow/tools/dockerfiles/assembler.py index 8dd6cea720..eb8d876cca 100644 --- a/tensorflow/tools/dockerfiles/assembler.py +++ b/tensorflow/tools/dockerfiles/assembler.py @@ -1,3 +1,18 @@ +# Copyright 2018 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + """Assemble common TF Dockerfiles from many parts. This script constructs TF's Dockerfiles by aggregating partial @@ -34,8 +49,8 @@ flags.DEFINE_string( flags.DEFINE_string( 'output_dir', - 'dockerfiles', ('Path to an output directory for Dockerfiles. ' - 'Will be created if it doesn\'t exist.'), + './dockerfiles', ('Path to an output directory for Dockerfiles. ' + 'Will be created if it doesn\'t exist.'), short_name='o') flags.DEFINE_string( @@ -178,6 +193,7 @@ class TfDockerValidator(cerberus.Validator): for partial in self.root_document.get('partials', dict()).values(): if value in partial.get('args', tuple()): return + self._error(field, '{} is not an arg used in any partial.'.format(value)) @@ -209,7 +225,6 @@ def build_partial_description(partial_spec): # Document each arg for arg, arg_data in partial_spec.get('args', dict()).items(): - # Wrap arg description with comment lines desc = arg_data.get('desc', '( no description )') desc = textwrap.fill( @@ -230,6 +245,7 @@ def build_partial_description(partial_spec): arg_data.get('default', '(unset)'), arg_options) lines.extend([arg_use, desc]) + return '\n'.join(lines) @@ -252,7 +268,6 @@ def construct_contents(partial_specs, image_spec): """ processed_partial_strings = [] for partial_name in image_spec['partials']: - # Apply image arg-defaults to existing arg defaults partial_spec = copy.deepcopy(partial_specs[partial_name]) args = partial_spec.get('args', dict()) @@ -316,7 +331,6 @@ def construct_documentation(header, partial_specs, image_spec): # Build documentation for each partial in the image for partial in image_spec['partials']: - # Copy partial data for default args unique to this image partial_spec = copy.deepcopy(partial_specs[partial]) args = partial_spec.get('args', dict()) @@ -365,6 +379,7 @@ def normalize_partial_args(partial_specs): if not isinstance(value, dict): new_value = {'default': value} args[arg] = new_value + return partial_specs @@ -410,8 +425,10 @@ def flatten_args_references(image_specs): new_args.extend(image_specs[arg]['arg-defaults']) else: new_args.append(arg) + image_spec['arg-defaults'] = new_args too_deep += 1 + return image_specs @@ -458,8 +475,10 @@ def flatten_partial_references(image_specs): new_partials.append(partial) else: new_partials.extend(image_specs[partial['image']]['partials']) + image_spec['partials'] = new_partials too_deep += 1 + return image_specs @@ -486,6 +505,7 @@ def construct_dockerfiles(tf_spec): image_spec) contents = construct_contents(partial_specs, image_spec) names_to_contents[name] = '\n'.join([documentation, contents]) + return names_to_contents diff --git a/tensorflow/tools/dockerfiles/bashrc b/tensorflow/tools/dockerfiles/bashrc index 7f54609e78..48cacf20f6 100644 --- a/tensorflow/tools/dockerfiles/bashrc +++ b/tensorflow/tools/dockerfiles/bashrc @@ -1,3 +1,19 @@ +# Copyright 2018 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# ============================================================================== + export PS1="\[\e[31m\]tf-docker\[\e[m\] \[\e[33m\]\w\[\e[m\] > " export TERM=xterm-256color alias grep="grep --color=auto" @@ -24,10 +40,11 @@ To avoid this, run the container by specifying your user's userid: $ docker run -u \$(id -u):\$(id -g) args... WARN else -cat < Date: Wed, 22 Aug 2018 11:47:31 -0700 Subject: [PATCH 076/598] Add license to header files --- tensorflow/tools/dockerfiles/assembler.py | 4 ++-- .../dockerfiles/cpu-devel-jupyter.Dockerfile | 17 ++++++++++++++++- .../dockerfiles/cpu-devel.Dockerfile | 17 ++++++++++++++++- .../dockerfiles/cpu-jupyter.Dockerfile | 17 ++++++++++++++++- .../dockerfiles/dockerfiles/cpu.Dockerfile | 17 ++++++++++++++++- .../dockerfiles/nvidia-devel-jupyter.Dockerfile | 17 ++++++++++++++++- .../dockerfiles/nvidia-devel.Dockerfile | 17 ++++++++++++++++- .../dockerfiles/nvidia-jupyter.Dockerfile | 17 ++++++++++++++++- .../dockerfiles/dockerfiles/nvidia.Dockerfile | 17 ++++++++++++++++- 9 files changed, 130 insertions(+), 10 deletions(-) diff --git a/tensorflow/tools/dockerfiles/assembler.py b/tensorflow/tools/dockerfiles/assembler.py index eb8d876cca..9cdd9bb0cb 100644 --- a/tensorflow/tools/dockerfiles/assembler.py +++ b/tensorflow/tools/dockerfiles/assembler.py @@ -12,7 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== - """Assemble common TF Dockerfiles from many parts. This script constructs TF's Dockerfiles by aggregating partial @@ -324,7 +323,8 @@ def construct_documentation(header, partial_specs, image_spec): """ # Comment and wrap header and image description - commented_header = '\n'.join(['# ' + l for l in header.splitlines()]) + commented_header = '\n'.join( + [('# ' + l).rstrip() for l in header.splitlines()]) commented_desc = '\n'.join( ['# ' + l for l in image_spec.get('desc', '').splitlines()]) partial_descriptions = [] diff --git a/tensorflow/tools/dockerfiles/dockerfiles/cpu-devel-jupyter.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/cpu-devel-jupyter.Dockerfile index e49e4f43bc..2d095e1041 100644 --- a/tensorflow/tools/dockerfiles/dockerfiles/cpu-devel-jupyter.Dockerfile +++ b/tensorflow/tools/dockerfiles/dockerfiles/cpu-devel-jupyter.Dockerfile @@ -1,5 +1,20 @@ +# Copyright 2018 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +# # THIS IS A GENERATED DOCKERFILE. -# +# # This file was assembled from multiple pieces, whose use is documented # below. Please refer to the the TensorFlow dockerfiles documentation for # more information. Build args are documented as their default value. diff --git a/tensorflow/tools/dockerfiles/dockerfiles/cpu-devel.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/cpu-devel.Dockerfile index 60607cfd13..e0073f4a6e 100644 --- a/tensorflow/tools/dockerfiles/dockerfiles/cpu-devel.Dockerfile +++ b/tensorflow/tools/dockerfiles/dockerfiles/cpu-devel.Dockerfile @@ -1,5 +1,20 @@ +# Copyright 2018 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +# # THIS IS A GENERATED DOCKERFILE. -# +# # This file was assembled from multiple pieces, whose use is documented # below. Please refer to the the TensorFlow dockerfiles documentation for # more information. Build args are documented as their default value. diff --git a/tensorflow/tools/dockerfiles/dockerfiles/cpu-jupyter.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/cpu-jupyter.Dockerfile index 5ea824aca5..2ec14dc295 100644 --- a/tensorflow/tools/dockerfiles/dockerfiles/cpu-jupyter.Dockerfile +++ b/tensorflow/tools/dockerfiles/dockerfiles/cpu-jupyter.Dockerfile @@ -1,5 +1,20 @@ +# Copyright 2018 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +# # THIS IS A GENERATED DOCKERFILE. -# +# # This file was assembled from multiple pieces, whose use is documented # below. Please refer to the the TensorFlow dockerfiles documentation for # more information. Build args are documented as their default value. diff --git a/tensorflow/tools/dockerfiles/dockerfiles/cpu.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/cpu.Dockerfile index 2d8db2a79e..1d99476fd9 100644 --- a/tensorflow/tools/dockerfiles/dockerfiles/cpu.Dockerfile +++ b/tensorflow/tools/dockerfiles/dockerfiles/cpu.Dockerfile @@ -1,5 +1,20 @@ +# Copyright 2018 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +# # THIS IS A GENERATED DOCKERFILE. -# +# # This file was assembled from multiple pieces, whose use is documented # below. Please refer to the the TensorFlow dockerfiles documentation for # more information. Build args are documented as their default value. diff --git a/tensorflow/tools/dockerfiles/dockerfiles/nvidia-devel-jupyter.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/nvidia-devel-jupyter.Dockerfile index 8860bbe431..ceec20ef0f 100644 --- a/tensorflow/tools/dockerfiles/dockerfiles/nvidia-devel-jupyter.Dockerfile +++ b/tensorflow/tools/dockerfiles/dockerfiles/nvidia-devel-jupyter.Dockerfile @@ -1,5 +1,20 @@ +# Copyright 2018 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +# # THIS IS A GENERATED DOCKERFILE. -# +# # This file was assembled from multiple pieces, whose use is documented # below. Please refer to the the TensorFlow dockerfiles documentation for # more information. Build args are documented as their default value. diff --git a/tensorflow/tools/dockerfiles/dockerfiles/nvidia-devel.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/nvidia-devel.Dockerfile index 13e979704c..369a3ed62e 100644 --- a/tensorflow/tools/dockerfiles/dockerfiles/nvidia-devel.Dockerfile +++ b/tensorflow/tools/dockerfiles/dockerfiles/nvidia-devel.Dockerfile @@ -1,5 +1,20 @@ +# Copyright 2018 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +# # THIS IS A GENERATED DOCKERFILE. -# +# # This file was assembled from multiple pieces, whose use is documented # below. Please refer to the the TensorFlow dockerfiles documentation for # more information. Build args are documented as their default value. diff --git a/tensorflow/tools/dockerfiles/dockerfiles/nvidia-jupyter.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/nvidia-jupyter.Dockerfile index b6b8eb9a9d..976fce037a 100644 --- a/tensorflow/tools/dockerfiles/dockerfiles/nvidia-jupyter.Dockerfile +++ b/tensorflow/tools/dockerfiles/dockerfiles/nvidia-jupyter.Dockerfile @@ -1,5 +1,20 @@ +# Copyright 2018 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +# # THIS IS A GENERATED DOCKERFILE. -# +# # This file was assembled from multiple pieces, whose use is documented # below. Please refer to the the TensorFlow dockerfiles documentation for # more information. Build args are documented as their default value. diff --git a/tensorflow/tools/dockerfiles/dockerfiles/nvidia.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/nvidia.Dockerfile index 8b306293d9..08a146d9fa 100644 --- a/tensorflow/tools/dockerfiles/dockerfiles/nvidia.Dockerfile +++ b/tensorflow/tools/dockerfiles/dockerfiles/nvidia.Dockerfile @@ -1,5 +1,20 @@ +# Copyright 2018 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +# # THIS IS A GENERATED DOCKERFILE. -# +# # This file was assembled from multiple pieces, whose use is documented # below. Please refer to the the TensorFlow dockerfiles documentation for # more information. Build args are documented as their default value. -- GitLab From 54dfc4c499df11facc8aaa141616422db504cc5c Mon Sep 17 00:00:00 2001 From: Austin Anderson Date: Wed, 22 Aug 2018 13:26:34 -0700 Subject: [PATCH 077/598] Name assembler Dockerfile --- tensorflow/tools/dockerfiles/README.md | 2 +- .../tools/dockerfiles/{Dockerfile => assembler.Dockerfile} | 0 2 files changed, 1 insertion(+), 1 deletion(-) rename tensorflow/tools/dockerfiles/{Dockerfile => assembler.Dockerfile} (100%) diff --git a/tensorflow/tools/dockerfiles/README.md b/tensorflow/tools/dockerfiles/README.md index ea80e9feaa..c484c162cb 100644 --- a/tensorflow/tools/dockerfiles/README.md +++ b/tensorflow/tools/dockerfiles/README.md @@ -57,7 +57,7 @@ You can use the `Dockerfile` in this directory to build an editing environment that has all of the Python dependencies you'll need: ```bash -$ docker build -t tf-assembler . +$ docker build -t tf-assembler -f assembler.Dockerfile . # Set --user to set correct permissions on generated files $ docker run --user $(id -u):$(id -g) -it -v $(pwd):/tf tf-assembler bash diff --git a/tensorflow/tools/dockerfiles/Dockerfile b/tensorflow/tools/dockerfiles/assembler.Dockerfile similarity index 100% rename from tensorflow/tools/dockerfiles/Dockerfile rename to tensorflow/tools/dockerfiles/assembler.Dockerfile -- GitLab From b425b0ed272a28ccb6dbc3d3ab937da257b7f02d Mon Sep 17 00:00:00 2001 From: Avijit <30507445+avijit-nervana@users.noreply.github.com> Date: Wed, 22 Aug 2018 17:51:05 -0700 Subject: [PATCH 078/598] Avijit/fix broken unit tests * Fixed the dependencies with `str(Label(...))` so that third_party codes that refer to the tensorflow as a submodule get it properly resolved. --- tensorflow/tensorflow.bzl | 4 ++-- third_party/mkl/build_defs.bzl | 24 ++++++++++-------------- 2 files changed, 12 insertions(+), 16 deletions(-) diff --git a/tensorflow/tensorflow.bzl b/tensorflow/tensorflow.bzl index 6d6e8941c5..3562a5192d 100644 --- a/tensorflow/tensorflow.bzl +++ b/tensorflow/tensorflow.bzl @@ -396,7 +396,7 @@ def tf_cc_binary( srcs = srcs + tf_binary_additional_srcs(), deps = deps + tf_binary_dynamic_kernel_deps(kernels) + if_mkl_ml( [ - "//third_party/mkl:intel_binary_blob", + clean_dep("//third_party/mkl:intel_binary_blob"), ], ), data = data + tf_binary_dynamic_kernel_dsos(kernels), @@ -734,7 +734,7 @@ def tf_cc_test( }) + linkopts + _rpath_linkopts(name), deps = deps + tf_binary_dynamic_kernel_deps(kernels) + if_mkl_ml( [ - "//third_party/mkl:intel_binary_blob", + clean_dep("//third_party/mkl:intel_binary_blob"), ], ), data = data + tf_binary_dynamic_kernel_dsos(kernels), diff --git a/third_party/mkl/build_defs.bzl b/third_party/mkl/build_defs.bzl index 06a8c3518c..9970a772fe 100644 --- a/third_party/mkl/build_defs.bzl +++ b/third_party/mkl/build_defs.bzl @@ -26,7 +26,7 @@ def if_mkl(if_true, if_false = []): a select evaluating to either if_true or if_false as appropriate. """ return select({ - "//third_party/mkl:using_mkl": if_true, + str(Label("//third_party/mkl:using_mkl")): if_true, "//conditions:default": if_false, }) @@ -42,9 +42,8 @@ def if_mkl_ml(if_true, if_false = []): a select evaluating to either if_true or if_false as appropriate. """ return select({ - "//third_party/mkl_dnn:using_mkl_dnn_only": - if_false, - "//third_party/mkl:using_mkl": if_true, + str(Label("//third_party/mkl_dnn:using_mkl_dnn_only")): if_false, + str(Label("//third_party/mkl:using_mkl")): if_true, "//conditions:default": if_false, }) @@ -59,7 +58,7 @@ def if_mkl_ml_only(if_true, if_false = []): a select evaluating to either if_true or if_false as appropriate. """ return select({ - "//third_party/mkl:using_mkl_ml_only": if_true, + str(Label("//third_party/mkl:using_mkl_ml_only")): if_true, "//conditions:default": if_false, }) @@ -76,7 +75,7 @@ def if_mkl_lnx_x64(if_true, if_false = []): a select evaluating to either if_true or if_false as appropriate. """ return select({ - "//third_party/mkl:using_mkl_lnx_x64": if_true, + str(Label("//third_party/mkl:using_mkl_lnx_x64")): if_true, "//conditions:default": if_false, }) @@ -90,16 +89,13 @@ def mkl_deps(): inclusion in the deps attribute of rules. """ return select({ - "//third_party/mkl_dnn:using_mkl_dnn_only": - ["@mkl_dnn"], - "//third_party/mkl:using_mkl_ml_only": - ["//third_party/mkl:intel_binary_blob"], - "//third_party/mkl:using_mkl": - [ + str(Label("//third_party/mkl_dnn:using_mkl_dnn_only")): ["@mkl_dnn"], + str(Label("//third_party/mkl:using_mkl_ml_only")): ["//third_party/mkl:intel_binary_blob"], + str(Label("//third_party/mkl:using_mkl")): [ "//third_party/mkl:intel_binary_blob", - "@mkl_dnn" + "@mkl_dnn", ], - "//conditions:default": [] + "//conditions:default": [], }) def _enable_local_mkl(repository_ctx): -- GitLab From 5c7d4001bec1f497ca25bf177015c0dc13557448 Mon Sep 17 00:00:00 2001 From: pengwa Date: Thu, 23 Aug 2018 02:58:44 +0000 Subject: [PATCH 079/598] Revert "change format a bit" This reverts commit d76aaad2ea9ee4df8c32b382db758854315d230e. --- tensorflow/core/grappler/optimizers/memory_optimizer_test.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tensorflow/core/grappler/optimizers/memory_optimizer_test.cc b/tensorflow/core/grappler/optimizers/memory_optimizer_test.cc index 1473e26cbd..49543645f6 100644 --- a/tensorflow/core/grappler/optimizers/memory_optimizer_test.cc +++ b/tensorflow/core/grappler/optimizers/memory_optimizer_test.cc @@ -114,7 +114,7 @@ TEST_F(RecomputeSubgraphTest, TwoInputSubgraphs) { (*pre_transform_node_map.GetNode("b")->mutable_attr())["_recompute_hint"] .set_i(0); - MemoryOptimizer optimizer(RewriterConfig::MANUAL, 1.0, + MemoryOptimizer optimizer(RewriterConfig::MANUAL,1.0, "some_name_scope/gradients"); GraphDef output; Status status = optimizer.Optimize(nullptr, item, &output); -- GitLab From aa54baf44a81b18375e7e9491ce5ba8e186d6a86 Mon Sep 17 00:00:00 2001 From: pengwa Date: Thu, 23 Aug 2018 02:58:56 +0000 Subject: [PATCH 080/598] Revert "fix build error" This reverts commit 2fcfb4abde9d847cff5a344cf06b2704cb6f9545. --- tensorflow/core/grappler/optimizers/memory_optimizer_test.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tensorflow/core/grappler/optimizers/memory_optimizer_test.cc b/tensorflow/core/grappler/optimizers/memory_optimizer_test.cc index 49543645f6..a3f0e07861 100644 --- a/tensorflow/core/grappler/optimizers/memory_optimizer_test.cc +++ b/tensorflow/core/grappler/optimizers/memory_optimizer_test.cc @@ -114,7 +114,7 @@ TEST_F(RecomputeSubgraphTest, TwoInputSubgraphs) { (*pre_transform_node_map.GetNode("b")->mutable_attr())["_recompute_hint"] .set_i(0); - MemoryOptimizer optimizer(RewriterConfig::MANUAL,1.0, + MemoryOptimizer optimizer(RewriterConfig::MANUAL, "some_name_scope/gradients"); GraphDef output; Status status = optimizer.Optimize(nullptr, item, &output); -- GitLab From 1ffdea5249eda3bc309e26052f56aba4e597ba7c Mon Sep 17 00:00:00 2001 From: pengwa Date: Thu, 23 Aug 2018 02:59:05 +0000 Subject: [PATCH 081/598] Revert "Merge master change" This reverts commit afbe36c5126cf118c60cbf22454d99d429425334. --- tensorflow/core/grappler/optimizers/meta_optimizer.cc | 7 ------- tensorflow/core/grappler/optimizers/meta_optimizer.h | 4 ---- 2 files changed, 11 deletions(-) diff --git a/tensorflow/core/grappler/optimizers/meta_optimizer.cc b/tensorflow/core/grappler/optimizers/meta_optimizer.cc index 0d2b9a5763..e0ab7e00e9 100644 --- a/tensorflow/core/grappler/optimizers/meta_optimizer.cc +++ b/tensorflow/core/grappler/optimizers/meta_optimizer.cc @@ -411,13 +411,6 @@ bool MetaOptimizerEnabled(const RewriterConfig& cfg) { !cfg.optimizers().empty() || !cfg.custom_optimizers().empty(); } -Status RunMetaOptimizer(const GrapplerItem& item, const RewriterConfig& cfg, - DeviceBase* cpu_device, Cluster* cluster, - GraphDef* optimized_graph) { - MetaOptimizer optimizer(cpu_device, cfg); - return optimizer.Optimize(cluster, item, optimized_graph); -} - Status RunMetaOptimizer(const GrapplerItem& item, const RewriterConfig& cfg, DeviceBase* cpu_device, Cluster* cluster, GraphDef* optimized_graph, diff --git a/tensorflow/core/grappler/optimizers/meta_optimizer.h b/tensorflow/core/grappler/optimizers/meta_optimizer.h index c267b5fd8e..74b6bb7f74 100644 --- a/tensorflow/core/grappler/optimizers/meta_optimizer.h +++ b/tensorflow/core/grappler/optimizers/meta_optimizer.h @@ -93,10 +93,6 @@ bool MetaOptimizerEnabled(const RewriterConfig& cfg); // during constant folding; if NULL, a new device is created for doing constant // folding. For performance, it is recommended to pass in an existing cpu_device // when possible. -Status RunMetaOptimizer(const GrapplerItem& item, const RewriterConfig& cfg, - DeviceBase* cpu_device, Cluster* cluster, - GraphDef* optimized_graph); - Status RunMetaOptimizer(const GrapplerItem& item, const RewriterConfig& cfg, DeviceBase* cpu_device, Cluster* cluster, GraphDef* optimized_graph, -- GitLab From a5657e75bf62540ee7d6accadc29ae21637d36f3 Mon Sep 17 00:00:00 2001 From: pengwa Date: Thu, 23 Aug 2018 02:59:14 +0000 Subject: [PATCH 082/598] Revert "consider gpu memory fraction option for memory optimizer" This reverts commit 0059fe57ce7f6b8397b72acfb0ef30013d748116. --- .../core/common_runtime/graph_execution_state.cc | 4 +--- .../core/grappler/optimizers/memory_optimizer.cc | 14 ++++++-------- .../core/grappler/optimizers/memory_optimizer.h | 3 --- .../core/grappler/optimizers/meta_optimizer.cc | 12 +++++------- .../core/grappler/optimizers/meta_optimizer.h | 9 +-------- 5 files changed, 13 insertions(+), 29 deletions(-) diff --git a/tensorflow/core/common_runtime/graph_execution_state.cc b/tensorflow/core/common_runtime/graph_execution_state.cc index d76f7b49b1..eb710bdbc5 100644 --- a/tensorflow/core/common_runtime/graph_execution_state.cc +++ b/tensorflow/core/common_runtime/graph_execution_state.cc @@ -407,8 +407,6 @@ Status GraphExecutionState::OptimizeGraph( const RewriterConfig& rewrite_options = session_options_->config.graph_options().rewrite_options(); - const GPUOptions& gpu_options = - session_options_->config.gpu_options(); if (grappler::MetaOptimizerEnabled(rewrite_options)) { // Adding this functionality in steps. The first step is to make sure @@ -495,7 +493,7 @@ Status GraphExecutionState::OptimizeGraph( grappler::VirtualCluster cluster(device_map, device_set_); GraphDef new_graph; TF_RETURN_IF_ERROR(grappler::RunMetaOptimizer( - item, rewrite_options, cpu_device, &cluster, &new_graph, gpu_options)); + item, rewrite_options, cpu_device, &cluster, &new_graph)); // Merge optimized graph function library with an original library. // Optimized graph might have new functions specialized for it's diff --git a/tensorflow/core/grappler/optimizers/memory_optimizer.cc b/tensorflow/core/grappler/optimizers/memory_optimizer.cc index 5a2cec4358..1be5f8dcc2 100644 --- a/tensorflow/core/grappler/optimizers/memory_optimizer.cc +++ b/tensorflow/core/grappler/optimizers/memory_optimizer.cc @@ -943,7 +943,7 @@ struct MemInfo { static bool IdentifySwappingCandidates( Cluster* cluster, GrapplerItem* item, std::unordered_set* skip_list, - std::unordered_map* nodes_to_swap, double memory_fraction) { + std::unordered_map* nodes_to_swap) { GraphMemory memory(*item); const std::unordered_map& devices = cluster->GetDevices(); @@ -966,10 +966,10 @@ static bool IdentifySwappingCandidates( } const GraphMemory::MemoryUsage& mem_usage = memory.GetPeakMemoryUsage(name); - if (mem_usage.used_memory <= memory_fraction * prop.memory_size()) { + if (mem_usage.used_memory <= prop.memory_size()) { continue; } - int64 required_savings = mem_usage.used_memory - memory_fraction * prop.memory_size(); + int64 required_savings = mem_usage.used_memory - prop.memory_size(); std::unordered_map op_completion_times; { @@ -1105,14 +1105,13 @@ static bool IdentifySwappingCandidates( bool SwappingPass(RewriterConfig::MemOptType optimization_level, Cluster* cluster, GrapplerItem* item, - std::unordered_set* skip_list, - double memory_fraction) { + std::unordered_set* skip_list) { std::unordered_map nodes_to_swap; if (optimization_level == RewriterConfig::DEFAULT_MEM_OPT || optimization_level == RewriterConfig::SWAPPING_HEURISTICS || optimization_level == RewriterConfig::HEURISTICS) { // Use heuristics to figure out what needs to be swapped; - IdentifySwappingCandidates(cluster, item, skip_list, &nodes_to_swap, memory_fraction); + IdentifySwappingCandidates(cluster, item, skip_list, &nodes_to_swap); } // Look for manual annotatations in the graph. for (auto& node : *item->graph.mutable_node()) { @@ -1325,8 +1324,7 @@ Status MemoryOptimizer::Optimize(Cluster* cluster, const GrapplerItem& item, optimization_level_ == RewriterConfig::MANUAL) && cluster != nullptr) { updated_graph |= SwappingPass(optimization_level_, cluster, - &optimized_item, &skip_list, - per_process_gpu_memory_fraction_); + &optimized_item, &skip_list); } } diff --git a/tensorflow/core/grappler/optimizers/memory_optimizer.h b/tensorflow/core/grappler/optimizers/memory_optimizer.h index 6e03f442d6..653ffaec4c 100644 --- a/tensorflow/core/grappler/optimizers/memory_optimizer.h +++ b/tensorflow/core/grappler/optimizers/memory_optimizer.h @@ -32,10 +32,8 @@ class MemoryOptimizer : public GraphOptimizer { // RewriterConfig::memory_optimizer_target_node_name_scope. explicit MemoryOptimizer( RewriterConfig::MemOptType optimization_level, - double per_process_gpu_memory_fraction = 1.0, const string& recomputation_targets_name_scope = "gradients/") : optimization_level_(optimization_level), - per_process_gpu_memory_fraction_(per_process_gpu_memory_fraction), recomputation_targets_name_scope_(recomputation_targets_name_scope) {} ~MemoryOptimizer() override {} @@ -49,7 +47,6 @@ class MemoryOptimizer : public GraphOptimizer { private: RewriterConfig::MemOptType optimization_level_; - double per_process_gpu_memory_fraction_; string recomputation_targets_name_scope_; }; diff --git a/tensorflow/core/grappler/optimizers/meta_optimizer.cc b/tensorflow/core/grappler/optimizers/meta_optimizer.cc index e0ab7e00e9..143d9dc1c6 100644 --- a/tensorflow/core/grappler/optimizers/meta_optimizer.cc +++ b/tensorflow/core/grappler/optimizers/meta_optimizer.cc @@ -83,7 +83,7 @@ std::unique_ptr MetaOptimizer::MakeNewOptimizer( MK_OPT("shape", new ShapeOptimizer()); MK_OPT("remap", new Remapper(cfg_.remapping())); MK_OPT("layout", new LayoutOptimizer()); - MK_OPT("memory", new MemoryOptimizer(RewriterConfig::MANUAL, gpu_options_.per_process_gpu_memory_fraction())); + MK_OPT("memory", new MemoryOptimizer(RewriterConfig::MANUAL)); MK_OPT("arithmetic", new ArithmeticOptimizer(cfg_.arithmetic_optimization())); MK_OPT("autoparallel", new AutoParallel(cfg_.auto_parallel().num_replicas())); MK_OPT("loop", new LoopOptimizer(cfg_.loop_optimization())); @@ -134,14 +134,13 @@ Status MetaOptimizer::InitializeOptimizers( optimizers->emplace_back(new LayoutOptimizer()); } if (cfg_.memory_optimization() != RewriterConfig::NO_MEM_OPT) { - double mem_fraction = gpu_options_.per_process_gpu_memory_fraction(); if (cfg_.memory_optimizer_target_node_name_scope().empty()) { optimizers->emplace_back( // Use the default target node name prefix "gradients/" - new MemoryOptimizer(cfg_.memory_optimization(), mem_fraction)); + new MemoryOptimizer(cfg_.memory_optimization())); } else { optimizers->emplace_back( - new MemoryOptimizer(cfg_.memory_optimization(), mem_fraction, + new MemoryOptimizer(cfg_.memory_optimization(), cfg_.memory_optimizer_target_node_name_scope())); } } @@ -413,9 +412,8 @@ bool MetaOptimizerEnabled(const RewriterConfig& cfg) { Status RunMetaOptimizer(const GrapplerItem& item, const RewriterConfig& cfg, DeviceBase* cpu_device, Cluster* cluster, - GraphDef* optimized_graph, - const GPUOptions& gpu_options) { - MetaOptimizer optimizer(cpu_device, cfg, gpu_options); + GraphDef* optimized_graph) { + MetaOptimizer optimizer(cpu_device, cfg); return optimizer.Optimize(cluster, item, optimized_graph); } diff --git a/tensorflow/core/grappler/optimizers/meta_optimizer.h b/tensorflow/core/grappler/optimizers/meta_optimizer.h index 74b6bb7f74..151a54cbdf 100644 --- a/tensorflow/core/grappler/optimizers/meta_optimizer.h +++ b/tensorflow/core/grappler/optimizers/meta_optimizer.h @@ -21,7 +21,6 @@ limitations under the License. #include "tensorflow/core/grappler/optimizers/graph_optimizer.h" #include "tensorflow/core/lib/core/status.h" #include "tensorflow/core/protobuf/rewriter_config.pb.h" -#include "tensorflow/core/protobuf/config.pb.h" namespace tensorflow { namespace grappler { @@ -31,10 +30,6 @@ class MetaOptimizer : public GraphOptimizer { public: MetaOptimizer(DeviceBase* cpu_device, const RewriterConfig& cfg) : cpu_device_(cpu_device), cfg_(cfg) {} - - MetaOptimizer(DeviceBase* cpu_device, const RewriterConfig& cfg, const GPUOptions& gpu_options) - : cpu_device_(cpu_device), cfg_(cfg), gpu_options_(gpu_options) {} - ~MetaOptimizer() override = default; string name() const override { return "meta_optimizer"; }; @@ -82,7 +77,6 @@ class MetaOptimizer : public GraphOptimizer { GraphOptimizationResult* optimization_result); std::vector optimization_results_; - GPUOptions gpu_options_; }; bool MetaOptimizerEnabled(const RewriterConfig& cfg); @@ -95,8 +89,7 @@ bool MetaOptimizerEnabled(const RewriterConfig& cfg); // when possible. Status RunMetaOptimizer(const GrapplerItem& item, const RewriterConfig& cfg, DeviceBase* cpu_device, Cluster* cluster, - GraphDef* optimized_graph, - const GPUOptions& gpu_options); + GraphDef* optimized_graph); } // namespace grappler } // namespace tensorflow -- GitLab From f22e734cd3a50a86803f8a51f77061e83fbc44e7 Mon Sep 17 00:00:00 2001 From: pengwa Date: Thu, 23 Aug 2018 08:31:22 +0000 Subject: [PATCH 083/598] set memory_size for virtual_cluster, to make user specified per_process_gpu_memory_fraction take effect --- tensorflow/core/grappler/clusters/virtual_cluster.cc | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tensorflow/core/grappler/clusters/virtual_cluster.cc b/tensorflow/core/grappler/clusters/virtual_cluster.cc index 12e3e46f65..f543dca49e 100644 --- a/tensorflow/core/grappler/clusters/virtual_cluster.cc +++ b/tensorflow/core/grappler/clusters/virtual_cluster.cc @@ -45,6 +45,8 @@ VirtualCluster::VirtualCluster(const DeviceSet* device_set) for (const auto& device : device_set_->devices()) { DeviceProperties props = GetDeviceInfo(device->parsed_name()); if (props.type() == "UNKNOWN") continue; + auto attrs = device->attributes(); + props.set_memory_size(attrs.memory_limit()); devices_[device->name()] = props; } } -- GitLab From 28758916d4e970a4ccd9c6af6dc393c3cdc16c58 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Yan=20Facai=20=28=E9=A2=9C=E5=8F=91=E6=89=8D=29?= Date: Thu, 23 Aug 2018 20:17:00 +0800 Subject: [PATCH 084/598] DOC: add div_no_nan in math module of user_guide --- tensorflow/docs_src/api_guides/python/math_ops.md | 1 + 1 file changed, 1 insertion(+) diff --git a/tensorflow/docs_src/api_guides/python/math_ops.md b/tensorflow/docs_src/api_guides/python/math_ops.md index e738161e49..6ec18f48ef 100644 --- a/tensorflow/docs_src/api_guides/python/math_ops.md +++ b/tensorflow/docs_src/api_guides/python/math_ops.md @@ -24,6 +24,7 @@ operators to your graph. * `tf.realdiv` * `tf.truncatediv` * `tf.floor_div` +* `tf.div_no_nan` * `tf.truncatemod` * `tf.floormod` * `tf.mod` -- GitLab From 1f926c94ff1e4729b8c771d113eeacb4f6a3d8fd Mon Sep 17 00:00:00 2001 From: Austin Anderson Date: Thu, 23 Aug 2018 11:35:54 -0700 Subject: [PATCH 085/598] Change chmod 777 to a+rwx --- tensorflow/tools/dockerfiles/assembler.Dockerfile | 2 +- .../tools/dockerfiles/dockerfiles/cpu-devel-jupyter.Dockerfile | 2 +- tensorflow/tools/dockerfiles/dockerfiles/cpu-devel.Dockerfile | 2 +- tensorflow/tools/dockerfiles/dockerfiles/cpu-jupyter.Dockerfile | 2 +- tensorflow/tools/dockerfiles/dockerfiles/cpu.Dockerfile | 2 +- .../dockerfiles/dockerfiles/nvidia-devel-jupyter.Dockerfile | 2 +- .../tools/dockerfiles/dockerfiles/nvidia-devel.Dockerfile | 2 +- .../tools/dockerfiles/dockerfiles/nvidia-jupyter.Dockerfile | 2 +- tensorflow/tools/dockerfiles/dockerfiles/nvidia.Dockerfile | 2 +- tensorflow/tools/dockerfiles/partials/shell.partial.Dockerfile | 2 +- 10 files changed, 10 insertions(+), 10 deletions(-) diff --git a/tensorflow/tools/dockerfiles/assembler.Dockerfile b/tensorflow/tools/dockerfiles/assembler.Dockerfile index 2798e83cb7..7a8e07fced 100644 --- a/tensorflow/tools/dockerfiles/assembler.Dockerfile +++ b/tensorflow/tools/dockerfiles/assembler.Dockerfile @@ -27,4 +27,4 @@ WORKDIR /tf VOLUME ["/tf"] COPY bashrc /etc/bash.bashrc -RUN chmod 777 /etc/bash.bashrc +RUN chmod a+rwx /etc/bash.bashrc diff --git a/tensorflow/tools/dockerfiles/dockerfiles/cpu-devel-jupyter.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/cpu-devel-jupyter.Dockerfile index 2d095e1041..084839dad9 100644 --- a/tensorflow/tools/dockerfiles/dockerfiles/cpu-devel-jupyter.Dockerfile +++ b/tensorflow/tools/dockerfiles/dockerfiles/cpu-devel-jupyter.Dockerfile @@ -88,7 +88,7 @@ RUN echo "deb [arch=amd64] http://storage.googleapis.com/bazel-apt stable jdk1.8 apt-get install -y bazel COPY bashrc /etc/bash.bashrc -RUN chmod 777 /etc/bash.bashrc +RUN chmod a+rwx /etc/bash.bashrc RUN ${PIP} install jupyter diff --git a/tensorflow/tools/dockerfiles/dockerfiles/cpu-devel.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/cpu-devel.Dockerfile index e0073f4a6e..160d7c02e2 100644 --- a/tensorflow/tools/dockerfiles/dockerfiles/cpu-devel.Dockerfile +++ b/tensorflow/tools/dockerfiles/dockerfiles/cpu-devel.Dockerfile @@ -86,4 +86,4 @@ RUN echo "deb [arch=amd64] http://storage.googleapis.com/bazel-apt stable jdk1.8 apt-get install -y bazel COPY bashrc /etc/bash.bashrc -RUN chmod 777 /etc/bash.bashrc +RUN chmod a+rwx /etc/bash.bashrc diff --git a/tensorflow/tools/dockerfiles/dockerfiles/cpu-jupyter.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/cpu-jupyter.Dockerfile index 2ec14dc295..d85af6c404 100644 --- a/tensorflow/tools/dockerfiles/dockerfiles/cpu-jupyter.Dockerfile +++ b/tensorflow/tools/dockerfiles/dockerfiles/cpu-jupyter.Dockerfile @@ -57,7 +57,7 @@ ARG TF_PACKAGE=tensorflow RUN ${PIP} install ${TF_PACKAGE} COPY bashrc /etc/bash.bashrc -RUN chmod 777 /etc/bash.bashrc +RUN chmod a+rwx /etc/bash.bashrc RUN ${PIP} install jupyter diff --git a/tensorflow/tools/dockerfiles/dockerfiles/cpu.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/cpu.Dockerfile index 1d99476fd9..35c41b49fd 100644 --- a/tensorflow/tools/dockerfiles/dockerfiles/cpu.Dockerfile +++ b/tensorflow/tools/dockerfiles/dockerfiles/cpu.Dockerfile @@ -55,4 +55,4 @@ ARG TF_PACKAGE=tensorflow RUN ${PIP} install ${TF_PACKAGE} COPY bashrc /etc/bash.bashrc -RUN chmod 777 /etc/bash.bashrc +RUN chmod a+rwx /etc/bash.bashrc diff --git a/tensorflow/tools/dockerfiles/dockerfiles/nvidia-devel-jupyter.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/nvidia-devel-jupyter.Dockerfile index ceec20ef0f..666aabe6a2 100644 --- a/tensorflow/tools/dockerfiles/dockerfiles/nvidia-devel-jupyter.Dockerfile +++ b/tensorflow/tools/dockerfiles/dockerfiles/nvidia-devel-jupyter.Dockerfile @@ -108,7 +108,7 @@ RUN echo "deb [arch=amd64] http://storage.googleapis.com/bazel-apt stable jdk1.8 apt-get install -y bazel COPY bashrc /etc/bash.bashrc -RUN chmod 777 /etc/bash.bashrc +RUN chmod a+rwx /etc/bash.bashrc RUN ${PIP} install jupyter diff --git a/tensorflow/tools/dockerfiles/dockerfiles/nvidia-devel.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/nvidia-devel.Dockerfile index 369a3ed62e..a6e280082e 100644 --- a/tensorflow/tools/dockerfiles/dockerfiles/nvidia-devel.Dockerfile +++ b/tensorflow/tools/dockerfiles/dockerfiles/nvidia-devel.Dockerfile @@ -106,4 +106,4 @@ RUN echo "deb [arch=amd64] http://storage.googleapis.com/bazel-apt stable jdk1.8 apt-get install -y bazel COPY bashrc /etc/bash.bashrc -RUN chmod 777 /etc/bash.bashrc +RUN chmod a+rwx /etc/bash.bashrc diff --git a/tensorflow/tools/dockerfiles/dockerfiles/nvidia-jupyter.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/nvidia-jupyter.Dockerfile index 976fce037a..2a46fc0696 100644 --- a/tensorflow/tools/dockerfiles/dockerfiles/nvidia-jupyter.Dockerfile +++ b/tensorflow/tools/dockerfiles/dockerfiles/nvidia-jupyter.Dockerfile @@ -78,7 +78,7 @@ ARG TF_PACKAGE=tensorflow-gpu RUN ${PIP} install ${TF_PACKAGE} COPY bashrc /etc/bash.bashrc -RUN chmod 777 /etc/bash.bashrc +RUN chmod a+rwx /etc/bash.bashrc RUN ${PIP} install jupyter diff --git a/tensorflow/tools/dockerfiles/dockerfiles/nvidia.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/nvidia.Dockerfile index 08a146d9fa..690eb68b22 100644 --- a/tensorflow/tools/dockerfiles/dockerfiles/nvidia.Dockerfile +++ b/tensorflow/tools/dockerfiles/dockerfiles/nvidia.Dockerfile @@ -76,4 +76,4 @@ ARG TF_PACKAGE=tensorflow-gpu RUN ${PIP} install ${TF_PACKAGE} COPY bashrc /etc/bash.bashrc -RUN chmod 777 /etc/bash.bashrc +RUN chmod a+rwx /etc/bash.bashrc diff --git a/tensorflow/tools/dockerfiles/partials/shell.partial.Dockerfile b/tensorflow/tools/dockerfiles/partials/shell.partial.Dockerfile index 87a0b4795b..d641a11b06 100644 --- a/tensorflow/tools/dockerfiles/partials/shell.partial.Dockerfile +++ b/tensorflow/tools/dockerfiles/partials/shell.partial.Dockerfile @@ -1,2 +1,2 @@ COPY bashrc /etc/bash.bashrc -RUN chmod 777 /etc/bash.bashrc +RUN chmod a+rwx /etc/bash.bashrc -- GitLab From 5bfdb84bc54e52b509a366991ea4f5156c1fca34 Mon Sep 17 00:00:00 2001 From: Austin Anderson Date: Thu, 23 Aug 2018 12:56:37 -0700 Subject: [PATCH 086/598] Update jupyter --- .../dockerfiles/dockerfiles/cpu-devel-jupyter.Dockerfile | 4 ++-- .../tools/dockerfiles/dockerfiles/cpu-jupyter.Dockerfile | 4 ++-- .../dockerfiles/dockerfiles/nvidia-devel-jupyter.Dockerfile | 4 ++-- .../tools/dockerfiles/dockerfiles/nvidia-jupyter.Dockerfile | 4 ++-- .../tools/dockerfiles/partials/jupyter.partial.Dockerfile | 4 ++-- 5 files changed, 10 insertions(+), 10 deletions(-) diff --git a/tensorflow/tools/dockerfiles/dockerfiles/cpu-devel-jupyter.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/cpu-devel-jupyter.Dockerfile index 084839dad9..dbbad7d03a 100644 --- a/tensorflow/tools/dockerfiles/dockerfiles/cpu-devel-jupyter.Dockerfile +++ b/tensorflow/tools/dockerfiles/dockerfiles/cpu-devel-jupyter.Dockerfile @@ -92,8 +92,8 @@ RUN chmod a+rwx /etc/bash.bashrc RUN ${PIP} install jupyter -RUN mkdir /notebooks && chmod 777 /notebooks -RUN mkdir /.local && chmod 777 /.local +RUN mkdir /notebooks && chmod a+rwx /notebooks +RUN mkdir /.local && chmod a+rwx /.local WORKDIR /notebooks EXPOSE 8888 diff --git a/tensorflow/tools/dockerfiles/dockerfiles/cpu-jupyter.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/cpu-jupyter.Dockerfile index d85af6c404..8d5d653ab7 100644 --- a/tensorflow/tools/dockerfiles/dockerfiles/cpu-jupyter.Dockerfile +++ b/tensorflow/tools/dockerfiles/dockerfiles/cpu-jupyter.Dockerfile @@ -61,8 +61,8 @@ RUN chmod a+rwx /etc/bash.bashrc RUN ${PIP} install jupyter -RUN mkdir /notebooks && chmod 777 /notebooks -RUN mkdir /.local && chmod 777 /.local +RUN mkdir /notebooks && chmod a+rwx /notebooks +RUN mkdir /.local && chmod a+rwx /.local WORKDIR /notebooks EXPOSE 8888 diff --git a/tensorflow/tools/dockerfiles/dockerfiles/nvidia-devel-jupyter.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/nvidia-devel-jupyter.Dockerfile index 666aabe6a2..0f5fedf2fe 100644 --- a/tensorflow/tools/dockerfiles/dockerfiles/nvidia-devel-jupyter.Dockerfile +++ b/tensorflow/tools/dockerfiles/dockerfiles/nvidia-devel-jupyter.Dockerfile @@ -112,8 +112,8 @@ RUN chmod a+rwx /etc/bash.bashrc RUN ${PIP} install jupyter -RUN mkdir /notebooks && chmod 777 /notebooks -RUN mkdir /.local && chmod 777 /.local +RUN mkdir /notebooks && chmod a+rwx /notebooks +RUN mkdir /.local && chmod a+rwx /.local WORKDIR /notebooks EXPOSE 8888 diff --git a/tensorflow/tools/dockerfiles/dockerfiles/nvidia-jupyter.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/nvidia-jupyter.Dockerfile index 2a46fc0696..f1799113b1 100644 --- a/tensorflow/tools/dockerfiles/dockerfiles/nvidia-jupyter.Dockerfile +++ b/tensorflow/tools/dockerfiles/dockerfiles/nvidia-jupyter.Dockerfile @@ -82,8 +82,8 @@ RUN chmod a+rwx /etc/bash.bashrc RUN ${PIP} install jupyter -RUN mkdir /notebooks && chmod 777 /notebooks -RUN mkdir /.local && chmod 777 /.local +RUN mkdir /notebooks && chmod a+rwx /notebooks +RUN mkdir /.local && chmod a+rwx /.local WORKDIR /notebooks EXPOSE 8888 diff --git a/tensorflow/tools/dockerfiles/partials/jupyter.partial.Dockerfile b/tensorflow/tools/dockerfiles/partials/jupyter.partial.Dockerfile index b5a9566bbe..2c9b9f3f9a 100644 --- a/tensorflow/tools/dockerfiles/partials/jupyter.partial.Dockerfile +++ b/tensorflow/tools/dockerfiles/partials/jupyter.partial.Dockerfile @@ -1,7 +1,7 @@ RUN ${PIP} install jupyter -RUN mkdir /notebooks && chmod 777 /notebooks -RUN mkdir /.local && chmod 777 /.local +RUN mkdir /notebooks && chmod a+rwx /notebooks +RUN mkdir /.local && chmod a+rwx /.local WORKDIR /notebooks EXPOSE 8888 -- GitLab From e634e74a1628d763bc826dc733a4167998edde32 Mon Sep 17 00:00:00 2001 From: Yong Tang Date: Fri, 24 Aug 2018 14:21:26 +0000 Subject: [PATCH 087/598] Change BFCGdrAllocator to 102 for priority. Signed-off-by: Yong Tang --- tensorflow/contrib/gdr/gdr_memory_manager.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tensorflow/contrib/gdr/gdr_memory_manager.cc b/tensorflow/contrib/gdr/gdr_memory_manager.cc index f464760f90..726f74c7b7 100644 --- a/tensorflow/contrib/gdr/gdr_memory_manager.cc +++ b/tensorflow/contrib/gdr/gdr_memory_manager.cc @@ -201,7 +201,7 @@ class BFCGdrAllocatorFactory : public AllocatorFactory { } }; -REGISTER_MEM_ALLOCATOR("BFCGdrAllocator", 101, BFCGdrAllocatorFactory); +REGISTER_MEM_ALLOCATOR("BFCGdrAllocator", 102, BFCGdrAllocatorFactory); GdrMemoryManager::GdrMemoryManager(const string& host, const string& port) : host_(host), -- GitLab From 0dd4c121e5efa9eeddd2a5f11b8247717ae255f1 Mon Sep 17 00:00:00 2001 From: Santosh Kumar Date: Fri, 24 Aug 2018 22:16:01 +0530 Subject: [PATCH 088/598] R --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 823c688096..91f49f8e95 100644 --- a/README.md +++ b/README.md @@ -104,12 +104,12 @@ The TensorFlow project strives to abide by generally accepted best practices in ## For more information -* [Tensorflow Blog](https://medium.com/tensorflow) +* [TensorFlow Blog](https://medium.com/tensorflow) * [TensorFlow Course at Stanford](https://web.stanford.edu/class/cs20si) * [TensorFlow Model Zoo](https://github.com/tensorflow/models) * [TensorFlow MOOC on Udacity](https://www.udacity.com/course/deep-learning--ud730) * [TensorFlow Roadmap](https://www.tensorflow.org/community/roadmap) -* [Tensorflow Twitter](https://twitter.com/tensorflow) +* [TensorFlow Twitter](https://twitter.com/tensorflow) * [TensorFlow Website](https://www.tensorflow.org) * [TensorFlow White Papers](https://www.tensorflow.org/about/bib) * [TensorFlow YouTube Channel](https://www.youtube.com/channel/UC0rqucBdTuFTjJiefW5t-IQ) -- GitLab From 851674874aac61c4a6e4cced1917b5dbe033980a Mon Sep 17 00:00:00 2001 From: Karmel Allison Date: Fri, 24 Aug 2018 11:04:10 -0700 Subject: [PATCH 089/598] Automated rollback of commit 73c7768904554b5b2b6420556b52bfaf43453423. Revert #19792. PiperOrigin-RevId: 210124433 --- tensorflow/python/kernel_tests/matmul_op_test.py | 4 ++-- tensorflow/python/ops/math_ops.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/tensorflow/python/kernel_tests/matmul_op_test.py b/tensorflow/python/kernel_tests/matmul_op_test.py index 9eaafb4435..b167278984 100644 --- a/tensorflow/python/kernel_tests/matmul_op_test.py +++ b/tensorflow/python/kernel_tests/matmul_op_test.py @@ -142,7 +142,7 @@ class MatMulStatsTest(test_lib.TestCase): for op in g.get_operations(): flops = ops.get_stats_for_node_def(g, op.node_def, "flops").value if op.name == "MatMul": - self.assertEqual(6975, flops) + self.assertEqual(7200, flops) def testTransposedStatistics(self): g = ops.Graph() @@ -153,7 +153,7 @@ class MatMulStatsTest(test_lib.TestCase): for op in g.get_operations(): flops = ops.get_stats_for_node_def(g, op.node_def, "flops").value if op.name == "MatMul": - self.assertEqual(6975, flops) + self.assertEqual(7200, flops) try: diff --git a/tensorflow/python/ops/math_ops.py b/tensorflow/python/ops/math_ops.py index 0a31bd51dd..9b0ab00c7a 100644 --- a/tensorflow/python/ops/math_ops.py +++ b/tensorflow/python/ops/math_ops.py @@ -2072,7 +2072,7 @@ def _calc_mat_mul_flops(graph, node): output_shape = graph_util.tensor_shape_from_node_def_name(graph, node.name) output_shape.assert_is_fully_defined() output_count = np.prod(output_shape.as_list()) - return ops.OpStats("flops", ((2 * k - 1) * output_count)) + return ops.OpStats("flops", (k * output_count * 2)) def _as_indexed_slices(x, optimize=True): -- GitLab From 6919cab24959bee9dff367009a0b6cad86d7b30b Mon Sep 17 00:00:00 2001 From: Scott Zhu Date: Fri, 24 Aug 2018 11:09:08 -0700 Subject: [PATCH 090/598] Update StackedRNNCell to have the same order of state_size as the cell. PiperOrigin-RevId: 210125409 --- tensorflow/python/keras/layers/recurrent.py | 53 ++++++++++++------- .../python/keras/layers/recurrent_test.py | 18 ++++++- tensorflow/python/kernel_tests/rnn_test.py | 10 ++-- 3 files changed, 55 insertions(+), 26 deletions(-) diff --git a/tensorflow/python/keras/layers/recurrent.py b/tensorflow/python/keras/layers/recurrent.py index 65171acfb6..cff612a8de 100644 --- a/tensorflow/python/keras/layers/recurrent.py +++ b/tensorflow/python/keras/layers/recurrent.py @@ -73,19 +73,27 @@ class StackedRNNCells(Layer): '`state_size` attribute. ' 'received cells:', cells) self.cells = cells + # reverse_state_order determines whether the state size will be in a reverse + # order of the cells' state. User might want to set this to True to keep the + # existing behavior. This is only useful when use RNN(return_state=True) + # since the state will be returned as the same order of state_size. + self.reverse_state_order = kwargs.pop('reverse_state_order', False) + if self.reverse_state_order: + logging.warning('reverse_state_order=True in StackedRNNCells will soon ' + 'be deprecated. Please update the code to work with the ' + 'natural order of states if you reply on the RNN states, ' + 'eg RNN(return_state=True).') super(StackedRNNCells, self).__init__(**kwargs) @property def state_size(self): - # States are a flat list - # in reverse order of the cell stack. - # This allows to preserve the requirement - # `stack.state_size[0] == output_dim`. - # e.g. states of a 2-layer LSTM would be - # `[h2, c2, h1, c1]` + # States are a flat list of the individual cell state size. + # e.g. states of a 2-layer LSTM would be `[h1, c1, h2, c2]`. # (assuming one LSTM has states [h, c]) + # In the case of reverse_state_order=True, the state_size will be + # [h2, c2, h1, c1]. state_size = [] - for cell in self.cells[::-1]: + for cell in self.cells[::-1] if self.reverse_state_order else self.cells: if _is_multiple_state(cell.state_size): state_size += list(cell.state_size) else: @@ -96,15 +104,16 @@ class StackedRNNCells(Layer): def output_size(self): if getattr(self.cells[-1], 'output_size', None) is not None: return self.cells[-1].output_size + elif _is_multiple_state(self.cells[-1].state_size): + return self.cells[-1].state_size[0] else: - return self.state_size[0] + return self.cells[-1].state_size def get_initial_state(self, inputs=None, batch_size=None, dtype=None): - # The init state is in reverse order of cell's initial state since the - # state_size is in reverse order. It is flattened into a list also because - # the state_size is a flattened list. + # The init state is flattened into a list because state_size is a flattened + # list. initial_states = [] - for cell in self.cells[::-1]: + for cell in self.cells[::-1] if self.reverse_state_order else self.cells: get_initial_state_fn = getattr(cell, 'get_initial_state', None) if get_initial_state_fn: initial_states.append(get_initial_state_fn( @@ -118,14 +127,15 @@ class StackedRNNCells(Layer): def call(self, inputs, states, constants=None, **kwargs): # Recover per-cell states. nested_states = [] - for cell in self.cells[::-1]: + for cell in self.cells[::-1] if self.reverse_state_order else self.cells: if _is_multiple_state(cell.state_size): nested_states.append(states[:len(cell.state_size)]) states = states[len(cell.state_size):] else: nested_states.append([states[0]]) states = states[1:] - nested_states = nested_states[::-1] + if self.reverse_state_order: + nested_states = nested_states[::-1] # Call the cells in order and store the returned states. new_nested_states = [] @@ -139,11 +149,12 @@ class StackedRNNCells(Layer): new_nested_states.append(states) # Format the new states as a flat list - # in reverse cell order. - states = [] - for cell_states in new_nested_states[::-1]: - states += cell_states - return inputs, states + new_states = [] + if self.reverse_state_order: + new_nested_states = new_nested_states[::-1] + for cell_states in new_nested_states: + new_states += cell_states + return inputs, new_states @tf_utils.shape_type_conversion def build(self, input_shape): @@ -156,7 +167,9 @@ class StackedRNNCells(Layer): cell.build([input_shape] + constants_shape) else: cell.build(input_shape) - if _is_multiple_state(cell.state_size): + if getattr(cell, 'output_size', None) is not None: + output_dim = cell.output_size + elif _is_multiple_state(cell.state_size): output_dim = cell.state_size[0] else: output_dim = cell.state_size diff --git a/tensorflow/python/keras/layers/recurrent_test.py b/tensorflow/python/keras/layers/recurrent_test.py index b52bfc05a5..a3861e44d5 100644 --- a/tensorflow/python/keras/layers/recurrent_test.py +++ b/tensorflow/python/keras/layers/recurrent_test.py @@ -103,7 +103,8 @@ class RNNTest(test.TestCase): MinimalRNNCell(16, 8), MinimalRNNCell(32, 16)] layer = keras.layers.RNN(cells) - assert layer.cell.state_size == (32, 32, 16, 16, 8, 8) + self.assertEqual(layer.cell.state_size, (8, 8, 16, 16, 32, 32)) + self.assertEqual(layer.cell.output_size, 32) y = layer(x) model = keras.models.Model(x, y) model.compile(optimizer='rmsprop', loss='mse') @@ -550,6 +551,21 @@ class RNNTest(test.TestCase): timesteps = 2 layer = keras.layers.RNN(cells, return_state=True, return_sequences=True) output_shape = layer.compute_output_shape((None, timesteps, embedding_dim)) + expected_output_shape = [(None, timesteps, 6), + (None, 3), + (None, 3), + (None, 6), + (None, 6)] + self.assertEqual( + [tuple(o.as_list()) for o in output_shape], + expected_output_shape) + + # Test reverse_state_order = True for stacked cell. + stacked_cell = keras.layers.StackedRNNCells( + cells, reverse_state_order=True) + layer = keras.layers.RNN( + stacked_cell, return_state=True, return_sequences=True) + output_shape = layer.compute_output_shape((None, timesteps, embedding_dim)) expected_output_shape = [(None, timesteps, 6), (None, 6), (None, 6), diff --git a/tensorflow/python/kernel_tests/rnn_test.py b/tensorflow/python/kernel_tests/rnn_test.py index c4f200a22e..78f2993d27 100644 --- a/tensorflow/python/kernel_tests/rnn_test.py +++ b/tensorflow/python/kernel_tests/rnn_test.py @@ -441,11 +441,11 @@ class RNNTest(test.TestCase): cell, inputs, dtype=dtypes.float32) self.assertEqual(outputs.shape.as_list(), [None, timestep, output_shape]) self.assertEqual(len(state), 4) - self.assertEqual(state[0].shape.as_list(), [None, output_shape]) - self.assertEqual(state[1].shape.as_list(), [None, output_shape]) - self.assertEqual(state[2].shape.as_list(), [None, 2 * output_shape]) - self.assertEqual(state[3].shape.as_list(), [None, 2 * output_shape]) - loss = losses.softmax_cross_entropy(predict, state[0]) + self.assertEqual(state[0].shape.as_list(), [None, 2 * output_shape]) + self.assertEqual(state[1].shape.as_list(), [None, 2 * output_shape]) + self.assertEqual(state[2].shape.as_list(), [None, output_shape]) + self.assertEqual(state[3].shape.as_list(), [None, output_shape]) + loss = losses.softmax_cross_entropy(predict, state[2]) train_op = training.GradientDescentOptimizer(0.001).minimize(loss) sess.run([variables_lib.global_variables_initializer()]) -- GitLab From e4643892197b202c6a0d3a9e10c18153eefb193a Mon Sep 17 00:00:00 2001 From: Derek Murray Date: Fri, 24 Aug 2018 11:15:25 -0700 Subject: [PATCH 091/598] [tf.data] Remove `num_parallel_parser_calls` argument from `tf.contrib.data.make_csv_dataset()`. This argument was unused, and it was not actually parallelizing the parsing (just rearranging the arguments), so remove it to avoid confusing users when it doesn't make things faster. PiperOrigin-RevId: 210126817 --- tensorflow/contrib/data/python/ops/readers.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/tensorflow/contrib/data/python/ops/readers.py b/tensorflow/contrib/data/python/ops/readers.py index 1c180663db..29005859d7 100644 --- a/tensorflow/contrib/data/python/ops/readers.py +++ b/tensorflow/contrib/data/python/ops/readers.py @@ -325,7 +325,6 @@ def make_csv_dataset( shuffle_seed=None, prefetch_buffer_size=1, num_parallel_reads=1, - num_parallel_parser_calls=2, sloppy=False, num_rows_for_inference=100, compression_type=None, @@ -392,8 +391,6 @@ def make_csv_dataset( batches consumed per training step. num_parallel_reads: Number of threads used to read CSV records from files. If >1, the results will be interleaved. - num_parallel_parser_calls: Number of parallel invocations of the CSV parsing - function on CSV records. sloppy: If `True`, reading performance will be improved at the cost of non-deterministic ordering. If `False`, the order of elements produced is deterministic prior to shuffling (elements are still @@ -502,7 +499,7 @@ def make_csv_dataset( # indefinitely, and all batches will be full-sized. dataset = dataset.batch(batch_size=batch_size, drop_remainder=num_epochs is None) - dataset = dataset.map(map_fn, num_parallel_calls=num_parallel_parser_calls) + dataset = dataset.map(map_fn) dataset = dataset.prefetch(prefetch_buffer_size) return dataset -- GitLab From 216d15054e866e8aa44a8af92353f1284cdcbfc0 Mon Sep 17 00:00:00 2001 From: Yu-Cheng Ling Date: Fri, 24 Aug 2018 11:18:31 -0700 Subject: [PATCH 092/598] Fix create_ios_frameworks.sh PiperOrigin-RevId: 210127306 --- .../contrib/lite/lib_package/create_ios_frameworks.sh | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/tensorflow/contrib/lite/lib_package/create_ios_frameworks.sh b/tensorflow/contrib/lite/lib_package/create_ios_frameworks.sh index b58ae26601..6195426d6d 100755 --- a/tensorflow/contrib/lite/lib_package/create_ios_frameworks.sh +++ b/tensorflow/contrib/lite/lib_package/create_ios_frameworks.sh @@ -14,6 +14,7 @@ # limitations under the License. # ============================================================================== +# TODO(ycling): Refactoring - Move this script into `tools/make`. set -e echo "Starting" @@ -32,7 +33,7 @@ echo "Headers, populating: TensorFlow Lite" cd $TFLITE_DIR/../../.. find tensorflow/contrib/lite -name '*.h' \ - -not -path 'tensorflow/contrib/lite/downloads/*' \ + -not -path 'tensorflow/contrib/lite/tools/*' \ -not -path 'tensorflow/contrib/lite/examples/*' \ -not -path 'tensorflow/contrib/lite/gen/*' \ -not -path 'tensorflow/contrib/lite/toco/*' \ @@ -44,7 +45,7 @@ tar xf tmp.tar rm -f tmp.tar echo "Headers, populating: Flatbuffer" -cd $TFLITE_DIR/downloads/flatbuffers/include/ +cd $TFLITE_DIR/tools/make/downloads/flatbuffers/include/ find . -name '*.h' | tar -cf $FW_DIR_TFLITE_HDRS/tmp.tar -T - cd $FW_DIR_TFLITE_HDRS tar xf tmp.tar @@ -57,7 +58,7 @@ cp $TFLITE_DIR/../../../bazel-genfiles/tensorflow/tools/lib_package/include/tens $FW_DIR_TFLITE echo "Copying static libraries" -cp $TFLITE_DIR/gen/lib/libtensorflow-lite.a \ +cp $TFLITE_DIR/tools/make/gen/lib/libtensorflow-lite.a \ $FW_DIR_TFLITE/tensorflow_lite # This is required, otherwise they interfere with the documentation of the -- GitLab From 197309b5d56436b523b8b03ddf2a23555c37365e Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Fri, 24 Aug 2018 11:20:19 -0700 Subject: [PATCH 093/598] Removed redundant std::string -> string conversions. PiperOrigin-RevId: 210127626 --- tensorflow/compiler/xla/shape_util.cc | 8 ++++---- tensorflow/compiler/xla/text_literal_reader.cc | 4 ++-- tensorflow/compiler/xla/text_literal_writer.cc | 2 +- tensorflow/core/lib/monitoring/collection_registry.cc | 8 ++++---- tensorflow/core/lib/monitoring/collection_registry.h | 4 ++-- tensorflow/core/lib/monitoring/metric_def.h | 4 ++-- tensorflow/core/lib/strings/numbers.h | 4 ++-- tensorflow/core/lib/strings/str_util.cc | 5 ++--- tensorflow/core/lib/strings/str_util.h | 2 +- tensorflow/core/platform/env.cc | 4 ++-- tensorflow/core/platform/file_system.cc | 2 +- tensorflow/core/platform/file_system_helper.cc | 2 +- tensorflow/core/platform/file_system_test.cc | 2 +- tensorflow/core/util/command_line_flags.cc | 2 +- tensorflow/core/util/env_var.cc | 8 ++++---- tensorflow/core/util/example_proto_fast_parsing.cc | 2 +- tensorflow/stream_executor/lib/env.h | 2 +- tensorflow/stream_executor/lib/path.cc | 2 +- tensorflow/stream_executor/lib/str_util.h | 2 +- 19 files changed, 34 insertions(+), 35 deletions(-) diff --git a/tensorflow/compiler/xla/shape_util.cc b/tensorflow/compiler/xla/shape_util.cc index 9a3d1ba83d..31ddd57eef 100644 --- a/tensorflow/compiler/xla/shape_util.cc +++ b/tensorflow/compiler/xla/shape_util.cc @@ -565,7 +565,7 @@ StatusOr ParseShapeStringInternal(absl::string_view* s) { break; } else if (must_end) { return InvalidArgument("Expected end of tuple; got: \"%s\"", - std::string(*s).c_str()); + string(*s).c_str()); } shapes.emplace_back(); TF_ASSIGN_OR_RETURN(shapes.back(), ParseShapeStringInternal(s)); @@ -594,7 +594,7 @@ StatusOr ParseShapeStringInternal(absl::string_view* s) { if (!absl::SimpleAtoi(input, &element)) { return InvalidArgument( "Invalid s64 value in parsed shape string: \"%s\" in \"%s\"", - string(input).c_str(), std::string(*s).c_str()); + string(input).c_str(), string(*s).c_str()); } return element; }; @@ -649,7 +649,7 @@ StatusOr ParseShapeStringInternal(absl::string_view* s) { } return InvalidArgument("Invalid shape string to parse: \"%s\"", - std::string(*s).c_str()); + string(*s).c_str()); } } // namespace @@ -657,7 +657,7 @@ StatusOr ParseShapeStringInternal(absl::string_view* s) { TF_ASSIGN_OR_RETURN(Shape shape, ParseShapeStringInternal(&s)); if (!s.empty()) { return InvalidArgument("Invalid shape string to parse: \"%s\"", - std::string(s).c_str()); + string(s).c_str()); } return shape; } diff --git a/tensorflow/compiler/xla/text_literal_reader.cc b/tensorflow/compiler/xla/text_literal_reader.cc index 392ad84ef4..9835e3d803 100644 --- a/tensorflow/compiler/xla/text_literal_reader.cc +++ b/tensorflow/compiler/xla/text_literal_reader.cc @@ -95,9 +95,9 @@ StatusOr> TextLiteralReader::ReadAllLines() { line.c_str()); } float value; - if (!absl::SimpleAtof(std::string(value_string).c_str(), &value)) { + if (!absl::SimpleAtof(absl::string_view(value_string), &value)) { return InvalidArgument("could not parse value as float: \"%s\"", - std::string(value_string).c_str()); + string(value_string).c_str()); } coordinates = absl::StrSplit(coordinates_string, ','); coordinate_values.clear(); diff --git a/tensorflow/compiler/xla/text_literal_writer.cc b/tensorflow/compiler/xla/text_literal_writer.cc index 20b3245bfb..00147015a6 100644 --- a/tensorflow/compiler/xla/text_literal_writer.cc +++ b/tensorflow/compiler/xla/text_literal_writer.cc @@ -33,7 +33,7 @@ namespace xla { /* static */ Status TextLiteralWriter::WriteToPath(const Literal& literal, absl::string_view path) { std::unique_ptr f; - auto s = tensorflow::Env::Default()->NewWritableFile(std::string(path), &f); + auto s = tensorflow::Env::Default()->NewWritableFile(string(path), &f); if (!s.ok()) { return s; } diff --git a/tensorflow/core/lib/monitoring/collection_registry.cc b/tensorflow/core/lib/monitoring/collection_registry.cc index 8c28620ff9..fface033cb 100644 --- a/tensorflow/core/lib/monitoring/collection_registry.cc +++ b/tensorflow/core/lib/monitoring/collection_registry.cc @@ -38,15 +38,15 @@ void Collector::CollectMetricDescriptor( mutex_lock l(mu_); return collected_metrics_->metric_descriptor_map .insert(std::make_pair( - std::string(metric_def->name()), + string(metric_def->name()), std::unique_ptr(new MetricDescriptor()))) .first->second.get(); }(); - metric_descriptor->name = std::string(metric_def->name()); - metric_descriptor->description = std::string(metric_def->description()); + metric_descriptor->name = string(metric_def->name()); + metric_descriptor->description = string(metric_def->description()); for (const StringPiece label_name : metric_def->label_descriptions()) { - metric_descriptor->label_names.push_back(std::string(label_name)); + metric_descriptor->label_names.emplace_back(label_name); } metric_descriptor->metric_kind = metric_def->kind(); diff --git a/tensorflow/core/lib/monitoring/collection_registry.h b/tensorflow/core/lib/monitoring/collection_registry.h index 20f0444f8b..c204d52cfe 100644 --- a/tensorflow/core/lib/monitoring/collection_registry.h +++ b/tensorflow/core/lib/monitoring/collection_registry.h @@ -72,7 +72,7 @@ class MetricCollector { registration_time_millis_(registration_time_millis), collector_(collector), point_set_(point_set) { - point_set_->metric_name = std::string(metric_def->name()); + point_set_->metric_name = string(metric_def->name()); } const MetricDef* const metric_def_; @@ -261,7 +261,7 @@ class Collector { auto* const point_set = [&]() { mutex_lock l(mu_); return collected_metrics_->point_set_map - .insert(std::make_pair(std::string(metric_def->name()), + .insert(std::make_pair(string(metric_def->name()), std::unique_ptr(new PointSet()))) .first->second.get(); }(); diff --git a/tensorflow/core/lib/monitoring/metric_def.h b/tensorflow/core/lib/monitoring/metric_def.h index 6f94685665..756e5c2af8 100644 --- a/tensorflow/core/lib/monitoring/metric_def.h +++ b/tensorflow/core/lib/monitoring/metric_def.h @@ -98,8 +98,8 @@ class AbstractMetricDef { const std::vector& label_descriptions) : kind_(kind), value_type_(value_type), - name_(std::string(name)), - description_(std::string(description)), + name_(name), + description_(description), label_descriptions_(std::vector(label_descriptions.begin(), label_descriptions.end())) {} diff --git a/tensorflow/core/lib/strings/numbers.h b/tensorflow/core/lib/strings/numbers.h index e0a5281d68..959290ba8c 100644 --- a/tensorflow/core/lib/strings/numbers.h +++ b/tensorflow/core/lib/strings/numbers.h @@ -140,11 +140,11 @@ inline bool ProtoParseNumeric(StringPiece s, uint64* value) { } inline bool ProtoParseNumeric(StringPiece s, float* value) { - return safe_strtof(std::string(s).c_str(), value); + return safe_strtof(s, value); } inline bool ProtoParseNumeric(StringPiece s, double* value) { - return safe_strtod(std::string(s).c_str(), value); + return safe_strtod(s, value); } // Convert strings to number of type T. diff --git a/tensorflow/core/lib/strings/str_util.cc b/tensorflow/core/lib/strings/str_util.cc index cab8f81585..3aba5ec80e 100644 --- a/tensorflow/core/lib/strings/str_util.cc +++ b/tensorflow/core/lib/strings/str_util.cc @@ -332,7 +332,7 @@ string StringReplace(StringPiece s, StringPiece oldsub, StringPiece newsub, bool replace_all) { // TODO(jlebar): We could avoid having to shift data around in the string if // we had a StringPiece::find() overload that searched for a StringPiece. - string res = std::string(s); + string res(s); size_t pos = 0; while ((pos = res.find(oldsub.data(), pos, oldsub.size())) != string::npos) { res.replace(pos, oldsub.size(), newsub.data(), newsub.size()); @@ -448,8 +448,7 @@ bool SplitAndParseAsFloats(StringPiece text, char delim, std::vector* result) { return SplitAndParseAsInts(text, delim, [](StringPiece str, float* value) { - return strings::safe_strtof( - std::string(str).c_str(), value); + return strings::safe_strtof(str, value); }, result); } diff --git a/tensorflow/core/lib/strings/str_util.h b/tensorflow/core/lib/strings/str_util.h index 58e87fcb9e..9f52cf29fc 100644 --- a/tensorflow/core/lib/strings/str_util.h +++ b/tensorflow/core/lib/strings/str_util.h @@ -205,7 +205,7 @@ std::vector Split(StringPiece text, StringPiece delims, Predicate p) { if ((i == text.size()) || (delims.find(text[i]) != StringPiece::npos)) { StringPiece token(text.data() + token_start, i - token_start); if (p(token)) { - result.push_back(std::string(token)); + result.emplace_back(token); } token_start = i + 1; } diff --git a/tensorflow/core/platform/env.cc b/tensorflow/core/platform/env.cc index 47c59d435b..afc4201e53 100644 --- a/tensorflow/core/platform/env.cc +++ b/tensorflow/core/platform/env.cc @@ -92,7 +92,7 @@ Env::Env() : file_system_registry_(new FileSystemRegistryImpl) {} Status Env::GetFileSystemForFile(const string& fname, FileSystem** result) { StringPiece scheme, host, path; io::ParseURI(fname, &scheme, &host, &path); - FileSystem* file_system = file_system_registry_->Lookup(std::string(scheme)); + FileSystem* file_system = file_system_registry_->Lookup(string(scheme)); if (!file_system) { if (scheme.empty()) { scheme = "[local]"; @@ -166,7 +166,7 @@ bool Env::FilesExist(const std::vector& files, for (const auto& file : files) { StringPiece scheme, host, path; io::ParseURI(file, &scheme, &host, &path); - files_per_fs[std::string(scheme)].push_back(file); + files_per_fs[string(scheme)].push_back(file); } std::unordered_map per_file_status; diff --git a/tensorflow/core/platform/file_system.cc b/tensorflow/core/platform/file_system.cc index 922773684b..3ab542a5d8 100644 --- a/tensorflow/core/platform/file_system.cc +++ b/tensorflow/core/platform/file_system.cc @@ -158,7 +158,7 @@ Status FileSystem::RecursivelyCreateDir(const string& dirname) { std::reverse(sub_dirs.begin(), sub_dirs.end()); // Now create the directories. - string built_path = std::string(remaining_dir); + string built_path(remaining_dir); for (const StringPiece sub_dir : sub_dirs) { built_path = io::JoinPath(built_path, sub_dir); Status status = CreateDir(io::CreateURI(scheme, host, built_path)); diff --git a/tensorflow/core/platform/file_system_helper.cc b/tensorflow/core/platform/file_system_helper.cc index 0ba0e6304f..342cf28e38 100644 --- a/tensorflow/core/platform/file_system_helper.cc +++ b/tensorflow/core/platform/file_system_helper.cc @@ -59,7 +59,7 @@ Status GetMatchingPaths(FileSystem* fs, Env* env, const string& pattern, string fixed_prefix = pattern.substr(0, pattern.find_first_of("*?[\\")); string eval_pattern = pattern; std::vector all_files; - string dir = std::string(io::Dirname(fixed_prefix)); + string dir(io::Dirname(fixed_prefix)); // If dir is empty then we need to fix up fixed_prefix and eval_pattern to // include . as the top level directory. if (dir.empty()) { diff --git a/tensorflow/core/platform/file_system_test.cc b/tensorflow/core/platform/file_system_test.cc index c0a16c95f9..a637d42a92 100644 --- a/tensorflow/core/platform/file_system_test.cc +++ b/tensorflow/core/platform/file_system_test.cc @@ -125,7 +125,7 @@ class InterPlanetaryFileSystem : public NullFileSystem { ASSERT_EQ(scheme, "ipfs"); ASSERT_EQ(host, "solarsystem"); str_util::ConsumePrefix(&path, "/"); - *parsed_path = std::string(path); + *parsed_path = string(path); } std::map> celestial_bodies_ = { diff --git a/tensorflow/core/util/command_line_flags.cc b/tensorflow/core/util/command_line_flags.cc index b281acb2b0..55f1e30880 100644 --- a/tensorflow/core/util/command_line_flags.cc +++ b/tensorflow/core/util/command_line_flags.cc @@ -32,7 +32,7 @@ bool ParseStringFlag(tensorflow::StringPiece arg, tensorflow::StringPiece flag, if (str_util::ConsumePrefix(&arg, "--") && str_util::ConsumePrefix(&arg, flag) && str_util::ConsumePrefix(&arg, "=")) { - *value_parsing_ok = hook(std::string(arg)); + *value_parsing_ok = hook(string(arg)); return true; } diff --git a/tensorflow/core/util/env_var.cc b/tensorflow/core/util/env_var.cc index 8d43bcc927..2604a5d66a 100644 --- a/tensorflow/core/util/env_var.cc +++ b/tensorflow/core/util/env_var.cc @@ -28,7 +28,7 @@ namespace tensorflow { Status ReadBoolFromEnvVar(StringPiece env_var_name, bool default_val, bool* value) { *value = default_val; - const char* tf_env_var_val = getenv(std::string(env_var_name).c_str()); + const char* tf_env_var_val = getenv(string(env_var_name).c_str()); if (tf_env_var_val == nullptr) { return Status::OK(); } @@ -48,7 +48,7 @@ Status ReadBoolFromEnvVar(StringPiece env_var_name, bool default_val, Status ReadInt64FromEnvVar(StringPiece env_var_name, int64 default_val, int64* value) { *value = default_val; - const char* tf_env_var_val = getenv(std::string(env_var_name).c_str()); + const char* tf_env_var_val = getenv(string(env_var_name).c_str()); if (tf_env_var_val == nullptr) { return Status::OK(); } @@ -62,11 +62,11 @@ Status ReadInt64FromEnvVar(StringPiece env_var_name, int64 default_val, Status ReadStringFromEnvVar(StringPiece env_var_name, StringPiece default_val, string* value) { - const char* tf_env_var_val = getenv(std::string(env_var_name).c_str()); + const char* tf_env_var_val = getenv(string(env_var_name).c_str()); if (tf_env_var_val != nullptr) { *value = tf_env_var_val; } else { - *value = std::string(default_val); + *value = string(default_val); } return Status::OK(); } diff --git a/tensorflow/core/util/example_proto_fast_parsing.cc b/tensorflow/core/util/example_proto_fast_parsing.cc index 1fec0010a1..a38cd1d09f 100644 --- a/tensorflow/core/util/example_proto_fast_parsing.cc +++ b/tensorflow/core/util/example_proto_fast_parsing.cc @@ -353,7 +353,7 @@ bool TestFastParse(const string& serialized, Example* example) { // I.e. last entry in the map overwrites all the previous ones. parsed::FeatureMapEntry& name_and_feature = parsed_example[parsed_example_size - i - 1]; - string name = std::string(name_and_feature.first); + string name(name_and_feature.first); if ((*features.mutable_feature()).count(name) > 0) continue; auto& value = (*features.mutable_feature())[name]; diff --git a/tensorflow/stream_executor/lib/env.h b/tensorflow/stream_executor/lib/env.h index 3ef8deb72e..d78bbfd425 100644 --- a/tensorflow/stream_executor/lib/env.h +++ b/tensorflow/stream_executor/lib/env.h @@ -32,7 +32,7 @@ inline Status FileExists(const string& filename) { } inline Status FileExists(const port::StringPiece& filename) { - return Env::Default()->FileExists(std::string(filename)); + return Env::Default()->FileExists(string(filename)); } } // namespace port diff --git a/tensorflow/stream_executor/lib/path.cc b/tensorflow/stream_executor/lib/path.cc index 58a862206c..3d3da103e1 100644 --- a/tensorflow/stream_executor/lib/path.cc +++ b/tensorflow/stream_executor/lib/path.cc @@ -33,7 +33,7 @@ string JoinPathImpl(std::initializer_list paths) { if (path.empty()) continue; if (result.empty()) { - result = std::string(path); + result = string(path); continue; } diff --git a/tensorflow/stream_executor/lib/str_util.h b/tensorflow/stream_executor/lib/str_util.h index b02fe4f56f..e77dfcef76 100644 --- a/tensorflow/stream_executor/lib/str_util.h +++ b/tensorflow/stream_executor/lib/str_util.h @@ -31,7 +31,7 @@ inline string StripSuffixString(port::StringPiece str, port::StringPiece suffix) if (tensorflow::str_util::EndsWith(str, suffix)) { str.remove_suffix(suffix.size()); } - return std::string(str); + return string(str); } using tensorflow::str_util::Lowercase; -- GitLab From 0eade17dc823ed26c0b82987de61f61c8f1886a7 Mon Sep 17 00:00:00 2001 From: Akshay Modi Date: Fri, 24 Aug 2018 11:21:19 -0700 Subject: [PATCH 094/598] Add a helper to be able to connect to cloud TPUs easily in the colab env. PiperOrigin-RevId: 210127772 --- tensorflow/contrib/eager/python/BUILD | 14 ++++ tensorflow/contrib/eager/python/remote.py | 73 +++++++++++++++++++ .../contrib/eager/python/remote_test.py | 13 ++++ tensorflow/contrib/eager/python/tfe.py | 3 + 4 files changed, 103 insertions(+) create mode 100644 tensorflow/contrib/eager/python/remote.py diff --git a/tensorflow/contrib/eager/python/BUILD b/tensorflow/contrib/eager/python/BUILD index fa3f1bb7ad..84517b57c7 100644 --- a/tensorflow/contrib/eager/python/BUILD +++ b/tensorflow/contrib/eager/python/BUILD @@ -14,6 +14,7 @@ py_library( ":datasets", ":metrics", ":network", + ":remote", ":saver", "//tensorflow/python:framework_ops", "//tensorflow/python:framework_test_lib", @@ -223,11 +224,24 @@ py_test( ], ) +py_library( + name = "remote", + srcs = ["remote.py"], + srcs_version = "PY2AND3", + visibility = ["//tensorflow:internal"], + deps = [ + "//tensorflow/core:protos_all_py", + "//tensorflow/python:platform", + "//tensorflow/python/eager:context", + ], +) + py_test( name = "remote_test", srcs = ["remote_test.py"], srcs_version = "PY2AND3", deps = [ + ":remote", "//tensorflow/contrib/eager/python:tfe", "//tensorflow/python:array_ops", "//tensorflow/python:client", diff --git a/tensorflow/contrib/eager/python/remote.py b/tensorflow/contrib/eager/python/remote.py new file mode 100644 index 0000000000..b74cf394f6 --- /dev/null +++ b/tensorflow/contrib/eager/python/remote.py @@ -0,0 +1,73 @@ +# Copyright 2017 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Helpers to connect to remote servers.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import os + +from tensorflow.core.protobuf.cluster_pb2 import ClusterDef +from tensorflow.core.protobuf.tensorflow_server_pb2 import ServerDef +from tensorflow.python.eager import context + + +def connect_to_remote_host(remote_host=None, job_name="worker"): + """Connects to a single machine to enable remote execution on it. + + Will make devices on the remote host available to use. Note that calling this + more than once will work, but will invalidate any tensor handles on the old + remote devices. + + Using the default job_name of worker, you can schedule ops to run remotely as + follows: + ```python + # Enable eager execution, and connect to the remote host. + tf.enable_eager_execution() + tf.contrib.eager.connect_to_remote_host("exampleaddr.com:9876") + + with ops.device("job:worker/replica:0/task:1/device:CPU:0"): + # The following tensors should be resident on the remote device, and the op + # will also execute remotely. + x1 = array_ops.ones([2, 2]) + x2 = array_ops.ones([2, 2]) + y = math_ops.matmul(x1, x2) + ``` + + Args: + remote_host: The addr of the remote server in host-port format. + job_name: The job name under which the new server will be accessible. + + Raises: + ValueError: if remote_host is None. + """ + if remote_host is None: + raise ValueError("Must provide an remote_host") + cluster_def = ClusterDef() + job_def = cluster_def.job.add() + job_def.name = job_name + job_def.tasks[0] = "127.0.0.1:0" + job_def.tasks[1] = remote_host + + server_def = ServerDef( + cluster=cluster_def, + job_name=job_name, + task_index=0, + protocol="grpc") + + # TODO(nareshmodi): Make this default since it works in more situations. + os.environ["TF_EAGER_REMOTE_USE_SEND_TENSOR_RPC"] = "1" + context.set_server_def(server_def) diff --git a/tensorflow/contrib/eager/python/remote_test.py b/tensorflow/contrib/eager/python/remote_test.py index 76f48eeb1c..13029db975 100644 --- a/tensorflow/contrib/eager/python/remote_test.py +++ b/tensorflow/contrib/eager/python/remote_test.py @@ -23,6 +23,7 @@ import os import numpy as np +from tensorflow.contrib.eager.python import remote from tensorflow.core.protobuf import cluster_pb2 from tensorflow.core.protobuf import tensorflow_server_pb2 from tensorflow.python.eager import backprop @@ -85,6 +86,7 @@ class RemoteExecutionTest(test.TestCase): self._cached_server1_target = self._cached_server1.target[len("grpc://"):] self._cached_server2_target = self._cached_server2.target[len("grpc://"):] + def setUp(self): # Start the local server. context.set_server_def( server_def=get_server_def( @@ -172,6 +174,17 @@ class RemoteExecutionTest(test.TestCase): y = math_ops.matmul(x1, x1) np.testing.assert_array_equal([[2, 2], [2, 2]], y.numpy()) + @run_sync_and_async + def testConnectToRemoteServer(self): + """Basic server connection.""" + remote.connect_to_remote_host(self._cached_server1_target) + + with ops.device("job:worker/replica:0/task:1/device:CPU:0"): + x1 = array_ops.ones([2, 2]) + x2 = array_ops.ones([2, 2]) + y = math_ops.matmul(x1, x2) + np.testing.assert_array_equal([[2, 2], [2, 2]], y.numpy()) + if __name__ == "__main__": ops.enable_eager_execution() diff --git a/tensorflow/contrib/eager/python/tfe.py b/tensorflow/contrib/eager/python/tfe.py index 4dfd083443..fe7f1b72fc 100644 --- a/tensorflow/contrib/eager/python/tfe.py +++ b/tensorflow/contrib/eager/python/tfe.py @@ -74,6 +74,8 @@ To use, at program startup, call `tf.enable_eager_execution()`. @@TensorSpec +@@connect_to_cloud_tpu + @@DEVICE_PLACEMENT_EXPLICIT @@DEVICE_PLACEMENT_WARN @@DEVICE_PLACEMENT_SILENT @@ -94,6 +96,7 @@ from tensorflow.contrib.eager.python.network import Network from tensorflow.contrib.eager.python.network import Sequential from tensorflow.contrib.eager.python.network import save_network_checkpoint from tensorflow.contrib.eager.python.network import restore_network_checkpoint +from tensorflow.contrib.eager.python.remote import connect_to_remote_host from tensorflow.contrib.eager.python.saver import get_optimizer_variables from tensorflow.contrib.eager.python.saver import restore_variables_on_create from tensorflow.contrib.eager.python.saver import Saver -- GitLab From 83839064dd8061089a7fdf69e1065655b432c4fd Mon Sep 17 00:00:00 2001 From: Derek Murray Date: Fri, 24 Aug 2018 11:25:58 -0700 Subject: [PATCH 095/598] [tf.data] Optimize `tf.contrib.data.sample_from_datasets()` when the weights are not a dataset. Previously, we were recomputing the logits from the weights for each element, which is only necessary when the weights can differ for each element. PiperOrigin-RevId: 210128640 --- .../contrib/data/python/ops/interleave_ops.py | 59 +++++++++++++------ 1 file changed, 40 insertions(+), 19 deletions(-) diff --git a/tensorflow/contrib/data/python/ops/interleave_ops.py b/tensorflow/contrib/data/python/ops/interleave_ops.py index cca9bf6742..54a92ab185 100644 --- a/tensorflow/contrib/data/python/ops/interleave_ops.py +++ b/tensorflow/contrib/data/python/ops/interleave_ops.py @@ -216,25 +216,46 @@ def sample_from_datasets(datasets, weights=None, seed=None): length of the `datasets` element. """ num_datasets = len(datasets) - if weights is None: - weights = dataset_ops.Dataset.from_tensors([1.0] * num_datasets).repeat() - elif not isinstance(weights, dataset_ops.Dataset): - weights = ops.convert_to_tensor(weights, name="weights") - if weights.dtype not in (dtypes.float32, dtypes.float64): - raise TypeError("`weights` must be convertible to a tensor of " - "`tf.float32` or `tf.float64` elements.") - if not weights.shape.is_compatible_with([num_datasets]): - raise ValueError("`weights` must be a vector of length `len(datasets)`.") - weights = dataset_ops.Dataset.from_tensors(weights).repeat() - - # The `stateless_multinomial()` op expects log-probabilities, as opposed to - # weights. - logits_ds = weights.map(lambda *p: math_ops.log(p, name="logits")) - def select_dataset(logits, seed): - return array_ops.squeeze( - stateless.stateless_multinomial(logits, 1, seed=seed), axis=[0, 1]) - selector_input = dataset_ops.Dataset.zip( - (logits_ds, random_ops.RandomDataset(seed).batch(2))).map(select_dataset) + if not isinstance(weights, dataset_ops.Dataset): + if weights is None: + # Select inputs with uniform probability. + logits = [[1.0] * num_datasets] + else: + # Use the given `weights` as the probability of choosing the respective + # input. + weights = ops.convert_to_tensor(weights, name="weights") + if weights.dtype not in (dtypes.float32, dtypes.float64): + raise TypeError("`weights` must be convertible to a tensor of " + "`tf.float32` or `tf.float64` elements.") + if not weights.shape.is_compatible_with([num_datasets]): + raise ValueError( + "`weights` must be a vector of length `len(datasets)`.") + + # The `stateless_multinomial()` op expects log-probabilities, as opposed + # to weights. + logits = array_ops.expand_dims(math_ops.log(weights, name="logits"), 0) + + def select_dataset_constant_logits(seed): + return array_ops.squeeze( + stateless.stateless_multinomial(logits, 1, seed=seed), axis=[0, 1]) + + selector_input = random_ops.RandomDataset(seed).batch(2).map( + select_dataset_constant_logits) + else: + # Use each element of the given `weights` dataset as the probability of + # choosing the respective input. + + # The `stateless_multinomial()` op expects log-probabilities, as opposed to + # weights. + logits_ds = weights.map(lambda *p: math_ops.log(p, name="logits")) + + def select_dataset_varying_logits(logits, seed): + return array_ops.squeeze( + stateless.stateless_multinomial(logits, 1, seed=seed), axis=[0, 1]) + + selector_input = dataset_ops.Dataset.zip( + (logits_ds, random_ops.RandomDataset(seed).batch(2) + )).map(select_dataset_varying_logits) return _DirectedInterleaveDataset(selector_input, datasets) -- GitLab From 89cd5087643bdf7a2a12996e8d21b916c7f25ec3 Mon Sep 17 00:00:00 2001 From: Allen Lavoie Date: Fri, 24 Aug 2018 11:26:38 -0700 Subject: [PATCH 096/598] Add a max_to_keep=None option to CheckpointManager Doesn't delete anything. Also keeps paths to all checkpoints; I will follow up with a way to remove them manually. PiperOrigin-RevId: 210128785 --- .../python/training/checkpoint_management.py | 14 ++++-- .../training/checkpoint_management_test.py | 44 +++++++++++++++++++ 2 files changed, 55 insertions(+), 3 deletions(-) diff --git a/tensorflow/python/training/checkpoint_management.py b/tensorflow/python/training/checkpoint_management.py index 85f2904318..b7aa8264b0 100644 --- a/tensorflow/python/training/checkpoint_management.py +++ b/tensorflow/python/training/checkpoint_management.py @@ -510,7 +510,10 @@ class CheckpointManager(object): max_to_keep: An integer, the number of checkpoints to keep. Unless preserved by `keep_checkpoint_every_n_hours`, checkpoints will be deleted from the active set, oldest first, until only `max_to_keep` - checkpoints remain. + checkpoints remain. If `None`, no checkpoints are deleted and everything + stays in the active set. Note that `max_to_keep=None` will keep all + checkpoint paths in memory and in the checkpoint state protocol buffer + on disk. keep_checkpoint_every_n_hours: Upon removal from the active set, a checkpoint will be preserved if it has been at least `keep_checkpoint_every_n_hours` since the last preserved checkpoint. The @@ -521,9 +524,10 @@ class CheckpointManager(object): """ self._checkpoint = checkpoint self._save_counter_assign = None - if not max_to_keep or max_to_keep < 0: + if max_to_keep is not None and max_to_keep <= 0: raise ValueError( - "Expected a positive integer for `max_to_max_to_keep`, got %d." + ("Expected a positive integer or `None` for `max_to_max_to_keep`, " + "got %d.") % (max_to_keep,)) self._max_to_keep = max_to_keep self._keep_checkpoint_every_n_hours = keep_checkpoint_every_n_hours @@ -586,6 +590,10 @@ class CheckpointManager(object): def _sweep(self): """Deletes or preserves managed checkpoints.""" + if not self._max_to_keep: + # Does not update self._last_preserved_timestamp, since everything is kept + # in the active set. + return while len(self._maybe_delete) > self._max_to_keep: filename, timestamp = self._maybe_delete.popitem(last=False) # Even if we're keeping this checkpoint due to diff --git a/tensorflow/python/training/checkpoint_management_test.py b/tensorflow/python/training/checkpoint_management_test.py index 22c2cc678a..d7162265e6 100644 --- a/tensorflow/python/training/checkpoint_management_test.py +++ b/tensorflow/python/training/checkpoint_management_test.py @@ -26,6 +26,7 @@ import tempfile from google.protobuf import text_format from tensorflow.core.protobuf import saver_pb2 +from tensorflow.python.eager import context from tensorflow.python.framework import dtypes from tensorflow.python.framework import ops as ops_lib from tensorflow.python.framework import test_util @@ -332,6 +333,49 @@ class CheckpointManagerTest(test.TestCase): self.assertTrue(checkpoint_management.checkpoint_exists(second_path)) self.assertFalse(checkpoint_management.checkpoint_exists(first_path)) + @test_util.run_in_graph_and_eager_modes + def testKeepAll(self): + checkpoint = util.Checkpoint() + directory = os.path.join( + self.get_temp_dir(), + # Avoid sharing directories between eager and graph + # TODO(allenl): stop run_in_graph_and_eager_modes reusing directories + str(context.executing_eagerly())) + manager = checkpoint_management.CheckpointManager( + checkpoint, directory, max_to_keep=None) + first_path = manager.save() + second_path = manager.save() + third_path = manager.save() + self.assertTrue(checkpoint_management.checkpoint_exists(third_path)) + self.assertTrue(checkpoint_management.checkpoint_exists(second_path)) + self.assertTrue(checkpoint_management.checkpoint_exists(first_path)) + self.assertEqual(third_path, manager.latest_checkpoint) + self.assertEqual([first_path, second_path, third_path], + manager.checkpoints) + del manager + manager = checkpoint_management.CheckpointManager( + checkpoint, directory, max_to_keep=None) + fourth_path = manager.save() + self.assertEqual([first_path, second_path, third_path, fourth_path], + manager.checkpoints) + del manager + manager = checkpoint_management.CheckpointManager( + checkpoint, directory, max_to_keep=3) + self.assertEqual([first_path, second_path, third_path, fourth_path], + manager.checkpoints) + self.assertTrue(checkpoint_management.checkpoint_exists(fourth_path)) + self.assertTrue(checkpoint_management.checkpoint_exists(third_path)) + self.assertTrue(checkpoint_management.checkpoint_exists(second_path)) + self.assertTrue(checkpoint_management.checkpoint_exists(first_path)) + fifth_path = manager.save() + self.assertEqual([third_path, fourth_path, fifth_path], + manager.checkpoints) + self.assertTrue(checkpoint_management.checkpoint_exists(fifth_path)) + self.assertTrue(checkpoint_management.checkpoint_exists(fourth_path)) + self.assertTrue(checkpoint_management.checkpoint_exists(third_path)) + self.assertFalse(checkpoint_management.checkpoint_exists(second_path)) + self.assertFalse(checkpoint_management.checkpoint_exists(first_path)) + @test_util.run_in_graph_and_eager_modes @test.mock.patch.object(checkpoint_management, "time") def testSaveRestoreState(self, mock_time): -- GitLab From 0fc04d33486fb4667666048750ef113fb9c92829 Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Fri, 24 Aug 2018 11:28:51 -0700 Subject: [PATCH 097/598] [XLA] Implement resize_images(BILINEAR, align_corners=false) PiperOrigin-RevId: 210129265 --- tensorflow/compiler/tests/image_ops_test.py | 26 ++- .../tf2xla/kernels/image_resize_ops.cc | 153 +++++++++++++----- 2 files changed, 140 insertions(+), 39 deletions(-) diff --git a/tensorflow/compiler/tests/image_ops_test.py b/tensorflow/compiler/tests/image_ops_test.py index 1a82fcbb2a..6fe5a66e0e 100644 --- a/tensorflow/compiler/tests/image_ops_test.py +++ b/tensorflow/compiler/tests/image_ops_test.py @@ -410,13 +410,14 @@ class ResizeBilinearTest(xla_test.XLATestCase): image_np, target_shape, expected=None, - large_tolerance=False): + large_tolerance=False, + align_corners=True): if expected is None: self.fail("expected must be specified") with self.cached_session() as sess, self.test_scope(): image = array_ops.placeholder(image_np.dtype) resized = gen_image_ops.resize_bilinear( - image, target_shape, align_corners=True) + image, target_shape, align_corners=align_corners) out = sess.run(resized, {image: image_np[np.newaxis, :, :, np.newaxis]}) if large_tolerance: self.assertAllClose( @@ -579,6 +580,27 @@ class ResizeBilinearTest(xla_test.XLATestCase): dtype=np.float32)), large_tolerance=True) + def testNonAlignCorners3x2To6x4(self): + input_data = [[64, 32], [32, 64], [50, 100]] + expected_data = [[64.0, 48.0, 32.0, 32.0], [48.0, 48.0, 48.0, 48.0], + [32.0, 48.0, 64.0, 64.0], [41.0, 61.5, 82.0, 82.0], + [50.0, 75.0, 100.0, 100.0], [50.0, 75.0, 100.0, 100.0]] + for dtype in self.float_types: + self._assertForwardOpMatchesExpected( + np.array(input_data, dtype=dtype), [6, 4], + expected=np.array(expected_data, dtype=np.float32), + align_corners=False) + + def testNonAlignCorners6x4To3x2(self): + input_data = [[127, 127, 64, 64], [127, 127, 64, 64], [64, 64, 127, 127], + [64, 64, 127, 127], [50, 50, 100, 100], [50, 50, 100, 100]] + expected_data = [[127, 64], [64, 127], [50, 100]] + for dtype in self.float_types: + self._assertForwardOpMatchesExpected( + np.array(input_data, dtype=dtype), [3, 2], + expected=np.array(expected_data, dtype=dtype), + align_corners=False) + class NonMaxSuppressionTest(xla_test.XLATestCase): diff --git a/tensorflow/compiler/tf2xla/kernels/image_resize_ops.cc b/tensorflow/compiler/tf2xla/kernels/image_resize_ops.cc index 2a92d9e80b..8e071bf0b7 100644 --- a/tensorflow/compiler/tf2xla/kernels/image_resize_ops.cc +++ b/tensorflow/compiler/tf2xla/kernels/image_resize_ops.cc @@ -78,7 +78,8 @@ struct ResizeConvolutionDims { std::vector stride; }; ResizeConvolutionDims ComputeResizeConvolutionParameters( - gtl::ArraySlice in_size, gtl::ArraySlice out_size) { + gtl::ArraySlice in_size, gtl::ArraySlice out_size, + bool align_corners) { CHECK_EQ(in_size.size(), out_size.size()); int num_spatial_dims = in_size.size(); ResizeConvolutionDims dims; @@ -94,15 +95,32 @@ ResizeConvolutionDims ComputeResizeConvolutionParameters( // entry before resizing. dims.stride[i] = dims.kernel_size[i] = 1; } else { - int64 gcd = MathUtil::GCD(static_cast(in_size[i] - 1), - static_cast(out_size[i] - 1)); - dims.stride[i] = (in_size[i] - 1) / gcd; - dims.kernel_size[i] = (out_size[i] - 1) / gcd; + // The scaling factor changes depending on the alignment of corners. + const int64 in_size_factor = align_corners ? in_size[i] - 1 : in_size[i]; + const int64 out_size_factor = + align_corners ? out_size[i] - 1 : out_size[i]; + + int64 gcd = MathUtil::GCD(static_cast(in_size_factor), + static_cast(out_size_factor)); + dims.stride[i] = in_size_factor / gcd; + dims.kernel_size[i] = out_size_factor / gcd; } } return dims; } +// The upper padding of the input needed by ConvGeneralDilated calls is +// determined by solving two related relationships (assuming rhs_dilation == 0): +// 1. dilated_input_dim = lower_padding + upper_padding +// + lhs_dilation * (in_size - 1) + 1 +// 2. dilated_input_dim = (2 * dims.kernel-size - 1) +// + dims.stride * (out_size - 1) +int64 CalculateUpperPadding(int64 in_size, int64 out_size, int64 kernel_size, + int64 stride) { + return (2 * kernel_size - 1) + (out_size - 1) * stride - (kernel_size - 1) - + 1 - (kernel_size * (in_size - 1)); +} + // Form a 2D convolution kernel like: // 1 2 3 2 1 // 2 4 6 4 2 @@ -173,7 +191,8 @@ xla::XlaOp ResizeUsingDilationAndConvolution(xla::XlaBuilder* builder, const int num_spatial_dims, std::vector in_size, std::vector out_size, - const int64 channels) { + const int64 channels, + const bool align_corners) { // Picture for a 1x3 to 1x4 resize: // stride = 2, kernel size = 3 // Input: @@ -198,27 +217,82 @@ xla::XlaOp ResizeUsingDilationAndConvolution(xla::XlaBuilder* builder, dimension_numbers.set_kernel_output_feature_dimension(num_spatial_dims); ResizeConvolutionDims dims = - ComputeResizeConvolutionParameters(in_size, out_size); + ComputeResizeConvolutionParameters(in_size, out_size, align_corners); xla::XlaOp output; + + // Concatenation and padding below currently assumes num_spatial_dims is 2 to + // prevent needless code complexity. + CHECK_EQ(num_spatial_dims, 2) + << "ResizeUsingDilationAndConvolution pads only 2 dimensions currently."; + std::vector upper_padding(num_spatial_dims); + for (int i = 0; i < num_spatial_dims; ++i) { + upper_padding[i] = dims.kernel_size[i] - 1; + } + xla::XlaOp input_data = input; + + if (!align_corners) { + // When Tensorflow does not align_corners, the resize indexing can access + // beyond the upper bound and is instead clamped to prevent out of bounds + // reads. This is conceptually the same as extending the edges of the input. + // We emulate this by copying the last row/column of the input. + // Calculate what padding would be needed then determine how far to extend + // the border before lhs dilation. + std::vector num_extended(num_spatial_dims); + upper_padding[0] = CalculateUpperPadding( + in_size[0], out_size[0], dims.kernel_size[0], dims.stride[0]); + upper_padding[1] = CalculateUpperPadding( + in_size[1], out_size[1], dims.kernel_size[1], dims.stride[1]); + num_extended[0] = upper_padding[0] / (dims.kernel_size[0]); + num_extended[1] = upper_padding[1] / (dims.kernel_size[1]); + + if (num_extended[0] > 0) { + auto slice = + xla::Slice(input_data, {0, in_size[0] - 1, 0, 0}, + {1, in_size[0], in_size[1], channels}, {1, 1, 1, 1}); + for (int i = 0; i < num_extended[0]; i++) { + input_data = xla::ConcatInDim(builder, {input_data, slice}, 1); + } + } + + if (num_extended[1] > 0) { + auto slice = + xla::Slice(input_data, {0, 0, in_size[1] - 1, 0}, + {1, in_size[0] + num_extended[0], in_size[1], channels}, + {1, 1, 1, 1}); + for (int i = 0; i < num_extended[1]; i++) { + input_data = xla::ConcatInDim(builder, {input_data, slice}, 2); + } + } + + // Setting in_size to (in_size + num_extended) due to the above Slice and + // ConcatInDim. Recalculate needed padding after the above Slice/Concat. + upper_padding[0] = + CalculateUpperPadding(in_size[0] + num_extended[0], out_size[0], + dims.kernel_size[0], dims.stride[0]); + upper_padding[1] = + CalculateUpperPadding(in_size[1] + num_extended[1], out_size[1], + dims.kernel_size[1], dims.stride[1]); + } + // Split convolutions into independent dimensions if they would be a very // large kernel. if (dims.kernel_size[0] * dims.kernel_size[1] < kMax2DKernelSize) { xla::XlaOp kernel = MakeBilinearResizeKernel(builder, dims.kernel_size, channels); - output = xla::ConvGeneralDilated( - input, kernel, dims.stride, - /*padding=*/ - {{dims.kernel_size[0] - 1, dims.kernel_size[0] - 1}, - {dims.kernel_size[1] - 1, dims.kernel_size[1] - 1}}, - /*lhs_dilation=*/dims.kernel_size, - /*rhs_dilation=*/{1, 1}, dimension_numbers); + output = + xla::ConvGeneralDilated(input_data, kernel, dims.stride, + /*padding=*/ + {{dims.kernel_size[0] - 1, upper_padding[0]}, + {dims.kernel_size[1] - 1, upper_padding[1]}}, + /*lhs_dilation=*/dims.kernel_size, + /*rhs_dilation=*/{1, 1}, dimension_numbers); } else { xla::XlaOp kernel0 = MakeBilinearResizeKernelInDim(builder, dims.kernel_size, channels, 0); output = xla::ConvGeneralDilated( - input, kernel0, {dims.stride[0], 1}, + input_data, kernel0, {dims.stride[0], 1}, /*padding=*/ - {{dims.kernel_size[0] - 1, dims.kernel_size[0] - 1}, {0, 0}}, + {{dims.kernel_size[0] - 1, upper_padding[0]}, {0, 0}}, /*lhs_dilation=*/{dims.kernel_size[0], 1}, /*rhs_dilation=*/{1, 1}, dimension_numbers); xla::XlaOp kernel1 = @@ -226,7 +300,7 @@ xla::XlaOp ResizeUsingDilationAndConvolution(xla::XlaBuilder* builder, output = xla::ConvGeneralDilated( output, kernel1, {1, dims.stride[1]}, /*padding=*/ - {{0, 0}, {dims.kernel_size[1] - 1, dims.kernel_size[1] - 1}}, + {{0, 0}, {dims.kernel_size[1] - 1, upper_padding[1]}}, /*lhs_dilation=*/{1, dims.kernel_size[1]}, /*rhs_dilation=*/{1, 1}, dimension_numbers); } @@ -247,9 +321,10 @@ xla::XlaOp ResizeUsingDilationAndConvolutionGradOp(xla::XlaBuilder* builder, const int num_spatial_dims, std::vector in_size, std::vector grad_size, - const int64 channels) { + const int64 channels, + const bool align_corners) { ResizeConvolutionDims dims = - ComputeResizeConvolutionParameters(in_size, grad_size); + ComputeResizeConvolutionParameters(in_size, grad_size, align_corners); // To form the backward convolution, we keep the kernel unchanged (it is // already symmetric) and swap the roles of strides and LHS dilation. @@ -343,10 +418,6 @@ class ResizeBilinearOp : public XlaOpKernel { public: explicit ResizeBilinearOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) { OP_REQUIRES_OK(ctx, ctx->GetAttr("align_corners", &align_corners_)); - OP_REQUIRES( - ctx, align_corners_ == true, - errors::Unimplemented( - "ResizeBilinear with align_corners=False is not yet implemented")); } void Compile(XlaOpKernelContext* ctx) override { @@ -379,20 +450,19 @@ class ResizeBilinearOp : public XlaOpKernel { // If in_size[i] > 1 and out_size[i] == 1, slice out the first input in // dimension i. - std::vector slice_size = in_size; bool slice_input = false; for (int i = 0; i < num_spatial_dims; ++i) { if (in_size[i] > 1 && out_size[i] == 1) { // If in_size[i] > 1 but out_size[i] == 1, then we slice out the first // entry before resizing. slice_input = true; - slice_size[i] = 1; + in_size[i] = 1; } } if (slice_input) { - input = xla::Slice(input, {0, 0, 0, 0}, - {batch, slice_size[0], slice_size[1], channels}, - {1, 1, 1, 1}); + input = + xla::Slice(input, {0, 0, 0, 0}, + {batch, in_size[0], in_size[1], channels}, {1, 1, 1, 1}); } // Output is always type float. @@ -408,6 +478,9 @@ class ResizeBilinearOp : public XlaOpKernel { // operations along different dimensions. // Given sufficient numerical stability and a cxd is same as resizing axb -> exf -> cxd. + // This does not work in the case of align_corners_=false because of special + // padding requirements that cause multiple resizes to be very different + // from a single resize. // // This makes the convolutions kernels smaller and the operation faster. xla::XlaOp output = input; @@ -417,21 +490,24 @@ class ResizeBilinearOp : public XlaOpKernel { (static_cast(out_size[0]) - 1) / ((in_size[0] - 1) * 2), (static_cast(out_size[1]) - 1) / ((in_size[1] - 1) * 2)}; if ((k[0] == std::floor(k[0])) && (k[1] == std::floor(k[1])) && - k[0] > 1 && k[1] > 1) { + k[0] > 1 && k[1] > 1 && align_corners_) { std::vector next_out_size = {(in_size[0] - 1) * 2 + 1, (in_size[1] - 1) * 2 + 1}; - output = ResizeUsingDilationAndConvolution( - b, input, num_spatial_dims, in_size, next_out_size, channels); + output = ResizeUsingDilationAndConvolution(b, input, num_spatial_dims, + in_size, next_out_size, + channels, align_corners_); input = output; in_size = next_out_size; } else { - output = ResizeUsingDilationAndConvolution( - b, input, num_spatial_dims, in_size, out_size, channels); + output = ResizeUsingDilationAndConvolution(b, input, num_spatial_dims, + in_size, out_size, + channels, align_corners_); in_size = out_size; } } else { output = ResizeUsingDilationAndConvolution(b, input, num_spatial_dims, - in_size, out_size, channels); + in_size, out_size, channels, + align_corners_); in_size = out_size; } } @@ -511,17 +587,20 @@ class ResizeBilinearGradOp : public XlaOpKernel { std::vector next_grad_size = {(in_size[0] - 1) * 2 + 1, (in_size[1] - 1) * 2 + 1}; output = ResizeUsingDilationAndConvolutionGradOp( - b, grad, num_spatial_dims, in_size, next_grad_size, channels); + b, grad, num_spatial_dims, in_size, next_grad_size, channels, + align_corners_); grad = output; in_size = next_grad_size; } else { output = ResizeUsingDilationAndConvolutionGradOp( - b, grad, num_spatial_dims, in_size, grad_size, channels); + b, grad, num_spatial_dims, in_size, grad_size, channels, + align_corners_); in_size = grad_size; } } else { output = ResizeUsingDilationAndConvolutionGradOp( - b, grad, num_spatial_dims, in_size, grad_size, channels); + b, grad, num_spatial_dims, in_size, grad_size, channels, + align_corners_); in_size = grad_size; } } -- GitLab From 5968329fba2801e48758b1fc44f500572aa1f952 Mon Sep 17 00:00:00 2001 From: Sanjoy Das Date: Fri, 24 Aug 2018 11:38:02 -0700 Subject: [PATCH 098/598] Make compile_options a mandatory const-ref argument. PiperOrigin-RevId: 210130976 --- tensorflow/compiler/jit/kernels/xla_launch_op.cc | 2 +- tensorflow/compiler/jit/xla_compilation_cache.cc | 15 +++++++-------- tensorflow/compiler/jit/xla_compilation_cache.h | 6 +++--- .../compiler/jit/xla_compile_on_demand_op.cc | 2 +- 4 files changed, 12 insertions(+), 13 deletions(-) diff --git a/tensorflow/compiler/jit/kernels/xla_launch_op.cc b/tensorflow/compiler/jit/kernels/xla_launch_op.cc index ddb27a38ae..fde4135bf7 100644 --- a/tensorflow/compiler/jit/kernels/xla_launch_op.cc +++ b/tensorflow/compiler/jit/kernels/xla_launch_op.cc @@ -187,7 +187,7 @@ void XlaLocalLaunchBase::Compute(OpKernelContext* ctx) { OP_REQUIRES_OK( ctx, cache->Compile(options, function_, constant_args, variables, ctx, - &kernel, &executable, &compile_options)); + &kernel, &executable, compile_options)); VLOG(1) << "Executing XLA Computation..."; diff --git a/tensorflow/compiler/jit/xla_compilation_cache.cc b/tensorflow/compiler/jit/xla_compilation_cache.cc index 7140d47a94..ef6b0e67d3 100644 --- a/tensorflow/compiler/jit/xla_compilation_cache.cc +++ b/tensorflow/compiler/jit/xla_compilation_cache.cc @@ -230,7 +230,7 @@ Status XlaCompilationCache::Compile( const std::map& variable_args, OpKernelContext* ctx, const XlaCompiler::CompilationResult** compilation_result, xla::LocalExecutable** executable, - const XlaCompiler::CompileOptions* compile_options) { + const XlaCompiler::CompileOptions& compile_options) { return CompileImpl(options, function, constant_args, variable_args, ctx, compilation_result, executable, compile_options, false); } @@ -241,7 +241,7 @@ Status XlaCompilationCache::CompileSingleOp( const std::map& variable_args, OpKernelContext* ctx, const XlaCompiler::CompilationResult** compilation_result, xla::LocalExecutable** executable, - const XlaCompiler::CompileOptions* compile_options) { + const XlaCompiler::CompileOptions& compile_options) { const NodeDef& def = ctx->op_kernel().def(); NameAttrList name; name.set_name(def.op()); @@ -256,7 +256,7 @@ Status XlaCompilationCache::CompileImpl( const std::map& variable_args, OpKernelContext* ctx, const XlaCompiler::CompilationResult** compilation_result, xla::LocalExecutable** executable, - const XlaCompiler::CompileOptions* compile_options, + const XlaCompiler::CompileOptions& compile_options, bool compile_single_op) { CHECK_NE(executable, nullptr); VLOG(1) << "XlaCompilationCache::Compile " << DebugString(); @@ -324,13 +324,12 @@ Status XlaCompilationCache::CompileImpl( entry->compiled = true; if (compile_single_op) { - entry->compilation_status = compiler.CompileSingleOp( - compile_options ? *compile_options : XlaCompiler::CompileOptions(), - signature.name, ctx, args, &entry->compilation_result); + entry->compilation_status = + compiler.CompileSingleOp(compile_options, signature.name, ctx, args, + &entry->compilation_result); } else { entry->compilation_status = compiler.CompileFunction( - compile_options ? *compile_options : XlaCompiler::CompileOptions(), - function, args, &entry->compilation_result); + compile_options, function, args, &entry->compilation_result); } TF_RETURN_IF_ERROR(entry->compilation_status); CHECK_EQ(entry->executable.get(), nullptr); diff --git a/tensorflow/compiler/jit/xla_compilation_cache.h b/tensorflow/compiler/jit/xla_compilation_cache.h index fc5f008f4f..10ad87e38c 100644 --- a/tensorflow/compiler/jit/xla_compilation_cache.h +++ b/tensorflow/compiler/jit/xla_compilation_cache.h @@ -70,7 +70,7 @@ class XlaCompilationCache : public ResourceBase { OpKernelContext* ctx, const XlaCompiler::CompilationResult** compilation_result, xla::LocalExecutable** executable, - const XlaCompiler::CompileOptions* compile_options); + const XlaCompiler::CompileOptions& compile_options); // As above, but calls XlaCompiler::CompileSingleOp instead of // XlaCompiler::CompileFunction. @@ -80,7 +80,7 @@ class XlaCompilationCache : public ResourceBase { const std::map& variable_args, OpKernelContext* ctx, const XlaCompiler::CompilationResult** compilation_result, xla::LocalExecutable** executable, - const XlaCompiler::CompileOptions* compile_options); + const XlaCompiler::CompileOptions& compile_options); xla::LocalClient* client() const { return client_; } const DeviceType& device_type() const { return device_type_; } @@ -96,7 +96,7 @@ class XlaCompilationCache : public ResourceBase { OpKernelContext* ctx, const XlaCompiler::CompilationResult** compilation_result, xla::LocalExecutable** executable, - const XlaCompiler::CompileOptions* compile_options, + const XlaCompiler::CompileOptions& compile_options, bool compile_single_op); // Takes `result` which has been compiled from a Tensorflow subgraph to a diff --git a/tensorflow/compiler/jit/xla_compile_on_demand_op.cc b/tensorflow/compiler/jit/xla_compile_on_demand_op.cc index dd84fb34c1..3ba48e8c31 100644 --- a/tensorflow/compiler/jit/xla_compile_on_demand_op.cc +++ b/tensorflow/compiler/jit/xla_compile_on_demand_op.cc @@ -177,7 +177,7 @@ Status XlaCompileOnDemandOp::Compile( std::map variable_args = GetVariables(ctx); return cache->CompileSingleOp(options, constant_arguments, variable_args, ctx, - result, executable, &compile_options); + result, executable, compile_options); } void XlaCompileOnDemandOp::Compute(OpKernelContext* ctx) { -- GitLab From 1b2d917a597bfde092ab62a15413aa3eba54f259 Mon Sep 17 00:00:00 2001 From: Sanjoy Das Date: Fri, 24 Aug 2018 12:03:00 -0700 Subject: [PATCH 099/598] [TF:XLA] Bump open source llvm revision to r340606 PiperOrigin-RevId: 210134848 --- tensorflow/workspace.bzl | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tensorflow/workspace.bzl b/tensorflow/workspace.bzl index c4c4bf46bf..34b4a66c41 100644 --- a/tensorflow/workspace.bzl +++ b/tensorflow/workspace.bzl @@ -493,11 +493,11 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""): tf_http_archive( name = "llvm", urls = [ - "https://mirror.bazel.build/github.com/llvm-mirror/llvm/archive/271b93a605571db9bb9755756656071d82d29852.tar.gz", - "https://github.com/llvm-mirror/llvm/archive/271b93a605571db9bb9755756656071d82d29852.tar.gz", + "https://mirror.bazel.build/github.com/llvm-mirror/llvm/archive/97d7bcd5c024ee6aec4eecbc723bb6d4f4c3dc3d.tar.gz", + "https://github.com/llvm-mirror/llvm/archive/97d7bcd5c024ee6aec4eecbc723bb6d4f4c3dc3d.tar.gz", ], - sha256 = "fdd914bd0174fc7c0cd9d94fee9e8cc538c1df4871eea0d1df831fd3de040579", - strip_prefix = "llvm-271b93a605571db9bb9755756656071d82d29852", + sha256 = "2889b79ab979e676e344974cfeefbaf2c21c7c69a015bd584e8ae67b87b136bc", + strip_prefix = "llvm-97d7bcd5c024ee6aec4eecbc723bb6d4f4c3dc3d", build_file = clean_dep("//third_party/llvm:llvm.autogenerated.BUILD"), ) -- GitLab From 50167b8c99767648945e26cd04e25f9ad02066ae Mon Sep 17 00:00:00 2001 From: Shivani Agrawal Date: Fri, 24 Aug 2018 12:06:04 -0700 Subject: [PATCH 100/598] [tf.data] Adds test for `ParseExampleDataset` serialization. PiperOrigin-RevId: 210135480 --- .../python/kernel_tests/serialization/BUILD | 13 +++++ .../dataset_serialization_test_base.py | 42 ++++++++++------ ...arse_example_dataset_serialization_test.py | 50 +++++++++++++++++++ tensorflow/contrib/data/python/ops/BUILD | 1 - tensorflow/core/kernels/data/BUILD | 1 + .../kernels/data/parse_example_dataset_op.cc | 8 +-- 6 files changed, 94 insertions(+), 21 deletions(-) create mode 100644 tensorflow/contrib/data/python/kernel_tests/serialization/parse_example_dataset_serialization_test.py diff --git a/tensorflow/contrib/data/python/kernel_tests/serialization/BUILD b/tensorflow/contrib/data/python/kernel_tests/serialization/BUILD index 7b9ea191a4..4881f63ab9 100644 --- a/tensorflow/contrib/data/python/kernel_tests/serialization/BUILD +++ b/tensorflow/contrib/data/python/kernel_tests/serialization/BUILD @@ -317,6 +317,19 @@ py_test( ], ) +py_test( + name = "parse_example_dataset_serialization_test", + size = "medium", + srcs = ["parse_example_dataset_serialization_test.py"], + srcs_version = "PY2AND3", + tags = ["no_pip"], + deps = [ + ":dataset_serialization_test_base", + "//tensorflow/contrib/data/python/kernel_tests:reader_dataset_ops_test_base", + "//tensorflow/python:client_testlib", + ], +) + py_test( name = "prefetch_dataset_serialization_test", size = "small", diff --git a/tensorflow/contrib/data/python/kernel_tests/serialization/dataset_serialization_test_base.py b/tensorflow/contrib/data/python/kernel_tests/serialization/dataset_serialization_test_base.py index 9fdbcb66bf..595cecef4d 100644 --- a/tensorflow/contrib/data/python/kernel_tests/serialization/dataset_serialization_test_base.py +++ b/tensorflow/contrib/data/python/kernel_tests/serialization/dataset_serialization_test_base.py @@ -510,7 +510,6 @@ class DatasetSerializationTestBase(test.TestCase): else: init_op, get_next_op, saver = self._build_graph( ds_fn, sparse_tensors=sparse_tensors) - get_next_op = remove_variants(get_next_op) return init_op, get_next_op, saver for i in range(len(break_points) + 1): @@ -616,29 +615,40 @@ class DatasetSerializationTestBase(test.TestCase): # `get_next` may be a tuple e.g. in TensorSliceDataset. Since Collections # do not support tuples we flatten the tensors and restore the shape in # `_get_iterator_ops_from_collection`. - - # TODO(shivaniagrwal): `output_classes` is a nested structure of classes, - # this base class is specific to current test cases. Update when tests are - # added with `output_classes` as a nested structure with at least one of the - # component being `tf.SparseTensor`. - if (sparse_tensors or - self._get_output_classes(ds_fn) is sparse_tensor.SparseTensor): + if sparse_tensors: # specific for deprecated `from_sparse_tensor_slices`. ops.add_to_collection("iterator_ops", get_next.indices) ops.add_to_collection("iterator_ops", get_next.values) ops.add_to_collection("iterator_ops", get_next.dense_shape) - else: - for el in nest.flatten(get_next): - ops.add_to_collection("iterator_ops", el) + return + + get_next_list = nest.flatten(get_next) + for i, output_class in enumerate( + nest.flatten(self._get_output_classes(ds_fn))): + if output_class is sparse_tensor.SparseTensor: + ops.add_to_collection("iterator_ops", get_next_list[i].indices) + ops.add_to_collection("iterator_ops", get_next_list[i].values) + ops.add_to_collection("iterator_ops", get_next_list[i].dense_shape) + else: + ops.add_to_collection("iterator_ops", get_next_list[i]) def _get_iterator_ops_from_collection(self, ds_fn, sparse_tensors=False): all_ops = ops.get_collection("iterator_ops") - if (sparse_tensors or - self._get_output_classes(ds_fn) is sparse_tensor.SparseTensor): + if sparse_tensors: # specific for deprecated `from_sparse_tensor_slices`. init_op, indices, values, dense_shape = all_ops return init_op, sparse_tensor.SparseTensor(indices, values, dense_shape) - else: - return all_ops[0], nest.pack_sequence_as( - self._get_output_types(ds_fn), all_ops[1:]) + get_next_list = [] + i = 1 + for output_class in nest.flatten(self._get_output_classes(ds_fn)): + if output_class is sparse_tensor.SparseTensor: + indices, values, dense_shape = all_ops[i:i + 3] + i += 3 + get_next_list.append( + sparse_tensor.SparseTensor(indices, values, dense_shape)) + else: + get_next_list.append(all_ops[i]) + i += 1 + return all_ops[0], nest.pack_sequence_as( + self._get_output_types(ds_fn), get_next_list) def _get_output_types(self, ds_fn): with ops.Graph().as_default(): diff --git a/tensorflow/contrib/data/python/kernel_tests/serialization/parse_example_dataset_serialization_test.py b/tensorflow/contrib/data/python/kernel_tests/serialization/parse_example_dataset_serialization_test.py new file mode 100644 index 0000000000..d3fa84e74c --- /dev/null +++ b/tensorflow/contrib/data/python/kernel_tests/serialization/parse_example_dataset_serialization_test.py @@ -0,0 +1,50 @@ +# Copyright 2018 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Tests for the ParseExampleDataset serialization.""" +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +from tensorflow.contrib.data.python.kernel_tests import reader_dataset_ops_test_base +from tensorflow.contrib.data.python.kernel_tests.serialization import dataset_serialization_test_base +from tensorflow.python.platform import test + + +class ParseExampleDatasetSerializationTest( + reader_dataset_ops_test_base.ReadBatchFeaturesTestBase, + dataset_serialization_test_base.DatasetSerializationTestBase): + + def ParseExampleDataset(self, num_repeat, batch_size): + return self.make_batch_feature( + filenames=self.test_filenames, + num_epochs=num_repeat, + batch_size=batch_size, + reader_num_threads=5, + parser_num_threads=10) + + def testSerializationCore(self): + num_repeat = 5 + batch_size = 2 + num_outputs = self._num_records * self._num_files * num_repeat // batch_size + # pylint: disable=g-long-lambda + self.run_core_tests( + lambda: self.ParseExampleDataset( + num_repeat=num_repeat, batch_size=batch_size), + lambda: self.ParseExampleDataset(num_repeat=10, batch_size=4), + num_outputs) + + +if __name__ == "__main__": + test.main() diff --git a/tensorflow/contrib/data/python/ops/BUILD b/tensorflow/contrib/data/python/ops/BUILD index 0bd5b403e2..4b45cc7e36 100644 --- a/tensorflow/contrib/data/python/ops/BUILD +++ b/tensorflow/contrib/data/python/ops/BUILD @@ -82,7 +82,6 @@ py_library( ":interleave_ops", ":parsing_ops", ":shuffle_ops", - ":stats_ops", "//tensorflow/python:constant_op", "//tensorflow/python:dataset_ops_gen", "//tensorflow/python:dtypes", diff --git a/tensorflow/core/kernels/data/BUILD b/tensorflow/core/kernels/data/BUILD index 82ff2a365d..7716043055 100644 --- a/tensorflow/core/kernels/data/BUILD +++ b/tensorflow/core/kernels/data/BUILD @@ -237,6 +237,7 @@ cc_library( srcs = ["parse_example_dataset_op.cc"], deps = [ ":parallel_map_iterator", + "//tensorflow/core:core_cpu_internal", "//tensorflow/core:framework", ], ) diff --git a/tensorflow/core/kernels/data/parse_example_dataset_op.cc b/tensorflow/core/kernels/data/parse_example_dataset_op.cc index cc5007ee92..6a0522e4f3 100644 --- a/tensorflow/core/kernels/data/parse_example_dataset_op.cc +++ b/tensorflow/core/kernels/data/parse_example_dataset_op.cc @@ -14,6 +14,7 @@ limitations under the License. ==============================================================================*/ #include +#include "tensorflow/core/common_runtime/device.h" #include "tensorflow/core/framework/stats_aggregator.h" #include "tensorflow/core/kernels/data/parallel_map_iterator.h" #include "tensorflow/core/util/example_proto_fast_parsing.h" @@ -166,8 +167,6 @@ class ParseExampleDatasetOp : public UnaryDatasetOpKernel { const std::vector& output_shapes) : DatasetBase(DatasetContext(ctx)), input_(input), - device_threadpool_( - ctx->device()->tensorflow_cpu_worker_threads()->workers), dense_defaults_(std::move(dense_defaults)), sparse_keys_(std::move(sparse_keys)), dense_keys_(std::move(dense_keys)), @@ -190,6 +189,8 @@ class ParseExampleDatasetOp : public UnaryDatasetOpKernel { std::vector input_element, std::vector* result, StatusCallback done) { (*ctx->runner())([this, ctx, input_element, result, done]() { + thread::ThreadPool* device_threadpool = + ctx->lib()->device()->tensorflow_cpu_worker_threads()->workers; std::vector slice_vec; for (Tensor t : input_element) { auto serialized_t = t.flat(); @@ -205,7 +206,7 @@ class ParseExampleDatasetOp : public UnaryDatasetOpKernel { config.collect_feature_stats = true; } example::Result example_result; - Status s = FastParseExample(config, slice_vec, {}, device_threadpool_, + Status s = FastParseExample(config, slice_vec, {}, device_threadpool, &example_result); if (s.ok()) { (*result).resize(key_to_output_index_.size()); @@ -339,7 +340,6 @@ class ParseExampleDatasetOp : public UnaryDatasetOpKernel { private: const DatasetBase* const input_; - thread::ThreadPool* const device_threadpool_; const std::vector dense_defaults_; const std::vector sparse_keys_; const std::vector dense_keys_; -- GitLab From a9e0c06a87857f73fdfc375abf7c69a2c28d87bf Mon Sep 17 00:00:00 2001 From: Karmel Allison Date: Fri, 24 Aug 2018 12:57:28 -0700 Subject: [PATCH 101/598] Adds Eager execution support to convenience functions for building serving input receiver functions. PiperOrigin-RevId: 210142408 --- tensorflow/python/estimator/export/export.py | 22 ++++++++++++++++++- .../python/estimator/export/export_test.py | 7 ++++++ 2 files changed, 28 insertions(+), 1 deletion(-) diff --git a/tensorflow/python/estimator/export/export.py b/tensorflow/python/estimator/export/export.py index 7723fcce74..55aace5fa9 100644 --- a/tensorflow/python/estimator/export/export.py +++ b/tensorflow/python/estimator/export/export.py @@ -311,13 +311,33 @@ def build_parsing_serving_input_receiver_fn(feature_spec, def _placeholder_from_tensor(t, default_batch_size=None): + """Creates a placeholder that matches the dtype and shape of passed tensor. + + Args: + t: Tensor or EagerTensor + default_batch_size: the number of query examples expected per batch. + Leave unset for variable batch size (recommended). + + Returns: + Placeholder that matches the passed tensor. + """ batch_shape = tensor_shape.TensorShape([default_batch_size]) shape = batch_shape.concatenate(t.get_shape()[1:]) # Reuse the feature tensor's op name (t.op.name) for the placeholder, # excluding the index from the tensor's name (t.name): # t.name = "%s:%d" % (t.op.name, t._value_index) - return array_ops.placeholder(dtype=t.dtype, shape=shape, name=t.op.name) + try: + name = t.op.name + except AttributeError: + # In Eager mode, tensors don't have ops or names, and while they do have + # IDs, those are not maintained across runs. The name here is used + # primarily for debugging, and is not critical to the placeholder. + # So, in order to make this Eager-compatible, continue with an empty + # name if none is available. + name = None + + return array_ops.placeholder(dtype=t.dtype, shape=shape, name=name) def _placeholders_from_receiver_tensors_dict(input_vals, diff --git a/tensorflow/python/estimator/export/export_test.py b/tensorflow/python/estimator/export/export_test.py index e87b88327f..3eed1ab163 100644 --- a/tensorflow/python/estimator/export/export_test.py +++ b/tensorflow/python/estimator/export/export_test.py @@ -416,6 +416,7 @@ class ExportTest(test_util.TensorFlowTestCase): tensor_shape.unknown_shape(), v.receiver_tensors["feature_2"].shape) + @test_util.run_in_graph_and_eager_modes def test_build_raw_serving_input_receiver_fn(self): features = {"feature_1": constant_op.constant(["hello"]), "feature_2": constant_op.constant([42])} @@ -434,6 +435,7 @@ class ExportTest(test_util.TensorFlowTestCase): dtypes.int32, serving_input_receiver.receiver_tensors["feature_2"].dtype) + @test_util.run_in_graph_and_eager_modes def test_build_raw_supervised_input_receiver_fn(self): features = {"feature_1": constant_op.constant(["hello"]), "feature_2": constant_op.constant([42])} @@ -454,6 +456,7 @@ class ExportTest(test_util.TensorFlowTestCase): self.assertEqual( dtypes.int32, input_receiver.receiver_tensors["feature_2"].dtype) + @test_util.run_in_graph_and_eager_modes def test_build_raw_supervised_input_receiver_fn_raw_tensors(self): features = {"feature_1": constant_op.constant(["hello"]), "feature_2": constant_op.constant([42])} @@ -477,6 +480,7 @@ class ExportTest(test_util.TensorFlowTestCase): self.assertEqual(set(["input", "label"]), set(input_receiver.receiver_tensors.keys())) + @test_util.run_in_graph_and_eager_modes def test_build_raw_supervised_input_receiver_fn_batch_size(self): features = {"feature_1": constant_op.constant(["hello"]), "feature_2": constant_op.constant([42])} @@ -489,6 +493,7 @@ class ExportTest(test_util.TensorFlowTestCase): self.assertEqual([10], input_receiver.receiver_tensors["feature_1"].shape) self.assertEqual([10], input_receiver.features["feature_1"].shape) + @test_util.run_in_graph_and_eager_modes def test_build_raw_supervised_input_receiver_fn_overlapping_keys(self): features = {"feature_1": constant_op.constant(["hello"]), "feature_2": constant_op.constant([42])} @@ -497,6 +502,7 @@ class ExportTest(test_util.TensorFlowTestCase): with self.assertRaises(ValueError): export.build_raw_supervised_input_receiver_fn(features, labels) + @test_util.run_in_graph_and_eager_modes def test_build_supervised_input_receiver_fn_from_input_fn(self): def dummy_input_fn(): return ({"x": constant_op.constant([[1], [1]]), @@ -514,6 +520,7 @@ class ExportTest(test_util.TensorFlowTestCase): self.assertEqual(set(["x", "y", "label"]), set(input_receiver.receiver_tensors.keys())) + @test_util.run_in_graph_and_eager_modes def test_build_supervised_input_receiver_fn_from_input_fn_args(self): def dummy_input_fn(feature_key="x"): return ({feature_key: constant_op.constant([[1], [1]]), -- GitLab From 94d267dfa6ee106dbf57c42a452925749bbe2f1a Mon Sep 17 00:00:00 2001 From: Justin Lebar Date: Fri, 24 Aug 2018 13:19:52 -0700 Subject: [PATCH 102/598] Internal change PiperOrigin-RevId: 210145594 --- tensorflow/compiler/xla/packed_literal_reader.cc | 4 ++-- tensorflow/compiler/xla/service/hlo_lexer.cc | 6 +++--- tensorflow/compiler/xla/service/hlo_tfgraph_builder.cc | 2 +- .../compiler/xla/tools/hex_floats_to_packed_literal.cc | 2 +- 4 files changed, 7 insertions(+), 7 deletions(-) diff --git a/tensorflow/compiler/xla/packed_literal_reader.cc b/tensorflow/compiler/xla/packed_literal_reader.cc index 55c4a80e29..012df87551 100644 --- a/tensorflow/compiler/xla/packed_literal_reader.cc +++ b/tensorflow/compiler/xla/packed_literal_reader.cc @@ -64,7 +64,7 @@ StatusOr> PackedLiteralReader::Read( tensorflow::gtl::ArraySlice field = result->data(); char* data = tensorflow::bit_cast(field.data()); uint64 bytes = elements * sizeof(float); - tensorflow::StringPiece sp; + tensorflow::StringPiece sp; // non-absl OK auto s = file_->Read(offset_, bytes, &sp, data); offset_ += sp.size(); if (!s.ok()) { @@ -85,7 +85,7 @@ bool PackedLiteralReader::IsExhausted() const { // Try to read a single byte from offset_. If we can't, we've // exhausted the data. char single_byte[1]; - tensorflow::StringPiece sp; + tensorflow::StringPiece sp; // non-absl OK auto s = file_->Read(offset_, sizeof(single_byte), &sp, single_byte); return !s.ok(); } diff --git a/tensorflow/compiler/xla/service/hlo_lexer.cc b/tensorflow/compiler/xla/service/hlo_lexer.cc index a2ad4c8315..0e49d343d6 100644 --- a/tensorflow/compiler/xla/service/hlo_lexer.cc +++ b/tensorflow/compiler/xla/service/hlo_lexer.cc @@ -409,9 +409,9 @@ TokKind HloLexer::LexString() { string error; // TODO(b/113077997): Change to absl::CUnescape once it works properly with // copy-on-write std::string implementations. - if (!tensorflow::str_util::CUnescape( - tensorflow::StringPiece(raw.data(), raw.size()), &str_val_, - &error)) { + if (!tensorflow::str_util::CUnescape( // non-absl ok + tensorflow::StringPiece(raw.data(), raw.size()), // non-absl ok + &str_val_, &error)) { LOG(ERROR) << "Failed unescaping string: " << raw << ". error: " << error; return TokKind::kError; } diff --git a/tensorflow/compiler/xla/service/hlo_tfgraph_builder.cc b/tensorflow/compiler/xla/service/hlo_tfgraph_builder.cc index f855212cc7..4876533449 100644 --- a/tensorflow/compiler/xla/service/hlo_tfgraph_builder.cc +++ b/tensorflow/compiler/xla/service/hlo_tfgraph_builder.cc @@ -36,7 +36,7 @@ using tensorflow::TensorShapeProto; string GetOpDefName(const HloInstruction* instruction) { string name = StrCat("hlo-", HloOpcodeString(instruction->opcode())); - tensorflow::str_util::TitlecaseString(&name, "-"); + tensorflow::str_util::TitlecaseString(&name, "-"); // non-absl ok name.erase(std::remove(name.begin(), name.end(), '-'), name.end()); if (instruction->opcode() == HloOpcode::kFusion) { diff --git a/tensorflow/compiler/xla/tools/hex_floats_to_packed_literal.cc b/tensorflow/compiler/xla/tools/hex_floats_to_packed_literal.cc index e0549b1c47..75b63c3b84 100644 --- a/tensorflow/compiler/xla/tools/hex_floats_to_packed_literal.cc +++ b/tensorflow/compiler/xla/tools/hex_floats_to_packed_literal.cc @@ -67,7 +67,7 @@ int main(int argc, char** argv) { floats.push_back(value); } - tensorflow::StringPiece content( + tensorflow::StringPiece content( // non-absl ok tensorflow::bit_cast(floats.data()), floats.size() * sizeof(float)); TF_CHECK_OK(tensorflow::WriteStringToFile(tensorflow::Env::Default(), -- GitLab From c31402273e6d60b4a53b28e372ef6c722a710495 Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Fri, 24 Aug 2018 13:44:52 -0700 Subject: [PATCH 103/598] Make sure all assignments to a mirrored variable happen. Failure mode being fixed is when you session.run(assignment) and assignment is the MirroredVariable value returned by ResourceVariable.assign*, only one of the components of assignment is executed. Now that it is safer, allow session.run() on Mirrored values (not just MirroredVariables). PiperOrigin-RevId: 210149461 --- .../python/mirrored_strategy_multigpu_test.py | 14 ++++++++++- .../contrib/distribute/python/values.py | 23 +++++++++++++++++-- 2 files changed, 34 insertions(+), 3 deletions(-) diff --git a/tensorflow/contrib/distribute/python/mirrored_strategy_multigpu_test.py b/tensorflow/contrib/distribute/python/mirrored_strategy_multigpu_test.py index 1dfd80fb49..ac2697958d 100644 --- a/tensorflow/contrib/distribute/python/mirrored_strategy_multigpu_test.py +++ b/tensorflow/contrib/distribute/python/mirrored_strategy_multigpu_test.py @@ -888,8 +888,18 @@ class MirroredVariableUpdateTest(test.TestCase): self.assertIsInstance(mirrored_var, values.MirroredVariable) self.evaluate(variables.global_variables_initializer()) self.assertEquals(1.0, self.evaluate(mirrored_var)) - mirrored_var_result = self.evaluate(mirrored_var.assign_add(6.0)) + + # read_value == True + mirrored_var_result = self.evaluate( + mirrored_var.assign_add(6.0, read_value=True)) self.assertEquals(7.0, mirrored_var_result) + self.assertEquals(7.0, self.evaluate(mirrored_var.get("/device:CPU:0"))) + self.assertEquals(7.0, self.evaluate(mirrored_var.get("/device:GPU:0"))) + + # read_value == False + self.evaluate(mirrored_var.assign_add(2.0, read_value=False)) + self.assertEquals(9.0, self.evaluate(mirrored_var.get("/device:CPU:0"))) + self.assertEquals(9.0, self.evaluate(mirrored_var.get("/device:GPU:0"))) @test_util.run_in_graph_and_eager_modes(config=config) def testAssignAddMirroredVarTowerContext(self): @@ -956,6 +966,8 @@ class MirroredVariableUpdateTest(test.TestCase): self.assertEquals(5.0, self.evaluate(mirrored_var)) mirrored_var_result = self.evaluate(mirrored_var.assign_sub(2.0)) self.assertEquals(3.0, mirrored_var_result) + self.assertEquals(3.0, self.evaluate(mirrored_var.get("/device:GPU:0"))) + self.assertEquals(3.0, self.evaluate(mirrored_var.get("/device:CPU:0"))) @test_util.run_in_graph_and_eager_modes(config=config) def testAssignSubMirroredVarTowerContext(self): diff --git a/tensorflow/contrib/distribute/python/values.py b/tensorflow/contrib/distribute/python/values.py index 1b9fdef5b0..e73d9c193e 100644 --- a/tensorflow/contrib/distribute/python/values.py +++ b/tensorflow/contrib/distribute/python/values.py @@ -183,6 +183,14 @@ class Mirrored(DistributedDelegate): return self._index[device] return list(self._index.values())[0] + def _as_graph_element(self): + obj = self.get() + # pylint: disable=protected-access + conv_fn = getattr(obj, "_as_graph_element", None) + if conv_fn and callable(conv_fn): + return conv_fn() + return obj + def _assign_on_device(device, variable, tensor): with ops.device(device): @@ -354,8 +362,19 @@ class MirroredVariable(DistributedVariable, Mirrored, # We are calling assign on the mirrored variable in cross tower context, # use update to update the variable. - return distribution_strategy_context.get_distribution_strategy().update( - self, f, *args, **kwargs) + strategy = distribution_strategy_context.get_distribution_strategy() + updates = strategy.update(self, f, *args, **kwargs) + grouped = strategy.group(updates) + if isinstance(updates, DistributedValues) and updates.is_tensor_like: + # Make sure we run all updates. Without this, something like + # session.run(mirrored_var.assign*(...)) may only update one tower. + index = {} + for d in updates.devices: + with ops.device(d), ops.control_dependencies([grouped]): + index[d] = array_ops.identity(updates.get(d)) + return Mirrored(index) + else: + return grouped else: _assert_tower_context() # We are calling an assign function on the mirrored variable in tower -- GitLab From 97950c6ae853afd1d58bd2e820602656d6951d95 Mon Sep 17 00:00:00 2001 From: Frank Chen Date: Fri, 24 Aug 2018 13:54:11 -0700 Subject: [PATCH 104/598] Modify TPU Distribution Strategy to allow ResNet-based Distribution Strategy to be open sourced PiperOrigin-RevId: 210151074 --- tensorflow/contrib/distribute/python/tpu_strategy.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/tensorflow/contrib/distribute/python/tpu_strategy.py b/tensorflow/contrib/distribute/python/tpu_strategy.py index 0ca7d403b3..6202a0750a 100644 --- a/tensorflow/contrib/distribute/python/tpu_strategy.py +++ b/tensorflow/contrib/distribute/python/tpu_strategy.py @@ -81,12 +81,13 @@ class TPUStrategy(one_device_strategy.OneDeviceStrategy): self._tpu_metadata = get_tpu_system_metadata(self._tpu_cluster_resolver) self._num_cores_override = num_cores - # TODO(priyag): This should not be hardcoded here. - self._host = '/device:CPU:0' # TODO(sourabhbajaj): Remove this once performance of running one step # at a time is comparable to multiple steps. self.steps_per_run = steps_per_run + # TODO(frankchn): This should not be hardcoded here for pod purposes. + self._host = self.tpu_host_cpu_device(0) + def distribute_dataset(self, dataset_fn): # TODO(priyag): Perhaps distribute across cores here. return self._call_dataset_fn(dataset_fn) @@ -263,3 +264,9 @@ class TPUStrategy(one_device_strategy.OneDeviceStrategy): @property def num_towers(self): return self._num_cores_override or self._tpu_metadata.num_cores + + def tpu_host_cpu_device(self, host_id): + if self._tpu_cluster_resolver.get_master() in ('', 'local'): + return '/replica:0/task:0/device:CPU:0' + return '/job:%s/task:%d/device:CPU:0' % ('tpu_worker', host_id) + -- GitLab From 652f7b13866577e61c5616ffce25aa2aa5ec585d Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Fri, 24 Aug 2018 13:58:21 -0700 Subject: [PATCH 105/598] Internal change. PiperOrigin-RevId: 210151786 --- .../contrib/lite/delegates/nnapi/nnapi_delegate_test.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tensorflow/contrib/lite/delegates/nnapi/nnapi_delegate_test.cc b/tensorflow/contrib/lite/delegates/nnapi/nnapi_delegate_test.cc index c39013bb42..720d6b741e 100644 --- a/tensorflow/contrib/lite/delegates/nnapi/nnapi_delegate_test.cc +++ b/tensorflow/contrib/lite/delegates/nnapi/nnapi_delegate_test.cc @@ -2055,7 +2055,7 @@ class SVDFOpModel : public BaseSVDFOpModel { } }; -TEST(NNAPIDelegate, SVDFBlackBoxTestRank1) { +TEST(NNAPIDelegate, DISABLED_SVDFBlackBoxTestRank1) { SVDFOpModel svdf(/*batches=*/2, /*units=*/4, /*input_size=*/3, /*memory_size=*/10, /*rank=*/1); svdf.SetWeightsFeature({-0.31930989, -0.36118156, 0.0079667, 0.37613347, @@ -2078,7 +2078,7 @@ TEST(NNAPIDelegate, SVDFBlackBoxTestRank1) { svdf.VerifyGoldens(svdf_input, svdf_golden_output_rank_1, sizeof(svdf_input)); } -TEST(NNAPIDelegate, SVDFBlackBoxTestRank2) { +TEST(NNAPIDelegate, DISABLED_SVDFBlackBoxTestRank2) { SVDFOpModel svdf(/*batches=*/2, /*units=*/4, /*input_size=*/3, /*memory_size=*/10, /*rank=*/2); svdf.SetWeightsFeature({-0.31930989, 0.0079667, 0.39296314, 0.37613347, -- GitLab From 0b17e5d00b11ee84ec9454e3913d0605b57be4ab Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Fri, 24 Aug 2018 14:14:41 -0700 Subject: [PATCH 106/598] Automated rollback of commit 07c5a99c450acbceb6f472f9e3a48675286a8e34 PiperOrigin-RevId: 210154869 --- tensorflow/core/kernels/resize_bilinear_op.cc | 34 +++++---- .../kernels/resize_nearest_neighbor_op.cc | 75 ++++++++++--------- 2 files changed, 57 insertions(+), 52 deletions(-) diff --git a/tensorflow/core/kernels/resize_bilinear_op.cc b/tensorflow/core/kernels/resize_bilinear_op.cc index dde59e8e74..f10c9a19a7 100644 --- a/tensorflow/core/kernels/resize_bilinear_op.cc +++ b/tensorflow/core/kernels/resize_bilinear_op.cc @@ -277,13 +277,13 @@ struct ResizeBilinearGrad { typename TTypes::ConstTensor input_grad, const float height_scale, const float width_scale, typename TTypes::Tensor output_grad) { - const int batch = output_grad.dimension(0); - const int64 original_height = output_grad.dimension(1); - const int64 original_width = output_grad.dimension(2); - const int channels = output_grad.dimension(3); + const Eigen::Index batch = output_grad.dimension(0); + const Eigen::Index original_height = output_grad.dimension(1); + const Eigen::Index original_width = output_grad.dimension(2); + const Eigen::Index channels = output_grad.dimension(3); - const int64 resized_height = input_grad.dimension(1); - const int64 resized_width = input_grad.dimension(2); + const Eigen::Index resized_height = input_grad.dimension(1); + const Eigen::Index resized_width = input_grad.dimension(2); output_grad.setZero(); @@ -294,22 +294,24 @@ struct ResizeBilinearGrad { // + top_right * (1 - y) * x // + bottom_left * y * (1 - x) // + bottom_right * y * x - for (int64 b = 0; b < batch; ++b) { - for (int64 y = 0; y < resized_height; ++y) { + for (Eigen::Index b = 0; b < batch; ++b) { + for (Eigen::Index y = 0; y < resized_height; ++y) { const float in_y = y * height_scale; - const int64 top_y_index = static_cast(floorf(in_y)); - const int64 bottom_y_index = - std::min(static_cast(ceilf(in_y)), original_height - 1); + const Eigen::Index top_y_index = + static_cast(floorf(in_y)); + const Eigen::Index bottom_y_index = std::min( + static_cast(ceilf(in_y)), original_height - 1); const float y_lerp = in_y - top_y_index; const float inverse_y_lerp = (1.0f - y_lerp); - for (int64 x = 0; x < resized_width; ++x) { + for (Eigen::Index x = 0; x < resized_width; ++x) { const float in_x = x * width_scale; - const int64 left_x_index = static_cast(floorf(in_x)); - const int64 right_x_index = - std::min(static_cast(ceilf(in_x)), original_width - 1); + const Eigen::Index left_x_index = + static_cast(floorf(in_x)); + const Eigen::Index right_x_index = std::min( + static_cast(ceilf(in_x)), original_width - 1); const float x_lerp = in_x - left_x_index; const float inverse_x_lerp = (1.0f - x_lerp); - for (int64 c = 0; c < channels; ++c) { + for (Eigen::Index c = 0; c < channels; ++c) { output_grad(b, top_y_index, left_x_index, c) += T(input_grad(b, y, x, c) * inverse_y_lerp * inverse_x_lerp); output_grad(b, top_y_index, right_x_index, c) += diff --git a/tensorflow/core/kernels/resize_nearest_neighbor_op.cc b/tensorflow/core/kernels/resize_nearest_neighbor_op.cc index 8ec526c2b2..e985d3e5a5 100644 --- a/tensorflow/core/kernels/resize_nearest_neighbor_op.cc +++ b/tensorflow/core/kernels/resize_nearest_neighbor_op.cc @@ -88,25 +88,27 @@ struct ResizeNearestNeighbor { bool operator()(const CPUDevice& d, typename TTypes::ConstTensor input, const float height_scale, const float width_scale, typename TTypes::Tensor output) { - const int batch_size = input.dimension(0); - const int64 in_height = input.dimension(1); - const int64 in_width = input.dimension(2); - const int channels = input.dimension(3); - - const int64 out_height = output.dimension(1); - const int64 out_width = output.dimension(2); - - for (int b = 0; b < batch_size; ++b) { - for (int y = 0; y < out_height; ++y) { - const int64 in_y = std::min( - (align_corners) ? static_cast(roundf(y * height_scale)) - : static_cast(floorf(y * height_scale)), - in_height - 1); - for (int x = 0; x < out_width; ++x) { - const int64 in_x = std::min( - (align_corners) ? static_cast(roundf(x * width_scale)) - : static_cast(floorf(x * width_scale)), - in_width - 1); + const Eigen::Index batch_size = input.dimension(0); + const Eigen::Index in_height = input.dimension(1); + const Eigen::Index in_width = input.dimension(2); + const Eigen::Index channels = input.dimension(3); + + const Eigen::Index out_height = output.dimension(1); + const Eigen::Index out_width = output.dimension(2); + + for (Eigen::Index b = 0; b < batch_size; ++b) { + for (Eigen::Index y = 0; y < out_height; ++y) { + const Eigen::Index in_y = + std::min((align_corners) + ? static_cast(roundf(y * height_scale)) + : static_cast(floorf(y * height_scale)), + in_height - 1); + for (Eigen::Index x = 0; x < out_width; ++x) { + const Eigen::Index in_x = + std::min((align_corners) + ? static_cast(roundf(x * width_scale)) + : static_cast(floorf(x * width_scale)), + in_width - 1); std::copy_n(&input(b, in_y, in_x, 0), channels, &output(b, y, x, 0)); } } @@ -199,28 +201,29 @@ struct ResizeNearestNeighborGrad { bool operator()(const CPUDevice& d, typename TTypes::ConstTensor input, const float height_scale, const float width_scale, typename TTypes::Tensor output) { - const int batch_size = input.dimension(0); - const int64 in_height = input.dimension(1); - const int64 in_width = input.dimension(2); - const int channels = input.dimension(3); + const Eigen::Index batch_size = input.dimension(0); + const Eigen::Index in_height = input.dimension(1); + const Eigen::Index in_width = input.dimension(2); + const Eigen::Index channels = input.dimension(3); - const int64 out_height = output.dimension(1); - const int64 out_width = output.dimension(2); + const Eigen::Index out_height = output.dimension(1); + const Eigen::Index out_width = output.dimension(2); output.setZero(); - for (int y = 0; y < in_height; ++y) { - const int64 out_y = std::min( - (align_corners) ? static_cast(roundf(y * height_scale)) - : static_cast(floorf(y * height_scale)), + for (Eigen::Index y = 0; y < in_height; ++y) { + const Eigen::Index out_y = std::min( + (align_corners) ? static_cast(roundf(y * height_scale)) + : static_cast(floorf(y * height_scale)), out_height - 1); - for (int x = 0; x < in_width; ++x) { - const int64 out_x = std::min( - (align_corners) ? static_cast(roundf(x * width_scale)) - : static_cast(floorf(x * width_scale)), - out_width - 1); - for (int b = 0; b < batch_size; ++b) { - for (int c = 0; c < channels; ++c) { + for (Eigen::Index x = 0; x < in_width; ++x) { + const Eigen::Index out_x = + std::min((align_corners) + ? static_cast(roundf(x * width_scale)) + : static_cast(floorf(x * width_scale)), + out_width - 1); + for (Eigen::Index b = 0; b < batch_size; ++b) { + for (Eigen::Index c = 0; c < channels; ++c) { output(b, out_y, out_x, c) += input(b, y, x, c); } } -- GitLab From 2bfd7f4ac7f627cd63c2e723fbcfd74e2daaee4b Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Fri, 24 Aug 2018 14:15:10 -0700 Subject: [PATCH 107/598] Quantize mean operator for uint8. PiperOrigin-RevId: 210154945 --- .../internal/reference/reference_ops.h | 64 +++++++++++++++++++ tensorflow/contrib/lite/kernels/reduce.cc | 26 ++++++-- .../contrib/lite/kernels/reduce_test.cc | 27 ++++++++ 3 files changed, 112 insertions(+), 5 deletions(-) diff --git a/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h b/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h index 3492a6c2f9..ff77f61191 100644 --- a/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h +++ b/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h @@ -4224,6 +4224,70 @@ inline void Mean(const T* input_data, const Dims<4>& input_dims, } } +// Computes the mean of elements across dimensions given in axis. +// It does so in two stages, first calculates the sum of elements along the axis +// then divides it by the number of element in axis for quantized values. +template +inline bool Mean(const T* input_data, int32 input_zero_point, float input_scale, + const int* input_dims, const int input_num_dims, + T* output_data, int32 output_zero_point, float output_scale, + const int* output_dims, const int output_num_dims, + const int* axis, const int num_axis_dimensions, bool keep_dims, + int* temp_index, int* resolved_axis, U* temp_sum) { + // Reset output data. + size_t num_outputs = 1; + for (int idx = 0; idx < output_num_dims; ++idx) { + size_t current = static_cast(output_dims[idx]); + // Overflow prevention. + if (num_outputs > std::numeric_limits::max() / current) { + return false; + } + num_outputs *= current; + } + for (size_t idx = 0; idx < num_outputs; ++idx) { + output_data[idx] = T(); + temp_sum[idx] = U(); + } + + // Resolve axis. + int num_resolved_axis = 0; + if (!ResolveAxis(input_num_dims, axis, num_axis_dimensions, resolved_axis, + &num_resolved_axis)) { + return false; + } + + if (!ReduceSumImpl(input_data, input_dims, output_dims, input_num_dims, + output_num_dims, resolved_axis, num_resolved_axis, + temp_index, temp_sum)) { + return false; + } + + // Calculate mean by dividing output_data by num of aggregated element. + U num_elements_in_axis = 1; + for (int idx = 0; idx < num_resolved_axis; ++idx) { + size_t current = static_cast(input_dims[resolved_axis[idx]]); + // Overflow prevention. + if (current > (std::numeric_limits::max() / num_elements_in_axis)) { + return false; + } + num_elements_in_axis *= current; + } + + if (num_elements_in_axis > 0) { + const float scale = input_scale / output_scale; + const float bias = -input_zero_point * scale; + for (size_t idx = 0; idx < num_outputs; ++idx) { + float float_mean = static_cast(temp_sum[idx]) / + static_cast(num_elements_in_axis); + + // Convert to float value. + output_data[idx] = + static_cast(round(float_mean * scale + bias)) + output_zero_point; + } + } + return true; +} + template void Minimum(const RuntimeShape& input1_shape, const T* input1_data, const T* input2_data, const RuntimeShape& output_shape, diff --git a/tensorflow/contrib/lite/kernels/reduce.cc b/tensorflow/contrib/lite/kernels/reduce.cc index 29374a0c27..839b48cb83 100644 --- a/tensorflow/contrib/lite/kernels/reduce.cc +++ b/tensorflow/contrib/lite/kernels/reduce.cc @@ -256,11 +256,27 @@ TfLiteStatus EvalMean(TfLiteContext* context, TfLiteNode* node) { TF_LITE_ENSURE(context, TF_LITE_MEAN(reference_ops, int64_t, int64_t)); break; case kTfLiteUInt8: - TF_LITE_ENSURE_EQ(context, op_context.input->params.scale, - op_context.output->params.scale); - TF_LITE_ENSURE_EQ(context, op_context.input->params.zero_point, - op_context.output->params.zero_point); - TF_LITE_ENSURE(context, TF_LITE_MEAN(reference_ops, uint8_t, int)); + if (op_context.input->params.zero_point == + op_context.output->params.zero_point && + op_context.input->params.scale == op_context.output->params.scale) { + TF_LITE_ENSURE(context, TF_LITE_MEAN(reference_ops, uint8_t, int)); + } else { + TF_LITE_ENSURE( + context, + reference_ops::Mean<>( + GetTensorData(op_context.input), + op_context.input->params.zero_point, + op_context.input->params.scale, op_context.input->dims->data, + op_context.input->dims->size, + GetTensorData(op_context.output), + op_context.output->params.zero_point, + op_context.output->params.scale, + op_context.output->dims->data, op_context.output->dims->size, + GetTensorData(op_context.axis), num_axis, + op_context.params->keep_dims, GetTensorData(temp_index), + GetTensorData(resolved_axis), + GetTensorData(temp_sum))); + } break; default: return kTfLiteError; diff --git a/tensorflow/contrib/lite/kernels/reduce_test.cc b/tensorflow/contrib/lite/kernels/reduce_test.cc index d9aca64356..69a07f76b6 100644 --- a/tensorflow/contrib/lite/kernels/reduce_test.cc +++ b/tensorflow/contrib/lite/kernels/reduce_test.cc @@ -338,6 +338,33 @@ TEST(DynamicUint8MeanOpTest, KeepDims) { ElementsAreArray(ArrayFloatNear({9.2815, 0.3695}, kQuantizedTolerance))); } +TEST(DynamicUint8MeanOpTest, QuantizedScalar) { + float kQuantizedTolerance = GetTolerance(-10.0, 12.0); + std::vector data = {0.643}; + MeanOpDynamicModel m({TensorType_UINT8, {}, 0.0, 1.0}, + {TensorType_UINT8, {}, -10.0, 12.0}, + {TensorType_INT32, {1}}, true); + std::vector axis = {0}; + m.QuantizeAndPopulate(m.Input(), data); + m.Invoke(); + EXPECT_THAT(m.GetOutputShape(), IsEmpty()); + EXPECT_THAT(m.GetDequantizedOutput(), + ElementsAreArray(ArrayFloatNear({0.643}, kQuantizedTolerance))); +} + +TEST(ConstUint8MeanOpTest, QuantizedKeepDims) { + float kQuantizedTolerance = GetTolerance(-5.0, 5.0); + std::vector data = {0.4, 0.2, 0.3, 0.4, 0.5, 0.6}; + MeanOpConstModel m({TensorType_UINT8, {3, 2}, 0.0, 1.0}, + {TensorType_UINT8, {3}, -5.0, 5.0}, {1}, {1}, true); + m.QuantizeAndPopulate(m.Input(), data); + m.Invoke(); + EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({3, 1})); + EXPECT_THAT( + m.GetDequantizedOutput(), + ElementsAreArray(ArrayFloatNear({0.3, 0.35, 0.55}, kQuantizedTolerance))); +} + // Tests for reduce_sum TEST(ConstFloatSumOpTest, NotKeepDims) { -- GitLab From c2b86ca5e3dde4d79ec8d1e3b993c73006a144c1 Mon Sep 17 00:00:00 2001 From: Yunxing Dai Date: Fri, 24 Aug 2018 14:21:15 -0700 Subject: [PATCH 108/598] We don't have the need to reorder parameters. PiperOrigin-RevId: 210155953 --- tensorflow/compiler/tf2xla/xla_compiler.cc | 9 +--- tensorflow/compiler/tf2xla/xla_compiler.h | 6 +-- .../compiler/tf2xla/xla_compiler_test.cc | 48 +++++++++++++++++++ 3 files changed, 53 insertions(+), 10 deletions(-) diff --git a/tensorflow/compiler/tf2xla/xla_compiler.cc b/tensorflow/compiler/tf2xla/xla_compiler.cc index 8e7aad2686..eabfc6b6e2 100644 --- a/tensorflow/compiler/tf2xla/xla_compiler.cc +++ b/tensorflow/compiler/tf2xla/xla_compiler.cc @@ -465,8 +465,6 @@ Status XlaCompiler::BuildArguments( // XLA computation as runtime parameters. input_mapping->clear(); input_mapping->reserve(args.size()); - std::vector resources; - resources.reserve(args.size()); // Fills in constant arguments, and computes non-constant argument order. for (std::vector::size_type i = 0; i < args.size(); @@ -485,8 +483,9 @@ Status XlaCompiler::BuildArguments( /*tensor_array_gradients=*/arg.tensor_array_gradients, &resource)); arg_expression.set_resource(resource); if (arg.initialized) { - resources.push_back(i); + input_mapping->push_back(i); } + break; case XlaCompiler::Argument::kParameter: { input_mapping->push_back(i); @@ -500,10 +499,6 @@ Status XlaCompiler::BuildArguments( } } - // Append parameters containing variable values after the other runtime - // parameters. - input_mapping->insert(input_mapping->end(), resources.begin(), - resources.end()); if (input_mapping->empty()) { return Status::OK(); } diff --git a/tensorflow/compiler/tf2xla/xla_compiler.h b/tensorflow/compiler/tf2xla/xla_compiler.h index fde47dbdec..da1ae02f32 100644 --- a/tensorflow/compiler/tf2xla/xla_compiler.h +++ b/tensorflow/compiler/tf2xla/xla_compiler.h @@ -212,9 +212,9 @@ class XlaCompiler { struct CompilationResult { // Vector that maps from the parameters of the XLA computation to their - // original argument positions. To handle compile-time constant inputs and - // resources, the parameters to the XLA computation may be a subset of the - // original arguments, and are not necessarily in the same order.) + // original argument positions. To handle compile-time constant inputs, the + // parameters to the XLA computation may be a subset of the original + // arguments. The relative ordering of parameters are maintained. std::vector input_mapping; // Input shapes of the computation. If we are flattening inputs, these are diff --git a/tensorflow/compiler/tf2xla/xla_compiler_test.cc b/tensorflow/compiler/tf2xla/xla_compiler_test.cc index 6e5a0198f6..740f6dc25c 100644 --- a/tensorflow/compiler/tf2xla/xla_compiler_test.cc +++ b/tensorflow/compiler/tf2xla/xla_compiler_test.cc @@ -280,6 +280,54 @@ TEST_F(XlaCompilerTest, OutOfOrderGraph) { EXPECT_TRUE(xla::LiteralTestUtil::Equal(*param0_literal, *actual_literal)); } +// Tests that the compiler doesn't reorder the parameters. +TEST_F(XlaCompilerTest, MixedOrderArguments) { + for (bool swap_order : {false, true}) { + Scope scope = Scope::NewRootScope().ExitOnError(); + auto var = + ops::_Arg(scope.WithOpName("V"), DT_RESOURCE, swap_order ? 0 : 1); + auto a = ops::_Arg(scope.WithOpName("A"), DT_INT32, swap_order ? 1 : 0); + // Adds an identity op around the resource to make sure identity ops + // propagate resources correctly. + auto identity = ops::Identity(scope.WithOpName("VIdentity"), var); + auto write = ops::AssignAddVariableOp(scope, identity, a); + auto read = ops::ReadVariableOp( + scope.WithControlDependencies(std::vector{write}), var, + DT_INT32); + auto read_plus_one = ops::Add(scope, read, ops::Const(scope, 1)); + auto d = ops::_Retval(scope.WithOpName("D"), read_plus_one, 0); + std::unique_ptr graph(new Graph(OpRegistry::Global())); + TF_ASSERT_OK(scope.ToGraph(graph.get())); + + // Builds a description of the arguments. + std::vector args(2); + args[0].kind = XlaCompiler::Argument::kParameter; + args[0].type = DT_INT32; + args[0].shape = TensorShape({2}); + args[1].kind = XlaCompiler::Argument::kResource; + args[1].resource_kind = XlaResource::kVariable; + args[1].initialized = true; + args[1].type = DT_INT32; + args[1].shape = TensorShape({2}); + + if (swap_order) { + // Even after swapping arguments, the compiler should maintain the new + // ordering of parameters. + std::swap(args[0], args[1]); + } + // Compiles the graph. + XlaCompiler compiler(DefaultOptions()); + + XlaCompiler::CompileOptions compile_options; + compile_options.always_return_tuple = false; + XlaCompiler::CompilationResult result; + TF_ASSERT_OK(compiler.CompileGraph(compile_options, "add", std::move(graph), + args, &result)); + + EXPECT_THAT(result.input_mapping, ::testing::ElementsAre(0, 1)); + } +} + TEST_F(XlaCompilerTest, HasSaneErrorOnNonCompileTimeConstantInputToReshape) { // Builds a graph that adds reshapes a tensor, but with the shape not // statically known. -- GitLab From 9252450e4566c913ee6e24f359da004c34f0e3ce Mon Sep 17 00:00:00 2001 From: Shivani Agrawal Date: Fri, 24 Aug 2018 14:39:00 -0700 Subject: [PATCH 109/598] [tf.data] Removes obsolete comment. PiperOrigin-RevId: 210159082 --- tensorflow/core/kernels/data/parse_example_dataset_op.cc | 1 - 1 file changed, 1 deletion(-) diff --git a/tensorflow/core/kernels/data/parse_example_dataset_op.cc b/tensorflow/core/kernels/data/parse_example_dataset_op.cc index 6a0522e4f3..9057800d94 100644 --- a/tensorflow/core/kernels/data/parse_example_dataset_op.cc +++ b/tensorflow/core/kernels/data/parse_example_dataset_op.cc @@ -290,7 +290,6 @@ class ParseExampleDatasetOp : public UnaryDatasetOpKernel { return "ParseExampleDatasetOp::Dataset"; } - // TODO(b/111553342): Add/Check support for checkpointing. protected: Status AsGraphDefInternal(SerializationContext* ctx, DatasetGraphDefBuilder* b, -- GitLab From 67ce3fa0b49f65d535fac4cda73edc83cda91b25 Mon Sep 17 00:00:00 2001 From: Gunhan Gulsoy Date: Fri, 24 Aug 2018 14:46:22 -0700 Subject: [PATCH 110/598] Drop contrib tests in windows to speed up windows bazel build. PiperOrigin-RevId: 210160250 --- tensorflow/tools/ci_build/windows/cpu/pip/build_tf_windows.sh | 3 +-- tensorflow/tools/ci_build/windows/gpu/pip/build_tf_windows.sh | 3 +-- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/tensorflow/tools/ci_build/windows/cpu/pip/build_tf_windows.sh b/tensorflow/tools/ci_build/windows/cpu/pip/build_tf_windows.sh index 5d0a8efc69..177ef390db 100644 --- a/tensorflow/tools/ci_build/windows/cpu/pip/build_tf_windows.sh +++ b/tensorflow/tools/ci_build/windows/cpu/pip/build_tf_windows.sh @@ -57,8 +57,7 @@ PY_TEST_DIR="py_test_dir" SKIP_TEST=0 RELEASE_BUILD=0 -TEST_TARGET="//${PY_TEST_DIR}/tensorflow/python/... \ - //${PY_TEST_DIR}/tensorflow/contrib/... " +TEST_TARGET="//${PY_TEST_DIR}/tensorflow/python/..." # --skip_test Skip running tests # --enable_remote_cache Add options to enable remote cache for build and test diff --git a/tensorflow/tools/ci_build/windows/gpu/pip/build_tf_windows.sh b/tensorflow/tools/ci_build/windows/gpu/pip/build_tf_windows.sh index f31b0a64e0..28d5565b98 100644 --- a/tensorflow/tools/ci_build/windows/gpu/pip/build_tf_windows.sh +++ b/tensorflow/tools/ci_build/windows/gpu/pip/build_tf_windows.sh @@ -57,8 +57,7 @@ PY_TEST_DIR="py_test_dir" SKIP_TEST=0 RELEASE_BUILD=0 -TEST_TARGET="//${PY_TEST_DIR}/tensorflow/python/... \ - //${PY_TEST_DIR}/tensorflow/contrib/... " +TEST_TARGET="//${PY_TEST_DIR}/tensorflow/python/..." # --skip_test Skip running tests # --enable_remote_cache Add options to enable remote cache for build and test -- GitLab From 28e4880df833f443711ced08d8bc3c64f87f4b44 Mon Sep 17 00:00:00 2001 From: Jacques Pienaar Date: Fri, 24 Aug 2018 15:01:19 -0700 Subject: [PATCH 111/598] Simplify executor vlog for not dead nodes. Previously a "not dead" node's execution would be reported with "is dead: false" which resulted in folks seeing "is dead" first. Instead don't print anything if the node isn't dead, else print "is dead". PiperOrigin-RevId: 210162695 --- tensorflow/core/common_runtime/executor.cc | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tensorflow/core/common_runtime/executor.cc b/tensorflow/core/common_runtime/executor.cc index 63ed860b9f..02193dae5a 100644 --- a/tensorflow/core/common_runtime/executor.cc +++ b/tensorflow/core/common_runtime/executor.cc @@ -1618,7 +1618,7 @@ void ExecutorState::Process(TaggedNode tagged_node, int64 scheduled_nsec) { if (vlog_) { VLOG(1) << "Process node: " << id << " step " << params.step_id << " " - << SummarizeNode(*node) << " is dead: " << tagged_node.is_dead + << SummarizeNode(*node) << (tagged_node.is_dead ? " is dead" : "") << " device: " << device->name(); } @@ -1680,7 +1680,7 @@ void ExecutorState::Process(TaggedNode tagged_node, int64 scheduled_nsec) { VLOG(2) << "Async kernel done: " << state->item->node->id() << " step " << step_id_ << " " << SummarizeNode(*state->item->node) - << " is dead: " << state->tagged_node.is_dead + << (state->tagged_node.is_dead ? " is dead" : "") << " device: " << device->name(); } @@ -1734,7 +1734,7 @@ void ExecutorState::Process(TaggedNode tagged_node, int64 scheduled_nsec) { if (vlog_) { VLOG(2) << "Synchronous kernel done: " << id << " step " << params.step_id << " " << SummarizeNode(*node) - << " is dead: " << tagged_node.is_dead + << (tagged_node.is_dead ? " is dead: " : "") << " device: " << device->name(); } -- GitLab From d8dc2a1cd23db7a6b15083f14042bab272756ae2 Mon Sep 17 00:00:00 2001 From: Yuefeng Zhou Date: Fri, 24 Aug 2018 15:19:16 -0700 Subject: [PATCH 112/598] Reduce learning_rate multi-worker MirroredStrategy test. PiperOrigin-RevId: 210165808 --- .../distribute/python/mirrored_strategy_multigpu_test.py | 5 +++-- tensorflow/contrib/distribute/python/strategy_test_lib.py | 5 +++-- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/tensorflow/contrib/distribute/python/mirrored_strategy_multigpu_test.py b/tensorflow/contrib/distribute/python/mirrored_strategy_multigpu_test.py index ac2697958d..a12ff662db 100644 --- a/tensorflow/contrib/distribute/python/mirrored_strategy_multigpu_test.py +++ b/tensorflow/contrib/distribute/python/mirrored_strategy_multigpu_test.py @@ -1271,7 +1271,8 @@ class MultiWorkerMirroredStrategyTest( return strategy def testMinimizeLossGraph(self): - self._test_minimize_loss_graph(self._get_distribution_strategy()) + self._test_minimize_loss_graph(self._get_distribution_strategy(), + learning_rate=0.05) class MultiWorkerMirroredStrategyTestWithChief( @@ -1288,7 +1289,7 @@ class MultiWorkerMirroredStrategyTestWithChief( def testMinimizeLossGraph(self): strategy = mirrored_strategy.MirroredStrategy(num_gpus=context.num_gpus()) strategy.configure(cluster_spec=self._cluster_spec) - self._test_minimize_loss_graph(strategy) + self._test_minimize_loss_graph(strategy, learning_rate=0.05) if __name__ == "__main__": diff --git a/tensorflow/contrib/distribute/python/strategy_test_lib.py b/tensorflow/contrib/distribute/python/strategy_test_lib.py index 371b97ba96..6ee26e19ac 100644 --- a/tensorflow/contrib/distribute/python/strategy_test_lib.py +++ b/tensorflow/contrib/distribute/python/strategy_test_lib.py @@ -130,7 +130,8 @@ class DistributionTestBase(test.TestCase): # Error should go down self.assertLess(error_after, error_before) - def _test_minimize_loss_graph(self, d, soft_placement=False): + def _test_minimize_loss_graph(self, d, soft_placement=False, + learning_rate=0.2): config = config_pb2.ConfigProto() config.allow_soft_placement = soft_placement config.gpu_options.per_process_gpu_memory_fraction = 0.3 @@ -150,7 +151,7 @@ class DistributionTestBase(test.TestCase): grad_fn = backprop.implicit_grad(loss) def update(v, g): - return v.assign_sub(0.2 * g) + return v.assign_sub(learning_rate * g) one = d.broadcast(constant_op.constant([[1.]])) -- GitLab From 40d2e84a74369f88bde3440e75d4ceed11de93f7 Mon Sep 17 00:00:00 2001 From: Yuefeng Zhou Date: Fri, 24 Aug 2018 15:37:53 -0700 Subject: [PATCH 113/598] Fix a bug in collective_all_reduce_strategy_test: unwrap PerDevice value before session run. PiperOrigin-RevId: 210168939 --- .../python/collective_all_reduce_strategy_test.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/tensorflow/contrib/distribute/python/collective_all_reduce_strategy_test.py b/tensorflow/contrib/distribute/python/collective_all_reduce_strategy_test.py index 0d966d0e90..e284969b1a 100644 --- a/tensorflow/contrib/distribute/python/collective_all_reduce_strategy_test.py +++ b/tensorflow/contrib/distribute/python/collective_all_reduce_strategy_test.py @@ -164,14 +164,18 @@ class CollectiveAllReduceStrategyTestBase( distribution.reduce( variable_scope.VariableAggregation.MEAN, x, destinations='/cpu:0'))[0] + x = distribution.unwrap(x)[0] sess.run( variables.global_variables_initializer(), options=self._run_options) x_value, reduced_x_value = sess.run( [x, reduced_x], options=self._run_options) - self.assertTrue(np.array_equal(x_value, reduced_x_value)) - return np.array_equal(x_value, reduced_x_value) + self.assertTrue( + np.allclose(x_value, reduced_x_value, atol=1e-5), + msg=('x_value = %r, reduced_x_value = %r' % (x_value, + reduced_x_value))) + return np.allclose(x_value, reduced_x_value, atol=1e-5) class DistributedCollectiveAllReduceStrategyTest( -- GitLab From 6d7261ef22835dc51fb157bdb1db349fd26d8f86 Mon Sep 17 00:00:00 2001 From: Shashi Shekhar Date: Fri, 24 Aug 2018 16:07:15 -0700 Subject: [PATCH 114/598] Add documentation for the accuracy tool. - Adds some documentation for the accuracy tool. - Some build fixes to worker with older version of NDK. PiperOrigin-RevId: 210173534 --- tensorflow/contrib/lite/tools/accuracy/BUILD | 183 +++--------------- .../contrib/lite/tools/accuracy/README.md | 40 ++++ .../contrib/lite/tools/accuracy/ilsvrc/BUILD | 171 ++++++++++++++++ .../lite/tools/accuracy/ilsvrc/README.md | 138 +++++++++++++ .../{ => ilsvrc}/imagenet_accuracy_eval.cc | 8 +- .../{ => ilsvrc}/imagenet_model_evaluator.cc | 6 +- .../{ => ilsvrc}/imagenet_model_evaluator.h | 2 +- .../{ => ilsvrc}/imagenet_topk_eval.cc | 2 +- .../{ => ilsvrc}/imagenet_topk_eval.h | 0 .../{ => ilsvrc}/imagenet_topk_eval_test.cc | 6 +- .../{ => ilsvrc}/inception_preprocessing.cc | 2 +- .../{ => ilsvrc}/inception_preprocessing.h | 0 .../inception_preprocessing_test.cc | 2 +- .../{ => ilsvrc}/testdata/grace_hopper.jpg | Bin tensorflow/core/BUILD | 2 + 15 files changed, 398 insertions(+), 164 deletions(-) create mode 100644 tensorflow/contrib/lite/tools/accuracy/README.md create mode 100644 tensorflow/contrib/lite/tools/accuracy/ilsvrc/BUILD create mode 100644 tensorflow/contrib/lite/tools/accuracy/ilsvrc/README.md rename tensorflow/contrib/lite/tools/accuracy/{ => ilsvrc}/imagenet_accuracy_eval.cc (94%) rename tensorflow/contrib/lite/tools/accuracy/{ => ilsvrc}/imagenet_model_evaluator.cc (96%) rename tensorflow/contrib/lite/tools/accuracy/{ => ilsvrc}/imagenet_model_evaluator.h (97%) rename tensorflow/contrib/lite/tools/accuracy/{ => ilsvrc}/imagenet_topk_eval.cc (97%) rename tensorflow/contrib/lite/tools/accuracy/{ => ilsvrc}/imagenet_topk_eval.h (100%) rename tensorflow/contrib/lite/tools/accuracy/{ => ilsvrc}/imagenet_topk_eval_test.cc (96%) rename tensorflow/contrib/lite/tools/accuracy/{ => ilsvrc}/inception_preprocessing.cc (97%) rename tensorflow/contrib/lite/tools/accuracy/{ => ilsvrc}/inception_preprocessing.h (100%) rename tensorflow/contrib/lite/tools/accuracy/{ => ilsvrc}/inception_preprocessing_test.cc (98%) rename tensorflow/contrib/lite/tools/accuracy/{ => ilsvrc}/testdata/grace_hopper.jpg (100%) diff --git a/tensorflow/contrib/lite/tools/accuracy/BUILD b/tensorflow/contrib/lite/tools/accuracy/BUILD index db09de2909..21941f5c8b 100644 --- a/tensorflow/contrib/lite/tools/accuracy/BUILD +++ b/tensorflow/contrib/lite/tools/accuracy/BUILD @@ -5,67 +5,21 @@ package(default_visibility = [ licenses(["notice"]) # Apache 2.0 load("//tensorflow:tensorflow.bzl", "tf_cc_binary", "tf_cc_test") +load("//tensorflow/contrib/lite:build_def.bzl", "tflite_copts", "tflite_linkopts") -cc_library( - name = "inception_preprocessing", - srcs = ["inception_preprocessing.cc"], - hdrs = ["inception_preprocessing.h"], - copts = [ - "-D__ANDROID_TYPES_FULL__", - "-DSUPPORT_SELECTIVE_REGISTRATION", +common_linkopts = tflite_linkopts() + select({ + "//conditions:default": [], + "//tensorflow:android": [ + "-pie", + "-llog", ], - deps = [ - ":stage", - "//tensorflow/cc:cc_ops", - "//tensorflow/cc:scope", - ] + select( - { - "//tensorflow:android": [ - "//tensorflow/core:android_tensorflow_lib", - "//tensorflow/core/kernels:android_tensorflow_image_op", - ], - "//conditions:default": [ - "//tensorflow/core:tensorflow", - "//tensorflow/core:protos_all_cc", - "//tensorflow/core:core_cpu", - "//tensorflow/core:framework", - "//tensorflow/core:lib", - "//tensorflow/core:ops", - ], - }, - ), -) - -tf_cc_test( - name = "inception_preprocessing_test", - srcs = ["inception_preprocessing_test.cc"], - args = [ - "--test_image=$(location :testdata/grace_hopper.jpg)", - ], - data = [":testdata/grace_hopper.jpg"], - deps = [ - ":inception_preprocessing", - ":android_required_build_flags", - "@com_google_googletest//:gtest", - ] + select( - { - "//tensorflow:android": [ - "//tensorflow/core:android_tensorflow_lib", - "//tensorflow/core:android_test_lib", - ], - "//conditions:default": [ - "//tensorflow/core:core_cpu", - "//tensorflow/core:framework_internal", - "//tensorflow/core:lib", - ], - }, - ), -) +}) cc_library( name = "utils", srcs = ["utils.cc"], hdrs = ["utils.h"], + copts = tflite_copts(), deps = [ "//tensorflow/contrib/lite:framework", "//tensorflow/contrib/lite/kernels:builtin_ops", @@ -88,6 +42,8 @@ tf_cc_test( "--test_model_file=$(location //tensorflow/contrib/lite:testdata/multi_add.bin)", ], data = ["//tensorflow/contrib/lite:testdata/multi_add.bin"], + linkopts = common_linkopts, + linkstatic = 1, deps = [ ":utils", "@com_google_googletest//:gtest", @@ -95,7 +51,7 @@ tf_cc_test( { "//tensorflow:android": [ "//tensorflow/core:android_tensorflow_lib", - "//tensorflow/core:android_test_lib", + "//tensorflow/core:android_tensorflow_test_lib", ], "//conditions:default": [ "//tensorflow/core:framework_internal", @@ -108,6 +64,7 @@ tf_cc_test( cc_library( name = "run_tflite_model_op", srcs = ["run_tflite_model_op.cc"], + copts = tflite_copts(), deps = [ ":utils", "//tensorflow/contrib/lite:framework", @@ -133,6 +90,7 @@ cc_library( cc_library( name = "android_required_build_flags", srcs = ["android_required_build_flags.cc"], + copts = tflite_copts(), ) tf_cc_test( @@ -142,6 +100,8 @@ tf_cc_test( "--test_model_file=$(location //tensorflow/contrib/lite:testdata/multi_add.bin)", ], data = ["//tensorflow/contrib/lite:testdata/multi_add.bin"], + linkopts = common_linkopts, + linkstatic = 1, deps = [ "//tensorflow/cc:cc_ops", "//tensorflow/cc:scope", @@ -152,7 +112,7 @@ tf_cc_test( { "//tensorflow:android": [ "//tensorflow/core:android_tensorflow_lib", - "//tensorflow/core:android_test_lib", + "//tensorflow/core:android_tensorflow_test_lib", ], "//conditions:default": [ "//tensorflow/core:core_cpu", @@ -170,6 +130,7 @@ tf_cc_test( cc_library( name = "stage", hdrs = ["stage.h"], + copts = tflite_copts(), deps = [ "//tensorflow/cc:scope", ], @@ -189,6 +150,8 @@ cc_library( tf_cc_test( name = "file_reader_stage_test", srcs = ["file_reader_stage_test.cc"], + linkopts = common_linkopts, + linkstatic = 1, deps = [ ":file_reader_stage", "@com_google_googletest//:gtest", @@ -197,7 +160,7 @@ tf_cc_test( "//tensorflow:android": [ "//tensorflow/core:android_tensorflow_lib", "//tensorflow/core/kernels:android_whole_file_read_ops", - "//tensorflow/core:android_test_lib", + "//tensorflow/core:android_tensorflow_test_lib", ], "//conditions:default": [ "//tensorflow/core:core_cpu", @@ -211,6 +174,7 @@ cc_library( name = "run_tflite_model_stage", srcs = ["run_tflite_model_stage.cc"], hdrs = ["run_tflite_model_stage.h"], + copts = tflite_copts(), deps = [ ":run_tflite_model_op", ":stage", @@ -222,6 +186,7 @@ cc_library( cc_library( name = "accuracy_eval_stage", hdrs = ["accuracy_eval_stage.h"], + copts = tflite_copts(), deps = [ ] + select( { @@ -235,47 +200,11 @@ cc_library( ), ) -cc_library( - name = "imagenet_topk_eval", - srcs = ["imagenet_topk_eval.cc"], - hdrs = ["imagenet_topk_eval.h"], - deps = [ - ":accuracy_eval_stage", - ] + select( - { - "//tensorflow:android": [ - "//tensorflow/core:android_tensorflow_lib", - ], - "//conditions:default": [ - "//tensorflow/core:framework", - ], - }, - ), -) - -tf_cc_test( - name = "imagenet_topk_eval_test", - srcs = ["imagenet_topk_eval_test.cc"], - deps = [ - ":imagenet_topk_eval", - "@com_google_googletest//:gtest", - ] + select( - { - "//tensorflow:android": [ - "//tensorflow/core:android_tensorflow_lib", - "//tensorflow/core:android_test_lib", - ], - "//conditions:default": [ - "//tensorflow/core:framework", - ], - }, - ), -) - cc_library( name = "eval_pipeline", srcs = ["eval_pipeline.cc"], hdrs = ["eval_pipeline.h"], + copts = tflite_copts(), deps = [ ":accuracy_eval_stage", ":stage", @@ -295,6 +224,8 @@ cc_library( tf_cc_test( name = "eval_pipeline_test", srcs = ["eval_pipeline_test.cc"], + linkopts = common_linkopts, + linkstatic = 1, deps = [ ":eval_pipeline", "//tensorflow/cc:cc_ops", @@ -303,7 +234,7 @@ tf_cc_test( { "//tensorflow:android": [ "//tensorflow/core:android_tensorflow_lib", - "//tensorflow/core:android_test_lib", + "//tensorflow/core:android_tensorflow_test_lib", ], "//conditions:default": [ "//tensorflow/core:framework", @@ -319,6 +250,7 @@ cc_library( name = "eval_pipeline_builder", srcs = ["eval_pipeline_builder.cc"], hdrs = ["eval_pipeline_builder.h"], + copts = tflite_copts(), deps = [ ":eval_pipeline", ":accuracy_eval_stage", @@ -343,6 +275,8 @@ cc_library( tf_cc_test( name = "eval_pipeline_builder_test", srcs = ["eval_pipeline_builder_test.cc"], + linkopts = common_linkopts, + linkstatic = 1, deps = [ ":eval_pipeline_builder", "//tensorflow/cc:cc_ops", @@ -351,7 +285,7 @@ tf_cc_test( { "//tensorflow:android": [ "//tensorflow/core:android_tensorflow_lib", - "//tensorflow/core:android_test_lib", + "//tensorflow/core:android_tensorflow_test_lib", ], "//conditions:default": [ "//tensorflow/core:framework", @@ -366,6 +300,7 @@ tf_cc_test( cc_library( name = "csv_writer", hdrs = ["csv_writer.h"], + copts = tflite_copts(), deps = select( { "//tensorflow:android": [ @@ -377,59 +312,3 @@ cc_library( }, ), ) - -cc_library( - name = "imagenet_model_evaluator", - srcs = ["imagenet_model_evaluator.cc"], - hdrs = ["imagenet_model_evaluator.h"], - deps = [ - ":android_required_build_flags", - ":eval_pipeline", - ":eval_pipeline_builder", - ":file_reader_stage", - ":imagenet_topk_eval", - ":inception_preprocessing", - ":run_tflite_model_stage", - ":utils", - "@com_google_absl//absl/memory", - "//tensorflow/cc:cc_ops", - "//tensorflow/cc:scope", - ] + select( - { - "//tensorflow:android": [ - "//tensorflow/core:android_tensorflow_lib", - "//tensorflow/core/kernels:android_whole_file_read_ops", - "//tensorflow/core/kernels:android_tensorflow_image_op", - ], - "//conditions:default": [ - "//tensorflow/core:tensorflow", - "//tensorflow/core:framework_internal", - "//tensorflow/core:framework", - "//tensorflow/core:lib", - "//tensorflow/core:core_cpu", - ], - }, - ), -) - -tf_cc_binary( - name = "imagenet_accuracy_eval", - srcs = ["imagenet_accuracy_eval.cc"], - deps = [ - ":android_required_build_flags", - ":csv_writer", - ":imagenet_model_evaluator", - ":imagenet_topk_eval", - "@com_google_absl//absl/memory", - ] + select( - { - "//tensorflow:android": [ - "//tensorflow/core:android_tensorflow_lib", - ], - "//conditions:default": [ - "//tensorflow/core:lib", - "//tensorflow/core:framework_internal", - ], - }, - ), -) diff --git a/tensorflow/contrib/lite/tools/accuracy/README.md b/tensorflow/contrib/lite/tools/accuracy/README.md new file mode 100644 index 0000000000..ad28fc3c70 --- /dev/null +++ b/tensorflow/contrib/lite/tools/accuracy/README.md @@ -0,0 +1,40 @@ +## TFLite accuracy library. + +This library provides evaluation pipelines that can be used to evaluate +accuracy and other metrics of a model. The resulting binary can be run on +a desktop or on a mobile device. + +## Usage +The tool provides an evaluation pipeline with different stages. Each +stage outputs a Tensorflow graph. +A sample usage is shown below. + +```C++ +// First build the pipeline. +EvalPipelineBuilder builder; +std::unique_ptr eval_pipeline; +auto status = builder.WithInput("pipeline_input", DT_FLOAT) + .WithInputStage(&input_stage) + .WithRunModelStage(&run_model_stage) + .WithPreprocessingStage(&preprocess_stage) + .WithAccuracyEval(&eval) + .Build(scope, &eval_pipeline); +TF_CHECK_OK(status); + +// Now run the pipeline with inputs and outputs. +std::unique_ptr session(NewSession(SessionOptions())); +TF_CHECK_OK(eval_pipeline.AttachSession(std::move(session))); +Tensor input = ... read input for the model ... +Tensor ground_truth = ... read ground truth for the model ... +TF_CHECK_OK(eval_pipeline.Run(input1, ground_truth1)); +``` +For further examples, check the usage in [imagenet accuracy evaluation binary] +(ilsvrc/imagenet_accuracy_eval.cc) + +## Measuring accuracy of published models. + +### ILSVRC (Imagenet Large Scale Visual Recognition Contest) classification task +For measuring accuracy for [ILSVRC 2012 image classification task] +(http://www.image-net.org/challenges/LSVRC/2012/), the binary can be built +using these +[instructions](accuracy/ilsvrc/) diff --git a/tensorflow/contrib/lite/tools/accuracy/ilsvrc/BUILD b/tensorflow/contrib/lite/tools/accuracy/ilsvrc/BUILD new file mode 100644 index 0000000000..db4b688a45 --- /dev/null +++ b/tensorflow/contrib/lite/tools/accuracy/ilsvrc/BUILD @@ -0,0 +1,171 @@ +package(default_visibility = [ + "//visibility:public", +]) + +licenses(["notice"]) # Apache 2.0 + +load("//tensorflow:tensorflow.bzl", "tf_cc_binary", "tf_cc_test") +load("//tensorflow/contrib/lite:build_def.bzl", "tflite_copts", "tflite_linkopts") + +common_linkopts = tflite_linkopts() + select({ + "//conditions:default": [], + "//tensorflow:android": [ + "-pie", + "-llog", + ], +}) + +cc_library( + name = "inception_preprocessing", + srcs = ["inception_preprocessing.cc"], + hdrs = ["inception_preprocessing.h"], + copts = tflite_copts(), + deps = [ + "//tensorflow/contrib/lite/tools/accuracy:android_required_build_flags", + "//tensorflow/contrib/lite/tools/accuracy:stage", + "//tensorflow/cc:cc_ops", + "//tensorflow/cc:scope", + ] + select( + { + "//tensorflow:android": [ + "//tensorflow/core:android_tensorflow_lib", + "//tensorflow/core/kernels:android_tensorflow_image_op", + ], + "//conditions:default": [ + "//tensorflow/core:tensorflow", + "//tensorflow/core:protos_all_cc", + "//tensorflow/core:core_cpu", + "//tensorflow/core:framework", + "//tensorflow/core:lib", + "//tensorflow/core:ops", + ], + }, + ), +) + +tf_cc_test( + name = "inception_preprocessing_test", + srcs = ["inception_preprocessing_test.cc"], + args = [ + "--test_image=$(location :testdata/grace_hopper.jpg)", + ], + data = [":testdata/grace_hopper.jpg"], + linkopts = common_linkopts, + linkstatic = 1, + deps = [ + ":inception_preprocessing", + "//tensorflow/contrib/lite/tools/accuracy:android_required_build_flags", + "@com_google_googletest//:gtest", + ] + select( + { + "//tensorflow:android": [ + "//tensorflow/core:android_tensorflow_lib", + "//tensorflow/core:android_tensorflow_test_lib", + ], + "//conditions:default": [ + "//tensorflow/core:core_cpu", + "//tensorflow/core:framework_internal", + "//tensorflow/core:lib", + ], + }, + ), +) + +cc_library( + name = "imagenet_topk_eval", + srcs = ["imagenet_topk_eval.cc"], + hdrs = ["imagenet_topk_eval.h"], + copts = tflite_copts(), + deps = [ + "//tensorflow/contrib/lite/tools/accuracy:accuracy_eval_stage", + ] + select( + { + "//tensorflow:android": [ + "//tensorflow/core:android_tensorflow_lib", + ], + "//conditions:default": [ + "//tensorflow/core:framework", + ], + }, + ), +) + +tf_cc_test( + name = "imagenet_topk_eval_test", + srcs = ["imagenet_topk_eval_test.cc"], + linkopts = common_linkopts, + linkstatic = 1, + deps = [ + ":imagenet_topk_eval", + "@com_google_googletest//:gtest", + ] + select( + { + "//tensorflow:android": [ + "//tensorflow/core:android_tensorflow_lib", + "//tensorflow/core:android_tensorflow_test_lib", + ], + "//conditions:default": [ + "//tensorflow/core:framework", + ], + }, + ), +) + +cc_library( + name = "imagenet_model_evaluator", + srcs = ["imagenet_model_evaluator.cc"], + hdrs = ["imagenet_model_evaluator.h"], + copts = tflite_copts(), + deps = [ + ":imagenet_topk_eval", + ":inception_preprocessing", + "//tensorflow/contrib/lite/tools/accuracy:android_required_build_flags", + "//tensorflow/contrib/lite/tools/accuracy:eval_pipeline", + "//tensorflow/contrib/lite/tools/accuracy:eval_pipeline_builder", + "//tensorflow/contrib/lite/tools/accuracy:file_reader_stage", + "//tensorflow/contrib/lite/tools/accuracy:run_tflite_model_stage", + "//tensorflow/contrib/lite/tools/accuracy:utils", + "@com_google_absl//absl/memory", + "//tensorflow/cc:cc_ops", + "//tensorflow/cc:scope", + ] + select( + { + "//tensorflow:android": [ + "//tensorflow/core:android_tensorflow_lib", + "//tensorflow/core/kernels:android_whole_file_read_ops", + "//tensorflow/core/kernels:android_tensorflow_image_op", + ], + "//conditions:default": [ + "//tensorflow/core:tensorflow", + "//tensorflow/core:framework_internal", + "//tensorflow/core:framework", + "//tensorflow/core:lib", + "//tensorflow/core:core_cpu", + ], + }, + ), +) + +tf_cc_binary( + name = "imagenet_accuracy_eval", + srcs = ["imagenet_accuracy_eval.cc"], + copts = tflite_copts(), + linkopts = common_linkopts, + deps = [ + ":imagenet_model_evaluator", + ":imagenet_topk_eval", + "@com_google_absl//absl/memory", + "//tensorflow/contrib/lite/tools/accuracy:android_required_build_flags", + "//tensorflow/contrib/lite/tools/accuracy:csv_writer", + ] + select( + { + "//tensorflow:android": [ + "//tensorflow/core:android_tensorflow_lib", + ], + "//conditions:default": [ + "//tensorflow/core:lib", + "//tensorflow/core:framework_internal", + ], + }, + ), +) diff --git a/tensorflow/contrib/lite/tools/accuracy/ilsvrc/README.md b/tensorflow/contrib/lite/tools/accuracy/ilsvrc/README.md new file mode 100644 index 0000000000..3c6a0d85b3 --- /dev/null +++ b/tensorflow/contrib/lite/tools/accuracy/ilsvrc/README.md @@ -0,0 +1,138 @@ +## Accuracy evaluation for ILSVRC 2012 (Imagenet Large Scale Visual Recognition Challenge) image classification task + +This binary can evaluate the accuracy of TFLite models trained for the [ILSVRC 2012 image classification task] +(http://www.image-net.org/challenges/LSVRC/2012/). +The binary takes the path to validation images and labels as inputs. It outputs the accuracy after running the TFLite model on the validation sets. + +To run the binary download the ILSVRC 2012 devkit [see instructions](#downloading-ilsvrc) and run the [`generate_validation_ground_truth` script](#ground-truth-label-generation) to generate the ground truth labels. + +## Parameters +The binary takes the following parameters: + +* `model_file` : `string` \ + Path to the TFlite model file. + +* `ground_truth_images_path`: `string` \ + The path to the directory containing ground truth images. + +* `ground_truth_labels`: `string` \ + Path to ground truth labels file. This file should contain the same number of labels as the number images in the ground truth directory. The labels are assumed to be in the + same order as the sorted filename of images. See [ground truth label generation](#ground-truth-label-generation) + section for more information about how to generate labels for images. + +* `model_output_labels`: `string` \ + Path to the file containing labels, that is used to interpret the output of + the model. E.g. in case of mobilenets, this is the path to + `mobilenet_labels.txt` where each label is in the same order as the output + 1001 dimension tensor. + +* `output_path`: `string` \ + This is the path to the output file. The output is a CSV file that has top-10 accuracies in each row. Each line of output file is the cumulative accuracy after processing images in a sorted order. So first line is accuracy after processing the first image, second line is accuracy after procesing first two images. The last line of the file is accuracy after processing the entire validation set. + +and the following optional parameters: +* `num_images`: `int` (default=0) \ + The number of images to process, if 0, all images in the directory are processed otherwise only num_images will be processed. + +## Downloading ILSVRC +In order to use this tool to run evaluation on the full 50K ImageNet dataset, +download the data set from http://image-net.org/request. + +## Ground truth label generation +The ILSVRC 2012 devkit `validation_ground_truth.txt` contains IDs that correspond to synset of the image. +The accuracy binary however expects the ground truth labels to contain the actual name of +category instead of synset ids. A conversion script has been provided to convert the validation ground truth to +category labels. The `validation_ground_truth.txt` can be converted by the following steps: + +``` +ILSVRC_2012_DEVKIT_DIR=[set to path to ILSVRC 2012 devkit] +VALIDATION_LABELS=[set to path to output] + +python generate_validation_labels -- \ +--ilsvrc_devkit_dir=${ILSVRC_2012_DEVKIT_DIR} \ +--validation_labels_output=${VALIDATION_LABELS} +``` + +## Running the binary + +### On Android + +(0) Refer to https://github.com/tensorflow/tensorflow/tree/master/tensorflow/examples/android for configuring NDK and SDK. + +(1) Build using the following command: + +``` +bazel build -c opt \ + --config=android_arm \ + --config=monolithic \ + --cxxopt='--std=c++11' \ + --copt=-D__ANDROID_TYPES_FULL__ \ + --copt=-DSUPPORT_SELECTIVE_REGISTRATION \ + //tensorflow/contrib/lite/tools/accuracy/ilsvrc:imagenet_accuracy_eval +``` + +(2) Connect your phone. Push the binary to your phone with adb push + (make the directory if required): + +``` +adb push bazel-bin/tensorflow/contrib/lite/tools/accuracy/ilsvrc/imagenet_accuracy_eval /data/local/tmp +``` + +(3) Make the binary executable. + +``` +adb shell chmod +x /data/local/tmp/imagenet_accuracy_eval +``` + +(4) Push the TFLite model that you need to test. For example: + +``` +adb push mobilenet_quant_v1_224.tflite /data/local/tmp +``` + +(5) Push the imagenet images to device, make sure device has sufficient storage available before pushing the dataset: + +``` +adb shell mkdir /data/local/tmp/ilsvrc_images && \ +adb push ${IMAGENET_IMAGES_DIR} /data/local/tmp/ilsvrc_images +``` + +(6) Push the generated validation ground labels to device. + +``` +adb push ${VALIDATION_LABELS} /data/local/tmp/ilsvrc_validation_labels.txt +``` + +(7) Push the model labels text file to device. + +``` +adb push ${MODEL_LABELS_TXT} /data/local/tmp/model_output_labels.txt +``` + +(8) Run the binary. + +``` +adb shell /data/local/tmp/imagenet_accuracy_eval \ + --model_file=/data/local/tmp/mobilenet_quant_v1_224.tflite \ + --ground_truth_images_path=/data/local/tmp/ilsvrc_images \ + --ground_truth_labels=/data/local/tmp/ilsvrc_validation_labels.txt \ + --model_output_labels=/data/local/tmp/model_output_labels.txt \ + --output_file_path=/data/local/tmp/accuracy_output.txt \ + --num_images=0 # Run on all images. +``` + +### On Desktop + +(1) Build and run using the following command: + +``` +bazel run -c opt \ + --cxxopt='--std=c++11' \ + -- \ + //tensorflow/contrib/lite/tools/accuracy/ilsvrc:imagenet_accuracy_eval \ + --model_file=mobilenet_quant_v1_224.tflite \ + --ground_truth_images_path=${IMAGENET_IMAGES_DIR} \ + --ground_truth_labels=${VALIDATION_LABELS} \ + --model_output_labels=${MODEL_LABELS_TXT} \ + --output_file_path=/tmp/accuracy_output.txt \ + --num_images=0 # Run on all images. +``` diff --git a/tensorflow/contrib/lite/tools/accuracy/imagenet_accuracy_eval.cc b/tensorflow/contrib/lite/tools/accuracy/ilsvrc/imagenet_accuracy_eval.cc similarity index 94% rename from tensorflow/contrib/lite/tools/accuracy/imagenet_accuracy_eval.cc rename to tensorflow/contrib/lite/tools/accuracy/ilsvrc/imagenet_accuracy_eval.cc index 8103d6adb5..f361341f7c 100644 --- a/tensorflow/contrib/lite/tools/accuracy/imagenet_accuracy_eval.cc +++ b/tensorflow/contrib/lite/tools/accuracy/ilsvrc/imagenet_accuracy_eval.cc @@ -18,8 +18,8 @@ limitations under the License. #include "absl/memory/memory.h" #include "tensorflow/contrib/lite/tools/accuracy/csv_writer.h" -#include "tensorflow/contrib/lite/tools/accuracy/imagenet_model_evaluator.h" -#include "tensorflow/contrib/lite/tools/accuracy/imagenet_topk_eval.h" +#include "tensorflow/contrib/lite/tools/accuracy/ilsvrc/imagenet_model_evaluator.h" +#include "tensorflow/contrib/lite/tools/accuracy/ilsvrc/imagenet_topk_eval.h" #include "tensorflow/core/platform/env.h" #include "tensorflow/core/util/command_line_flags.h" @@ -126,7 +126,9 @@ int Main(int argc, char* argv[]) { std::vector columns; columns.reserve(evaluator->params().num_ranks); for (int i = 0; i < evaluator->params().num_ranks; i++) { - columns.push_back("Top " + std::to_string(i + 1)); + string column_name = "Top "; + tensorflow::strings::StrAppend(&column_name, i + 1); + columns.push_back(column_name); } ResultsWriter results_writer( diff --git a/tensorflow/contrib/lite/tools/accuracy/imagenet_model_evaluator.cc b/tensorflow/contrib/lite/tools/accuracy/ilsvrc/imagenet_model_evaluator.cc similarity index 96% rename from tensorflow/contrib/lite/tools/accuracy/imagenet_model_evaluator.cc rename to tensorflow/contrib/lite/tools/accuracy/ilsvrc/imagenet_model_evaluator.cc index 6ddde8e7c0..a88a4a0fce 100644 --- a/tensorflow/contrib/lite/tools/accuracy/imagenet_model_evaluator.cc +++ b/tensorflow/contrib/lite/tools/accuracy/ilsvrc/imagenet_model_evaluator.cc @@ -13,7 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. ==============================================================================*/ -#include "tensorflow/contrib/lite/tools/accuracy/imagenet_model_evaluator.h" +#include "tensorflow/contrib/lite/tools/accuracy/ilsvrc/imagenet_model_evaluator.h" #include #include @@ -25,8 +25,8 @@ limitations under the License. #include "tensorflow/contrib/lite/tools/accuracy/eval_pipeline.h" #include "tensorflow/contrib/lite/tools/accuracy/eval_pipeline_builder.h" #include "tensorflow/contrib/lite/tools/accuracy/file_reader_stage.h" -#include "tensorflow/contrib/lite/tools/accuracy/imagenet_topk_eval.h" -#include "tensorflow/contrib/lite/tools/accuracy/inception_preprocessing.h" +#include "tensorflow/contrib/lite/tools/accuracy/ilsvrc/imagenet_topk_eval.h" +#include "tensorflow/contrib/lite/tools/accuracy/ilsvrc/inception_preprocessing.h" #include "tensorflow/contrib/lite/tools/accuracy/run_tflite_model_stage.h" #include "tensorflow/contrib/lite/tools/accuracy/utils.h" #include "tensorflow/core/platform/init_main.h" diff --git a/tensorflow/contrib/lite/tools/accuracy/imagenet_model_evaluator.h b/tensorflow/contrib/lite/tools/accuracy/ilsvrc/imagenet_model_evaluator.h similarity index 97% rename from tensorflow/contrib/lite/tools/accuracy/imagenet_model_evaluator.h rename to tensorflow/contrib/lite/tools/accuracy/ilsvrc/imagenet_model_evaluator.h index 0308ac95b6..5f42b2a50e 100644 --- a/tensorflow/contrib/lite/tools/accuracy/imagenet_model_evaluator.h +++ b/tensorflow/contrib/lite/tools/accuracy/ilsvrc/imagenet_model_evaluator.h @@ -18,7 +18,7 @@ limitations under the License. #include #include -#include "tensorflow/contrib/lite/tools/accuracy/imagenet_topk_eval.h" +#include "tensorflow/contrib/lite/tools/accuracy/ilsvrc/imagenet_topk_eval.h" #include "tensorflow/contrib/lite/tools/accuracy/utils.h" #include "tensorflow/core/framework/tensor_shape.h" #include "tensorflow/core/lib/core/status.h" diff --git a/tensorflow/contrib/lite/tools/accuracy/imagenet_topk_eval.cc b/tensorflow/contrib/lite/tools/accuracy/ilsvrc/imagenet_topk_eval.cc similarity index 97% rename from tensorflow/contrib/lite/tools/accuracy/imagenet_topk_eval.cc rename to tensorflow/contrib/lite/tools/accuracy/ilsvrc/imagenet_topk_eval.cc index 1595bbee2f..d46075d234 100644 --- a/tensorflow/contrib/lite/tools/accuracy/imagenet_topk_eval.cc +++ b/tensorflow/contrib/lite/tools/accuracy/ilsvrc/imagenet_topk_eval.cc @@ -13,7 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. ==============================================================================*/ -#include "tensorflow/contrib/lite/tools/accuracy/imagenet_topk_eval.h" +#include "tensorflow/contrib/lite/tools/accuracy/ilsvrc/imagenet_topk_eval.h" #include diff --git a/tensorflow/contrib/lite/tools/accuracy/imagenet_topk_eval.h b/tensorflow/contrib/lite/tools/accuracy/ilsvrc/imagenet_topk_eval.h similarity index 100% rename from tensorflow/contrib/lite/tools/accuracy/imagenet_topk_eval.h rename to tensorflow/contrib/lite/tools/accuracy/ilsvrc/imagenet_topk_eval.h diff --git a/tensorflow/contrib/lite/tools/accuracy/imagenet_topk_eval_test.cc b/tensorflow/contrib/lite/tools/accuracy/ilsvrc/imagenet_topk_eval_test.cc similarity index 96% rename from tensorflow/contrib/lite/tools/accuracy/imagenet_topk_eval_test.cc rename to tensorflow/contrib/lite/tools/accuracy/ilsvrc/imagenet_topk_eval_test.cc index 256cd1d529..ff332af5c5 100644 --- a/tensorflow/contrib/lite/tools/accuracy/imagenet_topk_eval_test.cc +++ b/tensorflow/contrib/lite/tools/accuracy/ilsvrc/imagenet_topk_eval_test.cc @@ -13,7 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. ==============================================================================*/ -#include "tensorflow/contrib/lite/tools/accuracy/imagenet_topk_eval.h" +#include "tensorflow/contrib/lite/tools/accuracy/ilsvrc/imagenet_topk_eval.h" #include namespace tensorflow { @@ -40,7 +40,9 @@ std::vector CreateGroundTruth() { std::vector ground_truth; ground_truth.reserve(kNumCategories); for (int i = 0; i < kNumCategories; i++) { - ground_truth.push_back(std::to_string(i)); + string category; + strings::StrAppend(&category, i); + ground_truth.push_back(category); } return ground_truth; } diff --git a/tensorflow/contrib/lite/tools/accuracy/inception_preprocessing.cc b/tensorflow/contrib/lite/tools/accuracy/ilsvrc/inception_preprocessing.cc similarity index 97% rename from tensorflow/contrib/lite/tools/accuracy/inception_preprocessing.cc rename to tensorflow/contrib/lite/tools/accuracy/ilsvrc/inception_preprocessing.cc index 7afef88637..7512b39c32 100644 --- a/tensorflow/contrib/lite/tools/accuracy/inception_preprocessing.cc +++ b/tensorflow/contrib/lite/tools/accuracy/ilsvrc/inception_preprocessing.cc @@ -13,7 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. ==============================================================================*/ -#include "tensorflow/contrib/lite/tools/accuracy/inception_preprocessing.h" +#include "tensorflow/contrib/lite/tools/accuracy/ilsvrc/inception_preprocessing.h" #include diff --git a/tensorflow/contrib/lite/tools/accuracy/inception_preprocessing.h b/tensorflow/contrib/lite/tools/accuracy/ilsvrc/inception_preprocessing.h similarity index 100% rename from tensorflow/contrib/lite/tools/accuracy/inception_preprocessing.h rename to tensorflow/contrib/lite/tools/accuracy/ilsvrc/inception_preprocessing.h diff --git a/tensorflow/contrib/lite/tools/accuracy/inception_preprocessing_test.cc b/tensorflow/contrib/lite/tools/accuracy/ilsvrc/inception_preprocessing_test.cc similarity index 98% rename from tensorflow/contrib/lite/tools/accuracy/inception_preprocessing_test.cc rename to tensorflow/contrib/lite/tools/accuracy/ilsvrc/inception_preprocessing_test.cc index db574476f6..3587878ba3 100644 --- a/tensorflow/contrib/lite/tools/accuracy/inception_preprocessing_test.cc +++ b/tensorflow/contrib/lite/tools/accuracy/ilsvrc/inception_preprocessing_test.cc @@ -17,7 +17,7 @@ limitations under the License. #include #include -#include "tensorflow/contrib/lite/tools/accuracy/inception_preprocessing.h" +#include "tensorflow/contrib/lite/tools/accuracy/ilsvrc/inception_preprocessing.h" #include "tensorflow/core/platform/init_main.h" #include "tensorflow/core/public/session.h" #include "tensorflow/core/util/command_line_flags.h" diff --git a/tensorflow/contrib/lite/tools/accuracy/testdata/grace_hopper.jpg b/tensorflow/contrib/lite/tools/accuracy/ilsvrc/testdata/grace_hopper.jpg similarity index 100% rename from tensorflow/contrib/lite/tools/accuracy/testdata/grace_hopper.jpg rename to tensorflow/contrib/lite/tools/accuracy/ilsvrc/testdata/grace_hopper.jpg diff --git a/tensorflow/core/BUILD b/tensorflow/core/BUILD index 836c3ce34e..0882cc3c8b 100644 --- a/tensorflow/core/BUILD +++ b/tensorflow/core/BUILD @@ -2268,6 +2268,8 @@ cc_library( srcs = if_android([ "lib/gif/gif_io.cc", "platform/gif.h", + "lib/strings/strcat.h", + "lib/strings/numbers.h", ]), hdrs = [ "lib/bfloat16/bfloat16.h", -- GitLab From f6c3c9733ed39f14ee3c32bc51ec62315b48ad31 Mon Sep 17 00:00:00 2001 From: Francois Chollet Date: Fri, 24 Aug 2018 16:13:18 -0700 Subject: [PATCH 115/598] Upgrade Keras applications and Keras preprocessing. PiperOrigin-RevId: 210174523 --- .../docs_src/install/install_sources.md | 6 +- .../install/install_sources_windows.md | 4 +- .../python/keras/applications/__init__.py | 51 +- .../keras/applications/applications_test.py | 8 +- .../python/keras/applications/densenet.py | 47 +- .../keras/applications/imagenet_utils.py | 33 +- .../keras/applications/inception_resnet_v2.py | 26 +- .../python/keras/applications/inception_v3.py | 25 +- .../python/keras/applications/mobilenet.py | 25 +- .../python/keras/applications/mobilenet_v2.py | 24 +- .../python/keras/applications/nasnet.py | 35 +- .../python/keras/applications/resnet50.py | 24 +- tensorflow/python/keras/applications/vgg16.py | 24 +- tensorflow/python/keras/applications/vgg19.py | 24 +- .../python/keras/applications/xception.py | 25 +- .../python/keras/preprocessing/__init__.py | 2 + .../python/keras/preprocessing/image.py | 492 +++++++++++++++++- .../python/keras/preprocessing/sequence.py | 63 ++- .../tools/api/generator/api_init_files.bzl | 1 + .../tools/api/generator/api_init_files_v1.bzl | 1 + tensorflow/tools/ci_build/Dockerfile.cmake | 4 +- .../ci_build/install/install_pip_packages.sh | 8 +- .../install/install_python3.5_pip_packages.sh | 4 +- .../install/install_python3.6_pip_packages.sh | 4 +- tensorflow/tools/docker/Dockerfile | 4 +- tensorflow/tools/docker/Dockerfile.devel | 4 +- tensorflow/tools/docker/Dockerfile.devel-gpu | 4 +- .../docker/Dockerfile.devel-gpu-cuda9-cudnn7 | 4 +- tensorflow/tools/docker/Dockerfile.devel-mkl | 4 +- .../tools/docker/Dockerfile.devel-mkl-horovod | 4 +- tensorflow/tools/docker/Dockerfile.gpu | 4 +- tensorflow/tools/docker/Dockerfile.mkl | 4 +- .../tools/docker/Dockerfile.mkl-horovod | 4 +- tensorflow/tools/pip_package/setup.py | 4 +- 34 files changed, 851 insertions(+), 149 deletions(-) diff --git a/tensorflow/docs_src/install/install_sources.md b/tensorflow/docs_src/install/install_sources.md index e8e13142e9..44ea18fa7b 100644 --- a/tensorflow/docs_src/install/install_sources.md +++ b/tensorflow/docs_src/install/install_sources.md @@ -180,9 +180,9 @@ If you follow these instructions, you will not need to disable SIP. After installing pip, invoke the following commands: -
 $ sudo pip install six numpy wheel mock h5py
- $ sudo pip install keras_applications==1.0.4 --no-deps
- $ sudo pip install keras_preprocessing==1.0.2 --no-deps
+
 $ pip install six numpy wheel mock h5py
+ $ pip install keras_applications==1.0.5 --no-deps
+ $ pip install keras_preprocessing==1.0.3 --no-deps
 
Note: These are just the minimum requirements to _build_ tensorflow. Installing diff --git a/tensorflow/docs_src/install/install_sources_windows.md b/tensorflow/docs_src/install/install_sources_windows.md index a1da122317..40dce106d6 100644 --- a/tensorflow/docs_src/install/install_sources_windows.md +++ b/tensorflow/docs_src/install/install_sources_windows.md @@ -94,8 +94,8 @@ Assume you already have `pip3` in `%PATH%`, issue the following command:
 C:\> pip3 install six numpy wheel
-C:\> pip3 install keras_applications==1.0.4 --no-deps
-C:\> pip3 install keras_preprocessing==1.0.2 --no-deps
+C:\> pip3 install keras_applications==1.0.5 --no-deps
+C:\> pip3 install keras_preprocessing==1.0.3 --no-deps
 
diff --git a/tensorflow/python/keras/applications/__init__.py b/tensorflow/python/keras/applications/__init__.py index cd9462d6b5..a8b6d55e41 100644 --- a/tensorflow/python/keras/applications/__init__.py +++ b/tensorflow/python/keras/applications/__init__.py @@ -14,6 +14,7 @@ # ============================================================================== """Keras Applications are canned architectures with pre-trained weights.""" # pylint: disable=g-import-not-at-top +# pylint: disable=g-bad-import-order from __future__ import absolute_import from __future__ import division from __future__ import print_function @@ -25,13 +26,49 @@ from tensorflow.python.keras import engine from tensorflow.python.keras import layers from tensorflow.python.keras import models from tensorflow.python.keras import utils +from tensorflow.python.util import tf_inspect + +# `get_submodules_from_kwargs` has been introduced in 1.0.5, but we would +# like to be able to handle prior versions. Note that prior to 1.0.5, +# `keras_applications` did not expose a `__version__` attribute. +if not hasattr(keras_applications, 'get_submodules_from_kwargs'): + + if 'engine' in tf_inspect.getfullargspec( + keras_applications.set_keras_submodules)[0]: + keras_applications.set_keras_submodules( + backend=backend, + layers=layers, + models=models, + utils=utils, + engine=engine) + else: + keras_applications.set_keras_submodules( + backend=backend, + layers=layers, + models=models, + utils=utils) + + +def keras_modules_injection(base_fun): + """Decorator injecting tf.keras replacements for Keras modules. + + Arguments: + base_fun: Application function to decorate (e.g. `MobileNet`). + + Returns: + Decorated function that injects keyword argument for the tf.keras + modules required by the Applications. + """ + + def wrapper(*args, **kwargs): + if hasattr(keras_applications, 'get_submodules_from_kwargs'): + kwargs['backend'] = backend + kwargs['layers'] = layers + kwargs['models'] = models + kwargs['utils'] = utils + return base_fun(*args, **kwargs) + return wrapper -keras_applications.set_keras_submodules( - backend=backend, - engine=engine, - layers=layers, - models=models, - utils=utils) from tensorflow.python.keras.applications.densenet import DenseNet121 from tensorflow.python.keras.applications.densenet import DenseNet169 @@ -39,7 +76,7 @@ from tensorflow.python.keras.applications.densenet import DenseNet201 from tensorflow.python.keras.applications.inception_resnet_v2 import InceptionResNetV2 from tensorflow.python.keras.applications.inception_v3 import InceptionV3 from tensorflow.python.keras.applications.mobilenet import MobileNet -# TODO(fchollet): enable MobileNetV2 in next version. +from tensorflow.python.keras.applications.mobilenet_v2 import MobileNetV2 from tensorflow.python.keras.applications.nasnet import NASNetLarge from tensorflow.python.keras.applications.nasnet import NASNetMobile from tensorflow.python.keras.applications.resnet50 import ResNet50 diff --git a/tensorflow/python/keras/applications/applications_test.py b/tensorflow/python/keras/applications/applications_test.py index ef3198a937..b15ca5990a 100644 --- a/tensorflow/python/keras/applications/applications_test.py +++ b/tensorflow/python/keras/applications/applications_test.py @@ -32,7 +32,8 @@ MODEL_LIST = [ (applications.InceptionV3, 2048), (applications.InceptionResNetV2, 1536), (applications.MobileNet, 1024), - # TODO(fchollet): enable MobileNetV2 in next version. + # TODO(fchollet): enable MobileNetV2 tests when a new TensorFlow test image + # is released with keras_applications upgraded to 1.0.5 or above. (applications.DenseNet121, 1024), (applications.DenseNet169, 1664), (applications.DenseNet201, 1920), @@ -43,11 +44,6 @@ MODEL_LIST = [ class ApplicationsTest(test.TestCase, parameterized.TestCase): - @parameterized.parameters(*MODEL_LIST) - def test_classification_model(self, model_fn, _): - model = model_fn(classes=1000, weights=None) - self.assertEqual(model.output_shape[-1], 1000) - @parameterized.parameters(*MODEL_LIST) def test_feature_extration_model(self, model_fn, output_dim): model = model_fn(include_top=False, weights=None) diff --git a/tensorflow/python/keras/applications/densenet.py b/tensorflow/python/keras/applications/densenet.py index fbdcc66d2d..172848bbdb 100644 --- a/tensorflow/python/keras/applications/densenet.py +++ b/tensorflow/python/keras/applications/densenet.py @@ -20,18 +20,39 @@ from __future__ import division from __future__ import print_function from keras_applications import densenet + +from tensorflow.python.keras.applications import keras_modules_injection from tensorflow.python.util.tf_export import tf_export -DenseNet121 = densenet.DenseNet121 -DenseNet169 = densenet.DenseNet169 -DenseNet201 = densenet.DenseNet201 -decode_predictions = densenet.decode_predictions -preprocess_input = densenet.preprocess_input - -tf_export('keras.applications.densenet.DenseNet121', - 'keras.applications.DenseNet121')(DenseNet121) -tf_export('keras.applications.densenet.DenseNet169', - 'keras.applications.DenseNet169')(DenseNet169) -tf_export('keras.applications.densenet.DenseNet201', - 'keras.applications.DenseNet201')(DenseNet201) -tf_export('keras.applications.densenet.preprocess_input')(preprocess_input) + +@tf_export('keras.applications.densenet.DenseNet121', + 'keras.applications.DenseNet121') +@keras_modules_injection +def DenseNet121(*args, **kwargs): + return densenet.DenseNet121(*args, **kwargs) + + +@tf_export('keras.applications.densenet.DenseNet169', + 'keras.applications.DenseNet169') +@keras_modules_injection +def DenseNet169(*args, **kwargs): + return densenet.DenseNet169(*args, **kwargs) + + +@tf_export('keras.applications.densenet.DenseNet201', + 'keras.applications.DenseNet201') +@keras_modules_injection +def DenseNet201(*args, **kwargs): + return densenet.DenseNet201(*args, **kwargs) + + +@tf_export('keras.applications.densenet.decode_predictions') +@keras_modules_injection +def decode_predictions(*args, **kwargs): + return densenet.decode_predictions(*args, **kwargs) + + +@tf_export('keras.applications.densenet.preprocess_input') +@keras_modules_injection +def preprocess_input(*args, **kwargs): + return densenet.preprocess_input(*args, **kwargs) diff --git a/tensorflow/python/keras/applications/imagenet_utils.py b/tensorflow/python/keras/applications/imagenet_utils.py index 70f8f6fb32..c25b5c2bdd 100644 --- a/tensorflow/python/keras/applications/imagenet_utils.py +++ b/tensorflow/python/keras/applications/imagenet_utils.py @@ -19,27 +19,18 @@ from __future__ import division from __future__ import print_function from keras_applications import imagenet_utils + +from tensorflow.python.keras.applications import keras_modules_injection from tensorflow.python.util.tf_export import tf_export -decode_predictions = imagenet_utils.decode_predictions -preprocess_input = imagenet_utils.preprocess_input -tf_export( - 'keras.applications.imagenet_utils.decode_predictions', - 'keras.applications.densenet.decode_predictions', - 'keras.applications.inception_resnet_v2.decode_predictions', - 'keras.applications.inception_v3.decode_predictions', - 'keras.applications.mobilenet.decode_predictions', - 'keras.applications.mobilenet_v2.decode_predictions', - 'keras.applications.nasnet.decode_predictions', - 'keras.applications.resnet50.decode_predictions', - 'keras.applications.vgg16.decode_predictions', - 'keras.applications.vgg19.decode_predictions', - 'keras.applications.xception.decode_predictions', -)(decode_predictions) -tf_export( - 'keras.applications.imagenet_utils.preprocess_input', - 'keras.applications.resnet50.preprocess_input', - 'keras.applications.vgg16.preprocess_input', - 'keras.applications.vgg19.preprocess_input', -)(preprocess_input) +@tf_export('keras.applications.imagenet_utils.preprocess_input') +@keras_modules_injection +def decode_predictions(*args, **kwargs): + return imagenet_utils.decode_predictions(*args, **kwargs) + + +@tf_export('keras.applications.imagenet_utils.preprocess_input') +@keras_modules_injection +def preprocess_input(*args, **kwargs): + return imagenet_utils.preprocess_input(*args, **kwargs) diff --git a/tensorflow/python/keras/applications/inception_resnet_v2.py b/tensorflow/python/keras/applications/inception_resnet_v2.py index 63debb4e0d..0b9ef371fa 100644 --- a/tensorflow/python/keras/applications/inception_resnet_v2.py +++ b/tensorflow/python/keras/applications/inception_resnet_v2.py @@ -20,13 +20,25 @@ from __future__ import division from __future__ import print_function from keras_applications import inception_resnet_v2 + +from tensorflow.python.keras.applications import keras_modules_injection from tensorflow.python.util.tf_export import tf_export -InceptionResNetV2 = inception_resnet_v2.InceptionResNetV2 -decode_predictions = inception_resnet_v2.decode_predictions -preprocess_input = inception_resnet_v2.preprocess_input -tf_export('keras.applications.inception_resnet_v2.InceptionResNetV2', - 'keras.applications.InceptionResNetV2')(InceptionResNetV2) -tf_export( - 'keras.applications.inception_resnet_v2.preprocess_input')(preprocess_input) +@tf_export('keras.applications.inception_resnet_v2.InceptionResNetV2', + 'keras.applications.InceptionResNetV2') +@keras_modules_injection +def InceptionResNetV2(*args, **kwargs): + return inception_resnet_v2.InceptionResNetV2(*args, **kwargs) + + +@tf_export('keras.applications.inception_resnet_v2.decode_predictions') +@keras_modules_injection +def decode_predictions(*args, **kwargs): + return inception_resnet_v2.decode_predictions(*args, **kwargs) + + +@tf_export('keras.applications.inception_resnet_v2.preprocess_input') +@keras_modules_injection +def preprocess_input(*args, **kwargs): + return inception_resnet_v2.preprocess_input(*args, **kwargs) diff --git a/tensorflow/python/keras/applications/inception_v3.py b/tensorflow/python/keras/applications/inception_v3.py index 87534086c8..ab76826e17 100644 --- a/tensorflow/python/keras/applications/inception_v3.py +++ b/tensorflow/python/keras/applications/inception_v3.py @@ -20,12 +20,25 @@ from __future__ import division from __future__ import print_function from keras_applications import inception_v3 + +from tensorflow.python.keras.applications import keras_modules_injection from tensorflow.python.util.tf_export import tf_export -InceptionV3 = inception_v3.InceptionV3 -decode_predictions = inception_v3.decode_predictions -preprocess_input = inception_v3.preprocess_input -tf_export('keras.applications.inception_v3.InceptionV3', - 'keras.applications.InceptionV3')(InceptionV3) -tf_export('keras.applications.inception_v3.preprocess_input')(preprocess_input) +@tf_export('keras.applications.inception_v3.InceptionV3', + 'keras.applications.InceptionV3') +@keras_modules_injection +def InceptionV3(*args, **kwargs): + return inception_v3.InceptionV3(*args, **kwargs) + + +@tf_export('keras.applications.inception_v3.decode_predictions') +@keras_modules_injection +def decode_predictions(*args, **kwargs): + return inception_v3.decode_predictions(*args, **kwargs) + + +@tf_export('keras.applications.inception_v3.preprocess_input') +@keras_modules_injection +def preprocess_input(*args, **kwargs): + return inception_v3.preprocess_input(*args, **kwargs) diff --git a/tensorflow/python/keras/applications/mobilenet.py b/tensorflow/python/keras/applications/mobilenet.py index 3528f027b3..1f71a5ae99 100644 --- a/tensorflow/python/keras/applications/mobilenet.py +++ b/tensorflow/python/keras/applications/mobilenet.py @@ -20,12 +20,25 @@ from __future__ import division from __future__ import print_function from keras_applications import mobilenet + +from tensorflow.python.keras.applications import keras_modules_injection from tensorflow.python.util.tf_export import tf_export -MobileNet = mobilenet.MobileNet -decode_predictions = mobilenet.decode_predictions -preprocess_input = mobilenet.preprocess_input -tf_export('keras.applications.mobilenet.MobileNet', - 'keras.applications.MobileNet')(MobileNet) -tf_export('keras.applications.mobilenet.preprocess_input')(preprocess_input) +@tf_export('keras.applications.mobilenet.MobileNet', + 'keras.applications.MobileNet') +@keras_modules_injection +def MobileNet(*args, **kwargs): + return mobilenet.MobileNet(*args, **kwargs) + + +@tf_export('keras.applications.mobilenet.decode_predictions') +@keras_modules_injection +def decode_predictions(*args, **kwargs): + return mobilenet.decode_predictions(*args, **kwargs) + + +@tf_export('keras.applications.mobilenet.preprocess_input') +@keras_modules_injection +def preprocess_input(*args, **kwargs): + return mobilenet.preprocess_input(*args, **kwargs) diff --git a/tensorflow/python/keras/applications/mobilenet_v2.py b/tensorflow/python/keras/applications/mobilenet_v2.py index 9194c3ee14..52ac5959ad 100644 --- a/tensorflow/python/keras/applications/mobilenet_v2.py +++ b/tensorflow/python/keras/applications/mobilenet_v2.py @@ -19,4 +19,26 @@ from __future__ import absolute_import from __future__ import division from __future__ import print_function -# TODO(fchollet): export MobileNetV2 as part of the public API in next version. +from keras_applications import mobilenet_v2 + +from tensorflow.python.keras.applications import keras_modules_injection +from tensorflow.python.util.tf_export import tf_export + + +@tf_export('keras.applications.mobilenet_v2.MobileNetV2', + 'keras.applications.MobileNetV2') +@keras_modules_injection +def MobileNetV2(*args, **kwargs): + return mobilenet_v2.MobileNetV2(*args, **kwargs) + + +@tf_export('keras.applications.mobilenet_v2.decode_predictions') +@keras_modules_injection +def decode_predictions(*args, **kwargs): + return mobilenet_v2.decode_predictions(*args, **kwargs) + + +@tf_export('keras.applications.mobilenet_v2.preprocess_input') +@keras_modules_injection +def preprocess_input(*args, **kwargs): + return mobilenet_v2.preprocess_input(*args, **kwargs) diff --git a/tensorflow/python/keras/applications/nasnet.py b/tensorflow/python/keras/applications/nasnet.py index 26ff5db53f..44fc329d57 100644 --- a/tensorflow/python/keras/applications/nasnet.py +++ b/tensorflow/python/keras/applications/nasnet.py @@ -20,15 +20,32 @@ from __future__ import division from __future__ import print_function from keras_applications import nasnet + +from tensorflow.python.keras.applications import keras_modules_injection from tensorflow.python.util.tf_export import tf_export -NASNetMobile = nasnet.NASNetMobile -NASNetLarge = nasnet.NASNetLarge -decode_predictions = nasnet.decode_predictions -preprocess_input = nasnet.preprocess_input -tf_export('keras.applications.nasnet.NASNetMobile', - 'keras.applications.NASNetMobile')(NASNetMobile) -tf_export('keras.applications.nasnet.NASNetLarge', - 'keras.applications.NASNetLarge')(NASNetLarge) -tf_export('keras.applications.nasnet.preprocess_input')(preprocess_input) +@tf_export('keras.applications.nasnet.NASNetMobile', + 'keras.applications.NASNetMobile') +@keras_modules_injection +def NASNetMobile(*args, **kwargs): + return nasnet.NASNetMobile(*args, **kwargs) + + +@tf_export('keras.applications.nasnet.NASNetLarge', + 'keras.applications.NASNetLarge') +@keras_modules_injection +def NASNetLarge(*args, **kwargs): + return nasnet.NASNetLarge(*args, **kwargs) + + +@tf_export('keras.applications.nasnet.decode_predictions') +@keras_modules_injection +def decode_predictions(*args, **kwargs): + return nasnet.decode_predictions(*args, **kwargs) + + +@tf_export('keras.applications.nasnet.preprocess_input') +@keras_modules_injection +def preprocess_input(*args, **kwargs): + return nasnet.preprocess_input(*args, **kwargs) diff --git a/tensorflow/python/keras/applications/resnet50.py b/tensorflow/python/keras/applications/resnet50.py index 4d804a3c44..80d3f9044f 100644 --- a/tensorflow/python/keras/applications/resnet50.py +++ b/tensorflow/python/keras/applications/resnet50.py @@ -20,11 +20,25 @@ from __future__ import division from __future__ import print_function from keras_applications import resnet50 + +from tensorflow.python.keras.applications import keras_modules_injection from tensorflow.python.util.tf_export import tf_export -ResNet50 = resnet50.ResNet50 -decode_predictions = resnet50.decode_predictions -preprocess_input = resnet50.preprocess_input -tf_export('keras.applications.resnet50.ResNet50', - 'keras.applications.ResNet50')(ResNet50) +@tf_export('keras.applications.resnet50.ResNet50', + 'keras.applications.ResNet50') +@keras_modules_injection +def ResNet50(*args, **kwargs): + return resnet50.ResNet50(*args, **kwargs) + + +@tf_export('keras.applications.resnet50.decode_predictions') +@keras_modules_injection +def decode_predictions(*args, **kwargs): + return resnet50.decode_predictions(*args, **kwargs) + + +@tf_export('keras.applications.resnet50.preprocess_input') +@keras_modules_injection +def preprocess_input(*args, **kwargs): + return resnet50.preprocess_input(*args, **kwargs) diff --git a/tensorflow/python/keras/applications/vgg16.py b/tensorflow/python/keras/applications/vgg16.py index c420d9b81e..8557d26931 100644 --- a/tensorflow/python/keras/applications/vgg16.py +++ b/tensorflow/python/keras/applications/vgg16.py @@ -20,11 +20,25 @@ from __future__ import division from __future__ import print_function from keras_applications import vgg16 + +from tensorflow.python.keras.applications import keras_modules_injection from tensorflow.python.util.tf_export import tf_export -VGG16 = vgg16.VGG16 -decode_predictions = vgg16.decode_predictions -preprocess_input = vgg16.preprocess_input -tf_export('keras.applications.vgg16.VGG16', - 'keras.applications.VGG16')(VGG16) +@tf_export('keras.applications.vgg16.VGG16', + 'keras.applications.VGG16') +@keras_modules_injection +def VGG16(*args, **kwargs): + return vgg16.VGG16(*args, **kwargs) + + +@tf_export('keras.applications.vgg16.decode_predictions') +@keras_modules_injection +def decode_predictions(*args, **kwargs): + return vgg16.decode_predictions(*args, **kwargs) + + +@tf_export('keras.applications.vgg16.preprocess_input') +@keras_modules_injection +def preprocess_input(*args, **kwargs): + return vgg16.preprocess_input(*args, **kwargs) diff --git a/tensorflow/python/keras/applications/vgg19.py b/tensorflow/python/keras/applications/vgg19.py index 73d3d1d1c3..8fc04413a0 100644 --- a/tensorflow/python/keras/applications/vgg19.py +++ b/tensorflow/python/keras/applications/vgg19.py @@ -20,11 +20,25 @@ from __future__ import division from __future__ import print_function from keras_applications import vgg19 + +from tensorflow.python.keras.applications import keras_modules_injection from tensorflow.python.util.tf_export import tf_export -VGG19 = vgg19.VGG19 -decode_predictions = vgg19.decode_predictions -preprocess_input = vgg19.preprocess_input -tf_export('keras.applications.vgg19.VGG19', - 'keras.applications.VGG19')(VGG19) +@tf_export('keras.applications.vgg19.VGG19', + 'keras.applications.VGG19') +@keras_modules_injection +def VGG19(*args, **kwargs): + return vgg19.VGG19(*args, **kwargs) + + +@tf_export('keras.applications.vgg19.decode_predictions') +@keras_modules_injection +def decode_predictions(*args, **kwargs): + return vgg19.decode_predictions(*args, **kwargs) + + +@tf_export('keras.applications.vgg19.preprocess_input') +@keras_modules_injection +def preprocess_input(*args, **kwargs): + return vgg19.preprocess_input(*args, **kwargs) diff --git a/tensorflow/python/keras/applications/xception.py b/tensorflow/python/keras/applications/xception.py index 5b221ac8e0..960e6dec69 100644 --- a/tensorflow/python/keras/applications/xception.py +++ b/tensorflow/python/keras/applications/xception.py @@ -20,12 +20,25 @@ from __future__ import division from __future__ import print_function from keras_applications import xception + +from tensorflow.python.keras.applications import keras_modules_injection from tensorflow.python.util.tf_export import tf_export -Xception = xception.Xception -decode_predictions = xception.decode_predictions -preprocess_input = xception.preprocess_input -tf_export('keras.applications.xception.Xception', - 'keras.applications.Xception')(Xception) -tf_export('keras.applications.xception.preprocess_input')(preprocess_input) +@tf_export('keras.applications.xception.Xception', + 'keras.applications.Xception') +@keras_modules_injection +def Xception(*args, **kwargs): + return xception.Xception(*args, **kwargs) + + +@tf_export('keras.applications.xception.decode_predictions') +@keras_modules_injection +def decode_predictions(*args, **kwargs): + return xception.decode_predictions(*args, **kwargs) + + +@tf_export('keras.applications.xception.preprocess_input') +@keras_modules_injection +def preprocess_input(*args, **kwargs): + return xception.preprocess_input(*args, **kwargs) diff --git a/tensorflow/python/keras/preprocessing/__init__.py b/tensorflow/python/keras/preprocessing/__init__.py index 2f08f88600..0860eed3cf 100644 --- a/tensorflow/python/keras/preprocessing/__init__.py +++ b/tensorflow/python/keras/preprocessing/__init__.py @@ -23,6 +23,8 @@ import keras_preprocessing from tensorflow.python.keras import backend from tensorflow.python.keras import utils +# This exists for compatibility with prior version of keras_preprocessing. +# TODO(fchollet): remove in the future. keras_preprocessing.set_keras_submodules(backend=backend, utils=utils) from tensorflow.python.keras.preprocessing import image diff --git a/tensorflow/python/keras/preprocessing/image.py b/tensorflow/python/keras/preprocessing/image.py index ba227385ef..e33993950d 100644 --- a/tensorflow/python/keras/preprocessing/image.py +++ b/tensorflow/python/keras/preprocessing/image.py @@ -27,6 +27,9 @@ try: except ImportError: pass +from tensorflow.python.keras import backend +from tensorflow.python.keras import utils +from tensorflow.python.util import tf_inspect from tensorflow.python.util.tf_export import tf_export random_rotation = image.random_rotation @@ -38,14 +41,482 @@ random_channel_shift = image.random_channel_shift apply_brightness_shift = image.apply_brightness_shift random_brightness = image.random_brightness apply_affine_transform = image.apply_affine_transform -array_to_img = image.array_to_img -img_to_array = image.img_to_array -save_img = image.save_img load_img = image.load_img -ImageDataGenerator = image.ImageDataGenerator -Iterator = image.Iterator -NumpyArrayIterator = image.NumpyArrayIterator -DirectoryIterator = image.DirectoryIterator + + +@tf_export('keras.preprocessing.image.array_to_img') +def array_to_img(x, data_format=None, scale=True, dtype=None): + """Converts a 3D Numpy array to a PIL Image instance. + + Arguments: + x: Input Numpy array. + data_format: Image data format. + either "channels_first" or "channels_last". + scale: Whether to rescale image values + to be within `[0, 255]`. + dtype: Dtype to use. + + Returns: + A PIL Image instance. + + Raises: + ImportError: if PIL is not available. + ValueError: if invalid `x` or `data_format` is passed. + """ + + if data_format is None: + data_format = backend.image_data_format() + kwargs = {} + if 'dtype' in tf_inspect.getfullargspec(image.array_to_img)[0]: + if dtype is None: + dtype = backend.floatx() + kwargs['dtype'] = dtype + return image.array_to_img(x, data_format=data_format, scale=scale, **kwargs) + + +@tf_export('keras.preprocessing.image.img_to_array') +def img_to_array(img, data_format=None, dtype=None): + """Converts a PIL Image instance to a Numpy array. + + Arguments: + img: PIL Image instance. + data_format: Image data format, + either "channels_first" or "channels_last". + dtype: Dtype to use for the returned array. + + Returns: + A 3D Numpy array. + + Raises: + ValueError: if invalid `img` or `data_format` is passed. + """ + + if data_format is None: + data_format = backend.image_data_format() + kwargs = {} + if 'dtype' in tf_inspect.getfullargspec(image.img_to_array)[0]: + if dtype is None: + dtype = backend.floatx() + kwargs['dtype'] = dtype + return image.img_to_array(img, data_format=data_format, **kwargs) + + +@tf_export('keras.preprocessing.image.save_img') +def save_img(path, + x, + data_format=None, + file_format=None, + scale=True, + **kwargs): + """Saves an image stored as a Numpy array to a path or file object. + + Arguments: + path: Path or file object. + x: Numpy array. + data_format: Image data format, + either "channels_first" or "channels_last". + file_format: Optional file format override. If omitted, the + format to use is determined from the filename extension. + If a file object was used instead of a filename, this + parameter should always be used. + scale: Whether to rescale image values to be within `[0, 255]`. + **kwargs: Additional keyword arguments passed to `PIL.Image.save()`. + """ + if data_format is None: + data_format = backend.image_data_format() + image.save_img(path, + x, + data_format=data_format, + file_format=file_format, + scale=scale, **kwargs) + + +@tf_export('keras.preprocessing.image.Iterator') +class Iterator(image.Iterator, utils.Sequence): + pass + + +@tf_export('keras.preprocessing.image.DirectoryIterator') +class DirectoryIterator(image.DirectoryIterator, Iterator): + """Iterator capable of reading images from a directory on disk. + + Arguments: + directory: Path to the directory to read images from. + Each subdirectory in this directory will be + considered to contain images from one class, + or alternatively you could specify class subdirectories + via the `classes` argument. + image_data_generator: Instance of `ImageDataGenerator` + to use for random transformations and normalization. + target_size: tuple of integers, dimensions to resize input images to. + color_mode: One of `"rgb"`, `"rgba"`, `"grayscale"`. + Color mode to read images. + classes: Optional list of strings, names of subdirectories + containing images from each class (e.g. `["dogs", "cats"]`). + It will be computed automatically if not set. + class_mode: Mode for yielding the targets: + `"binary"`: binary targets (if there are only two classes), + `"categorical"`: categorical targets, + `"sparse"`: integer targets, + `"input"`: targets are images identical to input images (mainly + used to work with autoencoders), + `None`: no targets get yielded (only input images are yielded). + batch_size: Integer, size of a batch. + shuffle: Boolean, whether to shuffle the data between epochs. + seed: Random seed for data shuffling. + data_format: String, one of `channels_first`, `channels_last`. + save_to_dir: Optional directory where to save the pictures + being yielded, in a viewable format. This is useful + for visualizing the random transformations being + applied, for debugging purposes. + save_prefix: String prefix to use for saving sample + images (if `save_to_dir` is set). + save_format: Format to use for saving sample images + (if `save_to_dir` is set). + subset: Subset of data (`"training"` or `"validation"`) if + validation_split is set in ImageDataGenerator. + interpolation: Interpolation method used to resample the image if the + target size is different from that of the loaded image. + Supported methods are "nearest", "bilinear", and "bicubic". + If PIL version 1.1.3 or newer is installed, "lanczos" is also + supported. If PIL version 3.4.0 or newer is installed, "box" and + "hamming" are also supported. By default, "nearest" is used. + dtype: Dtype to use for generated arrays. + """ + + def __init__(self, directory, image_data_generator, + target_size=(256, 256), + color_mode='rgb', + classes=None, + class_mode='categorical', + batch_size=32, + shuffle=True, + seed=None, + data_format=None, + save_to_dir=None, + save_prefix='', + save_format='png', + follow_links=False, + subset=None, + interpolation='nearest', + dtype=None): + if data_format is None: + data_format = backend.image_data_format() + kwargs = {} + if 'dtype' in tf_inspect.getfullargspec( + image.ImageDataGenerator.__init__)[0]: + if dtype is None: + dtype = backend.floatx() + kwargs['dtype'] = dtype + super(DirectoryIterator, self).__init__( + directory, image_data_generator, + target_size=target_size, + color_mode=color_mode, + classes=classes, + class_mode=class_mode, + batch_size=batch_size, + shuffle=shuffle, + seed=seed, + data_format=data_format, + save_to_dir=save_to_dir, + save_prefix=save_prefix, + save_format=save_format, + follow_links=follow_links, + subset=subset, + interpolation=interpolation, + **kwargs) + + +@tf_export('keras.preprocessing.image.NumpyArrayIterator') +class NumpyArrayIterator(image.NumpyArrayIterator, Iterator): + """Iterator yielding data from a Numpy array. + + Arguments: + x: Numpy array of input data or tuple. + If tuple, the second elements is either + another numpy array or a list of numpy arrays, + each of which gets passed + through as an output without any modifications. + y: Numpy array of targets data. + image_data_generator: Instance of `ImageDataGenerator` + to use for random transformations and normalization. + batch_size: Integer, size of a batch. + shuffle: Boolean, whether to shuffle the data between epochs. + sample_weight: Numpy array of sample weights. + seed: Random seed for data shuffling. + data_format: String, one of `channels_first`, `channels_last`. + save_to_dir: Optional directory where to save the pictures + being yielded, in a viewable format. This is useful + for visualizing the random transformations being + applied, for debugging purposes. + save_prefix: String prefix to use for saving sample + images (if `save_to_dir` is set). + save_format: Format to use for saving sample images + (if `save_to_dir` is set). + subset: Subset of data (`"training"` or `"validation"`) if + validation_split is set in ImageDataGenerator. + dtype: Dtype to use for the generated arrays. + """ + + def __init__(self, x, y, image_data_generator, + batch_size=32, + shuffle=False, + sample_weight=None, + seed=None, + data_format=None, + save_to_dir=None, + save_prefix='', + save_format='png', + subset=None, + dtype=None): + if data_format is None: + data_format = backend.image_data_format() + kwargs = {} + if 'dtype' in tf_inspect.getfullargspec( + image.NumpyArrayIterator.__init__)[0]: + if dtype is None: + dtype = backend.floatx() + kwargs['dtype'] = dtype + super(NumpyArrayIterator, self).__init__( + x, y, image_data_generator, + batch_size=batch_size, + shuffle=shuffle, + sample_weight=sample_weight, + seed=seed, + data_format=data_format, + save_to_dir=save_to_dir, + save_prefix=save_prefix, + save_format=save_format, + subset=subset, + **kwargs) + + +@tf_export('keras.preprocessing.image.ImageDataGenerator') +class ImageDataGenerator(image.ImageDataGenerator): + """Generate batches of tensor image data with real-time data augmentation. + + The data will be looped over (in batches). + + Arguments: + featurewise_center: Boolean. + Set input mean to 0 over the dataset, feature-wise. + samplewise_center: Boolean. Set each sample mean to 0. + featurewise_std_normalization: Boolean. + Divide inputs by std of the dataset, feature-wise. + samplewise_std_normalization: Boolean. Divide each input by its std. + zca_epsilon: epsilon for ZCA whitening. Default is 1e-6. + zca_whitening: Boolean. Apply ZCA whitening. + rotation_range: Int. Degree range for random rotations. + width_shift_range: Float, 1-D array-like or int + - float: fraction of total width, if < 1, or pixels if >= 1. + - 1-D array-like: random elements from the array. + - int: integer number of pixels from interval + `(-width_shift_range, +width_shift_range)` + - With `width_shift_range=2` possible values + are integers `[-1, 0, +1]`, + same as with `width_shift_range=[-1, 0, +1]`, + while with `width_shift_range=1.0` possible values are floats + in the interval [-1.0, +1.0). + height_shift_range: Float, 1-D array-like or int + - float: fraction of total height, if < 1, or pixels if >= 1. + - 1-D array-like: random elements from the array. + - int: integer number of pixels from interval + `(-height_shift_range, +height_shift_range)` + - With `height_shift_range=2` possible values + are integers `[-1, 0, +1]`, + same as with `height_shift_range=[-1, 0, +1]`, + while with `height_shift_range=1.0` possible values are floats + in the interval [-1.0, +1.0). + brightness_range: Tuple or list of two floats. Range for picking + a brightness shift value from. + shear_range: Float. Shear Intensity + (Shear angle in counter-clockwise direction in degrees) + zoom_range: Float or [lower, upper]. Range for random zoom. + If a float, `[lower, upper] = [1-zoom_range, 1+zoom_range]`. + channel_shift_range: Float. Range for random channel shifts. + fill_mode: One of {"constant", "nearest", "reflect" or "wrap"}. + Default is 'nearest'. + Points outside the boundaries of the input are filled + according to the given mode: + - 'constant': kkkkkkkk|abcd|kkkkkkkk (cval=k) + - 'nearest': aaaaaaaa|abcd|dddddddd + - 'reflect': abcddcba|abcd|dcbaabcd + - 'wrap': abcdabcd|abcd|abcdabcd + cval: Float or Int. + Value used for points outside the boundaries + when `fill_mode = "constant"`. + horizontal_flip: Boolean. Randomly flip inputs horizontally. + vertical_flip: Boolean. Randomly flip inputs vertically. + rescale: rescaling factor. Defaults to None. + If None or 0, no rescaling is applied, + otherwise we multiply the data by the value provided + (after applying all other transformations). + preprocessing_function: function that will be implied on each input. + The function will run after the image is resized and augmented. + The function should take one argument: + one image (Numpy tensor with rank 3), + and should output a Numpy tensor with the same shape. + data_format: Image data format, + either "channels_first" or "channels_last". + "channels_last" mode means that the images should have shape + `(samples, height, width, channels)`, + "channels_first" mode means that the images should have shape + `(samples, channels, height, width)`. + It defaults to the `image_data_format` value found in your + Keras config file at `~/.keras/keras.json`. + If you never set it, then it will be "channels_last". + validation_split: Float. Fraction of images reserved for validation + (strictly between 0 and 1). + dtype: Dtype to use for the generated arrays. + + Examples: + + Example of using `.flow(x, y)`: + + ```python + (x_train, y_train), (x_test, y_test) = cifar10.load_data() + y_train = np_utils.to_categorical(y_train, num_classes) + y_test = np_utils.to_categorical(y_test, num_classes) + datagen = ImageDataGenerator( + featurewise_center=True, + featurewise_std_normalization=True, + rotation_range=20, + width_shift_range=0.2, + height_shift_range=0.2, + horizontal_flip=True) + # compute quantities required for featurewise normalization + # (std, mean, and principal components if ZCA whitening is applied) + datagen.fit(x_train) + # fits the model on batches with real-time data augmentation: + model.fit_generator(datagen.flow(x_train, y_train, batch_size=32), + steps_per_epoch=len(x_train) / 32, epochs=epochs) + # here's a more "manual" example + for e in range(epochs): + print('Epoch', e) + batches = 0 + for x_batch, y_batch in datagen.flow(x_train, y_train, batch_size=32): + model.fit(x_batch, y_batch) + batches += 1 + if batches >= len(x_train) / 32: + # we need to break the loop by hand because + # the generator loops indefinitely + break + ``` + + Example of using `.flow_from_directory(directory)`: + + ```python + train_datagen = ImageDataGenerator( + rescale=1./255, + shear_range=0.2, + zoom_range=0.2, + horizontal_flip=True) + test_datagen = ImageDataGenerator(rescale=1./255) + train_generator = train_datagen.flow_from_directory( + 'data/train', + target_size=(150, 150), + batch_size=32, + class_mode='binary') + validation_generator = test_datagen.flow_from_directory( + 'data/validation', + target_size=(150, 150), + batch_size=32, + class_mode='binary') + model.fit_generator( + train_generator, + steps_per_epoch=2000, + epochs=50, + validation_data=validation_generator, + validation_steps=800) + ``` + + Example of transforming images and masks together. + + ```python + # we create two instances with the same arguments + data_gen_args = dict(featurewise_center=True, + featurewise_std_normalization=True, + rotation_range=90, + width_shift_range=0.1, + height_shift_range=0.1, + zoom_range=0.2) + image_datagen = ImageDataGenerator(**data_gen_args) + mask_datagen = ImageDataGenerator(**data_gen_args) + # Provide the same seed and keyword arguments to the fit and flow methods + seed = 1 + image_datagen.fit(images, augment=True, seed=seed) + mask_datagen.fit(masks, augment=True, seed=seed) + image_generator = image_datagen.flow_from_directory( + 'data/images', + class_mode=None, + seed=seed) + mask_generator = mask_datagen.flow_from_directory( + 'data/masks', + class_mode=None, + seed=seed) + # combine generators into one which yields image and masks + train_generator = zip(image_generator, mask_generator) + model.fit_generator( + train_generator, + steps_per_epoch=2000, + epochs=50) + ``` + """ + + def __init__(self, + featurewise_center=False, + samplewise_center=False, + featurewise_std_normalization=False, + samplewise_std_normalization=False, + zca_whitening=False, + zca_epsilon=1e-6, + rotation_range=0, + width_shift_range=0., + height_shift_range=0., + brightness_range=None, + shear_range=0., + zoom_range=0., + channel_shift_range=0., + fill_mode='nearest', + cval=0., + horizontal_flip=False, + vertical_flip=False, + rescale=None, + preprocessing_function=None, + data_format=None, + validation_split=0.0, + dtype=None): + if data_format is None: + data_format = backend.image_data_format() + kwargs = {} + if 'dtype' in tf_inspect.getfullargspec( + image.ImageDataGenerator.__init__)[0]: + if dtype is None: + dtype = backend.floatx() + kwargs['dtype'] = dtype + super(ImageDataGenerator, self).__init__( + featurewise_center=featurewise_center, + samplewise_center=samplewise_center, + featurewise_std_normalization=featurewise_std_normalization, + samplewise_std_normalization=samplewise_std_normalization, + zca_whitening=zca_whitening, + zca_epsilon=zca_epsilon, + rotation_range=rotation_range, + width_shift_range=width_shift_range, + height_shift_range=height_shift_range, + brightness_range=brightness_range, + shear_range=shear_range, + zoom_range=zoom_range, + channel_shift_range=channel_shift_range, + fill_mode=fill_mode, + cval=cval, + horizontal_flip=horizontal_flip, + vertical_flip=vertical_flip, + rescale=rescale, + preprocessing_function=preprocessing_function, + data_format=data_format, + validation_split=validation_split, + **kwargs) tf_export('keras.preprocessing.image.random_rotation')(random_rotation) tf_export('keras.preprocessing.image.random_shift')(random_shift) @@ -59,11 +530,4 @@ tf_export( tf_export('keras.preprocessing.image.random_brightness')(random_brightness) tf_export( 'keras.preprocessing.image.apply_affine_transform')(apply_affine_transform) -tf_export('keras.preprocessing.image.array_to_img')(array_to_img) -tf_export('keras.preprocessing.image.img_to_array')(img_to_array) -tf_export('keras.preprocessing.image.save_img')(save_img) tf_export('keras.preprocessing.image.load_img')(load_img) -tf_export('keras.preprocessing.image.ImageDataGenerator')(ImageDataGenerator) -tf_export('keras.preprocessing.image.Iterator')(Iterator) -tf_export('keras.preprocessing.image.NumpyArrayIterator')(NumpyArrayIterator) -tf_export('keras.preprocessing.image.DirectoryIterator')(DirectoryIterator) diff --git a/tensorflow/python/keras/preprocessing/sequence.py b/tensorflow/python/keras/preprocessing/sequence.py index 116d3108d9..f014668909 100644 --- a/tensorflow/python/keras/preprocessing/sequence.py +++ b/tensorflow/python/keras/preprocessing/sequence.py @@ -21,6 +21,7 @@ from __future__ import print_function from keras_preprocessing import sequence +from tensorflow.python.keras import utils from tensorflow.python.util.tf_export import tf_export pad_sequences = sequence.pad_sequences @@ -28,11 +29,67 @@ make_sampling_table = sequence.make_sampling_table skipgrams = sequence.skipgrams # TODO(fchollet): consider making `_remove_long_seq` public. _remove_long_seq = sequence._remove_long_seq # pylint: disable=protected-access -TimeseriesGenerator = sequence.TimeseriesGenerator + + +@tf_export('keras.preprocessing.sequence.TimeseriesGenerator') +class TimeseriesGenerator(sequence.TimeseriesGenerator, utils.Sequence): + """Utility class for generating batches of temporal data. + This class takes in a sequence of data-points gathered at + equal intervals, along with time series parameters such as + stride, length of history, etc., to produce batches for + training/validation. + # Arguments + data: Indexable generator (such as list or Numpy array) + containing consecutive data points (timesteps). + The data should be at 2D, and axis 0 is expected + to be the time dimension. + targets: Targets corresponding to timesteps in `data`. + It should have same length as `data`. + length: Length of the output sequences (in number of timesteps). + sampling_rate: Period between successive individual timesteps + within sequences. For rate `r`, timesteps + `data[i]`, `data[i-r]`, ... `data[i - length]` + are used for create a sample sequence. + stride: Period between successive output sequences. + For stride `s`, consecutive output samples would + be centered around `data[i]`, `data[i+s]`, `data[i+2*s]`, etc. + start_index: Data points earlier than `start_index` will not be used + in the output sequences. This is useful to reserve part of the + data for test or validation. + end_index: Data points later than `end_index` will not be used + in the output sequences. This is useful to reserve part of the + data for test or validation. + shuffle: Whether to shuffle output samples, + or instead draw them in chronological order. + reverse: Boolean: if `true`, timesteps in each output sample will be + in reverse chronological order. + batch_size: Number of timeseries samples in each batch + (except maybe the last one). + # Returns + A [Sequence](/utils/#sequence) instance. + # Examples + ```python + from keras.preprocessing.sequence import TimeseriesGenerator + import numpy as np + data = np.array([[i] for i in range(50)]) + targets = np.array([[i] for i in range(50)]) + data_gen = TimeseriesGenerator(data, targets, + length=10, sampling_rate=2, + batch_size=2) + assert len(data_gen) == 20 + batch_0 = data_gen[0] + x, y = batch_0 + assert np.array_equal(x, + np.array([[[0], [2], [4], [6], [8]], + [[1], [3], [5], [7], [9]]])) + assert np.array_equal(y, + np.array([[10], [11]])) + ``` + """ + pass + tf_export('keras.preprocessing.sequence.pad_sequences')(pad_sequences) tf_export( 'keras.preprocessing.sequence.make_sampling_table')(make_sampling_table) tf_export('keras.preprocessing.sequence.skipgrams')(skipgrams) -tf_export( - 'keras.preprocessing.sequence.TimeseriesGenerator')(TimeseriesGenerator) diff --git a/tensorflow/python/tools/api/generator/api_init_files.bzl b/tensorflow/python/tools/api/generator/api_init_files.bzl index 7001e566ce..64f0469482 100644 --- a/tensorflow/python/tools/api/generator/api_init_files.bzl +++ b/tensorflow/python/tools/api/generator/api_init_files.bzl @@ -25,6 +25,7 @@ TENSORFLOW_API_INIT_FILES = [ "keras/applications/inception_resnet_v2/__init__.py", "keras/applications/inception_v3/__init__.py", "keras/applications/mobilenet/__init__.py", + "keras/applications/mobilenet_v2/__init__.py", "keras/applications/nasnet/__init__.py", "keras/applications/resnet50/__init__.py", "keras/applications/vgg16/__init__.py", diff --git a/tensorflow/python/tools/api/generator/api_init_files_v1.bzl b/tensorflow/python/tools/api/generator/api_init_files_v1.bzl index 73d11199d9..bc2f3516d1 100644 --- a/tensorflow/python/tools/api/generator/api_init_files_v1.bzl +++ b/tensorflow/python/tools/api/generator/api_init_files_v1.bzl @@ -25,6 +25,7 @@ TENSORFLOW_API_INIT_FILES_V1 = [ "keras/applications/inception_resnet_v2/__init__.py", "keras/applications/inception_v3/__init__.py", "keras/applications/mobilenet/__init__.py", + "keras/applications/mobilenet_v2/__init__.py", "keras/applications/nasnet/__init__.py", "keras/applications/resnet50/__init__.py", "keras/applications/vgg16/__init__.py", diff --git a/tensorflow/tools/ci_build/Dockerfile.cmake b/tensorflow/tools/ci_build/Dockerfile.cmake index 4587bcf891..b7450c83de 100644 --- a/tensorflow/tools/ci_build/Dockerfile.cmake +++ b/tensorflow/tools/ci_build/Dockerfile.cmake @@ -28,8 +28,8 @@ RUN pip install --upgrade astor RUN pip install --upgrade gast RUN pip install --upgrade numpy RUN pip install --upgrade termcolor -RUN pip install keras_applications==1.0.4 -RUN pip install keras_preprocessing==1.0.2 +RUN pip install keras_applications==1.0.5 +RUN pip install keras_preprocessing==1.0.3 # Install golang RUN apt-get install -t xenial-backports -y golang-1.9 diff --git a/tensorflow/tools/ci_build/install/install_pip_packages.sh b/tensorflow/tools/ci_build/install/install_pip_packages.sh index bb316ecfc9..af478eded4 100755 --- a/tensorflow/tools/ci_build/install/install_pip_packages.sh +++ b/tensorflow/tools/ci_build/install/install_pip_packages.sh @@ -115,10 +115,10 @@ pip2 install --upgrade setuptools==39.1.0 pip3 install --upgrade setuptools==39.1.0 # Keras -pip2 install keras_applications==1.0.4 --no-deps -pip3 install keras_applications==1.0.4 --no-deps -pip2 install keras_preprocessing==1.0.2 --no-deps -pip3 install keras_preprocessing==1.0.2 --no-deps +pip2 install keras_applications==1.0.5 --no-deps +pip3 install keras_applications==1.0.5 --no-deps +pip2 install keras_preprocessing==1.0.3 --no-deps +pip3 install keras_preprocessing==1.0.3 --no-deps # Install last working version of setuptools. pip2 install --upgrade setuptools==39.1.0 diff --git a/tensorflow/tools/ci_build/install/install_python3.5_pip_packages.sh b/tensorflow/tools/ci_build/install/install_python3.5_pip_packages.sh index 15e4396ce3..93ea0c3db6 100755 --- a/tensorflow/tools/ci_build/install/install_python3.5_pip_packages.sh +++ b/tensorflow/tools/ci_build/install/install_python3.5_pip_packages.sh @@ -85,8 +85,8 @@ pip3.5 install --upgrade termcolor pip3.5 install --upgrade setuptools==39.1.0 # Keras -pip3.5 install keras_applications==1.0.4 -pip3.5 install keras_preprocessing==1.0.2 +pip3.5 install keras_applications==1.0.5 +pip3.5 install keras_preprocessing==1.0.3 # Install last working version of setuptools. pip3.5 install --upgrade setuptools==39.1.0 diff --git a/tensorflow/tools/ci_build/install/install_python3.6_pip_packages.sh b/tensorflow/tools/ci_build/install/install_python3.6_pip_packages.sh index 0fc3eee71c..7a9eef7c64 100755 --- a/tensorflow/tools/ci_build/install/install_python3.6_pip_packages.sh +++ b/tensorflow/tools/ci_build/install/install_python3.6_pip_packages.sh @@ -101,7 +101,7 @@ pip3 install --upgrade termcolor pip3 install --upgrade setuptools==39.1.0 # Keras -pip3 install keras_applications==1.0.4 -pip3 install keras_preprocessing==1.0.2 +pip3 install keras_applications==1.0.5 +pip3 install keras_preprocessing==1.0.3 # LINT.ThenChange(//tensorflow/tools/ci_build/install/install_python3.5_pip_packages.sh) diff --git a/tensorflow/tools/docker/Dockerfile b/tensorflow/tools/docker/Dockerfile index 2c31d784e5..0114ef9dbf 100644 --- a/tensorflow/tools/docker/Dockerfile +++ b/tensorflow/tools/docker/Dockerfile @@ -29,8 +29,8 @@ RUN pip --no-cache-dir install \ h5py \ ipykernel \ jupyter \ - keras_applications==1.0.4 \ - keras_preprocessing==1.0.2 \ + keras_applications==1.0.5 \ + keras_preprocessing==1.0.3 \ matplotlib \ numpy==1.14.5 \ pandas \ diff --git a/tensorflow/tools/docker/Dockerfile.devel b/tensorflow/tools/docker/Dockerfile.devel index bacdea72ce..aec5ca965e 100644 --- a/tensorflow/tools/docker/Dockerfile.devel +++ b/tensorflow/tools/docker/Dockerfile.devel @@ -33,8 +33,8 @@ RUN pip --no-cache-dir install \ h5py \ ipykernel \ jupyter \ - keras_applications==1.0.4 \ - keras_preprocessing==1.0.2 \ + keras_applications==1.0.5 \ + keras_preprocessing==1.0.3 \ matplotlib \ mock \ numpy==1.14.5 \ diff --git a/tensorflow/tools/docker/Dockerfile.devel-gpu b/tensorflow/tools/docker/Dockerfile.devel-gpu index 4f89e3f701..ba421d9978 100644 --- a/tensorflow/tools/docker/Dockerfile.devel-gpu +++ b/tensorflow/tools/docker/Dockerfile.devel-gpu @@ -49,8 +49,8 @@ RUN pip --no-cache-dir install \ h5py \ ipykernel \ jupyter \ - keras_applications==1.0.4 \ - keras_preprocessing==1.0.2 \ + keras_applications==1.0.5 \ + keras_preprocessing==1.0.3 \ matplotlib \ mock \ numpy==1.14.5 \ diff --git a/tensorflow/tools/docker/Dockerfile.devel-gpu-cuda9-cudnn7 b/tensorflow/tools/docker/Dockerfile.devel-gpu-cuda9-cudnn7 index 056b4755f4..eb139ec5f8 100644 --- a/tensorflow/tools/docker/Dockerfile.devel-gpu-cuda9-cudnn7 +++ b/tensorflow/tools/docker/Dockerfile.devel-gpu-cuda9-cudnn7 @@ -37,8 +37,8 @@ RUN pip --no-cache-dir install --upgrade \ RUN pip --no-cache-dir install \ ipykernel \ jupyter \ - keras_applications==1.0.4 \ - keras_preprocessing==1.0.2 \ + keras_applications==1.0.5 \ + keras_preprocessing==1.0.3 \ matplotlib \ numpy \ scipy \ diff --git a/tensorflow/tools/docker/Dockerfile.devel-mkl b/tensorflow/tools/docker/Dockerfile.devel-mkl index 2df770e525..371451d2aa 100755 --- a/tensorflow/tools/docker/Dockerfile.devel-mkl +++ b/tensorflow/tools/docker/Dockerfile.devel-mkl @@ -52,8 +52,8 @@ RUN ${PIP} --no-cache-dir install \ h5py \ ipykernel \ jupyter \ - keras_applications==1.0.4 \ - keras_preprocessing==1.0.2 \ + keras_applications==1.0.5 \ + keras_preprocessing==1.0.3 \ matplotlib \ mock \ numpy \ diff --git a/tensorflow/tools/docker/Dockerfile.devel-mkl-horovod b/tensorflow/tools/docker/Dockerfile.devel-mkl-horovod index ab2eec1728..987b582d10 100755 --- a/tensorflow/tools/docker/Dockerfile.devel-mkl-horovod +++ b/tensorflow/tools/docker/Dockerfile.devel-mkl-horovod @@ -45,8 +45,8 @@ RUN ${PIP} --no-cache-dir install \ h5py \ ipykernel \ jupyter \ - keras_applications==1.0.4 \ - keras_preprocessing==1.0.2 \ + keras_applications==1.0.5 \ + keras_preprocessing==1.0.3 \ matplotlib \ mock \ numpy \ diff --git a/tensorflow/tools/docker/Dockerfile.gpu b/tensorflow/tools/docker/Dockerfile.gpu index aa0e0face1..806b8836c7 100644 --- a/tensorflow/tools/docker/Dockerfile.gpu +++ b/tensorflow/tools/docker/Dockerfile.gpu @@ -37,8 +37,8 @@ RUN pip --no-cache-dir install \ h5py \ ipykernel \ jupyter \ - keras_applications==1.0.4 \ - keras_preprocessing==1.0.2 \ + keras_applications==1.0.5 \ + keras_preprocessing==1.0.3 \ matplotlib \ numpy==1.14.5 \ pandas \ diff --git a/tensorflow/tools/docker/Dockerfile.mkl b/tensorflow/tools/docker/Dockerfile.mkl index 69553302d8..641c9e3b16 100755 --- a/tensorflow/tools/docker/Dockerfile.mkl +++ b/tensorflow/tools/docker/Dockerfile.mkl @@ -38,8 +38,8 @@ RUN ${PIP} --no-cache-dir install \ h5py \ ipykernel \ jupyter \ - keras_applications==1.0.4 \ - keras_preprocessing==1.0.2 \ + keras_applications==1.0.5 \ + keras_preprocessing==1.0.3 \ matplotlib \ numpy \ pandas \ diff --git a/tensorflow/tools/docker/Dockerfile.mkl-horovod b/tensorflow/tools/docker/Dockerfile.mkl-horovod index 756716ee0e..2b11679f54 100755 --- a/tensorflow/tools/docker/Dockerfile.mkl-horovod +++ b/tensorflow/tools/docker/Dockerfile.mkl-horovod @@ -38,8 +38,8 @@ RUN ${PIP} --no-cache-dir install \ h5py \ ipykernel \ jupyter \ - keras_applications==1.0.4 \ - keras_preprocessing==1.0.2 \ + keras_applications==1.0.5 \ + keras_preprocessing==1.0.3 \ matplotlib \ numpy \ pandas \ diff --git a/tensorflow/tools/pip_package/setup.py b/tensorflow/tools/pip_package/setup.py index 5e179079c5..8cefbef82d 100644 --- a/tensorflow/tools/pip_package/setup.py +++ b/tensorflow/tools/pip_package/setup.py @@ -51,8 +51,8 @@ REQUIRED_PACKAGES = [ 'absl-py >= 0.1.6', 'astor >= 0.6.0', 'gast >= 0.2.0', - 'keras_applications == 1.0.4', - 'keras_preprocessing == 1.0.2', + 'keras_applications >= 1.0.5', + 'keras_preprocessing >= 1.0.3', 'numpy >= 1.13.3, <= 1.14.5', 'six >= 1.10.0', 'protobuf >= 3.6.0', -- GitLab From 58615322fd1fcdfffc78f1f57e284549f6f6ed6c Mon Sep 17 00:00:00 2001 From: Allen Lavoie Date: Fri, 24 Aug 2018 16:17:28 -0700 Subject: [PATCH 116/598] De-flake checkpoint_management_test Modifies the clock by an epsilon to fix creation and instant reloads of CheckpointState. PiperOrigin-RevId: 210175050 --- tensorflow/python/training/checkpoint_management.py | 4 +++- tensorflow/python/training/checkpoint_management_test.py | 3 --- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/tensorflow/python/training/checkpoint_management.py b/tensorflow/python/training/checkpoint_management.py index b7aa8264b0..38910fb246 100644 --- a/tensorflow/python/training/checkpoint_management.py +++ b/tensorflow/python/training/checkpoint_management.py @@ -538,7 +538,9 @@ class CheckpointManager(object): self._maybe_delete = collections.OrderedDict() if recovered_state is None: self._latest_checkpoint = None - self._last_preserved_timestamp = current_clock + # Set the clock back slightly to avoid race conditions when quckly + # re-creating a CheckpointManager. + self._last_preserved_timestamp = current_clock - 1. else: self._latest_checkpoint = recovered_state.model_checkpoint_path self._last_preserved_timestamp = recovered_state.last_preserved_timestamp diff --git a/tensorflow/python/training/checkpoint_management_test.py b/tensorflow/python/training/checkpoint_management_test.py index d7162265e6..8ef5048299 100644 --- a/tensorflow/python/training/checkpoint_management_test.py +++ b/tensorflow/python/training/checkpoint_management_test.py @@ -389,8 +389,6 @@ class CheckpointManagerTest(test.TestCase): mock_time.time.return_value = first_time first_manager.save() state = checkpoint_management.get_checkpoint_state(directory) - self.assertEqual([first_time], state.all_model_checkpoint_timestamps) - self.assertEqual(3., state.last_preserved_timestamp) second_time = first_time + 3610. second_name = os.path.join(directory, "ckpt-2") mock_time.time.return_value = second_time @@ -398,7 +396,6 @@ class CheckpointManagerTest(test.TestCase): state = checkpoint_management.get_checkpoint_state(directory) self.assertEqual([first_time, second_time], state.all_model_checkpoint_timestamps) - self.assertEqual(3., state.last_preserved_timestamp) self.assertEqual([first_name, second_name], first_manager.checkpoints) self.assertEqual(second_name, first_manager.latest_checkpoint) del first_manager -- GitLab From 2bc4f495f7bf315356dc61d45a22222933c9a49e Mon Sep 17 00:00:00 2001 From: Pavithra Vijay Date: Fri, 24 Aug 2018 16:36:39 -0700 Subject: [PATCH 117/598] Fix error when getting optimizer variables with distribution strategy - add `_in_graph_mode` property to DistributedVariable PiperOrigin-RevId: 210177702 --- tensorflow/contrib/distribute/python/examples/keras_mnist.py | 3 +-- tensorflow/contrib/distribute/python/values.py | 4 ++++ 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/tensorflow/contrib/distribute/python/examples/keras_mnist.py b/tensorflow/contrib/distribute/python/examples/keras_mnist.py index e33e6fdeb8..a20069c4fe 100644 --- a/tensorflow/contrib/distribute/python/examples/keras_mnist.py +++ b/tensorflow/contrib/distribute/python/examples/keras_mnist.py @@ -109,8 +109,7 @@ def main(_): # `distribute` argument. `fit`, `evaluate` and `predict` will be distributed # based on the strategy instantiated. model.compile(loss=tf.keras.losses.categorical_crossentropy, - optimizer=tf.train.GradientDescentOptimizer( - learning_rate=0.001), + optimizer=tf.train.RMSPropOptimizer(learning_rate=0.001), metrics=['accuracy'], distribute=strategy) diff --git a/tensorflow/contrib/distribute/python/values.py b/tensorflow/contrib/distribute/python/values.py index e73d9c193e..3ccaa2690e 100644 --- a/tensorflow/contrib/distribute/python/values.py +++ b/tensorflow/contrib/distribute/python/values.py @@ -304,6 +304,10 @@ class DistributedVariable(DistributedDelegate): self._primary_var.op.type) return self.get().op + @property + def _in_graph_mode(self): + return self._primary_var._in_graph_mode # pylint: disable=protected-access + def read_value(self): return distribution_strategy_context.get_distribution_strategy().read_var( self) -- GitLab From 88dacea6db2e53a3f5f14c44e8b9d56905fa9d0e Mon Sep 17 00:00:00 2001 From: Michael Kuperstein Date: Fri, 24 Aug 2018 16:37:13 -0700 Subject: [PATCH 118/598] [XLA] Use literal hashes in CSE, and make checking same-shaped literals fast. Previously all literals would hash to the same value, which would cause a lot of expensive literal equality checks. PiperOrigin-RevId: 210177790 --- tensorflow/compiler/xla/literal.cc | 6 ++++++ tensorflow/compiler/xla/service/hlo_cse.cc | 4 ++++ 2 files changed, 10 insertions(+) diff --git a/tensorflow/compiler/xla/literal.cc b/tensorflow/compiler/xla/literal.cc index 30b890737b..0c0b619d50 100644 --- a/tensorflow/compiler/xla/literal.cc +++ b/tensorflow/compiler/xla/literal.cc @@ -1434,6 +1434,12 @@ bool LiteralBase::Piece::EqualElementsInternal( bool LiteralBase::Piece::EqualElements(const LiteralBase::Piece& other) const { DCHECK(ShapeUtil::Compatible(subshape(), other.subshape())); + if (ShapeUtil::Equal(subshape(), other.subshape()) && + LayoutUtil::IsDenseArray(subshape())) { + CHECK_EQ(size_bytes(), other.size_bytes()); + return memcmp(buffer(), other.buffer(), size_bytes()) == 0; + } + std::vector multi_index; switch (subshape().element_type()) { case PRED: diff --git a/tensorflow/compiler/xla/service/hlo_cse.cc b/tensorflow/compiler/xla/service/hlo_cse.cc index 06484f4012..cb367adf5e 100644 --- a/tensorflow/compiler/xla/service/hlo_cse.cc +++ b/tensorflow/compiler/xla/service/hlo_cse.cc @@ -35,6 +35,7 @@ limitations under the License. #include "tensorflow/core/lib/core/errors.h" #include "tensorflow/core/lib/gtl/flatset.h" #include "tensorflow/core/lib/gtl/inlined_vector.h" +#include "tensorflow/core/lib/hash/hash.h" namespace xla { @@ -103,6 +104,9 @@ int64 CseHash(const HloInstruction* instruction) { for (auto operand : instruction->operands()) { hash = tensorflow::Hash64Combine(hash, operand->unique_id()); } + if (instruction->opcode() == HloOpcode::kConstant) { + hash = tensorflow::Hash64Combine(hash, instruction->literal().Hash()); + } return hash; } -- GitLab From 829b6691f905e17641840e83b3941cadcc7a2463 Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Fri, 24 Aug 2018 16:55:35 -0700 Subject: [PATCH 119/598] Deprecate C++ kernel for matrix exponential, which is now implemented as a python function. PiperOrigin-RevId: 210180168 --- .../base_api/api_def_MatrixExponential.pbtxt | 31 ++----------------- .../core/kernels/matrix_exponential_op.cc | 1 + tensorflow/core/ops/linalg_ops.cc | 2 ++ tensorflow/core/public/version.h | 4 ++- tensorflow/python/kernel_tests/BUILD | 2 +- .../kernel_tests/matrix_logarithm_op_test.py | 30 ++++++++++++------ 6 files changed, 29 insertions(+), 41 deletions(-) diff --git a/tensorflow/core/api_def/base_api/api_def_MatrixExponential.pbtxt b/tensorflow/core/api_def/base_api/api_def_MatrixExponential.pbtxt index d7b56aec87..46da1de1c3 100644 --- a/tensorflow/core/api_def/base_api/api_def_MatrixExponential.pbtxt +++ b/tensorflow/core/api_def/base_api/api_def_MatrixExponential.pbtxt @@ -1,32 +1,5 @@ op { graph_op_name: "MatrixExponential" - in_arg { - name: "input" - description: < { TF_DISALLOW_COPY_AND_ASSIGN(MatrixExponentialOp); }; +// Deprecated kernels (2018/08/21). REGISTER_LINALG_OP("MatrixExponential", (MatrixExponentialOp), float); REGISTER_LINALG_OP("MatrixExponential", (MatrixExponentialOp), double); REGISTER_LINALG_OP("MatrixExponential", (MatrixExponentialOp), diff --git a/tensorflow/core/ops/linalg_ops.cc b/tensorflow/core/ops/linalg_ops.cc index f37f79ddbf..1d4d51a25d 100644 --- a/tensorflow/core/ops/linalg_ops.cc +++ b/tensorflow/core/ops/linalg_ops.cc @@ -235,6 +235,8 @@ REGISTER_OP("MatrixInverse") .SetShapeFn(BatchUnchangedSquareShapeFn); REGISTER_OP("MatrixExponential") + .Deprecated( + 27, "Use Python implementation tf.linalg.matrix_exponential instead.") .Input("input: T") .Output("output: T") .Attr("T: {double, float, complex64, complex128}") diff --git a/tensorflow/core/public/version.h b/tensorflow/core/public/version.h index 563564119f..4129c93af5 100644 --- a/tensorflow/core/public/version.h +++ b/tensorflow/core/public/version.h @@ -96,10 +96,12 @@ limitations under the License. // GraphDef. (7dec2017) // 27. Deprecate TensorArray ops v2 in favor of v3 and deprecated io_ops // deprecated in favor of V2 ops. (2018/01/23) +// 28. Deprecate MatrixExponential op in favor of Python implementation. +// (2018/08/21). #define TF_GRAPH_DEF_VERSION_MIN_PRODUCER 0 #define TF_GRAPH_DEF_VERSION_MIN_CONSUMER 0 -#define TF_GRAPH_DEF_VERSION 26 +#define TF_GRAPH_DEF_VERSION 27 // Checkpoint compatibility versions (the versions field in SavedSliceMeta). // diff --git a/tensorflow/python/kernel_tests/BUILD b/tensorflow/python/kernel_tests/BUILD index b9c5f26cb7..a9982a7ae0 100644 --- a/tensorflow/python/kernel_tests/BUILD +++ b/tensorflow/python/kernel_tests/BUILD @@ -601,7 +601,7 @@ tf_py_test( tf_py_test( name = "matrix_logarithm_op_test", - size = "small", + size = "medium", srcs = ["matrix_logarithm_op_test.py"], additional_deps = [ "//third_party/py/numpy", diff --git a/tensorflow/python/kernel_tests/matrix_logarithm_op_test.py b/tensorflow/python/kernel_tests/matrix_logarithm_op_test.py index 24edc4f59f..723a15fbd1 100644 --- a/tensorflow/python/kernel_tests/matrix_logarithm_op_test.py +++ b/tensorflow/python/kernel_tests/matrix_logarithm_op_test.py @@ -30,6 +30,7 @@ from tensorflow.python.ops import gen_linalg_ops from tensorflow.python.ops import math_ops from tensorflow.python.ops import random_ops from tensorflow.python.ops import variables +from tensorflow.python.ops.linalg import linalg_impl from tensorflow.python.platform import test @@ -39,7 +40,7 @@ class LogarithmOpTest(test.TestCase): inp = x.astype(np_type) with self.test_session(use_gpu=True): # Verify that expm(logm(A)) == A. - tf_ans = gen_linalg_ops.matrix_exponential( + tf_ans = linalg_impl.matrix_exponential( gen_linalg_ops.matrix_logarithm(inp)) out = tf_ans.eval() self.assertAllClose(inp, out, rtol=1e-4, atol=1e-3) @@ -98,16 +99,25 @@ class LogarithmOpTest(test.TestCase): self._verifyLogarithmComplex(np.empty([0, 2, 2], dtype=np.complex64)) self._verifyLogarithmComplex(np.empty([2, 0, 0], dtype=np.complex64)) - def testRandomSmallAndLarge(self): + def testRandomSmallAndLargeComplex64(self): np.random.seed(42) - for dtype in np.complex64, np.complex128: - for batch_dims in [(), (1,), (3,), (2, 2)]: - for size in 8, 31, 32: - shape = batch_dims + (size, size) - matrix = np.random.uniform( - low=-1.0, high=1.0, - size=np.prod(shape)).reshape(shape).astype(dtype) - self._verifyLogarithmComplex(matrix) + for batch_dims in [(), (1,), (3,), (2, 2)]: + for size in 8, 31, 32: + shape = batch_dims + (size, size) + matrix = np.random.uniform( + low=-1.0, high=1.0, + size=np.prod(shape)).reshape(shape).astype(np.complex64) + self._verifyLogarithmComplex(matrix) + + def testRandomSmallAndLargeComplex128(self): + np.random.seed(42) + for batch_dims in [(), (1,), (3,), (2, 2)]: + for size in 8, 31, 32: + shape = batch_dims + (size, size) + matrix = np.random.uniform( + low=-1.0, high=1.0, + size=np.prod(shape)).reshape(shape).astype(np.complex128) + self._verifyLogarithmComplex(matrix) def testConcurrentExecutesWithoutError(self): with self.test_session(use_gpu=True) as sess: -- GitLab From 392ee7aa0b6557ffb1b9261328dd4f533de700d0 Mon Sep 17 00:00:00 2001 From: Shashi Shekhar Date: Fri, 24 Aug 2018 16:56:18 -0700 Subject: [PATCH 120/598] Fix links in docs. PiperOrigin-RevId: 210180247 --- tensorflow/contrib/lite/tools/accuracy/README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tensorflow/contrib/lite/tools/accuracy/README.md b/tensorflow/contrib/lite/tools/accuracy/README.md index ad28fc3c70..769ef201d2 100644 --- a/tensorflow/contrib/lite/tools/accuracy/README.md +++ b/tensorflow/contrib/lite/tools/accuracy/README.md @@ -29,7 +29,7 @@ Tensor ground_truth = ... read ground truth for the model ... TF_CHECK_OK(eval_pipeline.Run(input1, ground_truth1)); ``` For further examples, check the usage in [imagenet accuracy evaluation binary] -(ilsvrc/imagenet_accuracy_eval.cc) +(https://github.com/tensorflow/tensorflow/blob/master/tensorflow/contrib/lite/tools/accuracy/ilsvrc/imagenet_accuracy_eval.cc) ## Measuring accuracy of published models. @@ -37,4 +37,4 @@ For further examples, check the usage in [imagenet accuracy evaluation binary] For measuring accuracy for [ILSVRC 2012 image classification task] (http://www.image-net.org/challenges/LSVRC/2012/), the binary can be built using these -[instructions](accuracy/ilsvrc/) +[instructions.](ilsvrc/) -- GitLab From f1b0140b311e0b888bfe8373716f252dfb5e0439 Mon Sep 17 00:00:00 2001 From: Akshay Agrawal Date: Fri, 24 Aug 2018 16:57:22 -0700 Subject: [PATCH 121/598] Clean up the Python function API. Also adds an internal API for constructing graph functions with an input signature that bypasses PolymorphicFunction. PiperOrigin-RevId: 210180397 --- tensorflow/python/eager/function.py | 286 +++++++++--------- tensorflow/python/eager/function_test.py | 101 ++++--- .../framework/function_def_to_graph_test.py | 6 +- 3 files changed, 195 insertions(+), 198 deletions(-) diff --git a/tensorflow/python/eager/function.py b/tensorflow/python/eager/function.py index 3171ef9d62..9dc5648861 100644 --- a/tensorflow/python/eager/function.py +++ b/tensorflow/python/eager/function.py @@ -449,15 +449,15 @@ def _flatten(sequence): return outputs -class GraphCallable(object): +class Function(object): """Callable object encapsulating a function definition and its gradient. - `GraphCallable` is a callable that encapsulates a function definition and + `Function` is a callable that encapsulates a function definition and is differentiable under `tf.GradientTape` objects. """ def __init__(self, func_graph, attrs=None): - """Initialize a GraphCallable. + """Initialize a Function. Args: func_graph: An instance of FuncGraph: the function body to wrap. @@ -481,7 +481,7 @@ class GraphCallable(object): self._inference_function = _EagerDefinedFunction( _inference_name(self._func_graph.name), self._func_graph, self._func_graph.inputs, self._func_graph.outputs, self._attrs) - self._backward_graph_callable = None + self._backward_graph_function = None # Map holding distributed variables, keyed by resource handle tensors. self._distributed_variables = {} @@ -494,14 +494,94 @@ class GraphCallable(object): for component_variable in component_variables: self._distributed_variables[component_variable.handle] = variable + def __call__(self, *args): + """Executes the wrapped function.""" + ctx = context.context() + device_functions = _get_device_functions(ctx, ops.get_default_graph()) + if device_functions != self._device_functions: + raise ValueError( + "The current device stack does not match the device stack under " + "which the TensorFlow function '%s' was created.\n" + "Current device stack: %s\n%s device stack: %s" % + (self._inference_function.name, device_functions, + self._inference_function.name, self._device_functions)) + + for v in self._func_graph.variables: + if v.trainable: + tape.watch_variable(v) + + captures = self._resolve_captured_inputs() + tensor_inputs = [x for x in nest.flatten(args) if isinstance(x, ops.Tensor)] + args = tensor_inputs + captures + + if tape.should_record(tensor_inputs) or tape.should_record(captures): + return self._backprop_call(args) + + outputs = self._inference_function.call(ctx, args) + return self._build_call_outputs(outputs) + @property def graph(self): + """Returns the graph from which this function was constructed.""" return self._func_graph @property def variables(self): + """Returns all variables touched by this function.""" return self._func_graph.variables + @property + def inputs(self): + """Returns tensors in `self.graph` corresponding to arguments.""" + return self._func_graph.inputs + + @property + def outputs(self): + """Returns tensors in `self.graph` corresponding to return values.""" + return self._func_graph.outputs + + @property + def captured_inputs(self): + """Returns external Tensors captured by this function. + + self.__call__(*args) passes `args + self.captured_inputs` to the function. + """ + return self._captured_inputs + + @property + def function_def(self): + """Returns a `FunctionDef` object representing this function.""" + return self._inference_function.definition + + @property + def output_shapes(self): + """The function's output shapes.""" + # TODO(ebrevdo): Should we only keep the output shapes associated + # with len(self._python_returns) outputs? + # TODO(akshayka): Consider removing this. + outputs_list = nest.flatten(self._func_graph.structured_outputs) + j = 0 + for i, o in enumerate(outputs_list): + if o is not None: + if isinstance(o, ops.IndexedSlices): + # Extract the shape of the `IndexedSlices` object's `values` field. + outputs_list[i] = self._output_shapes[j] # the `values` shape + if o.dense_shape is not None: + j += 3 # skip over shapes for `values`, `indices`, `dense_shape` + else: + j += 2 # skip over shapes for `values`, `indices` + else: + outputs_list[i] = self._output_shapes[j] + j += 1 + return nest.pack_sequence_as(self._func_graph.structured_outputs, + outputs_list) + + @property + def output_dtypes(self): + # TODO(akshayka): Consider removing this. + return nest.map_structure(lambda x: x.dtype if x is not None else None, + self._func_graph.structured_outputs) + def _construct_backprop_function(self): """Constructs the backprop function object for this function.""" backwards_graph = FuncGraph(_backward_name(self._func_graph.name)) @@ -522,7 +602,7 @@ class GraphCallable(object): self._attrs) # The ordering of `backwards_graph.inputs` is important: inputs of - # `self._backward_graph_callable` correspond to outputs of + # `self._backward_graph_function` correspond to outputs of # `self._forward_function`. backwards_graph.inputs = gradients_wrt_outputs + list( backwards_graph.captures.values()) @@ -531,7 +611,7 @@ class GraphCallable(object): backwards_graph.outputs.extend( grad for grad in _flatten(gradients_wrt_inputs) if grad is not None) backwards_graph.structured_outputs = gradients_wrt_inputs - self._backward_graph_callable = GraphCallable( + self._backward_graph_function = Function( backwards_graph, attrs=self._attrs) def _backprop_call(self, args): @@ -545,7 +625,7 @@ class GraphCallable(object): Returns: The call output. """ - if self._backward_graph_callable is None: + if self._backward_graph_function is None: self._construct_backprop_function() ctx = context.context() @@ -560,49 +640,12 @@ class GraphCallable(object): side_outputs = outputs[self._num_outputs:] def backward_function(*args): - return self._backward_graph_callable(*(list(args) + side_outputs)) # pylint: disable=not-callable + return self._backward_graph_function(*(list(args) + side_outputs)) # pylint: disable=not-callable tape.record_operation(self._forward_function.signature.name, real_outputs, args, backward_function) return self._build_call_outputs(real_outputs) - @property - def output_shapes(self): - """The function's output shapes.""" - # TODO(ebrevdo): Should we only keep the output shapes associated - # with len(self._python_returns) outputs? - outputs_list = nest.flatten(self._func_graph.structured_outputs) - j = 0 - for i, o in enumerate(outputs_list): - if o is not None: - if isinstance(o, ops.IndexedSlices): - # Extract the shape of the `IndexedSlices` object's `values` field. - outputs_list[i] = self._output_shapes[j] # the `values` shape - if o.dense_shape is not None: - j += 3 # skip over shapes for `values`, `indices`, `dense_shape` - else: - j += 2 # skip over shapes for `values`, `indices` - else: - outputs_list[i] = self._output_shapes[j] - j += 1 - return nest.pack_sequence_as(self._func_graph.structured_outputs, - outputs_list) - - @property - def output_dtypes(self): - return nest.map_structure(lambda x: x.dtype if x is not None else None, - self._func_graph.structured_outputs) - - @property - def captured_inputs(self): - # TODO(akshayka): Should this return `_resolve_captured_inputs()`? - return self._captured_inputs - - @property - def name(self): - """Returns the name of the function in Eager-compatible format.""" - return self._inference_function.name.encode("utf-8") - def _resolve_captured_inputs(self): """Resolve captured distributed variables to their current values. @@ -629,32 +672,6 @@ class GraphCallable(object): return resolved_captured_inputs return self._captured_inputs - def __call__(self, *args): - """Executes the wrapped function.""" - ctx = context.context() - device_functions = _get_device_functions(ctx, ops.get_default_graph()) - if device_functions != self._device_functions: - raise ValueError( - "The current device stack does not match the device stack under " - "which the TensorFlow function '%s' was created.\n" - "Current device stack: %s\n%s device stack: %s" % - (self._inference_function.name, device_functions, - self._inference_function.name, self._device_functions)) - - for v in self._func_graph.variables: - if v.trainable: - tape.watch_variable(v) - - captures = self._resolve_captured_inputs() - tensor_inputs = [x for x in nest.flatten(args) if isinstance(x, ops.Tensor)] - args = tensor_inputs + captures - - if tape.should_record(tensor_inputs) or tape.should_record(captures): - return self._backprop_call(args) - - outputs = self._inference_function.call(ctx, args) - return self._build_call_outputs(outputs) - def _build_call_outputs(self, result): """Maps the fdef output list to actual output structure. @@ -868,13 +885,13 @@ def _deterministic_dict_values(dictionary): return tuple(dictionary[key] for key in sorted(dictionary)) -class _PolymorphicFunction(object): +class PolymorphicFunction(object): """Wrapper class for the graph functions defined for a Python function. See the documentation for `defun` for more information on the semantics of defined functions. - _PolymorphicFunction class is thread-compatible meaning that minimal + PolymorphicFunction class is thread-compatible meaning that minimal usage of defuns (defining and calling) is thread-safe, but if users call other methods or invoke the base `python_function` themselves, external synchronization is necessary. @@ -896,9 +913,6 @@ class _PolymorphicFunction(object): Raises: ValueError: if `input_signature` is not None and the `python_function`'s argspec has keyword arguments. - TypeError: if `input_signature` contains anything other than - `TensorSpec` objects, or (if not None) is anything other than a tuple or - list. """ if isinstance(python_function, functools.partial): @@ -910,7 +924,7 @@ class _PolymorphicFunction(object): self._args_to_prepend = tuple() self._kwds_to_include = {} self._name = name - self._arguments_to_functions = {} + self._function_cache = collections.OrderedDict() self._variables = [] self._lock = threading.Lock() @@ -945,15 +959,40 @@ class _PolymorphicFunction(object): self._input_signature = tuple(input_signature) self._flat_input_signature = tuple(nest.flatten(input_signature)) - if any(not isinstance(arg, tensor_spec.TensorSpec) - for arg in self._flat_input_signature): - raise TypeError("Invalid input_signature %s; input_signature must be " - "a possibly nested sequence of TensorSpec objects.") + + def __call__(self, *args, **kwds): + """Calls a graph function specialized to the inputs.""" + graph_function, inputs = self._maybe_define_function(*args, **kwds) + return graph_function(*inputs) + + @property + def python_function(self): + """Returns the wrapped Python function.""" + return self._python_function + + # TODO(akshayka): Remove this property. + @property + def variables(self): + """Returns the union of all variables referenced by cached `Function`s`.""" + return self._variables + + def get_concrete_function(self, *args, **kwargs): + """Returns a `Function` object specialized to inputs and execution context. + + `args` and `kwargs` are ignored if this `PolymorphicFunction` was created + with an `input_signature`. + + Args: + *args: inputs to specialize on. + **kwargs: inputs to specialize on. + """ + graph_function, _ = self._maybe_define_function(*args, **kwargs) + return graph_function def __get__(self, instance, owner): """Makes it possible to defun instance methods.""" del owner - # `instance` here is the instance that this `_PolymorphicFunction` was + # `instance` here is the instance that this `PolymorphicFunction` was # accessed through; e.g., for # # class Foo(object): @@ -963,7 +1002,7 @@ class _PolymorphicFunction(object): # ... # # foo = Foo() - # foo.bar() # `foo.bar` is a `_PolymorphicFunction` instance + # foo.bar() # `foo.bar` is a `PolymorphicFunction` instance # # then `instance` will be `foo` (and `owner` will be `Foo`). return functools.partial(self.__call__, instance) @@ -992,7 +1031,7 @@ class _PolymorphicFunction(object): Canonicalize the inputs to the Python function using its fullargspec. In particular, we parse the varags and kwargs that this - `_PolymorphicFunction` was called with into a tuple corresponding to the + `PolymorphicFunction` was called with into a tuple corresponding to the Python function's positional (named) arguments and a dictionary corresponding to its kwargs. @@ -1077,33 +1116,26 @@ class _PolymorphicFunction(object): ops.get_default_graph()) with self._lock: try: - graph_function = self._arguments_to_functions.get(cache_key, None) + graph_function = self._function_cache.get(cache_key, None) except TypeError: raise TypeError("Arguments supplied to `defun`-generated functions " "must be hashable.") if graph_function is None: - graph_function = GraphCallable( + graph_function = Function( func_graph_from_py_func(self._name, self._python_function, args, kwds, self._input_signature)) self._variables.extend( [v for v in graph_function.variables if v not in self._variables]) - self._arguments_to_functions[cache_key] = graph_function + self._function_cache[cache_key] = graph_function return graph_function, (args, kwds) - def __call__(self, *args, **kwds): - """Calls a graph function specialized for this input signature.""" - graph_function, inputs = self._maybe_define_function(*args, **kwds) - return graph_function(*inputs) - - def call_python_function(self, *args, **kwargs): - """Directly calls the wrapped python function.""" - return self._python_function(*args, **kwargs) - @property - def variables(self): - """Returns a list of variables used in any of the defined functions.""" - return self._variables +def _validate_signature(signature): + if any(not isinstance(arg, tensor_spec.TensorSpec) + for arg in nest.flatten(signature)): + raise TypeError("Invalid input_signature %s; input_signature must be " + "a possibly nested sequence of TensorSpec objects.") def defun(func=None, input_signature=None): @@ -1416,7 +1448,15 @@ def defun(func=None, input_signature=None): function (and return zero or more `tf.Tensor` objects). If `func` is None, returns a decorator that, when invoked with a single `func` argument, returns a callable equivalent to the case above. + + Raises: + TypeError: If `input_signature` is neither `None` nor a sequence of + `tf.contrib.eager.TensorSpec` objects. """ + + if input_signature is not None: + _validate_signature(input_signature) + # TODO(apassos): deal with captured global state. Deal with control flow. def decorated(function): try: @@ -1425,8 +1465,7 @@ def defun(func=None, input_signature=None): name = "function" return tf_decorator.make_decorator( function, - _PolymorphicFunction( - function, name, input_signature=input_signature)) + PolymorphicFunction(function, name, input_signature=input_signature)) # This code path is for the `foo = tfe.defun(foo, ...)` use case if func is not None: @@ -1442,51 +1481,6 @@ def defun(func=None, input_signature=None): return decorated -def make_defun_op(func, *args, **kwds): - """Compile func into graph_mode, assuming func arguments are *args, **kwargs. - - `make_defun_op` converts a function that constructs a TensorFlow graph into - a function object and attaches it to the graph. The resulting function - object can be queried for its properties, and called directly with different - inputs to execute. - - More details on use cases and limitations are available in the - documentation for `defun`. - - Example: - ```python - def f(x, y): - return tf.reduce_mean(tf.multiply(x ** 2, 3) + y) - - def g(x, y): - return tf.reduce_mean(tf.multiply(x ** 2, 3) + y) - - z = tf.constant([[0.0, 0.0]]) - g_op = make_defun_op(g, z, z) - - assert g_op.output_shapes == tf.TensorShape([]) - assert g_op.output_types == tf.float32 - - x = tf.constant([[2.0, 3.0]]) - y = tf.constant([[3.0, -2.0]]) - - # The plain function and defun-compiled function should return the same value. - assert f(x, y).numpy() == g_op(x, y).numpy() - ``` - - Args: - func: function to be compiled. - *args: List arguments to pass to `func` when attaching to the graph. - **kwds: Keyword arguments to pass to `func` when attaching to the graph. - - Returns: - A wrapper object which can be queried for its output properties, - and which can be called directly the way a `@defun` wrapped function - can. - """ - return GraphCallable(func_graph_from_py_func(func.__name__, func, args, kwds)) - - class AutomaticControlDependencies(object): """Context manager to automatically add control dependencies. diff --git a/tensorflow/python/eager/function_test.py b/tensorflow/python/eager/function_test.py index 4f23b3c4da..8381d2f55c 100644 --- a/tensorflow/python/eager/function_test.py +++ b/tensorflow/python/eager/function_test.py @@ -130,16 +130,16 @@ class FunctionTest(test.TestCase): with ops.Graph().as_default(): self.assertEqual(f().shape, ()) - def testBasicDefunOpGraphMode(self): + def testBasicGraphFunction(self): matmul = function.defun(math_ops.matmul) + @function.defun def sq(a): return matmul(a, a) t = constant_op.constant([[1.0, 2.0], [3.0, 4.0]]) - sq_op = function.make_defun_op(sq, t) - + sq_op = sq.get_concrete_function(t) self.assertEqual(sq_op.output_shapes, tensor_shape.TensorShape([2, 2])) out = sq_op(t) self.assertAllEqual(out, math_ops.matmul(t, t).numpy()) @@ -223,33 +223,32 @@ class FunctionTest(test.TestCase): g, = gradients_impl.gradients(f_c, c) self.assertAllEqual(sess.run(g), [[1.0]]) - def testNestedInputsDefunOpGraphMode(self): + def testNestedInputsGraphFunction(self): matmul = function.defun(math_ops.matmul) pair = collections.namedtuple('pair', ['a', 'b']) + @function.defun def a_times_b(inputs): return matmul(inputs.a['a'], inputs.b['b']) t = constant_op.constant([[1.0, 2.0], [3.0, 4.0]]) - inputs = pair({'a': t}, {'b': t}) - sq_op = function.make_defun_op(a_times_b, inputs) - + sq_op = a_times_b.get_concrete_function(inputs) self.assertEqual(sq_op.output_shapes, tensor_shape.TensorShape([2, 2])) out = sq_op(inputs) self.assertAllEqual(out, math_ops.matmul(t, t).numpy()) - def testNestedOutputDefunOpGraphMode(self): + def testNestedOutputGraphFunction(self): matmul = function.defun(math_ops.matmul) + @function.defun def sq(a): return (matmul(a, a), {'b': constant_op.constant(1.0)}) t = constant_op.constant([[1.0, 2.0], [3.0, 4.0]]) - sq_op = function.make_defun_op(sq, t) - + sq_op = sq.get_concrete_function(t) self.assertEqual(sq_op.output_shapes, (tensor_shape.TensorShape([2, 2]), {'b': tensor_shape.TensorShape([])})) @@ -259,28 +258,28 @@ class FunctionTest(test.TestCase): self.assertAllEqual(a, math_ops.matmul(t, t).numpy()) self.assertAllEqual(b['b'].numpy(), 1.0) - def testDefunOpGraphModeWithGradients(self): + def testGraphFunctionWithGradients(self): v = resource_variable_ops.ResourceVariable(1.0, name='v') + @function.defun def step(): def inner(): return v * v return backprop.implicit_grad(inner)()[0][0] - step_op = function.make_defun_op(step) - + step_op = step.get_concrete_function() self.assertEqual(step_op.output_dtypes, dtypes.float32) self.assertEqual(step_op.output_shapes, tensor_shape.TensorShape([])) self.assertAllEqual(step_op(), 2.0) - def testDefunOpGraphModeNoneOutput(self): + def testGraphFunctionNoneOutput(self): + @function.defun def fn(unused_a, unused_b): return None x = constant_op.constant(1) - fn_op = function.make_defun_op(fn, x, x) - + fn_op = fn.get_concrete_function(x, x) self.assertEqual(fn_op.output_dtypes, None) self.assertEqual(fn_op.output_shapes, None) self.assertAllEqual(fn_op(x, x), None) @@ -321,13 +320,13 @@ class FunctionTest(test.TestCase): x = random_ops.random_uniform([2, 2]).numpy() defined = function.defun(f) defined(x) - self.assertEqual(len(defined._arguments_to_functions), 1) + self.assertEqual(len(defined._function_cache), 1) x = random_ops.random_uniform([2, 2]).numpy() defined(x) # A NumPy array with different values but the same shape and dtype # shouldn't trigger another function definition. - self.assertEqual(len(defined._arguments_to_functions), 1) + self.assertEqual(len(defined._function_cache), 1) def testDefunCapturedInt32(self): x = constant_op.constant(1, dtype=dtypes.int32) @@ -686,17 +685,19 @@ class FunctionTest(test.TestCase): def testReturningIndexedSlicesWithDefun(self): def validate(indexed_slice): + @function.defun def f(): return indexed_slice - output = function.defun(f)() + output = f() self.assertTrue(isinstance(output, ops.IndexedSlices)) self.assertAllEqual(indexed_slice.values, output.values) self.assertAllEqual(indexed_slice.indices, output.indices) self.assertAllEqual(indexed_slice.dense_shape, output.dense_shape) self.assertEqual( - function.make_defun_op(f).output_shapes, indexed_slice.values.shape) + f.get_concrete_function().output_shapes, + indexed_slice.values.shape) arg = ops.IndexedSlices( values=constant_op.constant([1, 2]), @@ -1035,14 +1036,14 @@ class FunctionTest(test.TestCase): defined = function.defun(multi_device_fn) outputs = self.evaluate(defined()) - self.assertEqual(len(defined._arguments_to_functions), 1) + self.assertEqual(len(defined._function_cache), 1) self.assertIn(compat.as_bytes('CPU:0'), outputs[0]) self.assertIn(compat.as_bytes('CPU:1'), outputs[1]) self.assertIn(compat.as_bytes('CPU:2'), outputs[2]) with ops.device('/cpu:3'): outputs = self.evaluate(defined()) - self.assertEqual(len(defined._arguments_to_functions), 2) + self.assertEqual(len(defined._function_cache), 2) self.assertIn(compat.as_bytes('CPU:0'), outputs[0]) self.assertIn(compat.as_bytes('CPU:1'), outputs[1]) self.assertIn(compat.as_bytes('CPU:2'), outputs[2]) @@ -1050,12 +1051,12 @@ class FunctionTest(test.TestCase): # This should retrieve the call-site-device agnostic function defined() - self.assertEqual(len(defined._arguments_to_functions), 2) + self.assertEqual(len(defined._function_cache), 2) # And this should retrieve the function created for '/cpu:3' with ops.device('/cpu:3'): defined() - self.assertEqual(len(defined._arguments_to_functions), 2) + self.assertEqual(len(defined._function_cache), 2) @test_util.run_in_graph_and_eager_modes( config=config_pb2.ConfigProto(device_count={'CPU': 2})) @@ -1064,8 +1065,9 @@ class FunctionTest(test.TestCase): def func(): return constant_op.constant(0) + defined = function.defun(func) with ops.device('cpu:0'): - cpu_graph_function = function.make_defun_op(func) + cpu_graph_function = defined.get_concrete_function() with ops.device('cpu:0'): self.assertEqual( @@ -1087,8 +1089,7 @@ class FunctionTest(test.TestCase): with ops.device(None): cpu_graph_function() - default_graph_function = function.make_defun_op(func) - + default_graph_function = defined.get_concrete_function() self.assertEqual( self.evaluate(default_graph_function()), self.evaluate(func())) @@ -1130,7 +1131,7 @@ class FunctionTest(test.TestCase): def cache_keys(): """Sanitizes cache keys of non-input metadata.""" - return tuple(key[:3] for key in defined._arguments_to_functions) + return tuple(key[:3] for key in defined._function_cache) # `True` corresponds to the fact that we're executing eagerly self.assertIn((0, 1, 20), cache_keys()) @@ -1140,18 +1141,18 @@ class FunctionTest(test.TestCase): # This matches the previous call. defined(foo=1) - self.assertEqual(len(defined._arguments_to_functions), 2) + self.assertEqual(len(defined._function_cache), 2) defined(1, 2, 3) self.assertIn((1, 2, 3), cache_keys()) # This matches the previous call. defined(1, bar=2, baz=3) - self.assertEqual(len(defined._arguments_to_functions), 3) + self.assertEqual(len(defined._function_cache), 3) # This matches the previous call. defined(1, baz=3, bar=2) - self.assertEqual(len(defined._arguments_to_functions), 3) + self.assertEqual(len(defined._function_cache), 3) def testFunctoolsPartialUnwrappedCorrectly(self): @@ -1177,7 +1178,7 @@ class FunctionTest(test.TestCase): defined = function.defun(foo, input_signature=signature) a = array_ops.ones([2]) out = defined(a) - self.assertEqual(len(defined._arguments_to_functions), 1) + self.assertEqual(len(defined._function_cache), 1) self.assertAllEqual(out, a) def bar(a): @@ -1188,13 +1189,13 @@ class FunctionTest(test.TestCase): defined = function.defun(bar, input_signature=signature) a = array_ops.ones([2, 1]) out = defined(a) - self.assertEqual(len(defined._arguments_to_functions), 1) + self.assertEqual(len(defined._function_cache), 1) self.assertAllEqual(out, a) # Changing the second dimension shouldn't create a new function. b = array_ops.ones([2, 3]) out = defined(b) - self.assertEqual(len(defined._arguments_to_functions), 1) + self.assertEqual(len(defined._function_cache), 1) self.assertAllEqual(out, b) def testNestedInputSignatures(self): @@ -1211,7 +1212,7 @@ class FunctionTest(test.TestCase): a = array_ops.ones([2, 1]) b = array_ops.ones([1]) out = defined([a, a], b) - self.assertEqual(len(defined._arguments_to_functions), 1) + self.assertEqual(len(defined._function_cache), 1) nest.assert_same_structure(out, [[a, a], b]) self.assertAllEqual(out[0][0], a) self.assertAllEqual(out[0][1], a) @@ -1222,7 +1223,7 @@ class FunctionTest(test.TestCase): b = array_ops.ones([2, 5]) c = array_ops.ones([1]) out = defined([a, b], c) - self.assertEqual(len(defined._arguments_to_functions), 1) + self.assertEqual(len(defined._function_cache), 1) nest.assert_same_structure(out, [[a, b], c]) self.assertAllEqual(out[0][0], a) self.assertAllEqual(out[0][1], b) @@ -1258,13 +1259,13 @@ class FunctionTest(test.TestCase): # Signatures must consist exclusively of `TensorSpec` objects. signature = [(2, 3), tensor_spec.TensorSpec([2, 3], dtypes.float32)] with self.assertRaisesRegexp(TypeError, 'Invalid input_signature.*'): - function.defun(foo, input_signature=signature)(1, 2) + function.defun(foo, input_signature=signature) # Signatures must be either lists or tuples on their outermost levels. signature = {'t1': tensor_spec.TensorSpec([], dtypes.float32)} with self.assertRaisesRegexp(TypeError, 'input_signature must be either a ' 'tuple or a list.*'): - function.defun(foo, input_signature=signature)(1, 2) + function.defun(foo, input_signature=signature) def testInputsIncompatibleWithSignatureRaisesError(self): @@ -1318,22 +1319,22 @@ class FunctionTest(test.TestCase): integer = constant_op.constant(2, dtypes.int64) out1, out2 = foo(flt, integer) - self.assertEqual(len(foo._arguments_to_functions), 1) + self.assertEqual(len(foo._function_cache), 1) self.assertEqual(out1.numpy(), 1.0) self.assertEqual(out2.numpy(), 2) out1, out2 = foo(flt=flt, integer=integer) - self.assertEqual(len(foo._arguments_to_functions), 1) + self.assertEqual(len(foo._function_cache), 1) self.assertEqual(out1.numpy(), 1.0) self.assertEqual(out2.numpy(), 2) out1, out2 = foo(integer=integer, flt=flt) - self.assertEqual(len(foo._arguments_to_functions), 1) + self.assertEqual(len(foo._function_cache), 1) self.assertEqual(out1.numpy(), 1.0) self.assertEqual(out2.numpy(), 2) out1, out2 = foo(flt, integer=integer) - self.assertEqual(len(foo._arguments_to_functions), 1) + self.assertEqual(len(foo._function_cache), 1) self.assertEqual(out1.numpy(), 1.0) self.assertEqual(out2.numpy(), 2) @@ -1363,27 +1364,27 @@ class FunctionTest(test.TestCase): a = constant_op.constant(2.0) b = constant_op.constant([1.0, 2.0]) one = defined(a, b) - self.assertEqual(len(defined._arguments_to_functions), 1) + self.assertEqual(len(defined._function_cache), 1) two = defined(a=a, b=b) - self.assertEqual(len(defined._arguments_to_functions), 1) + self.assertEqual(len(defined._function_cache), 1) three = defined(b=b, a=a) - self.assertEqual(len(defined._arguments_to_functions), 1) + self.assertEqual(len(defined._function_cache), 1) four = defined(a, b=b) - self.assertEqual(len(defined._arguments_to_functions), 1) + self.assertEqual(len(defined._function_cache), 1) # The next call corresponds to a new input signature, hence # we expect another function to be defined. five = defined(b, a) - self.assertEqual(len(defined._arguments_to_functions), 2) + self.assertEqual(len(defined._function_cache), 2) six = defined(a=b, b=a) - self.assertEqual(len(defined._arguments_to_functions), 2) + self.assertEqual(len(defined._function_cache), 2) seven = defined(b=a, a=b) - self.assertEqual(len(defined._arguments_to_functions), 2) + self.assertEqual(len(defined._function_cache), 2) self.assertAllEqual(one, [1.0, 2.0]) self.assertAllEqual(two, [1.0, 2.0]) @@ -1468,7 +1469,7 @@ class FunctionTest(test.TestCase): self.assertAllEqual(state, [0]) # Whereas calling the python function directly should create a side-effect. - side_effecting_function.call_python_function() + side_effecting_function.python_function() self.assertAllEqual(state, [0, 0]) diff --git a/tensorflow/python/framework/function_def_to_graph_test.py b/tensorflow/python/framework/function_def_to_graph_test.py index 938814f1d0..e013fb6e4d 100644 --- a/tensorflow/python/framework/function_def_to_graph_test.py +++ b/tensorflow/python/framework/function_def_to_graph_test.py @@ -199,10 +199,11 @@ class FunctionDefToGraphDefTest(test.TestCase): return inner_fn() + @function.defun def fn2(): return 2 * fn() - fn2_defun = function.make_defun_op(fn2) + fn2_defun = fn2.get_concrete_function() # Call `fn2` to make sure `fn` is correctly instantiated so # `function_def_to_graph` can find it. @@ -221,6 +222,7 @@ class FunctionDefToGraphDefTest(test.TestCase): def testControlDependencies(self): + @function.defun def fn(inp): x = constant_op.constant(2.0, name="x") # TODO(b/79881896): Test external control dependency once that's @@ -230,7 +232,7 @@ class FunctionDefToGraphDefTest(test.TestCase): return 4.0 inp = constant_op.constant(1.0) - fdef = function.make_defun_op(fn, inp)._inference_function.definition + fdef = fn.get_concrete_function(inp).function_def func_graph = function_def_to_graph.function_def_to_graph(fdef) op = func_graph.get_operation_by_name("y") -- GitLab From 196c5b3450a31093eb5acc17df31300f1c3f56bd Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Fri, 24 Aug 2018 17:12:31 -0700 Subject: [PATCH 122/598] Changing KeyValueTensorInitializer to be more general. This is similar to the previous version but replaces the initialize_table_v2 call with gen_lookup_ops.lookup_table_import_v2. They are conceptually doing the same thing but initialize_table_v2 requires table to be a child of InitializableLookupTable whereas lookup_table_import_v2 does not. So the new intializer is more general than previous one. PiperOrigin-RevId: 210182320 --- .../kernels/initializable_lookup_table.cc | 8 ++- .../core/kernels/initializable_lookup_table.h | 57 +++++++++++++++++-- .../core/kernels/lookup_table_init_op.cc | 4 +- tensorflow/core/kernels/lookup_util.h | 51 ----------------- tensorflow/core/ops/lookup_ops.cc | 4 +- tensorflow/python/ops/lookup_ops.py | 11 +++- 6 files changed, 72 insertions(+), 63 deletions(-) diff --git a/tensorflow/core/kernels/initializable_lookup_table.cc b/tensorflow/core/kernels/initializable_lookup_table.cc index 06d53eba30..fcf468f5a8 100644 --- a/tensorflow/core/kernels/initializable_lookup_table.cc +++ b/tensorflow/core/kernels/initializable_lookup_table.cc @@ -14,7 +14,6 @@ limitations under the License. ==============================================================================*/ #include "tensorflow/core/kernels/initializable_lookup_table.h" - #include "tensorflow/core/lib/core/errors.h" namespace tensorflow { @@ -32,6 +31,13 @@ Status InitializableLookupTable::Find(OpKernelContext* ctx, const Tensor& keys, return DoFind(keys, values, default_value); } +Status InitializableLookupTable::ImportValues(OpKernelContext* ctx, + const Tensor& keys, + const Tensor& values) { + lookup::KeyValueTensorIterator iter(&keys, &values); + return Initialize(iter); +} + Status InitializableLookupTable::Initialize(InitTableIterator& iter) { if (!iter.Valid()) { return iter.status(); diff --git a/tensorflow/core/kernels/initializable_lookup_table.h b/tensorflow/core/kernels/initializable_lookup_table.h index 9ff94c46c6..424fe5df3c 100644 --- a/tensorflow/core/kernels/initializable_lookup_table.h +++ b/tensorflow/core/kernels/initializable_lookup_table.h @@ -58,11 +58,7 @@ class InitializableLookupTable : public LookupInterface { } Status ImportValues(OpKernelContext* ctx, const Tensor& keys, - const Tensor& values) final { - return errors::Unimplemented( - "ImportValues not supported by InitializableLookupTable " - "implementations"); - } + const Tensor& values) final; TensorShape key_shape() const final { return TensorShape(); } @@ -155,6 +151,57 @@ class InitializableLookupTable : public LookupInterface { bool is_initialized_ = false; }; +// Iterator to initialize tables given 'keys' and 'values' tensors. +// +// The two tensors are returned in the first iteration. It doesn't loop +// over each element of the tensor since insertions in the lookup table can +// process batches. +class KeyValueTensorIterator + : public InitializableLookupTable::InitTableIterator { + public: + // keys and values are not owned by the iterator. + explicit KeyValueTensorIterator(const Tensor* keys, const Tensor* values) + : keys_(keys), values_(values), valid_(true), status_(Status::OK()) { + TensorShape key_shape = keys_->shape(); + if (!key_shape.IsSameSize(values_->shape())) { + valid_ = false; + status_ = errors::InvalidArgument( + "keys and values should have the same dimension.", + key_shape.DebugString(), " vs ", values_->shape().DebugString()); + } + if (key_shape.num_elements() == 0) { + valid_ = false; + status_ = + errors::InvalidArgument("keys and values cannot be empty tensors."); + } + } + + bool Valid() const override { return valid_; } + + void Next() override { + valid_ = false; + status_ = errors::OutOfRange("No more data."); + } + + const Tensor& keys() const override { return *keys_; } + + const Tensor& values() const override { return *values_; } + + Status status() const override { return status_; } + + int64 total_size() const override { + return keys_ == nullptr ? -1 : keys_->NumElements(); + } + + private: + TF_DISALLOW_COPY_AND_ASSIGN(KeyValueTensorIterator); + + const Tensor* keys_; // Doesn't own it. + const Tensor* values_; // Doesn't own it. + bool valid_; // true if the iterator points to an existing range. + Status status_; +}; + } // namespace lookup } // namespace tensorflow diff --git a/tensorflow/core/kernels/lookup_table_init_op.cc b/tensorflow/core/kernels/lookup_table_init_op.cc index b352dd257c..6e77e1ee01 100644 --- a/tensorflow/core/kernels/lookup_table_init_op.cc +++ b/tensorflow/core/kernels/lookup_table_init_op.cc @@ -74,13 +74,11 @@ class InitializeTableOp : public OpKernel { "Keys and values must have the same size ", keys.NumElements(), " vs ", values.NumElements())); - lookup::KeyValueTensorIterator iter(&keys, &values); - int memory_used_before = 0; if (ctx->track_allocations()) { memory_used_before = table->MemoryUsed(); } - OP_REQUIRES_OK(ctx, table->Initialize(iter)); + OP_REQUIRES_OK(ctx, table->ImportValues(ctx, keys, values)); if (ctx->track_allocations()) { ctx->record_persistent_memory_allocation(table->MemoryUsed() - memory_used_before); diff --git a/tensorflow/core/kernels/lookup_util.h b/tensorflow/core/kernels/lookup_util.h index 894769960a..ec28cf9fa7 100644 --- a/tensorflow/core/kernels/lookup_util.h +++ b/tensorflow/core/kernels/lookup_util.h @@ -46,57 +46,6 @@ Status InitializeTableFromTextFile(const string& filename, int64 vocab_size, int32 value_index, Env* env, InitializableLookupTable* table); -// Iterator to initialize tables given 'keys' and 'values' tensors. -// -// The two tensors are returned in the first iteration. It doesn't loop -// over each element of the tensor since insertions in the lookup table can -// process batches. -class KeyValueTensorIterator - : public InitializableLookupTable::InitTableIterator { - public: - // keys and values are not owned by the iterator. - explicit KeyValueTensorIterator(const Tensor* keys, const Tensor* values) - : keys_(keys), values_(values), valid_(true), status_(Status::OK()) { - TensorShape key_shape = keys_->shape(); - if (!key_shape.IsSameSize(values_->shape())) { - valid_ = false; - status_ = errors::InvalidArgument( - "keys and values should have the same dimension.", - key_shape.DebugString(), " vs ", values_->shape().DebugString()); - } - if (key_shape.num_elements() == 0) { - valid_ = false; - status_ = - errors::InvalidArgument("keys and values cannot be empty tensors."); - } - } - - bool Valid() const override { return valid_; } - - void Next() override { - valid_ = false; - status_ = errors::OutOfRange("No more data."); - } - - const Tensor& keys() const override { return *keys_; } - - const Tensor& values() const override { return *values_; } - - Status status() const override { return status_; } - - int64 total_size() const override { - return keys_ == nullptr ? -1 : keys_->NumElements(); - } - - private: - TF_DISALLOW_COPY_AND_ASSIGN(KeyValueTensorIterator); - - const Tensor* keys_; // Doesn't own it. - const Tensor* values_; // Doesn't own it. - bool valid_; // true if the iterator points to an existing range. - Status status_; -}; - } // namespace lookup } // namespace tensorflow diff --git a/tensorflow/core/ops/lookup_ops.cc b/tensorflow/core/ops/lookup_ops.cc index 7c71406c6b..72a77be70d 100644 --- a/tensorflow/core/ops/lookup_ops.cc +++ b/tensorflow/core/ops/lookup_ops.cc @@ -294,7 +294,9 @@ REGISTER_OP("LookupTableImportV2") ShapeHandle handle; TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 0, &handle)); - // TODO: Validate keys and values shape. + ShapeHandle keys; + TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 1, &keys)); + TF_RETURN_IF_ERROR(c->Merge(keys, c->input(2), &keys)); return Status::OK(); }); diff --git a/tensorflow/python/ops/lookup_ops.py b/tensorflow/python/ops/lookup_ops.py index fb51fbc626..561a341cf3 100644 --- a/tensorflow/python/ops/lookup_ops.py +++ b/tensorflow/python/ops/lookup_ops.py @@ -22,6 +22,7 @@ import collections import functools import six +from tensorflow.python.compat import compat as fwd_compat from tensorflow.python.eager import context from tensorflow.python.framework import constant_op from tensorflow.python.framework import dtypes @@ -299,6 +300,7 @@ class HashTable(InitializableLookupTableBase): self._value_shape)) return exported_keys, exported_values + class TableInitializerBase(object): """Base class for lookup table initializers.""" @@ -370,8 +372,13 @@ class KeyValueTensorInitializer(TableInitializerBase): # Ensure a unique name when eager execution is enabled to avoid spurious # sharing issues. scope += str(ops.uid()) - init_op = gen_lookup_ops.initialize_table_v2( - table.table_ref, self._keys, self._values, name=scope) + if fwd_compat.forward_compatible(2018, 9, 19): + init_op = gen_lookup_ops.lookup_table_import_v2( + table.table_ref, self._keys, self._values, name=scope) + else: + # To maintain forward compatibiltiy, use the old implementation. + init_op = gen_lookup_ops.initialize_table_v2( + table.table_ref, self._keys, self._values, name=scope) ops.add_to_collection(ops.GraphKeys.TABLE_INITIALIZERS, init_op) return init_op -- GitLab From b42c222b19cde1a8a72fdd81c483bd5a2b1f674e Mon Sep 17 00:00:00 2001 From: Nupur Garg Date: Fri, 24 Aug 2018 17:35:05 -0700 Subject: [PATCH 123/598] Internal change. PiperOrigin-RevId: 210184689 --- tensorflow/contrib/lite/delegates/eager/BUILD | 1 + tensorflow/contrib/lite/delegates/eager/kernel.cc | 9 +++++++++ 2 files changed, 10 insertions(+) diff --git a/tensorflow/contrib/lite/delegates/eager/BUILD b/tensorflow/contrib/lite/delegates/eager/BUILD index 8abc828578..88c70fbb8a 100644 --- a/tensorflow/contrib/lite/delegates/eager/BUILD +++ b/tensorflow/contrib/lite/delegates/eager/BUILD @@ -132,6 +132,7 @@ cc_library( ], "//conditions:default": [ "//tensorflow/core:protos_all_cc", + "//tensorflow/core:framework", ], }), ) diff --git a/tensorflow/contrib/lite/delegates/eager/kernel.cc b/tensorflow/contrib/lite/delegates/eager/kernel.cc index febf0b85a4..f8467c7cb2 100644 --- a/tensorflow/contrib/lite/delegates/eager/kernel.cc +++ b/tensorflow/contrib/lite/delegates/eager/kernel.cc @@ -26,6 +26,7 @@ limitations under the License. #include "tensorflow/core/common_runtime/eager/execute.h" #include "tensorflow/core/common_runtime/eager/tensor_handle.h" #include "tensorflow/core/framework/node_def.pb.h" +#include "tensorflow/core/framework/node_def_util.h" // Note: this is part of TF Lite's Eager delegation code which is to be // completed soon. @@ -189,6 +190,14 @@ void* Init(TfLiteContext* context, const char* buffer, size_t length) { } } + // Fill NodeDef with defaults if it's a valid op. + const tensorflow::OpRegistrationData* op_reg_data; + auto tf_status = tensorflow::OpRegistry::Global()->LookUp( + node_data.nodedef.op(), &op_reg_data); + if (tf_status.ok()) { + AddDefaultsToNodeDef(op_reg_data->op_def, &node_data.nodedef); + } + for (auto input_index : TfLiteIntArrayView(node->inputs)) { node_data.inputs.push_back(input_index); } -- GitLab From be7a5e4b6e50842bc3c841daaa8dadadc793dd5f Mon Sep 17 00:00:00 2001 From: Youlong Cheng Date: Fri, 24 Aug 2018 17:46:49 -0700 Subject: [PATCH 124/598] Support Input partition in Predict mode. PiperOrigin-RevId: 210185805 --- tensorflow/contrib/tpu/python/tpu/tpu_estimator.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py b/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py index 2e4050bd99..1ff04f5c26 100644 --- a/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py +++ b/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py @@ -804,11 +804,14 @@ def generate_per_host_v2_enqueue_ops_fn_for_host( per_host_sharded_inputs.append(flattened_inputs) if inputs_structure_recorder.flattened_input_dims: + input_partition_dims = inputs_structure_recorder.flattened_input_dims + if signals: + input_partition_dims += [None] * len(signals) # pylint: disable=protected-access infeed_queue = tpu_feed._PartitionedInfeedQueue( number_of_tuple_elements=len(per_host_sharded_inputs[0]), host_id=host_id, - input_partition_dims=inputs_structure_recorder.flattened_input_dims, + input_partition_dims=input_partition_dims, device_assignment=ctx.device_assignment) per_host_enqueue_ops = infeed_queue.generate_enqueue_ops( per_host_sharded_inputs) @@ -2821,8 +2824,6 @@ def _train_on_tpu_system(ctx, model_fn_wrapper, dequeue_fn): def _predict_on_tpu_system(ctx, model_fn_wrapper, dequeue_fn): """Executes `model_fn_wrapper` multiple times on all TPU shards.""" - num_cores = ctx.num_cores - (single_tpu_predict_step, host_calls, captured_scaffold_fn, captured_predict_hooks ) = model_fn_wrapper.convert_to_single_tpu_predict_step(dequeue_fn) @@ -2841,7 +2842,7 @@ def _predict_on_tpu_system(ctx, model_fn_wrapper, dequeue_fn): (dummy_predict_op,) = tpu.shard( multi_tpu_predict_steps_on_single_shard, inputs=[], - num_shards=num_cores, + num_shards=ctx.num_replicas, outputs_from_all_shards=False, device_assignment=ctx.device_assignment) -- GitLab From 481266fa991ca2b08221214f535c6a8c546e2350 Mon Sep 17 00:00:00 2001 From: Mark Heffernan Date: Fri, 24 Aug 2018 17:53:34 -0700 Subject: [PATCH 125/598] Run AddSpecialCaseCopies in HloRematerialization. RemoveUnnecessaryCopies which runs in rematerialization to take advantage of scheduling can sometimes remove copies which are needed to non-interference reasons. This requires running AddSpecialCaseCopies to add them back in. Furthermore, the schedule needs to be updated to account for the changes to the module, so add an UpdateSchedule function which can patch up a schedule in light a limited set of transformations to the module (addition and deletion of instructions). PiperOrigin-RevId: 210186375 --- tensorflow/compiler/xla/service/BUILD | 7 +- .../compiler/xla/service/copy_insertion.cc | 40 +-- .../compiler/xla/service/copy_insertion.h | 24 +- .../compiler/xla/service/hlo_instruction.h | 13 - .../xla/service/hlo_rematerialization.cc | 61 +++-- .../compiler/xla/service/hlo_scheduling.cc | 185 +++++++++++++ .../compiler/xla/service/hlo_scheduling.h | 37 +++ .../xla/service/hlo_scheduling_test.cc | 248 ++++++++++++++++++ 8 files changed, 550 insertions(+), 65 deletions(-) diff --git a/tensorflow/compiler/xla/service/BUILD b/tensorflow/compiler/xla/service/BUILD index aa826aa770..47d376c8ac 100644 --- a/tensorflow/compiler/xla/service/BUILD +++ b/tensorflow/compiler/xla/service/BUILD @@ -1152,6 +1152,7 @@ cc_library( "//tensorflow/compiler/xla:types", "//tensorflow/compiler/xla:util", "//tensorflow/core:lib", + "//tensorflow/core:lib_internal", ], ) @@ -1159,17 +1160,18 @@ tf_cc_test( name = "hlo_scheduling_test", srcs = ["hlo_scheduling_test.cc"], deps = [ - ":buffer_value", ":heap_simulator", ":hlo", + ":hlo_dce", ":hlo_ordering", + ":hlo_parser", ":hlo_scheduling", "//tensorflow/compiler/xla:shape_util", "//tensorflow/compiler/xla:types", "//tensorflow/compiler/xla:xla_data_proto", - "//tensorflow/compiler/xla/service:hlo_parser", "//tensorflow/compiler/xla/tests:hlo_test_base", "//tensorflow/compiler/xla/tests:xla_internal_test_main", + "//tensorflow/core:test", ], ) @@ -2320,6 +2322,7 @@ cc_library( "//tensorflow/compiler/xla:types", "//tensorflow/compiler/xla:util", "//tensorflow/core:lib", + "//tensorflow/core:lib_internal", "@com_google_absl//absl/container:inlined_vector", "@com_google_absl//absl/strings", ], diff --git a/tensorflow/compiler/xla/service/copy_insertion.cc b/tensorflow/compiler/xla/service/copy_insertion.cc index 231d31d960..1b7a7b36ea 100644 --- a/tensorflow/compiler/xla/service/copy_insertion.cc +++ b/tensorflow/compiler/xla/service/copy_insertion.cc @@ -957,16 +957,11 @@ Status CopyInsertion::AddCopiesToResolveInterference(HloModule* module) { return Status::OK(); } -// Add copies to address special constraints on the roots of computations not -// related to live range interference: -// -// (1) Entry computation root must be unambiguous and distinct. -// -// (2) Any computation called by a kCall instruction must have an -// unambiguous root. -// -// (3) Constants and parameters cannot be live out of the entry computation -// +Status CopyInsertion::AddSpecialCaseCopies(HloModule* module) { + std::unique_ptr call_graph = CallGraph::Build(module); + return AddSpecialCaseCopies(*call_graph, module); +} + Status CopyInsertion::AddSpecialCaseCopies(const CallGraph& call_graph, HloModule* module) { TF_ASSIGN_OR_RETURN(std::unique_ptr alias_analysis, @@ -1062,15 +1057,6 @@ Status CopyInsertion::AddSpecialCaseCopies(const CallGraph& call_graph, for (HloInstruction* user : users) { TF_RETURN_IF_ERROR(instruction->ReplaceUseWith(user, deep_copy)); } - // Special case copies are not eligible for later copy elision passes. - indices_to_copy.ForEachElement([&](const ShapeIndex& index, bool has_copy) { - if (has_copy) { - HloInstruction* copy = *copies_added.mutable_element(index); - if (copy != nullptr) { - copy->SetCopyElisionAllowed(false); - } - } - }); if (instruction == instruction->parent()->root_instruction()) { instruction->parent()->set_root_instruction(deep_copy); } @@ -1078,10 +1064,10 @@ Status CopyInsertion::AddSpecialCaseCopies(const CallGraph& call_graph, return Status::OK(); } -Status CopyInsertion::VerifyNoLiveRangeInterference(HloModule* module) { +Status CopyInsertion::VerifyNoLiveRangeInterference(const HloOrdering& ordering, + HloModule* module) { TF_ASSIGN_OR_RETURN(std::unique_ptr alias_analysis, HloAliasAnalysis::Run(module, fusion_can_share_buffer_)); - DependencyHloOrdering ordering(module); TF_RET_CHECK(!alias_analysis->HasLiveRangeInterference(ordering)); return Status::OK(); } @@ -1098,8 +1084,7 @@ Status CopyInsertion::RemoveUnnecessaryCopies(const HloOrdering& ordering, std::unique_ptr call_graph = CallGraph::Build(module); for (HloComputation* computation : module->computations()) { for (HloInstruction* instruction : computation->instructions()) { - if (instruction->opcode() == HloOpcode::kCopy && - instruction->CopyElisionAllowed()) { + if (instruction->opcode() == HloOpcode::kCopy) { TF_RETURN_IF_ERROR(copy_remover.TryElideCopy(instruction).status()); } } @@ -1165,10 +1150,10 @@ StatusOr CopyInsertion::Run(HloModule* module) { TF_RETURN_IF_ERROR(tuple_simplifier.Run(module).status()); TF_RETURN_IF_ERROR(dce.Run(module).status()); - TF_DCHECK_OK(VerifyNoLiveRangeInterference(module)); + DependencyHloOrdering dep_ordering(module); + TF_DCHECK_OK(VerifyNoLiveRangeInterference(dep_ordering, module)); - DependencyHloOrdering ordering(module); - TF_RETURN_IF_ERROR(RemoveUnnecessaryCopies(ordering, module)); + TF_RETURN_IF_ERROR(RemoveUnnecessaryCopies(dep_ordering, module)); TF_RETURN_IF_ERROR(AddSpecialCaseCopies(*call_graph, module)); @@ -1176,7 +1161,8 @@ StatusOr CopyInsertion::Run(HloModule* module) { TF_RETURN_IF_ERROR(tuple_simplifier.Run(module).status()); TF_RETURN_IF_ERROR(dce.Run(module).status()); - TF_DCHECK_OK(VerifyNoLiveRangeInterference(module)); + TF_DCHECK_OK( + VerifyNoLiveRangeInterference(DependencyHloOrdering(module), module)); MaybeDumpModule("after copy insertion", *module); diff --git a/tensorflow/compiler/xla/service/copy_insertion.h b/tensorflow/compiler/xla/service/copy_insertion.h index f797ee7e4d..d308f6bc84 100644 --- a/tensorflow/compiler/xla/service/copy_insertion.h +++ b/tensorflow/compiler/xla/service/copy_insertion.h @@ -77,15 +77,29 @@ class CopyInsertion : public HloPassInterface { Status RemoveUnnecessaryCopies(const HloOrdering& ordering, HloModule* module); - private: - // Verifies that no HLO values have interfering live ranged assuming the - // ordering used by copy insertion. - Status VerifyNoLiveRangeInterference(HloModule* module); + // Add copies to address special constraints on the roots of computations not + // related to live range interference: + // + // (1) Entry computation root must be unambiguous and distinct. + // + // (2) Any computation called by a kCall instruction must have an + // unambiguous root. + // + // (3) Constants and parameters cannot be live out of the entry computation + // + Status AddSpecialCaseCopies(HloModule* module); - Status AddCopiesToResolveInterference(HloModule* module); + // Verifies that no HLO values have interfering live ranges using the given + // ordering. + Status VerifyNoLiveRangeInterference(const HloOrdering& ordering, + HloModule* module); + private: + // Override which requires the caller to pass in a call graph. Status AddSpecialCaseCopies(const CallGraph& call_graph, HloModule* module); + Status AddCopiesToResolveInterference(HloModule* module); + // Backend specific function that decides whether a fusion can share buffer // with its operand. HloDataflowAnalysis::FusionCanShareBufferFunction fusion_can_share_buffer_; diff --git a/tensorflow/compiler/xla/service/hlo_instruction.h b/tensorflow/compiler/xla/service/hlo_instruction.h index 566c1c449a..948e33a0a3 100644 --- a/tensorflow/compiler/xla/service/hlo_instruction.h +++ b/tensorflow/compiler/xla/service/hlo_instruction.h @@ -1092,19 +1092,6 @@ class HloInstruction { // instruction. void SetupDerivedInstruction(HloInstruction* derived_instruction) const; - // TODO(b/80249101): Remove these methods once HLO scheduling and copy - // insertion are integrated, and we don't need to run a separate pass - // of copy elision anymore. - bool CopyElisionAllowed() const { - CHECK_EQ(HloOpcode::kCopy, opcode_); - return copy_elision_allowed_; - } - - void SetCopyElisionAllowed(bool value) { - CHECK_EQ(HloOpcode::kCopy, opcode_); - copy_elision_allowed_ = value; - } - // Returns data on the dimension numbers used for a dot operation. const DotDimensionNumbers& dot_dimension_numbers() const { CHECK(dot_dimension_numbers_ != nullptr); diff --git a/tensorflow/compiler/xla/service/hlo_rematerialization.cc b/tensorflow/compiler/xla/service/hlo_rematerialization.cc index 9cc1f5a10e..6c6e7c6fec 100644 --- a/tensorflow/compiler/xla/service/hlo_rematerialization.cc +++ b/tensorflow/compiler/xla/service/hlo_rematerialization.cc @@ -1203,6 +1203,49 @@ StatusOr HloRematerialization::Run( VLOG(1) << "HloRematerialization() with memory limit of " << HumanReadableNumBytes(memory_limit_bytes); + XLA_VLOG_LINES(3, "Before HloRematerialization:\n" + module->ToString()); + + // Create initial sequence of HLO instructions. + TF_ASSIGN_OR_RETURN(*sequence, ScheduleComputationsInModule( + *module, + [this](const BufferValue& buffer) { + return size_function_(buffer.shape()); + }, + scheduler_algorithm_)); + if (copy_insertion) { + // We run a separate pass of copy elision here because the sequential + // ordering from the HLO schedule allows for more copies to be eliminated. + // TODO(b/80249101): Instead of a separate copy elision pass, use the + // ordering from the HLO schedule directly for copy insertion. + + // First create a copy of the schedule which contains HloInstruction unique + // ids instead of HloInstruction*. This is necessary for updating the + // schedule below. + // TODO(b/113175018): Remove this when the HLO schedule is self-contained + // and can update itself. + tensorflow::gtl::FlatMap> + id_sequence = ComputeIdSchedule(*sequence); + + SequentialHloOrdering ordering(module, *sequence); + TF_RETURN_IF_ERROR( + copy_insertion->RemoveUnnecessaryCopies(ordering, module)); + + // RemoveUnnecessaryCopies only considers interference when determining + // whether it is legal to remove a copy. However, copies in the graph may be + // necessary for other reason such as preventing a constant from being live + // out of the graph. So run AddSpecialCaseCopies to re-insert these copies. + // TODO(b/80249101): Break copy insertion into several passes and run each + // one once in the regular HLO pipeline. + TF_RETURN_IF_ERROR(copy_insertion->AddSpecialCaseCopies(module)); + + // The passes above can add and remove copies, update the schedule to + // account for these transformations. Newly added instructions will be + // placed ASAP in the schedule. + TF_RETURN_IF_ERROR(UpdateSchedule(*module, id_sequence, sequence)); + + TF_DCHECK_OK(copy_insertion->VerifyNoLiveRangeInterference( + SequentialHloOrdering(module, *sequence), module)); + } TF_ASSIGN_OR_RETURN(points_to_analysis_, TuplePointsToAnalysis::Run(module)); @@ -1224,24 +1267,6 @@ StatusOr HloRematerialization::Run( << HumanReadableNumBytes(module_output_size) << "): " << HumanReadableNumBytes(adjusted_memory_limit_bytes); - XLA_VLOG_LINES(3, "Before HloRematerialization:\n" + module->ToString()); - // Create initial sequence of HLO instructions. - TF_ASSIGN_OR_RETURN(*sequence, ScheduleComputationsInModule( - *module, - [this](const BufferValue& buffer) { - return size_function_(buffer.shape()); - }, - scheduler_algorithm_)); - if (copy_insertion) { - // We run a separate pass of copy elision here because the sequential - // ordering from the HLO schedule allows for more copies to be eliminated. - // TODO(b/80249101): Instead of a separate copy elision pass, use the - // ordering from the HLO schedule directly for copy insertion. - SequentialHloOrdering ordering(module, *sequence); - TF_RETURN_IF_ERROR( - copy_insertion->RemoveUnnecessaryCopies(ordering, module)); - } - // Compute peak memory usage of all computations in the module called in a // sequential context. call_graph_ = CallGraph::Build(module); diff --git a/tensorflow/compiler/xla/service/hlo_scheduling.cc b/tensorflow/compiler/xla/service/hlo_scheduling.cc index 393824d920..56b14f9fef 100644 --- a/tensorflow/compiler/xla/service/hlo_scheduling.cc +++ b/tensorflow/compiler/xla/service/hlo_scheduling.cc @@ -16,6 +16,7 @@ limitations under the License. #include "tensorflow/compiler/xla/service/hlo_scheduling.h" #include +#include #include #include @@ -28,6 +29,7 @@ limitations under the License. #include "tensorflow/compiler/xla/types.h" #include "tensorflow/compiler/xla/util.h" #include "tensorflow/core/lib/core/errors.h" +#include "tensorflow/core/lib/gtl/map_util.h" #include "tensorflow/core/lib/strings/stringprintf.h" #include "tensorflow/core/platform/logging.h" @@ -580,4 +582,187 @@ StatusOr> ScheduleOneComputation( size_function, nullptr, empty_map); } +tensorflow::gtl::FlatMap> +ComputeIdSchedule(const SequentialHloOrdering::HloModuleSequence& sequence) { + tensorflow::gtl::FlatMap> id_sequence; + for (const auto& computation_sequence : sequence) { + for (const HloInstruction* instruction : computation_sequence.second) { + id_sequence[computation_sequence.first].push_back( + instruction->unique_id()); + } + } + return id_sequence; +} + +Status UpdateSchedule( + const HloModule& module, + const tensorflow::gtl::FlatMap>& + id_sequence, + SequentialHloOrdering::HloModuleSequence* sequence) { + // Map from unique ID to HloInstruction pointer for instructions in the + // module. + tensorflow::gtl::FlatMap id_to_instruction; + // Set of all HloInstructions in the schedule. + tensorflow::gtl::FlatSet ids_in_schedule; + std::vector nonfusion_computations = + module.MakeNonfusionComputations(); + for (const HloComputation* computation : nonfusion_computations) { + for (const HloInstruction* instruction : computation->instructions()) { + TF_RET_CHECK( + id_to_instruction.insert({instruction->unique_id(), instruction}) + .second); + } + for (int id : id_sequence.at(computation)) { + ids_in_schedule.insert(id); + } + } + + // Map from HloInstruction X to newly added instructions (instruction is in + // module, but not in schedule) which use X. If an instruction is not in the + // map, then it has no users which are newly added instructions. + tensorflow::gtl::FlatMap> + new_instruction_uses; + + // For each newly added instruction, this is the count of the instruction's + // operands that have not yet been scheduled. When this value reaches zero, + // then the instruction may be placed in the schedule. + tensorflow::gtl::FlatMap + unscheduled_operand_count; + // For each computation, this is the set of newly added instructions which + // have no operands. These must be handled specially and are added to the + // beginning of the schedule. + tensorflow::gtl::FlatMap> + new_zero_operand_instructions; + for (const HloComputation* computation : nonfusion_computations) { + new_zero_operand_instructions[computation] = {}; + for (const HloInstruction* instruction : computation->instructions()) { + if (ids_in_schedule.count(instruction->unique_id()) == 0) { + // This is a newly added instruction which is not in the schedule. + for (const HloInstruction* operand : instruction->operands()) { + new_instruction_uses[operand].push_back(instruction); + } + if (instruction->operands().empty()) { + new_zero_operand_instructions[computation].push_back(instruction); + } + unscheduled_operand_count[instruction] = instruction->operand_count(); + } + } + } + + // Update the schedule with the newly added instructions, and remove any + // instructions no longer in the graph. + for (const HloComputation* computation : nonfusion_computations) { + std::vector old_computation_sequence = + std::move(sequence->at(computation)); + sequence->at(computation).clear(); + + // Create a worklist of newly added instructions which are ready to be added + // to the schedule. Initialize worklist with those that have zero operands. + std::queue worklist; + for (const HloInstruction* instruction : + new_zero_operand_instructions.at(computation)) { + worklist.push(instruction); + } + + // Lambda which schedules all instructions on the worklist. + auto schedule_worklist = [&]() { + while (!worklist.empty()) { + const HloInstruction* instruction = worklist.front(); + worklist.pop(); + sequence->at(computation).push_back(instruction); + std::vector* new_users = + tensorflow::gtl::FindOrNull(new_instruction_uses, instruction); + if (new_users != nullptr) { + // This just-scheduled instruction has users which are newly added to + // the module. Update the number of unscheduled operands and push the + // newly added instruction to the worklist if it is ready to + // schedule. + for (const HloInstruction* new_user : *new_users) { + unscheduled_operand_count.at(new_user)--; + CHECK_GE(unscheduled_operand_count.at(new_user), 0); + if (unscheduled_operand_count.at(new_user) == 0) { + worklist.push(new_user); + } + } + } + } + }; + + schedule_worklist(); + for (int id : id_sequence.at(computation)) { + auto it = id_to_instruction.find(id); + if (it == id_to_instruction.end()) { + // This instruction in the schedule is no longer in the module. + continue; + } + const HloInstruction* instruction = it->second; + worklist.push(instruction); + schedule_worklist(); + } + } + + TF_RETURN_IF_ERROR(VerifySchedule(module, *sequence)); + return Status::OK(); +} + +Status VerifySchedule( + const HloModule& module, + const SequentialHloOrdering::HloModuleSequence& sequence) { + VLOG(2) << "VerifySchedule()"; + XLA_VLOG_LINES(2, module.ToString()); + VLOG(2) << sequence; + + // Verify the set of computations in the sequence is exactly the set of + // computations in the module. + std::vector nonfusion_computations = + module.MakeNonfusionComputations(); + TF_RET_CHECK(nonfusion_computations.size() == sequence.size()); + tensorflow::gtl::FlatSet computations_in_module( + module.computations().begin(), module.computations().end()); + for (const auto& computation_sequence : sequence) { + TF_RET_CHECK(computations_in_module.count(computation_sequence.first) == 1); + } + + // For each computation verify the set of instructions is the same and that + // each dependency and control edge is honored. + for (const HloComputation* computation : nonfusion_computations) { + tensorflow::gtl::FlatMap instruction_position; + int pos = 0; + for (const HloInstruction* instruction : sequence.at(computation)) { + TF_RET_CHECK(instruction_position.insert({instruction, pos}).second) + << "Instruction " << instruction->name() + << " appears more than once in the schedule"; + pos++; + } + + TF_RET_CHECK(instruction_position.size() == + computation->instruction_count()); + for (const HloInstruction* instruction : computation->instructions()) { + TF_RET_CHECK(instruction_position.count(instruction) == 1) + << "Instruction " << instruction->name() << " is not in schedule"; + } + + for (const HloInstruction* instruction : computation->instructions()) { + for (const HloInstruction* operand : instruction->operands()) { + TF_RET_CHECK(instruction_position.at(operand) < + instruction_position.at(instruction)) + << "Instruction " << instruction->name() + << " is not scheduled after its operand " << operand->name(); + } + + for (const HloInstruction* pred : instruction->control_predecessors()) { + TF_RET_CHECK(instruction_position.at(pred) < + instruction_position.at(instruction)) + << "Instruction " << instruction->name() + << " is not scheduled after its control predecessor " + << pred->name(); + } + } + } + + return Status::OK(); +} + } // namespace xla diff --git a/tensorflow/compiler/xla/service/hlo_scheduling.h b/tensorflow/compiler/xla/service/hlo_scheduling.h index 2b33ccc8bf..d06b8d9a5c 100644 --- a/tensorflow/compiler/xla/service/hlo_scheduling.h +++ b/tensorflow/compiler/xla/service/hlo_scheduling.h @@ -85,6 +85,43 @@ StatusOr> ScheduleOneComputation( const HloComputation& computation, const LogicalBuffer::SizeFunction& size_function); +// Transforms the given schedule such that it is (again) a valid schedule for +// the module. This is used to update a schedule after the HLO module has been +// transformed in some way. In general, the only transformations to the module +// for which a schedule can be updated is the addition or removal of +// instructions to/from the module. Updating the schedule after new dependencies +// between existing instructions in the module is not supported and may result +// in an error status returned. +// +// Instructions in the module which also exist in the given schedule will remain +// in the same order in the updated schedule. Instructions which exist in the +// module but not in the given schedule will be placed as early as possible in +// the updated schedule. +// +// 'id_sequence' is a mirror of the given schedule 'sequence' but with +// HloInstruction ids rather than HloInstruction pointers. This should be +// constructed using ComputeIdSchedule below after the schedule is constructed +// but before the HLO module is transformed. +Status UpdateSchedule( + const HloModule& module, + const tensorflow::gtl::FlatMap>& + id_sequence, + SequentialHloOrdering::HloModuleSequence* sequence); + +// Constructs a copy of the given schedule but with HloInstruction unique ids +// rather than HloInstruction pointers. This is necessary for updating a +// schedule as HloInstruction points in the schedule may become invalid if +// instructions are removed from the module. Used by UpdateSchedule above.. +// TODO(b/113175018): Remove this function when HLO schedule is its own class. +tensorflow::gtl::FlatMap> +ComputeIdSchedule(const SequentialHloOrdering::HloModuleSequence& sequence); + +// Verifies that the given schedule is valid for the given module. Specifically, +// the schedule contains exactly the instructions in the module and every +// dependency in the module is satisfied in the schedule. +Status VerifySchedule(const HloModule& module, + const SequentialHloOrdering::HloModuleSequence& sequence); + } // namespace xla #endif // TENSORFLOW_COMPILER_XLA_SERVICE_HLO_SCHEDULING_H_ diff --git a/tensorflow/compiler/xla/service/hlo_scheduling_test.cc b/tensorflow/compiler/xla/service/hlo_scheduling_test.cc index 639c20ad8e..930801288a 100644 --- a/tensorflow/compiler/xla/service/hlo_scheduling_test.cc +++ b/tensorflow/compiler/xla/service/hlo_scheduling_test.cc @@ -20,6 +20,7 @@ limitations under the License. #include "tensorflow/compiler/xla/service/heap_simulator.h" #include "tensorflow/compiler/xla/service/hlo_computation.h" +#include "tensorflow/compiler/xla/service/hlo_dce.h" #include "tensorflow/compiler/xla/service/hlo_instruction.h" #include "tensorflow/compiler/xla/service/hlo_opcode.h" #include "tensorflow/compiler/xla/service/hlo_ordering.h" @@ -28,6 +29,7 @@ limitations under the License. #include "tensorflow/compiler/xla/tests/hlo_test_base.h" #include "tensorflow/compiler/xla/types.h" #include "tensorflow/compiler/xla/xla_data.pb.h" +#include "tensorflow/core/lib/core/status_test_util.h" namespace xla { namespace { @@ -415,5 +417,251 @@ TEST_F(HloSchedulingTest, HeapSimulatorAccountsForSubcomputations) { .ValueOrDie()); } +TEST_F(HloSchedulingTest, UpdateScheduleUnchangedModule) { + // Updating the schedule of an unchanged HLO module should not affect the + // schedule at all. + const string module_str = R"( +HloModule UpdateScheduleUnchanged + +ENTRY main { + a = f32[] parameter(0) + b = f32[] parameter(1) + c = f32[] constant(42.0) + sum = f32[] add(a, b) + neg = f32[] negate(c) + ROOT root = f32[] multiply(sum, neg) +} +)"; + TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr module, + ParseHloString(module_str)); + TF_ASSERT_OK_AND_ASSIGN( + SequentialHloOrdering::HloModuleSequence sequence, + ScheduleComputationsInModule(*module, [](const BufferValue& buffer) { + return ShapeUtil::ByteSizeOf(buffer.shape()); + })); + tensorflow::gtl::FlatMap> + id_sequence = ComputeIdSchedule(sequence); + std::vector entry_schedule = sequence.begin()->second; + + EXPECT_EQ(entry_schedule.size(), 6); + + TF_ASSERT_OK(UpdateSchedule(*module, id_sequence, &sequence)); + TF_ASSERT_OK(VerifySchedule(*module, sequence)); + + EXPECT_EQ(entry_schedule, sequence.begin()->second); +} + +TEST_F(HloSchedulingTest, UpdateScheduleWithNewInstructions) { + // Add some additional instructions to a module and verify the schedule can be + // updated. + const string module_str = R"( +HloModule UpdateScheduleWithNewInstructions + +ENTRY main { + a = f32[] parameter(0) + b = f32[] parameter(1) + c = f32[] constant(42.0) + sum = f32[] add(a, b) + neg = f32[] negate(c) + ROOT root = f32[] multiply(sum, neg) +} +)"; + TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr module, + ParseHloString(module_str)); + TF_ASSERT_OK_AND_ASSIGN( + SequentialHloOrdering::HloModuleSequence sequence, + ScheduleComputationsInModule(*module, [](const BufferValue& buffer) { + return ShapeUtil::ByteSizeOf(buffer.shape()); + })); + tensorflow::gtl::FlatMap> + id_sequence = ComputeIdSchedule(sequence); + + HloComputation* entry = module->entry_computation(); + const Shape shape = entry->root_instruction()->shape(); + HloInstruction* constant = entry->AddInstruction( + HloInstruction::CreateConstant(LiteralUtil::CreateR0(42.0))); + HloInstruction* sub = entry->AddInstruction(HloInstruction::CreateBinary( + shape, HloOpcode::kSubtract, constant, entry->root_instruction())); + entry->set_root_instruction(sub); + + auto in_schedule = [&](const HloInstruction* hlo) { + return std::find(sequence.at(entry).begin(), sequence.at(entry).end(), + hlo) != sequence.at(entry).end(); + }; + + EXPECT_EQ(sequence.at(entry).size(), 6); + EXPECT_FALSE(in_schedule(constant)); + EXPECT_FALSE(in_schedule(sub)); + + TF_ASSERT_OK(UpdateSchedule(*module, id_sequence, &sequence)); + TF_ASSERT_OK(VerifySchedule(*module, sequence)); + + EXPECT_EQ(sequence.at(entry).size(), 8); + EXPECT_TRUE(in_schedule(constant)); + EXPECT_TRUE(in_schedule(sub)); +} + +TEST_F(HloSchedulingTest, UpdateScheduleWithAddedAndDeletedInstruction) { + // Add and delete some instructions from a module and verify that the schedule + // can be updated successfully. + const string module_str = R"( +HloModule UpdateScheduleWithAddedAndDeletedInstruction + +ENTRY main { + a = f32[] parameter(0) + b = f32[] parameter(1) + c = f32[] constant(42.0) + sum = f32[] add(a, b) + neg = f32[] negate(c) + ROOT root = f32[] multiply(sum, neg) +} +)"; + + TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr module, + ParseHloString(module_str)); + TF_ASSERT_OK_AND_ASSIGN( + SequentialHloOrdering::HloModuleSequence sequence, + ScheduleComputationsInModule(*module, [](const BufferValue& buffer) { + return ShapeUtil::ByteSizeOf(buffer.shape()); + })); + tensorflow::gtl::FlatMap> + id_sequence = ComputeIdSchedule(sequence); + + // Set the entry root to some expression containing just a parameter and a + // constant. + HloComputation* entry = module->entry_computation(); + HloInstruction* constant = entry->AddInstruction( + HloInstruction::CreateConstant(LiteralUtil::CreateR0(42.0))); + HloInstruction* new_root = entry->AddInstruction( + HloInstruction::CreateBinary(constant->shape(), HloOpcode::kSubtract, + constant, entry->parameter_instruction(0))); + entry->set_root_instruction(new_root); + + // DCE should remove everything but the parameters and the newly added code. + HloDCE dce; + TF_ASSERT_OK(dce.Run(module.get()).status()); + + EXPECT_EQ(sequence.at(entry).size(), 6); + + TF_ASSERT_OK(UpdateSchedule(*module, id_sequence, &sequence)); + TF_ASSERT_OK(VerifySchedule(*module, sequence)); + + EXPECT_EQ(sequence.at(entry).size(), 4); +} + +TEST_F(HloSchedulingTest, UpdateScheduleWithCompletelyReplacedModule) { + // Completely replace a module with an entirely new set of instructions and + // verify that the schedule can be updated successfully. + const string module_str = R"( +HloModule UpdateScheduleWithCompletelyReplacedModule + +ENTRY main { + a = f32[] constant(42.0) + b = f32[] constant(123.0) + ROOT sum = f32[] add(a, b) +} +)"; + + TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr module, + ParseHloString(module_str)); + TF_ASSERT_OK_AND_ASSIGN( + SequentialHloOrdering::HloModuleSequence sequence, + ScheduleComputationsInModule(*module, [](const BufferValue& buffer) { + return ShapeUtil::ByteSizeOf(buffer.shape()); + })); + tensorflow::gtl::FlatMap> + id_sequence = ComputeIdSchedule(sequence); + + // Replace the entry computation with the negation of a constant. + HloComputation* entry = module->entry_computation(); + HloInstruction* constant = entry->AddInstruction( + HloInstruction::CreateConstant(LiteralUtil::CreateR0(1.0))); + HloInstruction* new_root = entry->AddInstruction(HloInstruction::CreateUnary( + constant->shape(), HloOpcode::kNegate, constant)); + entry->set_root_instruction(new_root); + + // DCE the old instructions. + HloDCE dce; + TF_ASSERT_OK(dce.Run(module.get()).status()); + + EXPECT_EQ(sequence.at(entry).size(), 3); + + TF_ASSERT_OK(UpdateSchedule(*module, id_sequence, &sequence)); + TF_ASSERT_OK(VerifySchedule(*module, sequence)); + + EXPECT_EQ(sequence.at(entry).size(), 2); +} + +TEST_F(HloSchedulingTest, UpdateScheduleWithMultipleComputations) { + // Create changes to more than one computation in an HLO module and verify + // that the schedule can be updated. + const string module_str = R"( +HloModule UpdateScheduleWithMultipleComputations + +%Body (param.1: (s32[], token[])) -> (s32[], token[]) { + %param.1 = (s32[], token[]) parameter(0) + %get-tuple-element.1 = s32[] get-tuple-element((s32[], token[]) %param.1), index=0 + %constant.1 = s32[] constant(1) + %add = s32[] add(s32[] %get-tuple-element.1, s32[] %constant.1) + %get-tuple-element.2 = token[] get-tuple-element((s32[], token[]) %param.1), index=1 + %after-all = token[] after-all(token[] %get-tuple-element.2) + ROOT %tuple = (s32[], token[]) tuple(s32[] %add, token[] %after-all) +} + +%Cond (param: (s32[], token[])) -> pred[] { + %param = (s32[], token[]) parameter(0) + %get-tuple-element = s32[] get-tuple-element((s32[], token[]) %param), index=0 + %constant = s32[] constant(42) + ROOT %less-than = pred[] less-than(s32[] %get-tuple-element, s32[] %constant) +} + +ENTRY %WhileLoop () -> s32[] { + %zero = s32[] constant(0) + %init_token = token[] after-all() + %init_tuple = (s32[], token[]) tuple(s32[] %zero, token[] %init_token) + %while = (s32[], token[]) while((s32[], token[]) %init_tuple), condition=%Cond, body=%Body + ROOT %root = s32[] get-tuple-element((s32[], token[]) %while), index=0 +} +)"; + + TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr module, + ParseHloString(module_str)); + TF_ASSERT_OK_AND_ASSIGN( + SequentialHloOrdering::HloModuleSequence sequence, + ScheduleComputationsInModule(*module, [](const BufferValue& buffer) { + return ShapeUtil::ByteSizeOf(buffer.shape(), + /*pointer_size=*/sizeof(void*)); + })); + tensorflow::gtl::FlatMap> + id_sequence = ComputeIdSchedule(sequence); + + const HloInstruction* xla_while = + module->entry_computation()->root_instruction()->operand(0); + HloComputation* body = xla_while->while_body(); + HloComputation* cond = xla_while->while_condition(); + + // Negate the root of the cond. + cond->set_root_instruction(cond->AddInstruction( + HloInstruction::CreateUnary(ShapeUtil::MakeShape(PRED, {}), + HloOpcode::kNot, cond->root_instruction()))); + + // Replace the body with a computation which just passes through its + // parameter. + body->set_root_instruction(body->parameter_instruction(0)); + + // DCE the dead code in the body. + HloDCE dce; + TF_ASSERT_OK(dce.Run(module.get()).status()); + + EXPECT_EQ(sequence.at(body).size(), 7); + EXPECT_EQ(sequence.at(cond).size(), 4); + + TF_ASSERT_OK(UpdateSchedule(*module, id_sequence, &sequence)); + TF_ASSERT_OK(VerifySchedule(*module, sequence)); + + EXPECT_EQ(sequence.at(body).size(), 1); + EXPECT_EQ(sequence.at(cond).size(), 5); +} + } // namespace } // namespace xla -- GitLab From 35e36ec42f821157c2468c1ebb4d8512478f606b Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Fri, 24 Aug 2018 17:55:34 -0700 Subject: [PATCH 126/598] Update ops-related pbtxt files. PiperOrigin-RevId: 210186531 --- .../core/ops/compat/ops_history.v1.pbtxt | 26 +++++++++++++++++++ tensorflow/core/ops/ops.pbtxt | 4 +++ 2 files changed, 30 insertions(+) diff --git a/tensorflow/core/ops/compat/ops_history.v1.pbtxt b/tensorflow/core/ops/compat/ops_history.v1.pbtxt index 68b5043ca7..97a212b8f3 100644 --- a/tensorflow/core/ops/compat/ops_history.v1.pbtxt +++ b/tensorflow/core/ops/compat/ops_history.v1.pbtxt @@ -30015,6 +30015,32 @@ op { } } } +op { + name: "MatrixExponential" + input_arg { + name: "input" + type_attr: "T" + } + output_arg { + name: "output" + type_attr: "T" + } + attr { + name: "T" + type: "type" + allowed_values { + list { + type: DT_DOUBLE + type: DT_FLOAT + type: DT_COMPLEX64 + type: DT_COMPLEX128 + } + } + } + deprecation { + version: 27 + } +} op { name: "MatrixInverse" input_arg { diff --git a/tensorflow/core/ops/ops.pbtxt b/tensorflow/core/ops/ops.pbtxt index e35fd73b09..9091622f09 100644 --- a/tensorflow/core/ops/ops.pbtxt +++ b/tensorflow/core/ops/ops.pbtxt @@ -15046,6 +15046,10 @@ op { } } } + deprecation { + version: 27 + explanation: "Use Python implementation tf.linalg.matrix_exponential instead." + } } op { name: "MatrixInverse" -- GitLab From 7805e23c8416fe4ccccb48c37199a5631bee6d51 Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Fri, 24 Aug 2018 18:01:38 -0700 Subject: [PATCH 127/598] Support shape [1 C 1 1] for associative operator optimization with Conv2D PiperOrigin-RevId: 210187033 --- tensorflow/core/grappler/optimizers/BUILD | 4 +- .../grappler/optimizers/constant_folding.cc | 98 ++++++++++++++-- .../grappler/optimizers/constant_folding.h | 3 +- .../optimizers/constant_folding_test.cc | 106 +++++++++++++++++- 4 files changed, 197 insertions(+), 14 deletions(-) diff --git a/tensorflow/core/grappler/optimizers/BUILD b/tensorflow/core/grappler/optimizers/BUILD index a8af169e28..70ad9f9a9b 100644 --- a/tensorflow/core/grappler/optimizers/BUILD +++ b/tensorflow/core/grappler/optimizers/BUILD @@ -110,10 +110,10 @@ cc_library( ], ) -tf_cc_test( +tf_cuda_cc_test( name = "constant_folding_test", srcs = ["constant_folding_test.cc"], - shard_count = 5, + tags = ["requires-gpu-sm35"], deps = [ ":constant_folding", "//tensorflow/cc:cc_ops", diff --git a/tensorflow/core/grappler/optimizers/constant_folding.cc b/tensorflow/core/grappler/optimizers/constant_folding.cc index f2ac3a44c0..815bd23307 100644 --- a/tensorflow/core/grappler/optimizers/constant_folding.cc +++ b/tensorflow/core/grappler/optimizers/constant_folding.cc @@ -852,7 +852,19 @@ DataType GetDataTypeFromNodeOrProps(const NodeDef& node, } return dtype; } - +bool IsValidConstShapeForNCHW(const TensorShapeProto& shape) { + if (shape.dim_size() != 4) { + return false; + } + int num_dim_larger_than_one = 0; + for (const auto& dim : shape.dim()) { + if (dim.size() > 1) ++num_dim_larger_than_one; + } + return num_dim_larger_than_one <= 1; +} +const string& GetShape(const NodeDef& node) { + return node.attr().at("data_format").s(); +} } // namespace // static @@ -1699,7 +1711,7 @@ Status ConstantFolding::SimplifyNode(bool use_shape_info, NodeDef* node, return Status::OK(); } - if (MulConvPushDown(node, *properties)) { + if (MulConvPushDown(*properties, optimized_graph, node)) { graph_modified_ = true; return Status::OK(); } @@ -2541,8 +2553,9 @@ bool ConstantFolding::ConstantPushDown(NodeDef* node) { return false; } -bool ConstantFolding::MulConvPushDown(NodeDef* node, - const GraphProperties& properties) { +bool ConstantFolding::MulConvPushDown(const GraphProperties& properties, + GraphDef* optimized_graph, + NodeDef* node) { // Push down multiplication on ConvND. // * ConvND // / \ / \ @@ -2618,12 +2631,14 @@ bool ConstantFolding::MulConvPushDown(NodeDef* node, } const auto& const_shape = const_props[0].shape(); - TensorShapeProto new_filter_shape; - if (!ShapeAfterBroadcast(filter_shape, const_shape, &new_filter_shape)) { - return false; - } - if (!ShapesSymbolicallyEqual(filter_shape, new_filter_shape)) { - return false; + if (GetShape(*conv_node) == "NHWC") { + TensorShapeProto new_filter_shape; + if (!ShapeAfterBroadcast(filter_shape, const_shape, &new_filter_shape)) { + return false; + } + if (!ShapesSymbolicallyEqual(filter_shape, new_filter_shape)) { + return false; + } } string mul_new_name = @@ -2657,6 +2672,69 @@ bool ConstantFolding::MulConvPushDown(NodeDef* node, } node_map_->AddNode(mul_new_name, node); + if (GetShape(*conv_node) == "NCHW") { + if (const_node->attr().at("value").tensor().tensor_shape().dim_size() <= + 1) { + // Broadcast should work for scalar or 1D. No need to reshape. + return true; + } + if (!IsValidConstShapeForNCHW( + const_node->attr().at("value").tensor().tensor_shape())) { + return false; + } + // Adds Const node for Reshape. + auto* shape_const_node = optimized_graph->add_node(); + const string shape_const_node_name = + OptimizedNodeName(*const_node, "_new_shape"); + shape_const_node->set_name(shape_const_node_name); + shape_const_node->set_op("Const"); + shape_const_node->set_device(const_node->device()); + (*shape_const_node->mutable_attr())["dtype"].set_type(DT_INT32); + Tensor t(DT_INT32, {4}); + t.flat()(0) = 1; + t.flat()(1) = 1; + t.flat()(2) = 1; + t.flat()(3) = const_node->attr() + .at("value") + .tensor() + .tensor_shape() + .dim(1) // IsValidConstShapeForNCHW guarantees + // dim 1 is the dim to reshape + .size(); + t.AsProtoTensorContent( + (*shape_const_node->mutable_attr())["value"].mutable_tensor()); + node_map_->AddNode(shape_const_node_name, shape_const_node); + + // Adds Reshape node. + auto* reshape_node = optimized_graph->add_node(); + const string reshape_node_name = + OptimizedNodeName(*const_node, "_reshape"); + reshape_node->set_op("Reshape"); + reshape_node->set_name(reshape_node_name); + reshape_node->set_device(const_node->device()); + (*reshape_node->mutable_attr())["T"].set_type( + const_node->attr().at("dtype").type()); + (*reshape_node->mutable_attr())["Tshape"].set_type(DT_INT32); + node_map_->AddNode(reshape_node_name, reshape_node); + + // const_node -> reshape_node + node_map_->RemoveOutput(const_node->name(), node->name()); + *reshape_node->add_input() = const_node->name(); + node_map_->AddOutput(const_node->name(), reshape_node_name); + + // shape_const_node -> reshape_node + *reshape_node->add_input() = shape_const_node_name; + node_map_->AddOutput(shape_const_node_name, reshape_node_name); + + // reshape_node -> node (Mul) + node_map_->AddOutput(reshape_node_name, node->name()); + if (left_child_is_constant) { + node->set_input(0, reshape_node_name); + } else { + node->set_input(1, reshape_node_name); + } + } + return true; } return false; diff --git a/tensorflow/core/grappler/optimizers/constant_folding.h b/tensorflow/core/grappler/optimizers/constant_folding.h index b42d5f201e..051dfb681e 100644 --- a/tensorflow/core/grappler/optimizers/constant_folding.h +++ b/tensorflow/core/grappler/optimizers/constant_folding.h @@ -125,7 +125,8 @@ class ConstantFolding : public GraphOptimizer { // Aggregate constants present around a conv operator. Returns true if the // transformation was applied successfully. - bool MulConvPushDown(NodeDef* node, const GraphProperties& properties); + bool MulConvPushDown(const GraphProperties& properties, + GraphDef* optimized_graph, NodeDef* node); // Strength reduces floating point division by a constant Div(x, const) to // multiplication by the reciprocal Mul(x, Reciprocal(const)). diff --git a/tensorflow/core/grappler/optimizers/constant_folding_test.cc b/tensorflow/core/grappler/optimizers/constant_folding_test.cc index 5bf45af6b3..0683572dcc 100644 --- a/tensorflow/core/grappler/optimizers/constant_folding_test.cc +++ b/tensorflow/core/grappler/optimizers/constant_folding_test.cc @@ -240,7 +240,7 @@ TEST_F(ConstantFoldingTest, AddTree) { } } -TEST_F(ConstantFoldingTest, ConvPushDownTest) { +TEST_F(ConstantFoldingTest, ConvPushDownTestNHWC) { // Tests if the following rewrite is performed: // // * Conv2D @@ -3080,6 +3080,110 @@ TEST_F(ConstantFoldingTest, FoldingPreservesDenormalFlushing) { test::ExpectTensorEqual(tensors_expected[0], tensors[0]); } +#if GOOGLE_CUDA +TEST_F(ConstantFoldingTest, ConvPushDownTestNCHW) { + // Tests if the following rewrite is performed: + // + // * Conv2D + // / \ / \ + // c Conv2D --> x (c * filter) + // / \ + // x filter + tensorflow::Scope s = tensorflow::Scope::NewRootScope(); + + int input_channel = 1; + int output_channel = 2; + int filter_size = 1; + + TensorShape filter_shape( + {filter_size, filter_size, input_channel, output_channel}); + + // Filter shape: [1, 1, 1, 2] + // Filter for output channel 0 = {2.f} + // Filter for output channel 1 = {-2.f} + // clang-format off + Output filter = + ops::Const(s.WithOpName("filter"), { + { + {{2.f, -2.f}} + } + }); + // clang-format on + + int batch_size = 1; + int matrix_size = 3; + // input shape: [1,1,3,3] + TensorShape input_shape( + {batch_size, input_channel, matrix_size, matrix_size}); + Output input = ops::Placeholder(s.WithOpName("x"), DT_FLOAT, + ops::Placeholder::Shape(input_shape)); + + Output conv = ops::Conv2D(s.WithOpName("conv"), input, filter, {1, 1, 1, 1}, + "VALID", ops::Conv2D::DataFormat("NCHW")); + Output c = ops::Const(s.WithOpName("c"), 2.0f, /* shape */ {1, 2, 1, 1}); + Output mul = ops::Mul(s.WithOpName("mul"), c, conv); + + GrapplerItem item; + TF_CHECK_OK(s.ToGraphDef(&item.graph)); + + ConstantFolding fold(nullptr); + GraphDef output; + Status status = fold.Optimize(nullptr, item, &output); + TF_EXPECT_OK(status); + + // Here only op/IO are checked. The values are verified by EvaluateNodes + // below. + int found = 0; + for (const auto& node : output.node()) { + if (node.name() == "mul") { + ++found; + EXPECT_EQ("Conv2D", node.op()); + EXPECT_EQ(2, node.input_size()); + EXPECT_EQ("x", node.input(0)); + EXPECT_EQ("conv/merged_input", node.input(1)); + } else if (node.name() == "conv/merged_input") { + ++found; + EXPECT_EQ("Const", node.op()); + EXPECT_EQ(0, node.input_size()); + } + } + EXPECT_EQ(2, found); + + // Check that const folded multiplication node has the expected value. + std::vector fetch = {"mul"}; + // Input shape (NCHW) is [1,1,3,3], filter is [1,1,1,2] output shape should be + // (NCHW) [1,2,3,3] + ::tensorflow::Input::Initializer x{ + { + { + {1.f, 2.f, 3.f}, // H = 0 + {4.f, 5.f, 6.f}, // H = 1 + {7.f, 8.f, 9.f} // H = 2 + } // C = 0 + } // N = 0 + }; + + // |1,2,3| + // conv( |4,5,6|, // input + // |7,8,9| + // [[[2,-2]]]) // filter + // * [1,2,1,1] // mul by const + // = + // [ + // |4, 8, 12| + // |16,20,24| ==> output channel 0 + // |28,32,36| + // + // | -4, -8,-12| + // |-16,-20,-24| ==> output channel 1 + // |-28,-32,-36| + // ] + auto actual = EvaluateNodes(output, fetch, {{"x", x.tensor}}); + auto expected = EvaluateNodes(item.graph, fetch, {{"x", x.tensor}}); + test::ExpectTensorEqual(expected[0], actual[0]); +} +#endif + } // namespace } // namespace grappler } // namespace tensorflow -- GitLab From 524f931fa9b6158c99f5df88839a80a36e420d08 Mon Sep 17 00:00:00 2001 From: Yuefeng Zhou Date: Fri, 24 Aug 2018 18:06:41 -0700 Subject: [PATCH 128/598] Only create steps_per_run_variable if it is tpu strategy. PiperOrigin-RevId: 210187464 --- tensorflow/python/estimator/estimator.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tensorflow/python/estimator/estimator.py b/tensorflow/python/estimator/estimator.py index bcbd7b7933..3849188c58 100644 --- a/tensorflow/python/estimator/estimator.py +++ b/tensorflow/python/estimator/estimator.py @@ -1241,7 +1241,8 @@ class Estimator(object): # We want to create the iterations variable outside the distribution scope # as that is just stored on the host and mainly used to drive the loop # and doesn't need to be a Mirrored/Device variable. - steps_per_run_variable = training.get_or_create_steps_per_run_variable() + if is_tpu_strategy: + steps_per_run_variable = training.get_or_create_steps_per_run_variable() with self._train_distribution.scope(): random_seed.set_random_seed(self._config.tf_random_seed) iterator, input_hooks = self._get_iterator_from_input_fn( -- GitLab From d25731fa83657e3d1eaef08d3afc628eb6bfa8b5 Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Fri, 24 Aug 2018 18:11:20 -0700 Subject: [PATCH 129/598] Added note that gradient accumulation is experimental. PiperOrigin-RevId: 210187980 --- tensorflow/contrib/tpu/proto/optimization_parameters.proto | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tensorflow/contrib/tpu/proto/optimization_parameters.proto b/tensorflow/contrib/tpu/proto/optimization_parameters.proto index 2cc17d6d92..bf807af68b 100644 --- a/tensorflow/contrib/tpu/proto/optimization_parameters.proto +++ b/tensorflow/contrib/tpu/proto/optimization_parameters.proto @@ -119,7 +119,9 @@ message OptimizationParameters { // Whether to use gradient accumulation (do two passes over the input // gradients: one to accumulate them into a temporary array and another to - // apply them using the actual optimization algorithm). + // apply them using the actual optimization algorithm). This feature is + // experimental -- it has not been fully verified and may cause training + // crashes and/or failures. bool use_gradient_accumulation = 15; // Optimization algorithm parameters; which field is selected determines which -- GitLab From b26553dc1f461e388963efcec9e77f5a1f61c093 Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Fri, 24 Aug 2018 18:17:45 -0700 Subject: [PATCH 130/598] Go: Update generated wrapper functions for TensorFlow ops. PiperOrigin-RevId: 210188468 --- tensorflow/go/op/wrappers.go | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tensorflow/go/op/wrappers.go b/tensorflow/go/op/wrappers.go index de096acc4d..0aba0393af 100644 --- a/tensorflow/go/op/wrappers.go +++ b/tensorflow/go/op/wrappers.go @@ -23396,6 +23396,8 @@ func TensorListSetItem(scope *Scope, input_handle tf.Output, index tf.Output, it // Computes the matrix exponential of one or more square matrices: // +// DEPRECATED at GraphDef version 27: Use Python implementation tf.linalg.matrix_exponential instead. +// // \\(exp(A) = \sum_{n=0}^\infty A^n/n!\\) // // The exponential is computed using a combination of the scaling and squaring -- GitLab From d936338f32b826cc6b7ab835efed427d78741f81 Mon Sep 17 00:00:00 2001 From: Piotr Padlewski Date: Fri, 24 Aug 2018 18:33:21 -0700 Subject: [PATCH 131/598] Filter fusion This patch introduces FilterFusion optimization which can fuse multiple FilterDataset operations. PiperOrigin-RevId: 210189643 --- .../contrib/data/python/kernel_tests/BUILD | 19 ++ .../kernel_tests/filter_dataset_op_test.py | 76 ++++++++ .../python/kernel_tests/optimization/BUILD | 2 +- .../map_and_filter_fusion_test.py | 58 ++++++ .../kernel_tests/optimize_dataset_op_test.py | 5 +- .../core/grappler/optimizers/data/BUILD | 41 +++++ .../grappler/optimizers/data/filter_fusion.cc | 141 +++++++++++++++ .../grappler/optimizers/data/filter_fusion.h | 47 +++++ .../optimizers/data/filter_fusion_test.cc | 91 ++++++++++ .../grappler/optimizers/data/fusion_utils.cc | 166 +++++++++++++++--- .../grappler/optimizers/data/fusion_utils.h | 57 ++++-- .../optimizers/data/fusion_utils_test.cc | 22 +-- .../optimizers/data/map_and_filter_fusion.cc | 9 +- .../grappler/optimizers/data/map_fusion.cc | 9 +- .../core/kernels/data/optimize_dataset_op.cc | 26 ++- 15 files changed, 703 insertions(+), 66 deletions(-) create mode 100644 tensorflow/contrib/data/python/kernel_tests/filter_dataset_op_test.py create mode 100644 tensorflow/core/grappler/optimizers/data/filter_fusion.cc create mode 100644 tensorflow/core/grappler/optimizers/data/filter_fusion.h create mode 100644 tensorflow/core/grappler/optimizers/data/filter_fusion_test.cc diff --git a/tensorflow/contrib/data/python/kernel_tests/BUILD b/tensorflow/contrib/data/python/kernel_tests/BUILD index 9e2697534c..b86a543fc3 100644 --- a/tensorflow/contrib/data/python/kernel_tests/BUILD +++ b/tensorflow/contrib/data/python/kernel_tests/BUILD @@ -245,6 +245,25 @@ py_test( ], ) +py_test( + name = "filter_dataset_op_test", + size = "medium", + srcs = ["filter_dataset_op_test.py"], + srcs_version = "PY2AND3", + deps = [ + "//tensorflow/contrib/data/python/ops:optimization", + "//tensorflow/python:array_ops", + "//tensorflow/python:client_testlib", + "//tensorflow/python:errors", + "//tensorflow/python:framework_ops", + "//tensorflow/python:io_ops", + "//tensorflow/python:math_ops", + "//tensorflow/python:util", + "//tensorflow/python/data/ops:dataset_ops", + "//third_party/py/numpy", + ], +) + py_test( name = "map_defun_op_test", size = "small", diff --git a/tensorflow/contrib/data/python/kernel_tests/filter_dataset_op_test.py b/tensorflow/contrib/data/python/kernel_tests/filter_dataset_op_test.py new file mode 100644 index 0000000000..6d01bf585c --- /dev/null +++ b/tensorflow/contrib/data/python/kernel_tests/filter_dataset_op_test.py @@ -0,0 +1,76 @@ +# Copyright 2018 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Benchmarks FilterDataset input pipeline op.""" +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import time + +import numpy as np + +from tensorflow.contrib.data.python.ops import optimization +from tensorflow.python.client import session +from tensorflow.python.data.ops import dataset_ops +from tensorflow.python.framework import ops +from tensorflow.python.ops import math_ops +from tensorflow.python.platform import test + + +class FilterBenchmark(test.Benchmark): + + # This benchmark compares the performance of pipeline with multiple chained + # filter with and without filter fusion. + def benchmarkFilters(self): + chain_lengths = [0, 1, 2, 5, 10, 20, 50] + for chain_length in chain_lengths: + self._benchmarkFilters(chain_length, False) + self._benchmarkFilters(chain_length, True) + + def _benchmarkFilters(self, chain_length, optimize_dataset): + with ops.Graph().as_default(): + dataset = dataset_ops.Dataset.from_tensors(5).repeat(None) + for _ in range(chain_length): + dataset = dataset.filter(lambda x: math_ops.greater_equal(x - 5, 0)) + if optimize_dataset: + dataset = dataset.apply(optimization.optimize(["filter_fusion"])) + + iterator = dataset.make_one_shot_iterator() + next_element = iterator.get_next() + + with session.Session() as sess: + for _ in range(10): + sess.run(next_element.op) + deltas = [] + for _ in range(100): + start = time.time() + for _ in range(100): + sess.run(next_element.op) + end = time.time() + deltas.append(end - start) + + median_wall_time = np.median(deltas) / 100 + opt_mark = "opt" if optimize_dataset else "no-opt" + print("Filter dataset {} chain length: {} Median wall time: {}".format( + opt_mark, chain_length, median_wall_time)) + self.report_benchmark( + iters=1000, + wall_time=median_wall_time, + name="benchmark_filter_dataset_chain_latency_{}_{}".format( + opt_mark, chain_length)) + + +if __name__ == "__main__": + test.main() diff --git a/tensorflow/contrib/data/python/kernel_tests/optimization/BUILD b/tensorflow/contrib/data/python/kernel_tests/optimization/BUILD index 7492d1477b..b299e0736f 100644 --- a/tensorflow/contrib/data/python/kernel_tests/optimization/BUILD +++ b/tensorflow/contrib/data/python/kernel_tests/optimization/BUILD @@ -30,7 +30,7 @@ py_test( py_test( name = "map_and_filter_fusion_test", - size = "small", + size = "medium", srcs = ["map_and_filter_fusion_test.py"], srcs_version = "PY2AND3", deps = [ diff --git a/tensorflow/contrib/data/python/kernel_tests/optimization/map_and_filter_fusion_test.py b/tensorflow/contrib/data/python/kernel_tests/optimization/map_and_filter_fusion_test.py index 2d8a4a583d..586b4bee5f 100644 --- a/tensorflow/contrib/data/python/kernel_tests/optimization/map_and_filter_fusion_test.py +++ b/tensorflow/contrib/data/python/kernel_tests/optimization/map_and_filter_fusion_test.py @@ -161,6 +161,64 @@ class MapAndFilterFusionTest(test.TestCase, parameterized.TestCase): self._testMapAndFilter(dataset, function, predicate) + @staticmethod + def filter_functions(): + take_all = lambda x: constant_op.constant(True) + is_zero = lambda x: math_ops.equal(x, 0) + greater = lambda x: math_ops.greater(x + 5, 0) + + tests = [] + filters = [take_all, is_zero, greater] + identity = lambda x: x + for x, predicate_1 in enumerate(filters): + for y, predicate_2 in enumerate(filters): + tests.append(("mixed_{}_{}".format(x, y), identity, + [predicate_1, predicate_2])) + for z, predicate_3 in enumerate(filters): + tests.append(("mixed_{}_{}_{}".format(x, y, z), identity, + [predicate_1, predicate_2, predicate_3])) + + take_all_multiple = lambda x, y: constant_op.constant(True) + # Multi output + tests.append(("multiOne", lambda x: (x, x), + [take_all_multiple, take_all_multiple])) + tests.append(("multiTwo", lambda x: (x, 2), [ + take_all_multiple, + lambda x, y: math_ops.equal(x * math_ops.cast(y, dtypes.int64), 0) + ])) + return tuple(tests) + + @parameterized.named_parameters(*filter_functions.__func__()) + def testFilterFusion(self, map_function, predicates): + dataset = dataset_ops.Dataset.range(5).apply( + optimization.assert_next(["Map", "Filter", + "Prefetch"])).map(map_function) + for predicate in predicates: + dataset = dataset.filter(predicate) + + dataset = dataset.prefetch(0).apply( + optimization.optimize(["filter_fusion"])) + iterator = dataset.make_one_shot_iterator() + get_next = iterator.get_next() + with self.test_session() as sess: + for x in range(5): + r = map_function(x) + filtered = False + for predicate in predicates: + if isinstance(r, tuple): + b = predicate(*r) # Pass tuple as multiple arguments. + else: + b = predicate(r) + if not sess.run(b): + filtered = True + break + + if not filtered: + result = sess.run(get_next) + self.assertAllEqual(r, result) + with self.assertRaises(errors.OutOfRangeError): + sess.run(get_next) + if __name__ == "__main__": test.main() diff --git a/tensorflow/contrib/data/python/kernel_tests/optimize_dataset_op_test.py b/tensorflow/contrib/data/python/kernel_tests/optimize_dataset_op_test.py index 514adffa62..ca38f8e2f9 100644 --- a/tensorflow/contrib/data/python/kernel_tests/optimize_dataset_op_test.py +++ b/tensorflow/contrib/data/python/kernel_tests/optimize_dataset_op_test.py @@ -100,7 +100,10 @@ class OptimizeDatasetTest(test.TestCase, parameterized.TestCase): with self.assertRaises(errors.OutOfRangeError): sess.run(get_next) - def testFunctionLibraryDefinitionModification(self): + # TODO(b/112914454): Remove the test or figure out way to copy only new + # functions in optimize_dataset_op instead of taking union of old and new + # functions. + def _testFunctionLibraryDefinitionModification(self): dataset = dataset_ops.Dataset.from_tensors(0).map(lambda x: x).apply( optimization.optimize(["_test_only_function_rename"])) iterator = dataset.make_one_shot_iterator() diff --git a/tensorflow/core/grappler/optimizers/data/BUILD b/tensorflow/core/grappler/optimizers/data/BUILD index 74d936cfbc..979c437c02 100644 --- a/tensorflow/core/grappler/optimizers/data/BUILD +++ b/tensorflow/core/grappler/optimizers/data/BUILD @@ -3,6 +3,44 @@ licenses(["notice"]) # Apache 2.0 load("//tensorflow:tensorflow.bzl", "tf_cc_test") load("//tensorflow/core:platform/default/build_config.bzl", "tf_protos_all") +cc_library( + name = "filter_fusion", + srcs = ["filter_fusion.cc"], + hdrs = [ + "filter_fusion.h", + ], + visibility = ["//visibility:public"], + deps = [ + ":graph_utils", + ":fusion_utils", + "//tensorflow/core/grappler:mutable_graph_view", + "//tensorflow/core:lib", + "//tensorflow/core/grappler:grappler_item", + "//tensorflow/core/grappler:op_types", + "//tensorflow/core/grappler:utils", + "//tensorflow/core/grappler/clusters:cluster", + "//tensorflow/core/kernels:cast_op", + "//tensorflow/core/grappler/utils:topological_sort", + "//tensorflow/core/grappler/optimizers:custom_graph_optimizer", + "//tensorflow/core/grappler/optimizers:custom_graph_optimizer_registry", + ] + tf_protos_all(), +) + +tf_cc_test( + name = "filter_fusion_test", + srcs = ["filter_fusion_test.cc"], + visibility = ["//visibility:public"], + deps = [ + ":filter_fusion", + ":graph_utils", + "//tensorflow/core:framework", + "//tensorflow/core:test", + "//tensorflow/core:test_main", + "//tensorflow/core:testlib", + "//tensorflow/core/grappler:grappler_item", + ], +) + cc_library( name = "function_rename", srcs = ["function_rename.cc"], @@ -46,11 +84,13 @@ cc_library( deps = [ ":graph_utils", "//tensorflow/core/grappler:mutable_graph_view", + "//tensorflow/core:framework", "//tensorflow/core:lib", "//tensorflow/core/grappler:grappler_item", "//tensorflow/core/grappler:op_types", "//tensorflow/core/grappler:utils", "//tensorflow/core/kernels:cast_op", + "//tensorflow/core/kernels:functional_ops", "//tensorflow/core/grappler/optimizers:custom_graph_optimizer_registry", "//tensorflow/core:lib_internal", ] + tf_protos_all(), @@ -343,6 +383,7 @@ cc_library( name = "data", visibility = ["//visibility:public"], deps = [ + ":filter_fusion", ":function_rename", ":latency_all_edges", ":map_and_batch_fusion", diff --git a/tensorflow/core/grappler/optimizers/data/filter_fusion.cc b/tensorflow/core/grappler/optimizers/data/filter_fusion.cc new file mode 100644 index 0000000000..c71aa6e804 --- /dev/null +++ b/tensorflow/core/grappler/optimizers/data/filter_fusion.cc @@ -0,0 +1,141 @@ +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include "tensorflow/core/grappler/optimizers/data/filter_fusion.h" + +#include "tensorflow/core/framework/attr_value.pb.h" +#include "tensorflow/core/framework/node_def.pb.h" +#include "tensorflow/core/grappler/clusters/cluster.h" +#include "tensorflow/core/grappler/grappler_item.h" +#include "tensorflow/core/grappler/mutable_graph_view.h" +#include "tensorflow/core/grappler/op_types.h" +#include "tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.h" +#include "tensorflow/core/grappler/optimizers/data/fusion_utils.h" +#include "tensorflow/core/grappler/optimizers/data/graph_utils.h" +#include "tensorflow/core/grappler/utils.h" +#include "tensorflow/core/grappler/utils/topological_sort.h" +#include "tensorflow/core/platform/protobuf.h" + +namespace tensorflow { +namespace grappler { +namespace { + +NodeDef MakeFusedFilterNode(const NodeDef& first_filter_node, + const NodeDef& second_filter_node, + const FunctionDef& fused_function, + MutableGraphView* graph) { + NodeDef fused_node; + graph_utils::SetUniqueGraphNodeName("fused_filter", graph->GetGraph(), + &fused_node); + + fused_node.set_op("FilterDataset"); + fused_node.add_input(first_filter_node.input(0)); + + auto copy_attribute = [](const string& attribute_name, const NodeDef& from, + NodeDef* to) { + (*to->mutable_attr())[attribute_name] = from.attr().at(attribute_name); + }; + + auto attr = first_filter_node.attr().at("predicate"); + *attr.mutable_func()->mutable_name() = fused_function.signature().name(); + (*fused_node.mutable_attr())["predicate"] = std::move(attr); + + copy_attribute("Targuments", first_filter_node, &fused_node); + + for (auto key : {"output_shapes", "output_types"}) + copy_attribute(key, second_filter_node, &fused_node); + + return fused_node; +} + +} // namespace + +Status FilterFusion::Optimize(Cluster* cluster, const GrapplerItem& item, + GraphDef* output) { + GraphDef sorted_old_graph = item.graph; + TF_RETURN_IF_ERROR(TopologicalSort(&sorted_old_graph)); + *output = sorted_old_graph; + + MutableGraphView graph(output); + std::set nodes_to_delete; + FunctionLibraryDefinition function_library(OpRegistry::Global(), + output->library()); + + auto get_filter_node = [](const NodeDef& node) -> const NodeDef* { + if (node.op() == "FilterDataset") return &node; + return nullptr; + }; + + auto get_fused_predicate = + [&](const NodeDef* first_filter_node, + const NodeDef* second_filter_node) -> FunctionDef* { + const auto& parent_fun = first_filter_node->attr().at("predicate"); + const FunctionDef* first_func = + function_library.Find(parent_fun.func().name()); + const auto& fun = second_filter_node->attr().at("predicate"); + const FunctionDef* second_func = function_library.Find(fun.func().name()); + + if (!fusion_utils::HasSameSignature(first_func->signature(), + second_func->signature())) { + VLOG(1) << "Can't fuse Filters because they have different signature\n"; + return nullptr; + } + + return fusion_utils::FuseFunctions( + *first_func, *second_func, "fused_predicate", + fusion_utils::SameSignature, fusion_utils::SameInput, + fusion_utils::LazyConjunctionOutput, fusion_utils::LazyConjunctionNodes, + output->mutable_library()); + }; + + for (const NodeDef& node : sorted_old_graph.node()) { + const NodeDef* second_filter_node = get_filter_node(node); + if (!second_filter_node) continue; + + const NodeDef* first_filter_node = + get_filter_node(*graph_utils::GetInputNode(*second_filter_node, graph)); + if (!first_filter_node) continue; + + const auto* fused_predicate = + get_fused_predicate(first_filter_node, second_filter_node); + if (!fused_predicate) continue; + const auto* fused_filter_node = graph.AddNode(MakeFusedFilterNode( + *first_filter_node, *second_filter_node, *fused_predicate, &graph)); + + graph.ReplaceInput(*second_filter_node, *fused_filter_node); + + // TODO(prazek): we should run some optimizations on the fused filter + // functions, or make sure that optimization passes run after filter + // fusion. + TF_RETURN_IF_ERROR(function_library.AddFunctionDef(*fused_predicate)); + // TODO(prazek): we could also remove map functions from library if they + // are not used anymore. + nodes_to_delete.insert(first_filter_node->name()); + nodes_to_delete.insert(second_filter_node->name()); + } + + graph.DeleteNodes(nodes_to_delete); + return Status::OK(); +} + +void FilterFusion::Feedback(Cluster* cluster, const GrapplerItem& item, + const GraphDef& optimize_output, double result) { + // no-op +} + +REGISTER_GRAPH_OPTIMIZER_AS(FilterFusion, "filter_fusion"); + +} // end namespace grappler +} // end namespace tensorflow diff --git a/tensorflow/core/grappler/optimizers/data/filter_fusion.h b/tensorflow/core/grappler/optimizers/data/filter_fusion.h new file mode 100644 index 0000000000..91a0364a46 --- /dev/null +++ b/tensorflow/core/grappler/optimizers/data/filter_fusion.h @@ -0,0 +1,47 @@ +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#ifndef TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_DATA_FILTER_FUSION_H_ +#define TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_DATA_FILTER_FUSION_H_ + +#include "tensorflow/core/grappler/optimizers/custom_graph_optimizer.h" + +namespace tensorflow { +namespace grappler { + +// This optimization fuses filter transformations. +class FilterFusion : public CustomGraphOptimizer { + public: + FilterFusion() = default; + ~FilterFusion() override = default; + + string name() const override { return "filter_fusion"; }; + + Status Init( + const tensorflow::RewriterConfig_CustomGraphOptimizer* config) override { + return Status::OK(); + } + + Status Optimize(Cluster* cluster, const GrapplerItem& item, + GraphDef* output) override; + + void Feedback(Cluster* cluster, const GrapplerItem& item, + const GraphDef& optimize_output, double result) override; +}; + +} // end namespace grappler +} // end namespace tensorflow + +#endif // TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_DATA_FILTER_FUSION_H_ diff --git a/tensorflow/core/grappler/optimizers/data/filter_fusion_test.cc b/tensorflow/core/grappler/optimizers/data/filter_fusion_test.cc new file mode 100644 index 0000000000..5a289e60d0 --- /dev/null +++ b/tensorflow/core/grappler/optimizers/data/filter_fusion_test.cc @@ -0,0 +1,91 @@ +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include "tensorflow/core/grappler/optimizers/data/filter_fusion.h" + +#include "tensorflow/core/framework/attr_value_util.h" +#include "tensorflow/core/framework/function_testlib.h" +#include "tensorflow/core/framework/tensor_testutil.h" +#include "tensorflow/core/grappler/grappler_item.h" +#include "tensorflow/core/grappler/optimizers/data/graph_utils.h" + +#include "tensorflow/core/lib/core/status_test_util.h" +#include "tensorflow/core/platform/test.h" + +namespace tensorflow { +namespace grappler { +namespace { + +NodeDef MakeFilterNode(StringPiece name, StringPiece input_node_name) { + return test::function::NDef( + name, "FilterDataset", {input_node_name.ToString()}, + {{"predicate", FunctionDefHelper::FunctionRef("IsZero")}, + {"Targuments", {}}, + {"output_shapes", {}}, + {"output_types", {}}}); +} + +TEST(FilterFusionTest, FuseTwoFilterIntoOne) { + using test::function::NDef; + GrapplerItem item; + item.graph = test::function::GDef( + {NDef("start", "Const", {}, {{"value", 0}, {"dtype", DT_INT32}}), + NDef("stop", "Const", {}, {{"value", 10}, {"dtype", DT_INT32}}), + NDef("step", "Const", {}, {{"value", 1}, {"dtype", DT_INT32}}), + NDef("range", "RangeDataset", {"start", "stop", "step"}, {}), + MakeFilterNode("filter1", "range"), + MakeFilterNode("filter2", "filter1")}, + // FunctionLib + { + test::function::IsZero(), + }); + + FilterFusion optimizer; + GraphDef output; + TF_ASSERT_OK(optimizer.Optimize(nullptr, item, &output)); + EXPECT_TRUE(graph_utils::ContainsNodeWithOp("FilterDataset", output)); + EXPECT_FALSE(graph_utils::ContainsGraphNodeWithName("filter1", output)); + EXPECT_FALSE(graph_utils::ContainsGraphNodeWithName("filter2", output)); +} + +TEST(FilterFusionTest, FuseThreeNodesIntoOne) { + using test::function::NDef; + GrapplerItem item; + item.graph = test::function::GDef( + {NDef("start", "Const", {}, {{"value", 0}, {"dtype", DT_INT32}}), + NDef("stop", "Const", {}, {{"value", 10}, {"dtype", DT_INT32}}), + NDef("step", "Const", {}, {{"value", 1}, {"dtype", DT_INT32}}), + NDef("filename", "Const", {}, {{"value", ""}, {"dtype", DT_STRING}}), + NDef("range", "RangeDataset", {"start", "stop", "step"}, {}), + MakeFilterNode("filter1", "range"), MakeFilterNode("filter2", "filter1"), + MakeFilterNode("filter3", "filter2"), + NDef("cache", "CacheDataset", {"filter3", "filename"}, {})}, + // FunctionLib + { + test::function::IsZero(), + }); + + FilterFusion optimizer; + GraphDef output; + TF_ASSERT_OK(optimizer.Optimize(nullptr, item, &output)); + EXPECT_TRUE(graph_utils::ContainsNodeWithOp("FilterDataset", output)); + EXPECT_FALSE(graph_utils::ContainsGraphNodeWithName("filter1", output)); + EXPECT_FALSE(graph_utils::ContainsGraphNodeWithName("filter2", output)); + EXPECT_FALSE(graph_utils::ContainsGraphNodeWithName("filter3", output)); +} + +} // namespace +} // namespace grappler +} // namespace tensorflow diff --git a/tensorflow/core/grappler/optimizers/data/fusion_utils.cc b/tensorflow/core/grappler/optimizers/data/fusion_utils.cc index f84f109af6..01a78c04b0 100644 --- a/tensorflow/core/grappler/optimizers/data/fusion_utils.cc +++ b/tensorflow/core/grappler/optimizers/data/fusion_utils.cc @@ -16,8 +16,8 @@ limitations under the License. #include "tensorflow/core/grappler/optimizers/data/fusion_utils.h" #include "tensorflow/core/framework/node_def.pb.h" +#include "tensorflow/core/framework/node_def_builder.h" #include "tensorflow/core/framework/op_def.pb.h" - #include "tensorflow/core/grappler/grappler_item.h" #include "tensorflow/core/grappler/mutable_graph_view.h" #include "tensorflow/core/grappler/op_types.h" @@ -52,6 +52,12 @@ string GetOutputNode(const FunctionDef& function, int output_idx) { return function.ret().at(ret_output_name); } +string& GetMutableOutputNode(FunctionDef* function, int output_idx) { + const auto& ret_output_name = + function->signature().output_arg(output_idx).name(); + return function->mutable_ret()->at(ret_output_name); +} + template StringCollection GetNames(const Iterable& iterable, int allocate_size) { StringCollection names; @@ -106,7 +112,6 @@ gtl::FlatMap GetUniqueNames(const Iterable& first_iterable, // Nodes that will be added to the function can have the same name as the nodes // from parent function. void RenameFunctionNodes(const FunctionDef& first_function, - FunctionDef* fused_function, protobuf::RepeatedPtrField* nodes_to_fuse, protobuf::Map* rets_to_fuse) { const gtl::FlatMap changed_node_names = @@ -149,6 +154,7 @@ OpDef GetUniqueSignature(const OpDef& first_signature, const gtl::FlatMap changed_input_names = GetUniqueNames(first_signature.input_arg(), second_signature.input_arg()); OpDef signature; + signature.set_name(second_signature.name()); for (const auto& input_arg : second_signature.input_arg()) { auto& input = *signature.add_input_arg(); @@ -221,12 +227,13 @@ void FuseFunctionNodes(const StringCollection& first_inputs, } // This function looks for direct edges from input to return and rewrites -// them to the coresponding input of the return of `first_function`. +// them to the corresponding input of the return of `first_function`. void FuseReturns(const StringCollection& first_inputs, const StringCollection& second_inputs, const StringCollection& first_outputs, - const SetInputFn& set_input, FunctionDef* fused_function) { - for (auto& ret : *fused_function->mutable_ret()) { + const SetInputFn& set_input, + protobuf::Map* fused_ret) { + for (auto& ret : *fused_ret) { auto return_input = ParseNodeConnection(ret.second); auto input_it = std::find(second_inputs.begin(), second_inputs.end(), return_input); @@ -249,6 +256,33 @@ StringCollection GetFunctionOutputs(const FunctionDef& function) { return outputs; } +FunctionDef* CreateFalsePredicate( + const protobuf::RepeatedPtrField& fake_args, + FunctionDefLibrary* library) { + GraphDef graph; + MutableGraphView graph_view(&graph); + auto* node = graph_utils::AddScalarConstNode(false, &graph_view); + auto* false_predicate = library->add_function(); + graph_utils::SetUniqueGraphFunctionName("false_predicate", library, + false_predicate); + + int num = 0; + for (const auto& fake_arg : fake_args) { + auto* arg = false_predicate->mutable_signature()->add_input_arg(); + arg->set_type(fake_arg.type()); + arg->set_name(strings::StrCat("fake_arg", num)); + num++; + } + + auto* output = false_predicate->mutable_signature()->add_output_arg(); + output->set_name("false_out"); + output->set_type(DT_BOOL); + + (*false_predicate->mutable_ret())["false_out"] = node->name() + ":output:0"; + *false_predicate->mutable_node_def() = std::move(*graph.mutable_node()); + return false_predicate; +} + void CheckIfCanCompose(const OpDef& first_signature, const OpDef& second_signature) { CHECK(CanCompose(first_signature, second_signature)) @@ -259,6 +293,15 @@ void CheckIfCanCompose(const OpDef& first_signature, } // namespace +void MergeNodes(const FunctionDef& first_function, + const FunctionDef& second_function, FunctionDef* fused_function, + FunctionDefLibrary* library) { + // Copy all nodes from first_function. + fused_function->mutable_node_def()->CopyFrom(first_function.node_def()); + // Copy transformed nodes from the second function. + fused_function->mutable_node_def()->MergeFrom(second_function.node_def()); +} + bool CanCompose(const OpDef& first_signature, const OpDef& second_signature) { // TODO(prazek): Functions can have additional inputs being placeholders // for a values used in function. We should be able to also fuse these @@ -285,8 +328,8 @@ void ComposeSignature(const OpDef& first_signature, void ComposeOutput(const protobuf::Map& first_ret, const protobuf::Map& second_ret, - FunctionDef* fused_function) { - *fused_function->mutable_ret() = second_ret; + protobuf::Map* fused_ret) { + *fused_ret = second_ret; } void CombineSignature(const OpDef& first_signature, @@ -302,41 +345,110 @@ void CombineSignature(const OpDef& first_signature, void CombineOutput(const protobuf::Map& first_ret, const protobuf::Map& second_ret, - FunctionDef* fused_function) { - *fused_function->mutable_ret() = first_ret; - fused_function->mutable_ret()->insert(second_ret.begin(), second_ret.end()); + protobuf::Map* fused_ret) { + *fused_ret = first_ret; + fused_ret->insert(second_ret.begin(), second_ret.end()); +} + +string SameInput(const StringCollection& first_inputs, + const StringCollection& second_inputs, + const StringCollection& first_outputs, int arg_num) { + return first_inputs.at(arg_num); +} + +bool HasSameSignature(const OpDef& first_signature, + const OpDef& second_signature) { + return first_signature.input_arg_size() == + second_signature.input_arg_size() && + first_signature.output_arg_size() == + second_signature.output_arg_size(); +} + +void SameSignature(const OpDef& first_signature, const OpDef& second_signature, + OpDef* fused_signature) { + CHECK(HasSameSignature(first_signature, second_signature)) + << "Functions do not have the same signature"; + // Copy signature from first function. + *fused_signature = first_signature; +} + +void LazyConjunctionNodes(const FunctionDef& first_function, + const FunctionDef& second_function, + FunctionDef* fused_function, + FunctionDefLibrary* library) { + fused_function->mutable_node_def()->CopyFrom(first_function.node_def()); + + NodeDefBuilder if_builder("", "If"); + if_builder.Input(GetOutputNode(first_function, 0), 0, DT_BOOL); + DataTypeVector in_arg_types; + std::vector inputs; + for (const auto& input_arg : first_function.signature().input_arg()) { + inputs.push_back({input_arg.name(), 0, input_arg.type()}); + in_arg_types.push_back(input_arg.type()); + } + if_builder.Attr("Tin", in_arg_types); + + if_builder.Attr("Tcond", DT_BOOL); + if_builder.Attr("Tout", DataTypeVector{DT_BOOL}); + if_builder.Attr("_lower_using_switch_merge", true); + + NameAttrList then_branch; + then_branch.set_name(second_function.signature().name()); + if_builder.Attr("then_branch", then_branch); + + auto* false_predicate = + CreateFalsePredicate(first_function.signature().input_arg(), library); + + NameAttrList else_branch; + else_branch.set_name(false_predicate->signature().name()); + if_builder.Attr("else_branch", else_branch); + if_builder.Input(inputs); + + auto* if_node = fused_function->add_node_def(); + // This is guaranteed to succeed. + TF_CHECK_OK(if_builder.Finalize(if_node)); + graph_utils::SetUniqueFunctionNodeName("cond", fused_function, if_node); + + GetMutableOutputNode(fused_function, 0) = if_node->name() + ":output:0"; +} + +void LazyConjunctionOutput(const protobuf::Map& first_ret, + const protobuf::Map& second_ret, + protobuf::Map* fused_ret) { + CHECK_EQ(first_ret.size(), 1); + CHECK_EQ(second_ret.size(), 1); + // Temporarily copy returns from first_ret. We are going to change the + // output node after creating it. + *fused_ret = first_ret; } -FunctionDef* FuseFunctions(const FunctionDef& first_function, - const FunctionDef& function, - StringPiece fused_name_prefix, - const SetFunctionSignatureFn& set_signature, - const SetInputFn& set_input, - const SetOutputFn& set_output, - FunctionDefLibrary* library) { - if (first_function.attr_size() != 0 || function.attr_size() != 0) +FunctionDef* FuseFunctions( + const FunctionDef& first_function, const FunctionDef& second_function, + StringPiece fused_name_prefix, const SetFunctionSignatureFn& set_signature, + const SetInputFn& set_input, const SetOutputFn& set_output, + const SetNodesFn& set_nodes, FunctionDefLibrary* library) { + if (first_function.attr_size() != 0 || second_function.attr_size() != 0) return nullptr; // Functions with attributes are currently not supported // This function will be used as a clone of second function, having unique // names. - FunctionDef setup_function = function; + FunctionDef setup_function = second_function; *setup_function.mutable_signature() = GetUniqueSignature( first_function.signature(), setup_function.signature(), setup_function.mutable_ret(), setup_function.mutable_node_def()); FunctionDef* fused_function = library->add_function(); - // Copy all nodes from first_function. - fused_function->mutable_node_def()->CopyFrom(first_function.node_def()); + set_signature(first_function.signature(), setup_function.signature(), fused_function->mutable_signature()); graph_utils::SetUniqueGraphFunctionName(fused_name_prefix, library, fused_function); - RenameFunctionNodes(first_function, fused_function, - setup_function.mutable_node_def(), + RenameFunctionNodes(first_function, setup_function.mutable_node_def(), setup_function.mutable_ret()); - set_output(first_function.ret(), setup_function.ret(), fused_function); + set_output(first_function.ret(), setup_function.ret(), + fused_function->mutable_ret()); CHECK(fused_function->signature().output_arg_size() == fused_function->ret_size()) @@ -351,10 +463,10 @@ FunctionDef* FuseFunctions(const FunctionDef& first_function, FuseFunctionNodes(first_inputs, second_inputs, first_outputs, set_input, setup_function.mutable_node_def()); FuseReturns(first_inputs, second_inputs, first_outputs, set_input, - fused_function); + fused_function->mutable_ret()); + + set_nodes(first_function, setup_function, fused_function, library); - // Copy transformed nodes from the second function. - fused_function->mutable_node_def()->MergeFrom(setup_function.node_def()); return fused_function; } diff --git a/tensorflow/core/grappler/optimizers/data/fusion_utils.h b/tensorflow/core/grappler/optimizers/data/fusion_utils.h index 41f13f6cb8..19b7002dcd 100644 --- a/tensorflow/core/grappler/optimizers/data/fusion_utils.h +++ b/tensorflow/core/grappler/optimizers/data/fusion_utils.h @@ -48,14 +48,20 @@ using SetInputFn = const StringCollection& second_function_inputs, const StringCollection& parent_outputs, int arg_num)>; -// This function is invoked with first function ret. It is used to set up -// returns of fused function. If you need to combine outputs -// of first and second function, then this is a right place to create a new -// nodes. +// This function is invoked with first and second function ret. It is used to +// set up returns of fused function. using SetOutputFn = std::function& parent_ret, const protobuf::Map& second_function_ret, - FunctionDef* fused_function)>; + protobuf::Map* fused_ret)>; + +using SetNodesFn = std::function; + +void MergeNodes(const FunctionDef& first_function, + const FunctionDef& second_function, FunctionDef* fused_function, + FunctionDefLibrary* library); // Returns true if functions can be composed. bool CanCompose(const OpDef& first_signature, const OpDef& second_signature); @@ -71,7 +77,7 @@ string ComposeInput(const StringCollection& first_inputs, // second_function(first_function(args...)). void ComposeOutput(const protobuf::Map& first_ret, const protobuf::Map& second_ret, - FunctionDef* fused_function); + protobuf::Map* fused_ret); // Set input signature to `first_function_signature` and output signature // to `first_function_signature` + `second_function_signature` @@ -83,7 +89,32 @@ void CombineSignature(const OpDef& first_signature, // return *first_function(...), *second_function(...) void CombineOutput(const protobuf::Map& first_ret, const protobuf::Map& second_ret, - FunctionDef* fused_function); + protobuf::Map* fused_ret); + +// Returns true if both signatures have the same number of input and output +// args. +bool HasSameSignature(const OpDef& first_signature, + const OpDef& second_signature); + +// Check if both signatures are same and copy it from `first_signature`. +void SameSignature(const OpDef& first_signature, const OpDef& second_signature, + OpDef* fused_signature); + +// Take the same input as first function. +string SameInput(const StringCollection& first_inputs, + const StringCollection& second_inputs, + const StringCollection& first_outputs, int arg_num); + +// Create a fused function that computes the short-circuit logical AND of the +// result of the first function and the result of the second function. +void LazyConjunctionOutput(const protobuf::Map& first_ret, + const protobuf::Map& second_ret, + protobuf::Map* fused_ret); + +void LazyConjunctionNodes(const FunctionDef& first_function, + const FunctionDef& second_function, + FunctionDef* fused_function, + FunctionDefLibrary* library); // Fuse `first_function` with `second_function`, setting `fused_name_prefix` as // a name prefix. The nodes from `first_function` are copied unmodified. All @@ -91,13 +122,11 @@ void CombineOutput(const protobuf::Map& first_ret, // that are not conflicting with first function. This means that copied nodes // from second function can end up having different names. For explanation of // set up functions see the documentation of the functions types. -FunctionDef* FuseFunctions(const FunctionDef& first_function, - const FunctionDef& second_function, - StringPiece fused_name_prefix, - const SetFunctionSignatureFn& set_signature, - const SetInputFn& set_input, - const SetOutputFn& set_output, - FunctionDefLibrary* library); +FunctionDef* FuseFunctions( + const FunctionDef& first_function, const FunctionDef& second_function, + StringPiece fused_name_prefix, const SetFunctionSignatureFn& set_signature, + const SetInputFn& set_input, const SetOutputFn& set_output, + const SetNodesFn& set_nodes, FunctionDefLibrary* library); } // namespace fusion_utils } // namespace grappler diff --git a/tensorflow/core/grappler/optimizers/data/fusion_utils_test.cc b/tensorflow/core/grappler/optimizers/data/fusion_utils_test.cc index 7ad5d63bf6..d5c6466080 100644 --- a/tensorflow/core/grappler/optimizers/data/fusion_utils_test.cc +++ b/tensorflow/core/grappler/optimizers/data/fusion_utils_test.cc @@ -57,10 +57,10 @@ TEST(FusionUtilsTest, FuseFunctionsByComposition) { auto *function = graph.mutable_library()->add_function(); *function = test::function::XTimesTwo(); - auto *fused_function = - FuseFunctions(*parent_function, *function, "fused_maps", - fusion_utils::ComposeSignature, fusion_utils::ComposeInput, - fusion_utils::ComposeOutput, graph.mutable_library()); + auto *fused_function = FuseFunctions( + *parent_function, *function, "fused_maps", fusion_utils::ComposeSignature, + fusion_utils::ComposeInput, fusion_utils::ComposeOutput, + fusion_utils::MergeNodes, graph.mutable_library()); EXPECT_EQ(fused_function->signature().name(), "fused_maps"); EXPECT_EQ(fused_function->signature().input_arg_size(), 1); @@ -98,7 +98,8 @@ TEST(FusionUtilsTest, FuseFunctionWithPredicate) { auto *fused_function = FuseFunctions(*xtimes_two, *is_zero, "fused_map_and_filter_function", fusion_utils::CombineSignature, fusion_utils::ComposeInput, - fusion_utils::CombineOutput, graph.mutable_library()); + fusion_utils::CombineOutput, fusion_utils::MergeNodes, + graph.mutable_library()); EXPECT_EQ(fused_function->signature().name(), "fused_map_and_filter_function"); @@ -134,10 +135,10 @@ TEST(FusionUtilsTest, FuseSameFunctionWithExtraOutput) { auto *function = graph.mutable_library()->add_function(); *function = test::function::XTimesTwo(); - auto *fused_function = - FuseFunctions(*parent_function, *function, "fused_maps", - fusion_utils::CombineSignature, fusion_utils::ComposeInput, - fusion_utils::CombineOutput, graph.mutable_library()); + auto *fused_function = FuseFunctions( + *parent_function, *function, "fused_maps", fusion_utils::CombineSignature, + fusion_utils::ComposeInput, fusion_utils::CombineOutput, + fusion_utils::MergeNodes, graph.mutable_library()); EXPECT_EQ(fused_function->signature().input_arg_size(), 1); EXPECT_EQ(fused_function->signature().output_arg_size(), 2); @@ -169,7 +170,8 @@ TEST(FusionUtilsTest, ZipFusion) { auto *fused_function = FuseFunctions(*function, *function, "zip_maps", zip_signature, zip_input, - fusion_utils::CombineOutput, graph.mutable_library()); + fusion_utils::CombineOutput, fusion_utils::MergeNodes, + graph.mutable_library()); EXPECT_EQ(fused_function->signature().input_arg_size(), 2); EXPECT_EQ(fused_function->signature().output_arg_size(), 2); diff --git a/tensorflow/core/grappler/optimizers/data/map_and_filter_fusion.cc b/tensorflow/core/grappler/optimizers/data/map_and_filter_fusion.cc index a411e641f7..f1844a141c 100644 --- a/tensorflow/core/grappler/optimizers/data/map_and_filter_fusion.cc +++ b/tensorflow/core/grappler/optimizers/data/map_and_filter_fusion.cc @@ -116,12 +116,17 @@ Status MapAndFilterFusion::Optimize(Cluster* cluster, const GrapplerItem& item, const auto& fun = filter_node->attr().at("predicate"); const FunctionDef* filter_func = function_library.Find(fun.func().name()); if (!fusion_utils::CanCompose(map_func->signature(), - filter_func->signature())) + filter_func->signature())) { + VLOG(1) << "Can't fuse map and filter because the output signature of " + "the map function does not match the input signature of the " + "filter function\n"; return nullptr; + } return fusion_utils::FuseFunctions( *map_func, *filter_func, "fused_map_and_filter_function", fusion_utils::CombineSignature, fusion_utils::ComposeInput, - fusion_utils::CombineOutput, output->mutable_library()); + fusion_utils::CombineOutput, fusion_utils::MergeNodes, + output->mutable_library()); }; for (const NodeDef& node : sorted_old_graph.node()) { diff --git a/tensorflow/core/grappler/optimizers/data/map_fusion.cc b/tensorflow/core/grappler/optimizers/data/map_fusion.cc index dc0509f7a5..a78ecb09f7 100644 --- a/tensorflow/core/grappler/optimizers/data/map_fusion.cc +++ b/tensorflow/core/grappler/optimizers/data/map_fusion.cc @@ -90,12 +90,17 @@ Status MapFusion::Optimize(Cluster* cluster, const GrapplerItem& item, const auto& fun = map_node->attr().at("f"); const FunctionDef* func = function_library.Find(fun.func().name()); - if (!fusion_utils::CanCompose(parent_func->signature(), func->signature())) + if (!fusion_utils::CanCompose(parent_func->signature(), + func->signature())) { + VLOG(1) << "Can't fuse two maps because the output signature of the " + "first map function does not match the input signature of the " + "second function\n"; return nullptr; + } return fusion_utils::FuseFunctions( *parent_func, *func, "fused_map", fusion_utils::ComposeSignature, fusion_utils::ComposeInput, fusion_utils::ComposeOutput, - output->mutable_library()); + fusion_utils::MergeNodes, output->mutable_library()); }; for (const NodeDef& node : sorted_old_graph.node()) { diff --git a/tensorflow/core/kernels/data/optimize_dataset_op.cc b/tensorflow/core/kernels/data/optimize_dataset_op.cc index b2d307ba8a..9b14078407 100644 --- a/tensorflow/core/kernels/data/optimize_dataset_op.cc +++ b/tensorflow/core/kernels/data/optimize_dataset_op.cc @@ -97,19 +97,27 @@ class OptimizeDatasetOp : public UnaryDatasetOpKernel { TF_RETURN_IF_ERROR( db.AddInputDataset(&serialization_ctx, input_, &input_node)); string output_node = input_node->name(); + GraphDef graph_def; TF_RETURN_IF_ERROR(b.ToGraphDef(&graph_def)); VLOG(3) << "Before optimization: " << graph_def.DebugString(); + TF_RETURN_IF_ERROR(ApplyOptimizations(ctx, &graph_def, &output_node)); VLOG(3) << "After optimization: " << graph_def.DebugString(); - flib_def_.reset(new FunctionLibraryDefinition(OpRegistry::Global(), - graph_def.library())); + + // Instantiate the optimized input pipeline by running the optimized graph + // using the optimized function library. + TF_RETURN_IF_ERROR( + ctx->function_library()->Clone(&flib_def_, &pflr_, &lib_)); + TF_RETURN_IF_ERROR(flib_def_->AddLibrary(graph_def.library())); + Graph graph(OpRegistry::Global()); TF_RETURN_IF_ERROR(ImportGraphDef({}, graph_def, &graph, nullptr)); std::vector outputs; GraphRunner graph_runner(ctx->function_library()->device()); - TF_RETURN_IF_ERROR(graph_runner.Run(&graph, ctx->function_library(), {}, - {output_node}, &outputs)); + + TF_RETURN_IF_ERROR( + graph_runner.Run(&graph, lib_, {}, {output_node}, &outputs)); TF_RETURN_IF_ERROR( GetDatasetFromVariantTensor(outputs[0], &optimized_input_)); optimized_input_->Ref(); @@ -146,8 +154,7 @@ class OptimizeDatasetOp : public UnaryDatasetOpKernel { params.env = ctx->env(); params.runner = *(ctx->runner()); params.stats_aggregator_getter = ctx->stats_aggregator_getter(); - params.lib = ctx->lib(); - params.function_library = dataset()->flib_def_; + params.lib = dataset()->lib_; params.allocator_getter = ctx->allocator_getter(); return dataset()->optimized_input_->MakeIterator( IteratorContext(params), prefix(), &input_impl_); @@ -160,8 +167,7 @@ class OptimizeDatasetOp : public UnaryDatasetOpKernel { params.env = ctx->env(); params.runner = *(ctx->runner()); params.stats_aggregator_getter = ctx->stats_aggregator_getter(); - params.lib = ctx->lib(); - params.function_library = dataset()->flib_def_; + params.lib = dataset()->lib_; params.allocator_getter = ctx->allocator_getter(); IteratorContext iter_ctx(params); return input_impl_->GetNext(&iter_ctx, out_tensors, end_of_sequence); @@ -243,7 +249,9 @@ class OptimizeDatasetOp : public UnaryDatasetOpKernel { } DatasetBase* optimized_input_; - std::shared_ptr flib_def_; + FunctionLibraryRuntime* lib_ = nullptr; + std::unique_ptr pflr_ = nullptr; + std::unique_ptr flib_def_ = nullptr; const DatasetBase* input_; const std::vector optimizations_; const DataTypeVector output_types_; -- GitLab From 5afcc0fa269f9b41f1aecd029f4e3ebea35a420b Mon Sep 17 00:00:00 2001 From: Shanqing Cai Date: Fri, 24 Aug 2018 18:49:19 -0700 Subject: [PATCH 132/598] Update TensorFlow.js roadmap * remove old (completed) items * add new items PiperOrigin-RevId: 210190738 --- tensorflow/docs_src/community/roadmap.md | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/tensorflow/docs_src/community/roadmap.md b/tensorflow/docs_src/community/roadmap.md index 0463ca05fe..d11b6ed467 100644 --- a/tensorflow/docs_src/community/roadmap.md +++ b/tensorflow/docs_src/community/roadmap.md @@ -58,10 +58,12 @@ across image recognition, speech, object detection, and * Increase support for devices beyond Android and iOS (eg. RPi, Cortex-M) #### TensorFlow.js: -* Release package for Node.js bindings to the TensorFlow C API through the TensorFlow.js backend interface -* Expand support for importing TensorFlow SavedModels and Keras models into browser with unified APIs supporting retraining in browser -* Improve Layers API and allow model exporting/saving +* Continue to expand support for importing TensorFlow SavedModels and Keras models into browser with unified APIs supporting retraining in browser +* Improve inference and training performance in both browser and Node.js environments +* Widen the collection of pre-built models in [tfjs-models](https://github.com/tensorflow/tfjs-models), + including but not limited to audio- and speech-oriented models * Release tfjs-data API for efficient data input pipelines +* Integration with [TF-Hub](https://www.tensorflow.org/hub/) #### TensorFlow with Swift: * Establish open source project including documentation, open design, and code availability. -- GitLab From 9599b473035fb9f38959608f32180e22216c7dbc Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Fri, 24 Aug 2018 19:12:41 -0700 Subject: [PATCH 133/598] Make registration macro namespace-agnostic PiperOrigin-RevId: 210192298 --- .../core/common_runtime/optimization_registry.h | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/tensorflow/core/common_runtime/optimization_registry.h b/tensorflow/core/common_runtime/optimization_registry.h index f5d265aa24..6fcd2afd27 100644 --- a/tensorflow/core/common_runtime/optimization_registry.h +++ b/tensorflow/core/common_runtime/optimization_registry.h @@ -132,11 +132,12 @@ class OptimizationPassRegistration { #define REGISTER_OPTIMIZATION_UNIQ_HELPER(ctr, grouping, phase, optimization) \ REGISTER_OPTIMIZATION_UNIQ(ctr, grouping, phase, optimization) -#define REGISTER_OPTIMIZATION_UNIQ(ctr, grouping, phase, optimization) \ - static optimization_registration::OptimizationPassRegistration \ - register_optimization_##ctr( \ - grouping, phase, \ - std::unique_ptr(new optimization()), \ +#define REGISTER_OPTIMIZATION_UNIQ(ctr, grouping, phase, optimization) \ + static ::tensorflow::optimization_registration::OptimizationPassRegistration \ + register_optimization_##ctr( \ + grouping, phase, \ + ::std::unique_ptr<::tensorflow::GraphOptimizationPass>( \ + new optimization()), \ #optimization) } // namespace tensorflow -- GitLab From ca94990804cf5326c0f6f46d75c96e0f0e240366 Mon Sep 17 00:00:00 2001 From: Yuefeng Zhou Date: Fri, 24 Aug 2018 19:14:44 -0700 Subject: [PATCH 134/598] Add an option to RunConfig and train_and_evaluate to run distribute coordinator. This is necessary to run multi-worker MirroredStrategy and CollectiveAllReduceStrategy with estimator. PiperOrigin-RevId: 210192378 --- tensorflow/contrib/distribute/BUILD | 1 + tensorflow/contrib/distribute/__init__.py | 2 + tensorflow/contrib/distribute/python/BUILD | 26 + .../python/estimator_training_test.py | 659 ++++++++++++++++++ tensorflow/python/BUILD | 1 + tensorflow/python/distribute/BUILD | 33 + .../python/distribute/distribute_config.py | 45 ++ .../distribute/distribute_coordinator.py | 53 +- .../python/distribute/estimator_training.py | 264 +++++++ tensorflow/python/estimator/run_config.py | 20 +- tensorflow/python/estimator/training.py | 22 +- 11 files changed, 1100 insertions(+), 26 deletions(-) create mode 100644 tensorflow/contrib/distribute/python/estimator_training_test.py create mode 100644 tensorflow/python/distribute/distribute_config.py create mode 100644 tensorflow/python/distribute/estimator_training.py diff --git a/tensorflow/contrib/distribute/BUILD b/tensorflow/contrib/distribute/BUILD index c16f1d6035..02feeafb60 100644 --- a/tensorflow/contrib/distribute/BUILD +++ b/tensorflow/contrib/distribute/BUILD @@ -35,5 +35,6 @@ py_library( "//tensorflow/contrib/distribute/python:tpu_strategy", "//tensorflow/python:training", "//tensorflow/python:util", + "//tensorflow/python/distribute:distribute_config", ], ) diff --git a/tensorflow/contrib/distribute/__init__.py b/tensorflow/contrib/distribute/__init__.py index 588a4f2898..bf763215ba 100644 --- a/tensorflow/contrib/distribute/__init__.py +++ b/tensorflow/contrib/distribute/__init__.py @@ -27,6 +27,7 @@ from tensorflow.contrib.distribute.python.one_device_strategy import OneDeviceSt from tensorflow.contrib.distribute.python.parameter_server_strategy import ParameterServerStrategy from tensorflow.contrib.distribute.python.step_fn import * from tensorflow.contrib.distribute.python.tpu_strategy import TPUStrategy +from tensorflow.python.distribute.distribute_config import DistributeConfig from tensorflow.python.training.distribute import * from tensorflow.python.training.distribution_strategy_context import * @@ -37,6 +38,7 @@ _allowed_symbols = [ 'AllReduceCrossTowerOps', 'CollectiveAllReduceStrategy', 'CrossTowerOps', + 'DistributeConfig', 'DistributionStrategy', 'MirroredStrategy', 'Monitor', diff --git a/tensorflow/contrib/distribute/python/BUILD b/tensorflow/contrib/distribute/python/BUILD index 8173b5d4ba..f5b236e35f 100644 --- a/tensorflow/contrib/distribute/python/BUILD +++ b/tensorflow/contrib/distribute/python/BUILD @@ -452,6 +452,32 @@ cuda_py_test( ], ) +cuda_py_test( + name = "estimator_training_test", + size = "large", + srcs = ["estimator_training_test.py"], + additional_deps = [ + ":combinations", + ":mirrored_strategy", + ":multi_worker_test_base", + ":parameter_server_strategy", + "//third_party/py/numpy", + "//tensorflow/contrib/optimizer_v2:training", + "//tensorflow/python/data/ops:dataset_ops", + "//tensorflow/python/distribute", + "//tensorflow/python/eager:test", + "//tensorflow/python/estimator:estimator_py", + "//tensorflow/python/feature_column", + "//tensorflow/python:framework_ops", + "//tensorflow/python:platform", + "//tensorflow/python:summary", + ], + tags = [ + "multi_and_single_gpu", + "no_pip", + ], +) + py_library( name = "single_loss_example", srcs = ["single_loss_example.py"], diff --git a/tensorflow/contrib/distribute/python/estimator_training_test.py b/tensorflow/contrib/distribute/python/estimator_training_test.py new file mode 100644 index 0000000000..5348512016 --- /dev/null +++ b/tensorflow/contrib/distribute/python/estimator_training_test.py @@ -0,0 +1,659 @@ +# Copyright 2018 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Tests that show Distribute Coordinator works with Estimator.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import glob +import json +import os +import sys +import tempfile +import threading +from absl.testing import parameterized +import numpy as np +import six + +_portpicker_import_error = None +try: + import portpicker # pylint: disable=g-import-not-at-top +except ImportError as _error: # pylint: disable=invalid-name + _portpicker_import_error = _error + portpicker = None + +# pylint: disable=g-import-not-at-top +from tensorflow.contrib.distribute.python import combinations +from tensorflow.contrib.distribute.python import mirrored_strategy +from tensorflow.contrib.distribute.python import parameter_server_strategy +from tensorflow.contrib.optimizer_v2 import adagrad +from tensorflow.core.protobuf import config_pb2 +from tensorflow.python.data.ops import dataset_ops +from tensorflow.python.distribute import distribute_coordinator as dc +from tensorflow.python.distribute import estimator_training as dc_training +from tensorflow.python.distribute.distribute_config import DistributeConfig +from tensorflow.python.eager import context +from tensorflow.python.estimator import exporter as exporter_lib +from tensorflow.python.estimator import run_config as run_config_lib +from tensorflow.python.estimator import training as estimator_training +from tensorflow.python.estimator.canned import dnn_linear_combined +from tensorflow.python.estimator.canned import prediction_keys +from tensorflow.python.estimator.export import export as export_lib +from tensorflow.python.feature_column import feature_column +from tensorflow.python.platform import gfile +from tensorflow.python.platform import test +from tensorflow.python.summary import summary_iterator +from tensorflow.python.summary.writer import writer_cache +from tensorflow.python.training import server_lib + +BATCH_SIZE = 10 +LABEL_DIMENSION = 2 +DATA = np.linspace( + 0., 2., BATCH_SIZE * LABEL_DIMENSION, dtype=np.float32).reshape( + BATCH_SIZE, LABEL_DIMENSION) +EVAL_NAME = "foo" +EXPORTER_NAME = "saved_model_exporter" +MAX_STEPS = 10 + +CHIEF = dc._TaskType.CHIEF +EVALUATOR = dc._TaskType.EVALUATOR +WORKER = dc._TaskType.WORKER +PS = dc._TaskType.PS + +original_run_distribute_coordinator = dc.run_distribute_coordinator + + +# TODO(yuefengz): merge this method back to test_util. +def _create_local_cluster(num_workers, + num_ps, + has_eval=False, + protocol="grpc", + worker_config=None, + ps_config=None): + if _portpicker_import_error: + raise _portpicker_import_error # pylint: disable=raising-bad-type + worker_ports = [portpicker.pick_unused_port() for _ in range(num_workers)] + ps_ports = [portpicker.pick_unused_port() for _ in range(num_ps)] + + cluster_dict = { + "worker": ["localhost:%s" % port for port in worker_ports], + "ps": ["localhost:%s" % port for port in ps_ports] + } + if has_eval: + cluster_dict["evaluator"] = ["localhost:%s" % portpicker.pick_unused_port()] + + cs = server_lib.ClusterSpec(cluster_dict) + + workers = [ + server_lib.Server( + cs, + job_name="worker", + protocol=protocol, + task_index=ix, + config=worker_config, + start=True) for ix in range(num_workers) + ] + ps_servers = [ + server_lib.Server( + cs, + job_name="ps", + protocol=protocol, + task_index=ix, + config=ps_config, + start=True) for ix in range(num_ps) + ] + if has_eval: + evals = [ + server_lib.Server( + cs, + job_name="evaluator", + protocol=protocol, + task_index=0, + config=worker_config, + start=True) + ] + else: + evals = [] + + return workers, ps_servers, evals + + +def _create_in_process_cluster(num_workers, num_ps, has_eval=False): + """Create an in-process cluster that consists of only standard server.""" + # Leave some memory for cuda runtime. + if has_eval: + gpu_mem_frac = 0.7 / (num_workers + 1) + else: + gpu_mem_frac = 0.7 / num_workers + + worker_config = config_pb2.ConfigProto() + worker_config.gpu_options.per_process_gpu_memory_fraction = gpu_mem_frac + + # Enable collective ops which has no impact on non-collective ops. + # TODO(yuefengz, tucker): removing this after we move the initialization of + # collective mgr to the session level. + worker_config.experimental.collective_group_leader = ( + "/job:worker/replica:0/task:0") + + ps_config = config_pb2.ConfigProto() + ps_config.device_count["GPU"] = 0 + + return _create_local_cluster( + num_workers, + num_ps=num_ps, + has_eval=has_eval, + worker_config=worker_config, + ps_config=ps_config, + protocol="grpc") + + +def _create_cluster_spec(has_chief=False, + num_workers=1, + num_ps=0, + has_eval=False): + if _portpicker_import_error: + raise _portpicker_import_error # pylint: disable=raising-bad-type + + cluster_spec = {} + if has_chief: + cluster_spec[CHIEF] = ["localhost:%s" % portpicker.pick_unused_port()] + if num_workers: + cluster_spec[WORKER] = [ + "localhost:%s" % portpicker.pick_unused_port() + for _ in range(num_workers) + ] + if num_ps: + cluster_spec[PS] = [ + "localhost:%s" % portpicker.pick_unused_port() for _ in range(num_ps) + ] + if has_eval: + cluster_spec[EVALUATOR] = ["localhost:%s" % portpicker.pick_unused_port()] + return cluster_spec + + +def _bytes_to_str(maybe_bytes): + if isinstance(maybe_bytes, six.string_types): + return maybe_bytes + else: + return str(maybe_bytes, "utf-8") + + +def _strip_protocol(target): + # cluster_spec expects "host:port" strings. + if "//" in target: + return target.split("//")[1] + else: + return target + + +class DistributeCoordinatorIntegrationTest(test.TestCase, + parameterized.TestCase): + + @classmethod + def setUpClass(cls): + """Create a local cluster with 2 workers.""" + cls._workers, cls._ps, cls._evals = _create_in_process_cluster( + num_workers=3, num_ps=2, has_eval=True) + cls._cluster_spec = { + "worker": [ + _strip_protocol(_bytes_to_str(w.target)) for w in cls._workers + ], + "ps": [_strip_protocol(_bytes_to_str(ps.target)) for ps in cls._ps], + "evaluator": [ + _strip_protocol(_bytes_to_str(e.target)) for e in cls._evals + ] + } + + def setUp(self): + self._model_dir = tempfile.mkdtemp() + self._event = threading.Event() + super(DistributeCoordinatorIntegrationTest, self).setUp() + + def dataset_input_fn(self, x, y, batch_size, shuffle): + + def input_fn(): + dataset = dataset_ops.Dataset.from_tensor_slices((x, y)) + if shuffle: + dataset = dataset.shuffle(batch_size) + dataset = dataset.repeat(100).batch(batch_size) + return dataset + + return input_fn + + def _get_exporter(self, name, fc): + feature_spec = feature_column.make_parse_example_spec(fc) + serving_input_receiver_fn = ( + export_lib.build_parsing_serving_input_receiver_fn(feature_spec)) + return exporter_lib.LatestExporter( + name, serving_input_receiver_fn=serving_input_receiver_fn) + + def _extract_loss_and_global_step(self, event_folder): + """Returns the loss and global step in last event.""" + event_paths = glob.glob(os.path.join(event_folder, "events*")) + + loss = None + global_step_count = None + + for e in summary_iterator.summary_iterator(event_paths[-1]): + current_loss = None + for v in e.summary.value: + if v.tag == "loss": + current_loss = v.simple_value + + # If loss is not found, global step is meaningless. + if current_loss is None: + continue + + current_global_step = e.step + if global_step_count is None or current_global_step > global_step_count: + global_step_count = current_global_step + loss = current_loss + + return (loss, global_step_count) + + def _get_estimator(self, + train_distribute, + eval_distribute, + remote_cluster=None): + input_dimension = LABEL_DIMENSION + linear_feature_columns = [ + feature_column.numeric_column("x", shape=(input_dimension,)) + ] + dnn_feature_columns = [ + feature_column.numeric_column("x", shape=(input_dimension,)) + ] + + return dnn_linear_combined.DNNLinearCombinedRegressor( + linear_feature_columns=linear_feature_columns, + dnn_hidden_units=(2, 2), + dnn_feature_columns=dnn_feature_columns, + label_dimension=LABEL_DIMENSION, + model_dir=self._model_dir, + dnn_optimizer=adagrad.AdagradOptimizer(0.001), + linear_optimizer=adagrad.AdagradOptimizer(0.001), + config=run_config_lib.RunConfig( + experimental_distribute=DistributeConfig( + train_distribute=train_distribute, + eval_distribute=eval_distribute, + remote_cluster=remote_cluster))) + + def _complete_flow(self, + train_distribute, + eval_distribute, + remote_cluster=None): + estimator = self._get_estimator(train_distribute, eval_distribute, + remote_cluster) + + input_dimension = LABEL_DIMENSION + train_input_fn = self.dataset_input_fn( + x={"x": DATA}, + y=DATA, + batch_size=BATCH_SIZE // len(train_distribute.worker_devices), + shuffle=True) + if eval_distribute: + eval_batch_size = BATCH_SIZE // len(eval_distribute.worker_devices) + else: + eval_batch_size = BATCH_SIZE + eval_input_fn = self.dataset_input_fn( + x={"x": DATA}, y=DATA, batch_size=eval_batch_size, shuffle=False) + + linear_feature_columns = [ + feature_column.numeric_column("x", shape=(input_dimension,)) + ] + dnn_feature_columns = [ + feature_column.numeric_column("x", shape=(input_dimension,)) + ] + feature_columns = linear_feature_columns + dnn_feature_columns + + estimator_training.train_and_evaluate( + estimator, + estimator_training.TrainSpec(train_input_fn, max_steps=MAX_STEPS), + estimator_training.EvalSpec( + name=EVAL_NAME, + input_fn=eval_input_fn, + steps=None, + exporters=self._get_exporter(EXPORTER_NAME, feature_columns), + start_delay_secs=0, + throttle_secs=1)) + return estimator + + def _inspect_train_and_eval_events(self, estimator): + # Make sure nothing is stuck in limbo. + writer_cache.FileWriterCache.clear() + + # Examine the training events. Use a range to check global step to avoid + # flakyness due to global step race condition. + training_loss, _ = self._extract_loss_and_global_step(self._model_dir) + self.assertIsNotNone(training_loss) + + # Examine the eval events. The global step should be accurate. + eval_dir = os.path.join(self._model_dir, "eval_" + EVAL_NAME) + eval_loss, eval_global_step = self._extract_loss_and_global_step( + event_folder=eval_dir) + self.assertIsNotNone(eval_loss) + self.assertGreaterEqual(eval_global_step, MAX_STEPS) + + # Examine the export folder. + export_dir = os.path.join( + os.path.join(self._model_dir, "export"), EXPORTER_NAME) + self.assertTrue(gfile.Exists(export_dir)) + + # Examine the ckpt for predict. + def predict_input_fn(): + return dataset_ops.Dataset.from_tensor_slices({ + "x": DATA + }).batch(BATCH_SIZE) + + predicted_proba = np.array([ + x[prediction_keys.PredictionKeys.PREDICTIONS] + for x in estimator.predict(predict_input_fn) + ]) + self.assertAllEqual((BATCH_SIZE, LABEL_DIMENSION), predicted_proba.shape) + + @combinations.generate( + combinations.combine( + mode=["graph"], + train_distribute_cls=[ + mirrored_strategy.MirroredStrategy, + parameter_server_strategy.ParameterServerStrategy + ], + eval_distribute_cls=[ + None, mirrored_strategy.MirroredStrategy, + parameter_server_strategy.ParameterServerStrategy + ], + required_gpus=1)) + def test_complete_flow_standalone_client(self, train_distribute_cls, + eval_distribute_cls): + try: + train_distribute = train_distribute_cls(num_gpus=context.num_gpus()) + except TypeError: + train_distribute = train_distribute_cls(num_gpus_per_worker=2) + + if eval_distribute_cls: + eval_distribute = eval_distribute_cls() + else: + eval_distribute = None + + estimator = self._complete_flow( + train_distribute, eval_distribute, remote_cluster=self._cluster_spec) + self._inspect_train_and_eval_events(estimator) + + def _mock_run_distribute_coordinator( + self, + worker_fn, + strategy, + eval_fn, + eval_strategy, + mode=dc.CoordinatorMode.STANDALONE_CLIENT, + cluster_spec=None, + session_config=None): + # Calls the origial `run_distribute_coordinator` method but gets task config + # from environment variables and then signals the caller. + task_type = None + task_id = None + if not cluster_spec: + cluster_spec = None + tf_config = json.loads(os.environ.get("TF_CONFIG", "{}")) + if not cluster_spec: + cluster_spec = tf_config.get("cluster", {}) + task_env = tf_config.get("task", {}) + if task_env: + task_type = task_env.get("type", task_type) + task_id = int(task_env.get("index", task_id)) + self._event.set() + original_run_distribute_coordinator( + worker_fn, + strategy, + eval_fn, + eval_strategy, + mode=mode, + cluster_spec=cluster_spec, + task_type=task_type, + task_id=task_id, + session_config=session_config) + + def _task_thread(self, train_distribute, eval_distribute): + with test.mock.patch.object(dc, "run_distribute_coordinator", + self._mock_run_distribute_coordinator): + self._complete_flow(train_distribute, eval_distribute) + + def _run_task_in_thread(self, cluster_spec, task_type, task_id, + train_distribute, eval_distribute): + if task_type: + tf_config = { + "cluster": cluster_spec, + "task": { + "type": task_type, + "index": task_id + } + } + else: + tf_config = { + "cluster": cluster_spec, + "task": { + "type": task_type, + "index": task_id + } + } + self._event.clear() + t = threading.Thread( + target=self._task_thread, args=(train_distribute, eval_distribute)) + with test.mock.patch.dict("os.environ", + {"TF_CONFIG": json.dumps(tf_config)}): + t.start() + self._event.wait() + return t + + def _run_multiple_tasks_in_threads(self, cluster_spec, train_distribute, + eval_distribute): + threads = {} + for task_type in cluster_spec.keys(): + threads[task_type] = [] + for task_id in range(len(cluster_spec[task_type])): + t = self._run_task_in_thread(cluster_spec, task_type, task_id, + train_distribute, eval_distribute) + threads[task_type].append(t) + return threads + + @combinations.generate( + combinations.combine( + mode=["graph"], + train_distribute_cls=[ + parameter_server_strategy.ParameterServerStrategy, + ], + eval_distribute_cls=[ + None, mirrored_strategy.MirroredStrategy, + parameter_server_strategy.ParameterServerStrategy + ], + required_gpus=1)) + def test_complete_flow_indepedent_worker_between_graph( + self, train_distribute_cls, eval_distribute_cls): + train_distribute = train_distribute_cls( + num_gpus_per_worker=context.num_gpus()) + + if eval_distribute_cls: + eval_distribute = eval_distribute_cls() + else: + eval_distribute = None + + cluster_spec = _create_cluster_spec(num_workers=3, num_ps=2, has_eval=True) + threads = self._run_multiple_tasks_in_threads( + cluster_spec, train_distribute, eval_distribute) + for task_type, ts in threads.items(): + if task_type == PS: + continue + for t in ts: + t.join() + + estimator = self._get_estimator(train_distribute, eval_distribute) + self._inspect_train_and_eval_events(estimator) + + @combinations.generate( + combinations.combine( + mode=["graph"], + train_distribute_cls=[mirrored_strategy.MirroredStrategy], + eval_distribute_cls=[None, mirrored_strategy.MirroredStrategy], + required_gpus=1)) + def test_complete_flow_indepedent_worker_in_graph(self, train_distribute_cls, + eval_distribute_cls): + train_distribute = train_distribute_cls(num_gpus=context.num_gpus()) + + if eval_distribute_cls: + eval_distribute = eval_distribute_cls() + else: + eval_distribute = None + + cluster_spec = _create_cluster_spec(num_workers=3, num_ps=2, has_eval=True) + threads = self._run_multiple_tasks_in_threads( + cluster_spec, train_distribute, eval_distribute) + threads[WORKER][0].join() + threads[EVALUATOR][0].join() + + estimator = self._get_estimator(train_distribute, eval_distribute) + self._inspect_train_and_eval_events(estimator) + + +TF_CONFIG_WITH_CHIEF = { + "cluster": { + "chief": ["fake_chief"], + }, + "task": { + "type": "chief", + "index": 0 + } +} + +TF_CONFIG_WITH_MASTER = { + "cluster": { + "master": ["fake_master"], + }, + "task": { + "type": "master", + "index": 0 + } +} + +TF_CONFIG_WITHOUT_TASK = {"cluster": {"chief": ["fake_worker"]}} + + +class RunConfigTest(test.TestCase): + + def test_previously_unexpected_cluster_spec(self): + with test.mock.patch.dict( + "os.environ", {"TF_CONFIG": json.dumps(TF_CONFIG_WITHOUT_TASK)}): + run_config_lib.RunConfig( + experimental_distribute=DistributeConfig( + train_distribute=mirrored_strategy.MirroredStrategy(num_gpus=2))) + + def test_should_run_distribute_coordinator(self): + """Tests that should_run_distribute_coordinator return a correct value.""" + # We don't use distribute coordinator for local training. + self.assertFalse( + dc_training.should_run_distribute_coordinator( + run_config_lib.RunConfig())) + + # When `train_distribute` is not specified, don't use distribute + # coordinator. + with test.mock.patch.dict("os.environ", + {"TF_CONFIG": json.dumps(TF_CONFIG_WITH_CHIEF)}): + self.assertFalse( + dc_training.should_run_distribute_coordinator( + run_config_lib.RunConfig())) + + # When `train_distribute` is specified and TF_CONFIG is detected, use + # distribute coordinator. + with test.mock.patch.dict("os.environ", + {"TF_CONFIG": json.dumps(TF_CONFIG_WITH_CHIEF)}): + config_with_train_distribute = run_config_lib.RunConfig( + experimental_distribute=DistributeConfig( + train_distribute=mirrored_strategy.MirroredStrategy(num_gpus=2))) + config_with_eval_distribute = run_config_lib.RunConfig( + experimental_distribute=DistributeConfig( + eval_distribute=mirrored_strategy.MirroredStrategy(num_gpus=2))) + self.assertTrue( + dc_training.should_run_distribute_coordinator( + config_with_train_distribute)) + self.assertFalse( + dc_training.should_run_distribute_coordinator( + config_with_eval_distribute)) + + # With a master in the cluster, don't run distribute coordinator. + with test.mock.patch.dict("os.environ", + {"TF_CONFIG": json.dumps(TF_CONFIG_WITH_MASTER)}): + config = run_config_lib.RunConfig( + experimental_distribute=DistributeConfig( + train_distribute=mirrored_strategy.MirroredStrategy(num_gpus=2))) + self.assertFalse(dc_training.should_run_distribute_coordinator(config)) + + def test_init_run_config_duplicate_distribute(self): + with self.assertRaises(ValueError): + run_config_lib.RunConfig( + train_distribute=mirrored_strategy.MirroredStrategy(), + experimental_distribute=DistributeConfig( + train_distribute=mirrored_strategy.MirroredStrategy())) + + with self.assertRaises(ValueError): + run_config_lib.RunConfig( + eval_distribute=mirrored_strategy.MirroredStrategy(), + experimental_distribute=DistributeConfig( + eval_distribute=mirrored_strategy.MirroredStrategy())) + + def test_init_run_config_none_distribute_coordinator_mode(self): + # We don't use distribute coordinator for local training. + config = run_config_lib.RunConfig( + train_distribute=mirrored_strategy.MirroredStrategy()) + dc_training.init_run_config(config, {}) + self.assertIsNone(config._distribute_coordinator_mode) + + # With a master in the cluster, don't run distribute coordinator. + with test.mock.patch.dict("os.environ", + {"TF_CONFIG": json.dumps(TF_CONFIG_WITH_MASTER)}): + config = run_config_lib.RunConfig( + train_distribute=mirrored_strategy.MirroredStrategy()) + self.assertIsNone(config._distribute_coordinator_mode) + + # When `train_distribute` is not specified, don't use distribute + # coordinator. + with test.mock.patch.dict("os.environ", + {"TF_CONFIG": json.dumps(TF_CONFIG_WITH_CHIEF)}): + config = run_config_lib.RunConfig() + self.assertFalse(hasattr(config, "_distribute_coordinator_mode")) + + def test_init_run_config_independent_worker(self): + # When `train_distribute` is specified and TF_CONFIG is detected, use + # distribute coordinator with INDEPENDENT_WORKER mode. + with test.mock.patch.dict("os.environ", + {"TF_CONFIG": json.dumps(TF_CONFIG_WITH_CHIEF)}): + config = run_config_lib.RunConfig( + train_distribute=mirrored_strategy.MirroredStrategy()) + self.assertEqual(config._distribute_coordinator_mode, + dc.CoordinatorMode.INDEPENDENT_WORKER) + + def test_init_run_config_standalone_client(self): + # When `train_distribute` is specified, TF_CONFIG is detected and + # `experimental.remote_cluster` is set use distribute coordinator with + # STANDALONE_CLIENT mode. + config = run_config_lib.RunConfig( + train_distribute=mirrored_strategy.MirroredStrategy(), + experimental_distribute=DistributeConfig( + remote_cluster={"chief": ["fake_worker"]})) + self.assertEqual(config._distribute_coordinator_mode, + dc.CoordinatorMode.STANDALONE_CLIENT) + + +if __name__ == "__main__": + with test.mock.patch.object(sys, "exit", os._exit): + test.main() diff --git a/tensorflow/python/BUILD b/tensorflow/python/BUILD index 40f98474b5..37af3d350e 100644 --- a/tensorflow/python/BUILD +++ b/tensorflow/python/BUILD @@ -134,6 +134,7 @@ py_library( "//tensorflow/core:protos_all_py", "//tensorflow/python/compat", "//tensorflow/python/data", + "//tensorflow/python/distribute:estimator_training", "//tensorflow/python/feature_column:feature_column_py", "//tensorflow/python/keras", "//tensorflow/python/ops/distributions", diff --git a/tensorflow/python/distribute/BUILD b/tensorflow/python/distribute/BUILD index 98ef9bf492..ebfcd085e6 100644 --- a/tensorflow/python/distribute/BUILD +++ b/tensorflow/python/distribute/BUILD @@ -8,6 +8,25 @@ exports_files(["LICENSE"]) load("//tensorflow:tensorflow.bzl", "py_test") +py_library( + name = "distribute", + srcs_version = "PY2AND3", + visibility = ["//visibility:public"], + deps = [ + ":distribute_config", + ":distribute_coordinator", + ":distribute_coordinator_context", + ], +) + +py_library( + name = "distribute_config", + srcs = [ + "distribute_config.py", + ], + deps = [], +) + py_library( name = "distribute_coordinator", srcs = [ @@ -81,3 +100,17 @@ py_test( "@absl_py//absl/testing:parameterized", ], ) + +# Used only by estimator. +py_library( + name = "estimator_training", + srcs = [ + "estimator_training.py", + ], + srcs_version = "PY2AND3", + deps = [ + ":distribute_coordinator", + ":distribute_coordinator_context", + "//tensorflow/python:training", + ], +) diff --git a/tensorflow/python/distribute/distribute_config.py b/tensorflow/python/distribute/distribute_config.py new file mode 100644 index 0000000000..fac35742fe --- /dev/null +++ b/tensorflow/python/distribute/distribute_config.py @@ -0,0 +1,45 @@ +# Copyright 2018 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""A configure tuple for high-level APIs for running distribution strategies.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import collections + + +class DistributeConfig( + collections.namedtuple( + 'DistributeConfig', + ['train_distribute', 'eval_distribute', 'remote_cluster'])): + """A config tuple for distribution strategies. + + Attributes: + train_distribute: a `DistributionStrategy` object for training. + eval_distribute: an optional `DistributionStrategy` object for + evaluation. + remote_cluster: a dict, `ClusterDef` or `ClusterSpec` object specifying + the cluster configurations. If this is given, the `train_and_evaluate` + method will be running as a standalone client which connects to the + cluster for training. + """ + + def __new__(cls, + train_distribute=None, + eval_distribute=None, + remote_cluster=None): + return super(DistributeConfig, cls).__new__(cls, train_distribute, + eval_distribute, remote_cluster) diff --git a/tensorflow/python/distribute/distribute_coordinator.py b/tensorflow/python/distribute/distribute_coordinator.py index eb081b65fc..9cf0b3b7a6 100644 --- a/tensorflow/python/distribute/distribute_coordinator.py +++ b/tensorflow/python/distribute/distribute_coordinator.py @@ -311,7 +311,11 @@ def _run_single_worker(worker_fn, worker_barrier=None): """Runs a single worker by calling `worker_fn` under context.""" strategy = copy.deepcopy(strategy) - strategy.configure(session_config, cluster_spec, task_type, task_id) + # If there is an EVALUATOR task, we run single-machine eval on that task. + if task_type == _TaskType.EVALUATOR: + strategy.configure(session_config) + else: + strategy.configure(session_config, cluster_spec, task_type, task_id) context = _WorkerContext( strategy, cluster_spec, @@ -340,14 +344,14 @@ def _run_std_server(cluster_spec=None, return server -def _run_between_graph_client(worker_fn, strategy, cluster_spec, session_config, - rpc_layer): +def _run_between_graph_client(worker_fn, strategy, eval_fn, eval_strategy, + cluster_spec, session_config, rpc_layer): """Runs a standalone client for between-graph replication.""" eval_thread = None if _TaskType.EVALUATOR in cluster_spec.jobs: eval_thread = threading.Thread( target=_run_single_worker, - args=(worker_fn, strategy, cluster_spec, _TaskType.EVALUATOR, 0, + args=(eval_fn, eval_strategy, None, _TaskType.EVALUATOR, 0, session_config), kwargs={ "rpc_layer": rpc_layer, @@ -378,14 +382,14 @@ def _run_between_graph_client(worker_fn, strategy, cluster_spec, session_config, eval_thread.join() -def _run_in_graph_client(worker_fn, strategy, cluster_spec, session_config, - rpc_layer): +def _run_in_graph_client(worker_fn, strategy, eval_fn, eval_strategy, + cluster_spec, session_config, rpc_layer): """Runs a standalone client for in-graph replication.""" eval_thread = None if _TaskType.EVALUATOR in cluster_spec.jobs: eval_thread = threading.Thread( target=_run_single_worker, - args=(worker_fn, strategy, cluster_spec, _TaskType.EVALUATOR, 0, + args=(eval_fn, eval_strategy, cluster_spec, _TaskType.EVALUATOR, 0, session_config), kwargs={ "rpc_layer": rpc_layer, @@ -408,6 +412,8 @@ def _run_in_graph_client(worker_fn, strategy, cluster_spec, session_config, # is the special task when we support cluster_spec propagation. def run_distribute_coordinator(worker_fn, strategy, + eval_fn=None, + eval_strategy=None, mode=CoordinatorMode.STANDALONE_CLIENT, cluster_spec=None, task_type=None, @@ -488,10 +494,12 @@ def run_distribute_coordinator(worker_fn, If `cluster_spec` is not given in any format, it becomes local training and this coordinator will connect to a local session. - For evaluation, if "evaluator" exist in the cluster_spec, a separate thread - will be created with its `task_type` set to "evaluator". If "evaluator" is not - set in the cluster_spec, it entirely depends on the `worker_fn` for how to do - evaluation. + For evaluation, if "evaluator" exists in the cluster_spec, a separate thread + will be created to call `eval_fn` with its `task_type` set to "evaluator". If + `eval_fn` is not defined, fall back to `worker_fn`. This implies that + evaluation will be done on a single machine if there is an "evaluator" task. + If "evaluator" doesn't exit in the cluster_spec, it entirely depends on the + `worker_fn` for how to do evaluation. Args: worker_fn: the function to be called. The function should accept a @@ -501,6 +509,8 @@ def run_distribute_coordinator(worker_fn, run between-graph replicated training or not, whether to run init ops, etc. This object will also be configured given `session_config`, `cluster_spc`, `task_type` and `task_id`. + eval_fn: optional function for "evaluator" task. + eval_strategy: optional DistributionStrategy object for "evaluator" task. mode: in which mode this distribute coordinator runs. cluster_spec: a dict, ClusterDef or ClusterSpec specifying servers and roles in a cluster. If not set or empty, fall back to local training. @@ -535,16 +545,22 @@ def run_distribute_coordinator(worker_fn, # `mode` is ignored in the local case. _run_single_worker(worker_fn, strategy, None, None, None, session_config, rpc_layer) + if eval_fn: + _run_single_worker(eval_fn, eval_strategy or strategy, None, None, None, + session_config, rpc_layer) elif mode == CoordinatorMode.STANDALONE_CLIENT: + eval_fn = eval_fn or worker_fn + eval_strategy = eval_strategy or strategy + # The client must know the cluster but servers in the cluster don't have to # know the client. if task_type in [_TaskType.CLIENT, None]: if strategy.between_graph: - _run_between_graph_client(worker_fn, strategy, cluster_spec, - session_config, rpc_layer) + _run_between_graph_client(worker_fn, strategy, eval_fn, eval_strategy, + cluster_spec, session_config, rpc_layer) else: - _run_in_graph_client(worker_fn, strategy, cluster_spec, session_config, - rpc_layer) + _run_in_graph_client(worker_fn, strategy, eval_fn, eval_strategy, + cluster_spec, session_config, rpc_layer) else: # If not a client job, run the standard server. server = _run_std_server( @@ -554,6 +570,9 @@ def run_distribute_coordinator(worker_fn, if mode != CoordinatorMode.INDEPENDENT_WORKER: raise ValueError("Unexpected coordinator mode: %r" % mode) + eval_fn = eval_fn or worker_fn + eval_strategy = eval_strategy or strategy + # Every one starts a standard server. server = _run_std_server( cluster_spec=cluster_spec, task_type=task_type, task_id=task_id) @@ -572,8 +591,8 @@ def run_distribute_coordinator(worker_fn, else: server.join() elif task_type == _TaskType.EVALUATOR: - _run_single_worker(worker_fn, strategy, cluster_spec, task_type, task_id, - session_config, rpc_layer) + _run_single_worker(eval_fn, eval_strategy, cluster_spec, task_type, + task_id, session_config, rpc_layer) else: if task_type != _TaskType.PS: raise ValueError("Unexpected task_type: %r" % task_type) diff --git a/tensorflow/python/distribute/estimator_training.py b/tensorflow/python/distribute/estimator_training.py new file mode 100644 index 0000000000..202e19c420 --- /dev/null +++ b/tensorflow/python/distribute/estimator_training.py @@ -0,0 +1,264 @@ +# Copyright 2018 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Training utilities for Estimator to use Distribute Coordinator.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import copy + +import six + +from tensorflow.python.distribute import distribute_coordinator as dc +from tensorflow.python.distribute import distribute_coordinator_context as dc_context +from tensorflow.python.platform import tf_logging as logging +from tensorflow.python.training import server_lib + +# pylint: disable=protected-access +CHIEF = dc._TaskType.CHIEF +EVALUATOR = dc._TaskType.EVALUATOR +PS = dc._TaskType.PS +WORKER = dc._TaskType.WORKER + +# pylint: enable=protected-access + + +def _count_ps(cluster_spec): + """Counts the number of parameter servers in cluster_spec.""" + if not cluster_spec: + raise RuntimeError( + 'Internal error: `_count_ps` does not expect empty cluster_spec.') + + return len(cluster_spec.as_dict().get(PS, [])) + + +def _count_worker(cluster_spec, chief_task_type): + """Counts the number of workers (including chief) in cluster_spec.""" + if not cluster_spec: + raise RuntimeError( + 'Internal error: `_count_worker` does not expect empty cluster_spec.') + + return (len(cluster_spec.as_dict().get(WORKER, [])) + len( + cluster_spec.as_dict().get(chief_task_type, []))) + + +def _get_global_id(cluster_spec, task_type, task_id, chief_task_type): + """Returns the global id of the given task type in a cluster.""" + if not task_type: + return 0 + + # Sort task names in cluster by "chief"/"master", "evaluator", "worker" + # and "ps". More details can be found at the documentation of + # @{tf.estimator.RunConfig.global_id_in_cluster}. + task_type_ordered_list = [] + if chief_task_type in cluster_spec.jobs: + task_type_ordered_list = [chief_task_type] + task_type_ordered_list.extend([ + t for t in sorted(cluster_spec.jobs) if t != chief_task_type and t != PS + ]) + if PS in cluster_spec.jobs: + task_type_ordered_list.append(PS) + + # Find the right gloabl_id for current task. + next_global_id = 0 + for t in task_type_ordered_list: + if t == task_type: + return next_global_id + task_id + # `cluster_spec.job_tasks` returns all task addresses of type `t`. + next_global_id += len(cluster_spec.job_tasks(t)) + + # It is unexpected that it passes through all task_types in + # `task_type_ordered_list`. + raise RuntimeError('Internal Error: `task_type` ({}) is not in ' + 'cluster_spec ({}).'.format(task_type, cluster_spec)) + + +def _init_run_config_from_worker_context(config, worker_context): + """Initializes run config from distribute coordinator's worker context.""" + + # pylint: disable=protected-access + config._service = None + config._cluster_spec = worker_context.cluster_spec + config._task_type = worker_context.task_type + config._task_id = worker_context.task_id + config._evaluation_master = worker_context.master_target + config._master = worker_context.master_target + config._is_chief = worker_context.is_chief + + if config._cluster_spec: + # Distributed mode. + if config._task_type != EVALUATOR: + + config._num_ps_replicas = _count_ps(config._cluster_spec) + config._num_worker_replicas = _count_worker( + config._cluster_spec, chief_task_type=CHIEF) + config._global_id_in_cluster = _get_global_id( + config._cluster_spec, + config._task_type, + config._task_id, + chief_task_type=CHIEF) + else: + # Evaluator task should not be aware of the other tasks. + config._cluster_spec = server_lib.ClusterSpec({}) + config._num_ps_replicas = 0 + config._num_worker_replicas = 0 + config._global_id_in_cluster = None # undefined + else: + # Local mode. + config._global_id_in_cluster = 0 + config._num_ps_replicas = 0 + config._num_worker_replicas = 1 + + +def init_run_config(config, tf_config): + """Initializes RunConfig for distribution strategies.""" + # pylint: disable=protected-access + if (config._experimental_distribute and + config._experimental_distribute.train_distribute): + if config._train_distribute: + raise ValueError('Either `train_distribute` or' + '`experimental_distribute.train_distribute` can be set.') + config._train_distribute = config._experimental_distribute.train_distribute + + if (config._experimental_distribute and + config._experimental_distribute.eval_distribute): + if config._eval_distribute: + raise ValueError('Either `eval_distribute` or' + '`experimental_distribute.eval_distribute` can be set.') + config._eval_distribute = config._experimental_distribute.eval_distribute + + cluster_spec = server_lib.ClusterSpec(tf_config.get('cluster', {})) + config._init_distributed_setting_from_environment_var({}) + + # Use distribute coordinator with STANDALONE_CLIENT mode if + # `experimental_distribute.remote_cluster` is set. + if (config._train_distribute and config._experimental_distribute and + config._experimental_distribute.remote_cluster): + if tf_config: + raise ValueError('Cannot set both TF_CONFIG environment variable and ' + '`experimental_distribute.remote_cluster`') + config._distribute_coordinator_mode = dc.CoordinatorMode.STANDALONE_CLIENT + config._cluster_spec = config._experimental_distribute.remote_cluster + logging.info('RunConfig initialized for Distribute Coordinator with ' + 'STANDALONE_CLIENT mode') + return + + # Don't use distribute coordinator if it is local training or cluster has a + # MASTER job or `train_distribute` is not specifed. + if (not tf_config or 'master' in cluster_spec.jobs or + not config._train_distribute): + config._distribute_coordinator_mode = None + config._init_distributed_setting_from_environment_var(tf_config) + config._maybe_overwrite_session_config_for_distributed_training() + logging.info('Not using Distribute Coordinator.') + return + + # Use distribute coordinator with INDEPENDENT_WORKER mode otherwise. + assert tf_config + + # Set the cluster_spec only since the distributed setting will come from + # distribute coordinator. + config._cluster_spec = cluster_spec + config._distribute_coordinator_mode = dc.CoordinatorMode.INDEPENDENT_WORKER + logging.info('RunConfig initialized for Distribute Coordinator with ' + 'INDEPENDENT_WORKER mode') + + +def should_run_distribute_coordinator(config): + """Checks the config to see whether to run distribute coordinator.""" + # pylint: disable=protected-access + if (not hasattr(config, '_distribute_coordinator_mode') or + config._distribute_coordinator_mode is None): + return False + if (not isinstance(config._distribute_coordinator_mode, six.string_types) or + config._distribute_coordinator_mode not in [ + dc.CoordinatorMode.STANDALONE_CLIENT, + dc.CoordinatorMode.INDEPENDENT_WORKER + ]): + logging.warning('Unexpected distribute_coordinator_mode: %r', + config._distribute_coordinator_mode) + return False + if not config.cluster_spec: + logging.warning('Running `train_and_evaluate` locally, ignoring ' + '`experimental_distribute_coordinator_mode`.') + return False + return True + + +def train_and_evaluate(estimator, train_spec, eval_spec, executor_cls): + """Run distribute coordinator for Estimator's `train_and_evaluate`. + + Args: + estimator: An `Estimator` instance to train and evaluate. + train_spec: A `TrainSpec` instance to specify the training specification. + eval_spec: A `EvalSpec` instance to specify the evaluation and export + specification. + executor_cls: the evaluation executor class of Estimator. + + Raises: + ValueError: if `distribute_coordinator_mode` is None in RunConfig. + """ + run_config = estimator.config + if not run_config._distribute_coordinator_mode: # pylint: disable=protected-access + raise ValueError( + 'Distribute coordinator mode is not specified in `RunConfig`.') + + def _worker_fn(strategy): + """Function for worker task.""" + local_estimator = copy.deepcopy(estimator) + # pylint: disable=protected-access + local_estimator._config._train_distribute = strategy + _init_run_config_from_worker_context( + local_estimator._config, dc_context.get_current_worker_context()) + local_estimator._train_distribution = strategy + # pylint: enable=protected-access + + local_estimator.train( + input_fn=train_spec.input_fn, + max_steps=train_spec.max_steps, + hooks=list(train_spec.hooks)) + + def _eval_fn(strategy): + """Function for evaluator task.""" + local_estimator = copy.deepcopy(estimator) + # pylint: disable=protected-access + local_estimator._config._eval_distribute = strategy + _init_run_config_from_worker_context( + local_estimator._config, dc_context.get_current_worker_context()) + local_estimator._eval_distribution = strategy + + executor = executor_cls(local_estimator, train_spec, eval_spec) + executor._start_continuous_evaluation() + # pylint: enable=protected-access + + # pylint: disable=protected-access + if (run_config._distribute_coordinator_mode == + dc.CoordinatorMode.STANDALONE_CLIENT): + cluster_spec = run_config.cluster_spec + assert cluster_spec + else: + # The cluster_spec comes from TF_CONFIG environment variable if it is + # INDEPENDENT_WORKER mode. + cluster_spec = None + + dc.run_distribute_coordinator( + _worker_fn, + run_config.train_distribute, + _eval_fn, + run_config.eval_distribute, + mode=run_config._distribute_coordinator_mode, + cluster_spec=cluster_spec, + session_config=run_config.session_config) diff --git a/tensorflow/python/estimator/run_config.py b/tensorflow/python/estimator/run_config.py index 12daddb044..b1ca207b62 100644 --- a/tensorflow/python/estimator/run_config.py +++ b/tensorflow/python/estimator/run_config.py @@ -26,6 +26,7 @@ import six from tensorflow.core.protobuf import config_pb2 from tensorflow.core.protobuf import rewriter_config_pb2 +from tensorflow.python.distribute import estimator_training as distribute_coordinator_training from tensorflow.python.platform import tf_logging as logging from tensorflow.python.training import server_lib from tensorflow.python.util import compat_internal @@ -460,7 +461,8 @@ class RunConfig(object): train_distribute: An optional instance of `tf.contrib.distribute.DistributionStrategy`. If specified, then Estimator will distribute the user's model during training, - according to the policy specified by that strategy. + according to the policy specified by that strategy. Setting + `experimental_distribute.train_distribute` is preferred. device_fn: A callable invoked for every `Operation` that takes the `Operation` and returns the device string. If `None`, defaults to the device function returned by `tf.train.replica_device_setter` @@ -470,10 +472,13 @@ class RunConfig(object): eval_distribute: An optional instance of `tf.contrib.distribute.DistributionStrategy`. If specified, then Estimator will distribute the user's model during evaluation, - according to the policy specified by that strategy. + according to the policy specified by that strategy. Setting + `experimental_distribute.eval_distribute` is preferred. experimental_distribute: an optional `tf.contrib.distribute.DistributeConfig` object specifying - DistributionStrategy-related configuration. + DistributionStrategy-related configuration. The `train_distribute` and + `eval_distribute` can be passed as parameters to `RunConfig` or set in + `experimental_distribute` but not both. Raises: ValueError: If both `save_checkpoints_steps` and `save_checkpoints_secs` @@ -516,9 +521,12 @@ class RunConfig(object): eval_distribute=eval_distribute, experimental_distribute=experimental_distribute) - self._init_distributed_setting_from_environment_var(tf_config) - - self._maybe_overwrite_session_config_for_distributed_training() + if train_distribute or eval_distribute or experimental_distribute: + logging.info('Initializing RunConfig with distribution strategies.') + distribute_coordinator_training.init_run_config(self, tf_config) + else: + self._init_distributed_setting_from_environment_var(tf_config) + self._maybe_overwrite_session_config_for_distributed_training() def _maybe_overwrite_session_config_for_distributed_training(self): """Overwrites the session_config for distributed training. diff --git a/tensorflow/python/estimator/training.py b/tensorflow/python/estimator/training.py index e6bd263c80..240be5dabe 100644 --- a/tensorflow/python/estimator/training.py +++ b/tensorflow/python/estimator/training.py @@ -26,6 +26,7 @@ import time import six from tensorflow.core.protobuf import config_pb2 +from tensorflow.python.distribute import estimator_training as distribute_coordinator_training from tensorflow.python.estimator import estimator as estimator_lib from tensorflow.python.estimator import exporter as exporter_lib from tensorflow.python.estimator import run_config as run_config_lib @@ -274,8 +275,10 @@ def train_and_evaluate(estimator, train_spec, eval_spec): evaluation `input_fn`, steps, etc. This utility function provides consistent behavior for both local - (non-distributed) and distributed configurations. Currently, the only - supported distributed training configuration is between-graph replication. + (non-distributed) and distributed configurations. The default distribution + configuration is parameter server-based between-graph replication. For other + types of distribution configurations such as all-reduce training, please use + [DistributionStrategies](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/distribute). # pylint: disable=line-too-long Overfitting: In order to avoid overfitting, it is recommended to set up the training `input_fn` to shuffle the training data properly. @@ -426,6 +429,11 @@ def train_and_evaluate(estimator, train_spec, eval_spec): }' ``` + When `distribute` or `experimental_distribute.train_distribute` and + `experimental_distribute.remote_cluster` is set, this method will start a + client running on the current host which connects to the `remote_cluster` for + training and evaluation. + Args: estimator: An `Estimator` instance to train and evaluate. train_spec: A `TrainSpec` instance to specify the training specification. @@ -444,8 +452,16 @@ def train_and_evaluate(estimator, train_spec, eval_spec): executor = _TrainingExecutor( estimator=estimator, train_spec=train_spec, eval_spec=eval_spec) - config = estimator.config + + # If `distribute_coordinator_mode` is set and running in distributed + # environment, we run `train_and_evaluate` via distribute coordinator. + if distribute_coordinator_training.should_run_distribute_coordinator(config): + logging.info('Running `train_and_evaluate` with Distribute Coordinator.') + distribute_coordinator_training.train_and_evaluate( + estimator, train_spec, eval_spec, _TrainingExecutor) + return + if (config.task_type == run_config_lib.TaskType.EVALUATOR and config.task_id > 0): raise ValueError( -- GitLab From 04ffe2f34957f02d5a2aa4ead1c75233dd1cb1b7 Mon Sep 17 00:00:00 2001 From: Yuefeng Zhou Date: Fri, 24 Aug 2018 20:49:01 -0700 Subject: [PATCH 135/598] Add environment and rpc_layer to the TF_CONFIG environment variable in distribute coordinator. PiperOrigin-RevId: 210197404 --- .../distribute/distribute_coordinator.py | 65 +++++++++++++++---- .../distribute/distribute_coordinator_test.py | 64 +++++++++++++++++- 2 files changed, 117 insertions(+), 12 deletions(-) diff --git a/tensorflow/python/distribute/distribute_coordinator.py b/tensorflow/python/distribute/distribute_coordinator.py index 9cf0b3b7a6..46cdd64a6e 100644 --- a/tensorflow/python/distribute/distribute_coordinator.py +++ b/tensorflow/python/distribute/distribute_coordinator.py @@ -22,9 +22,12 @@ import copy import json import os import threading +import time from tensorflow.core.protobuf import cluster_pb2 +from tensorflow.python.client import session from tensorflow.python.distribute import distribute_coordinator_context +from tensorflow.python.platform import tf_logging as logging from tensorflow.python.training import monitored_session from tensorflow.python.training import server_lib @@ -332,16 +335,38 @@ def _run_std_server(cluster_spec=None, task_type=None, task_id=None, session_config=None, - rpc_layer=None): + rpc_layer=None, + environment=None): """Runs a standard server.""" - server = server_lib.Server( - cluster_spec, - job_name=task_type, - task_index=task_id, - config=session_config, - protocol=rpc_layer) - server.start() - return server + + class _FakeServer(object): + """A fake server that runs a master session.""" + + def start(self): + assert cluster_spec + target = cluster_spec.task_address(task_type, task_id) + if rpc_layer: + target = rpc_layer + "://" + target + # A tensorflow server starts when a remote session is created. + session.Session(target=target, config=session_config) + + def join(self): + while True: + time.sleep(5) + + if environment == "google": + server = _FakeServer() + server.start() + return server + else: + server = server_lib.Server( + cluster_spec, + job_name=task_type, + task_index=task_id, + config=session_config, + protocol=rpc_layer) + server.start() + return server def _run_between_graph_client(worker_fn, strategy, eval_fn, eval_strategy, @@ -541,8 +566,18 @@ def run_distribute_coordinator(worker_fn, "`tf.train.ClusterDef` object") # TODO(yuefengz): validate cluster_spec. + rpc_layer = tf_config.get("rpc_layer", rpc_layer) + environment = tf_config.get("environment", None) + + if cluster_spec: + logging.info( + "Running Distribute Coordinator with mode = %r, cluster_spec = %r, " + "task_type = %r, task_id = %r, environment = %r, rpc_layer = %r", mode, + cluster_spec.as_dict(), task_type, task_id, environment, rpc_layer) + if not cluster_spec: # `mode` is ignored in the local case. + logging.info("Running local Distribute Coordinator.") _run_single_worker(worker_fn, strategy, None, None, None, session_config, rpc_layer) if eval_fn: @@ -564,7 +599,11 @@ def run_distribute_coordinator(worker_fn, else: # If not a client job, run the standard server. server = _run_std_server( - cluster_spec=cluster_spec, task_type=task_type, task_id=task_id) + cluster_spec=cluster_spec, + task_type=task_type, + task_id=task_id, + rpc_layer=rpc_layer, + environment=environment) server.join() else: if mode != CoordinatorMode.INDEPENDENT_WORKER: @@ -575,7 +614,11 @@ def run_distribute_coordinator(worker_fn, # Every one starts a standard server. server = _run_std_server( - cluster_spec=cluster_spec, task_type=task_type, task_id=task_id) + cluster_spec=cluster_spec, + task_type=task_type, + task_id=task_id, + rpc_layer=rpc_layer, + environment=environment) if task_type in [_TaskType.CHIEF, _TaskType.WORKER]: if strategy.between_graph: diff --git a/tensorflow/python/distribute/distribute_coordinator_test.py b/tensorflow/python/distribute/distribute_coordinator_test.py index 97c6bdd15a..5dd57fa134 100644 --- a/tensorflow/python/distribute/distribute_coordinator_test.py +++ b/tensorflow/python/distribute/distribute_coordinator_test.py @@ -20,8 +20,10 @@ from __future__ import print_function import contextlib import copy +import json import os import sys +import time import threading import six @@ -59,6 +61,8 @@ INDEPENDENT_WORKER = distribute_coordinator.CoordinatorMode.INDEPENDENT_WORKER NUM_WORKERS = 3 NUM_PS = 2 +original_sys_exit = sys.exit + def _bytes_to_str(maybe_bytes): if isinstance(maybe_bytes, six.string_types): @@ -369,7 +373,8 @@ class DistributeCoordinatorTestBase(test.TestCase): cluster_spec=None, task_type=None, task_id=None, - rpc_layer=None): + rpc_layer=None, + environment=None): task_type = str(task_type) task_id = task_id or 0 with self._lock: @@ -730,6 +735,63 @@ class DistributeCoordinatorTestInpendentWorkerMode( self.assertTrue(self._std_servers[WORKER][2].joined) self.assertFalse(self._std_servers[EVALUATOR][0].joined) + def testRunStdServerInGoogleEnvironment(self): + cluster_spec = {"worker": ["fake_worker"], "ps": ["localhost:0"]} + tf_config = {"cluster": cluster_spec, "environment": "google"} + + joined = [False] + + def _fake_sleep(_): + joined[0] = True + original_sys_exit(0) + + def _thread_fn(cluster_spec): + distribute_coordinator.run_distribute_coordinator( + None, + None, + mode=INDEPENDENT_WORKER, + cluster_spec=cluster_spec, + task_type="ps", + task_id=0) + + with test.mock.patch.dict( + "os.environ", + {"TF_CONFIG": json.dumps(tf_config)}), test.mock.patch.object( + time, "sleep", _fake_sleep): + t = threading.Thread(target=_thread_fn, args=(cluster_spec,)) + t.start() + t.join() + self.assertTrue(joined[0]) + + def testRpcLayerEnvironmentVariable(self): + cluster_spec = {"worker": ["fake_worker"], "ps": ["fake_ps"]} + tf_config = {"cluster": cluster_spec, "rpc_layer": "cake"} + + rpc_layer_from_coordinator = [None] + + def _run_mock_server(cluster_spec=None, + task_type=None, + task_id=None, + session_config=None, + rpc_layer=None, + environment=None): + del cluster_spec, task_type, task_id, session_config, environment + rpc_layer_from_coordinator[0] = rpc_layer + return MockServer() + + with test.mock.patch.dict( + "os.environ", + {"TF_CONFIG": json.dumps(tf_config)}), test.mock.patch.object( + distribute_coordinator, "_run_std_server", _run_mock_server): + distribute_coordinator.run_distribute_coordinator( + None, + None, + mode=INDEPENDENT_WORKER, + cluster_spec=cluster_spec, + task_type="ps", + task_id=0) + self.assertEqual(rpc_layer_from_coordinator[0], "cake") + if __name__ == "__main__": # TODO(yuefengz): find a smart way to terminite std server threads. -- GitLab From b07f8211409f2b2e46ab539291e824f2b7865885 Mon Sep 17 00:00:00 2001 From: Jonathan Date: Sat, 25 Aug 2018 14:12:48 +0800 Subject: [PATCH 136/598] remove unused sparse_ops import --- tensorflow/python/ops/nn_grad.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tensorflow/python/ops/nn_grad.py b/tensorflow/python/ops/nn_grad.py index 59ba0091c8..ef4dfd7fd1 100644 --- a/tensorflow/python/ops/nn_grad.py +++ b/tensorflow/python/ops/nn_grad.py @@ -27,7 +27,6 @@ from tensorflow.python.ops import gen_nn_ops from tensorflow.python.ops import gradients_impl from tensorflow.python.ops import math_ops from tensorflow.python.ops import nn_ops -from tensorflow.python.ops import sparse_ops @ops.RegisterGradient("Conv2DBackpropInput") -- GitLab From 5c86ae3c8487e776cc2f32c8a99410e729a3ddaf Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Sat, 25 Aug 2018 02:05:29 -0700 Subject: [PATCH 137/598] compat: Update forward compatibility horizon to 2018-08-25 PiperOrigin-RevId: 210211804 --- tensorflow/python/compat/compat.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tensorflow/python/compat/compat.py b/tensorflow/python/compat/compat.py index e0826a7945..d6d9080e8e 100644 --- a/tensorflow/python/compat/compat.py +++ b/tensorflow/python/compat/compat.py @@ -26,7 +26,7 @@ import datetime from tensorflow.python.util import tf_contextlib from tensorflow.python.util.tf_export import tf_export -_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2018, 8, 24) +_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2018, 8, 25) @tf_export("compat.forward_compatible") -- GitLab From 941da3f759fa7232e11603e49744788d2e86bf1c Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Sat, 25 Aug 2018 05:55:13 -0700 Subject: [PATCH 138/598] Adds support of float16 and bfloat16 weights for embedding_lookup_sparse. PiperOrigin-RevId: 210220427 --- .../python/kernel_tests/embedding_ops_test.py | 18 +++++++++++++++--- tensorflow/python/ops/embedding_ops.py | 2 ++ 2 files changed, 17 insertions(+), 3 deletions(-) diff --git a/tensorflow/python/kernel_tests/embedding_ops_test.py b/tensorflow/python/kernel_tests/embedding_ops_test.py index 55d75cb474..0e83726760 100644 --- a/tensorflow/python/kernel_tests/embedding_ops_test.py +++ b/tensorflow/python/kernel_tests/embedding_ops_test.py @@ -663,8 +663,9 @@ class EmbeddingLookupSparseTest(test.TestCase): np.ones(np.sum(vals_per_batch_entry)), vals_per_batch_entry) for num_shards, combiner, dtype, ignore_weights in itertools.product( - [1, 5], ["sum", "mean", "sqrtn"], [dtypes.float32, - dtypes.float64], [True, False]): + [1, 5], ["sum", "mean", "sqrtn"], + [dtypes.float16, dtypes.bfloat16, dtypes.float32, dtypes.float64], + [True, False]): with self.test_session(): p, params, feed_dict = _EmbeddingParams( @@ -677,6 +678,10 @@ class EmbeddingLookupSparseTest(test.TestCase): self.assertEqual(embedding_sum.get_shape().as_list(), expected_lookup_result_shape) + if dtype in (dtypes.float16, dtypes.bfloat16): + self.assertEqual(embedding_sum.dtype, dtypes.float32) + else: + self.assertEqual(embedding_sum.dtype, dtype) tf_embedding_sum = embedding_sum.eval(feed_dict=feed_dict) @@ -692,7 +697,14 @@ class EmbeddingLookupSparseTest(test.TestCase): if combiner == "sqrtn": np_embedding_sum /= np.reshape( np.sqrt(np_weight_sq_sum), (batch_size, 1, 1)) - self.assertAllClose(np_embedding_sum, tf_embedding_sum) + + rtol = 1e-6 + if dtype == dtypes.bfloat16: + rtol = 1e-2 + elif dtype == dtypes.float16: + rtol = 1e-3 + atol = rtol + self.assertAllClose(np_embedding_sum, tf_embedding_sum, rtol, atol) def testGradientsEmbeddingLookupSparse(self): vocab_size = 12 diff --git a/tensorflow/python/ops/embedding_ops.py b/tensorflow/python/ops/embedding_ops.py index 7b9e7de145..f97fca47ea 100644 --- a/tensorflow/python/ops/embedding_ops.py +++ b/tensorflow/python/ops/embedding_ops.py @@ -427,6 +427,8 @@ def embedding_lookup_sparse(params, embeddings = embedding_lookup( params, ids, partition_strategy=partition_strategy, max_norm=max_norm) + if embeddings.dtype in (dtypes.float16, dtypes.bfloat16): + embeddings = math_ops.to_float(embeddings) if not ignore_weights: weights = sp_weights.values if weights.dtype != embeddings.dtype: -- GitLab From 8a05bdf333f34603b33c0f3a029e023deb27ae04 Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Sat, 25 Aug 2018 20:53:46 -0700 Subject: [PATCH 139/598] Expose the RegAdagradOptimizer (which allows the user to specify whether a loss should update the accumulator) through tf.contrib.opt. PiperOrigin-RevId: 210253451 --- tensorflow/contrib/opt/__init__.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tensorflow/contrib/opt/__init__.py b/tensorflow/contrib/opt/__init__.py index 781621dba0..ad7d7cfa6e 100644 --- a/tensorflow/contrib/opt/__init__.py +++ b/tensorflow/contrib/opt/__init__.py @@ -31,6 +31,7 @@ from tensorflow.contrib.opt.python.training.model_average_optimizer import * from tensorflow.contrib.opt.python.training.moving_average_optimizer import * from tensorflow.contrib.opt.python.training.multitask_optimizer_wrapper import * from tensorflow.contrib.opt.python.training.nadam_optimizer import * +from tensorflow.contrib.opt.python.training.reg_adagrad_optimizer import * from tensorflow.contrib.opt.python.training.shampoo import * from tensorflow.contrib.opt.python.training.weight_decay_optimizers import * from tensorflow.contrib.opt.python.training.powersign import * @@ -65,6 +66,7 @@ _allowed_symbols = [ 'ModelAverageCustomGetter', 'GGTOptimizer', 'ShampooOptimizer', + 'RegAdagradOptimizer', ] remove_undocumented(__name__, _allowed_symbols) -- GitLab From 09792df012c22622324f085f46edde33006c7355 Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Sun, 26 Aug 2018 02:07:11 -0700 Subject: [PATCH 140/598] compat: Update forward compatibility horizon to 2018-08-26 PiperOrigin-RevId: 210266798 --- tensorflow/python/compat/compat.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tensorflow/python/compat/compat.py b/tensorflow/python/compat/compat.py index d6d9080e8e..dd3b957da9 100644 --- a/tensorflow/python/compat/compat.py +++ b/tensorflow/python/compat/compat.py @@ -26,7 +26,7 @@ import datetime from tensorflow.python.util import tf_contextlib from tensorflow.python.util.tf_export import tf_export -_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2018, 8, 25) +_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2018, 8, 26) @tf_export("compat.forward_compatible") -- GitLab From 32d4ffeb95a344fde6a1b956a4a8d6792432bf15 Mon Sep 17 00:00:00 2001 From: Ben Date: Sun, 26 Aug 2018 15:41:10 -0400 Subject: [PATCH 141/598] comments --- tensorflow/python/pywrap_tfe.i | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/tensorflow/python/pywrap_tfe.i b/tensorflow/python/pywrap_tfe.i index bc02e9a35c..e1c233cdd9 100755 --- a/tensorflow/python/pywrap_tfe.i +++ b/tensorflow/python/pywrap_tfe.i @@ -105,10 +105,16 @@ limitations under the License. } } +// For const parameters in a function, SWIG pretty much ignores the const. +// See: http://www.swig.org/Doc2.0/SWIG.html#SWIG_nn13 +// Hence the 'const_cast'. %typemap(in) const char* serialized_function_def { $1 = const_cast(TFE_GetPythonString($input)); } +// For const parameters in a function, SWIG pretty much ignores the const. +// See: http://www.swig.org/Doc2.0/SWIG.html#SWIG_nn13 +// Hence the 'const_cast'. %typemap(in) const char* device_name { if ($input == Py_None) { $1 = nullptr; @@ -117,6 +123,9 @@ limitations under the License. } } +// For const parameters in a function, SWIG pretty much ignores the const. +// See: http://www.swig.org/Doc2.0/SWIG.html#SWIG_nn13 +// Hence the 'const_cast'. %typemap(in) const char* op_name { $1 = const_cast(TFE_GetPythonString($input)); } -- GitLab From 6f469061b8f52b4d6b9c9b47d46f97f135df01b0 Mon Sep 17 00:00:00 2001 From: Ben Date: Sun, 26 Aug 2018 15:46:54 -0400 Subject: [PATCH 142/598] revert protobuf version --- tensorflow/workspace.bzl | 18 ++++-------------- 1 file changed, 4 insertions(+), 14 deletions(-) diff --git a/tensorflow/workspace.bzl b/tensorflow/workspace.bzl index 7cd9246b78..5c82285749 100755 --- a/tensorflow/workspace.bzl +++ b/tensorflow/workspace.bzl @@ -366,11 +366,11 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""): ) PROTOBUF_urls =[ - "https://mirror.bazel.build/github.com/google/protobuf/archive/fe2eef4bf414ebb352cf11bcec633f1fd46ec876.tar.gz", - "https://github.com/google/protobuf/archive/fe2eef4bf414ebb352cf11bcec633f1fd46ec876.tar.gz", + "https://mirror.bazel.build/github.com/google/protobuf/archive/v3.6.0.tar.gz", + "https://github.com/google/protobuf/archive/v3.6.0.tar.gz", ] - PROTOBUF_sha256 = "b91b0ac9907af983877c960809dcad7a6dc8e4b06e34d32b7d66a12b9ea1fa17" - PROTOBUF_strip_prefix = "protobuf-fe2eef4bf414ebb352cf11bcec633f1fd46ec876" + PROTOBUF_sha256 = "50a5753995b3142627ac55cfd496cebc418a2e575ca0236e29033c67bd5665f4" + PROTOBUF_strip_prefix = "protobuf-3.6.0" tf_http_archive( name = "protobuf_archive", @@ -396,16 +396,6 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""): strip_prefix = PROTOBUF_strip_prefix, ) - tf_http_archive( - name = "bazel_skylib", - urls = [ - "https://mirror.bazel.build/github.com/bazelbuild/bazel-skylib/archive/2169ae1c374aab4a09aa90e65efe1a3aad4e279b.tar.gz", - "https://github.com/bazelbuild/bazel-skylib/archive/2169ae1c374aab4a09aa90e65efe1a3aad4e279b.tar.gz" - ], - sha256 = "bbccf674aa441c266df9894182d80de104cabd19be98be002f6d478aaa31574d", - strip_prefix = "bazel-skylib-2169ae1c374aab4a09aa90e65efe1a3aad4e279b", - ) - tf_http_archive( name = "nsync", urls = [ -- GitLab From 7bccde15ce0dd29dce62092a5e9d48ffdc772963 Mon Sep 17 00:00:00 2001 From: Derek Murray Date: Sun, 26 Aug 2018 22:31:25 -0700 Subject: [PATCH 143/598] Reduce the overhead of invoking an OpKernel when tracing is not enabled. PiperOrigin-RevId: 210317627 --- tensorflow/compiler/jit/xla_device.cc | 6 +- tensorflow/core/BUILD | 1 + .../core/common_runtime/local_device.cc | 2 +- tensorflow/core/common_runtime/local_device.h | 3 +- .../core/common_runtime/threadpool_device.cc | 11 ---- .../core/common_runtime/threadpool_device.h | 1 - .../core/common_runtime/tracing_device.h | 57 +++++++++++++++++++ 7 files changed, 62 insertions(+), 19 deletions(-) create mode 100644 tensorflow/core/common_runtime/tracing_device.h diff --git a/tensorflow/compiler/jit/xla_device.cc b/tensorflow/compiler/jit/xla_device.cc index 70e6d0be0f..50c902fdfc 100644 --- a/tensorflow/compiler/jit/xla_device.cc +++ b/tensorflow/compiler/jit/xla_device.cc @@ -365,11 +365,7 @@ Status XlaDevice::FillContextMap(const Graph* graph, void XlaDevice::Compute(OpKernel* op_kernel, OpKernelContext* context) { VLOG(2) << "XlaDevice::Compute " << op_kernel->name() << ":" << op_kernel->type_string(); - // When Xprof profiling is off (which is the default), constructing the - // activity is simple enough that its overhead is negligible. - tracing::ScopedActivity activity(op_kernel->name(), op_kernel->type_string(), - op_kernel->IsExpensive()); - op_kernel->Compute(context); + TracingDevice::Compute(op_kernel, context); } void XlaDevice::ComputeAsync(AsyncOpKernel* op_kernel, OpKernelContext* context, diff --git a/tensorflow/core/BUILD b/tensorflow/core/BUILD index 0882cc3c8b..44662ea79e 100644 --- a/tensorflow/core/BUILD +++ b/tensorflow/core/BUILD @@ -2743,6 +2743,7 @@ CORE_CPU_LIB_HEADERS = CORE_CPU_BASE_HDRS + [ "common_runtime/stats_publisher_interface.h", "common_runtime/step_stats_collector.h", "common_runtime/threadpool_device.h", + "common_runtime/tracing_device.h", "common_runtime/visitable_allocator.h", "common_runtime/process_state.h", "common_runtime/pool_allocator.h", diff --git a/tensorflow/core/common_runtime/local_device.cc b/tensorflow/core/common_runtime/local_device.cc index 873182371e..db5022d56e 100644 --- a/tensorflow/core/common_runtime/local_device.cc +++ b/tensorflow/core/common_runtime/local_device.cc @@ -62,7 +62,7 @@ struct LocalDevice::EigenThreadPoolInfo { LocalDevice::LocalDevice(const SessionOptions& options, const DeviceAttributes& attributes) - : Device(options.env, attributes), owned_tp_info_(nullptr) { + : TracingDevice(options.env, attributes), owned_tp_info_(nullptr) { // Log info messages if TensorFlow is not compiled with instructions that // could speed up performance and are available on the current CPU. port::InfoAboutUnusedCPUFeatures(); diff --git a/tensorflow/core/common_runtime/local_device.h b/tensorflow/core/common_runtime/local_device.h index 226f121bf3..9a82fb7204 100644 --- a/tensorflow/core/common_runtime/local_device.h +++ b/tensorflow/core/common_runtime/local_device.h @@ -17,6 +17,7 @@ limitations under the License. #define TENSORFLOW_CORE_COMMON_RUNTIME_LOCAL_DEVICE_H_ #include "tensorflow/core/common_runtime/device.h" +#include "tensorflow/core/common_runtime/tracing_device.h" #include "tensorflow/core/framework/device_attributes.pb.h" #include "tensorflow/core/platform/macros.h" @@ -31,7 +32,7 @@ struct SessionOptions; // initializes a shared Eigen compute device used by both. This // should eventually be removed once we refactor ThreadPoolDevice and // GPUDevice into more 'process-wide' abstractions. -class LocalDevice : public Device { +class LocalDevice : public TracingDevice { public: LocalDevice(const SessionOptions& options, const DeviceAttributes& attributes); diff --git a/tensorflow/core/common_runtime/threadpool_device.cc b/tensorflow/core/common_runtime/threadpool_device.cc index 7406ecf4f8..0fbc20b34b 100644 --- a/tensorflow/core/common_runtime/threadpool_device.cc +++ b/tensorflow/core/common_runtime/threadpool_device.cc @@ -70,17 +70,6 @@ ThreadPoolDevice::ThreadPoolDevice(const SessionOptions& options, ThreadPoolDevice::~ThreadPoolDevice() {} -void ThreadPoolDevice::Compute(OpKernel* op_kernel, OpKernelContext* context) { - // When Xprof/ThreadScape profiling is off (which is the default), the - // following code is simple enough that its overhead is negligible. - tracing::ScopedActivity activity(op_kernel->name(), op_kernel->type_string(), - op_kernel->IsExpensive()); - tracing::ScopedRegion region(tracing::EventCategory::kCompute, - op_kernel->name()); - - op_kernel->Compute(context); -} - Allocator* ThreadPoolDevice::GetAllocator(AllocatorAttributes attr) { return allocator_; } diff --git a/tensorflow/core/common_runtime/threadpool_device.h b/tensorflow/core/common_runtime/threadpool_device.h index afc5d15ebc..51bd038a1c 100644 --- a/tensorflow/core/common_runtime/threadpool_device.h +++ b/tensorflow/core/common_runtime/threadpool_device.h @@ -29,7 +29,6 @@ class ThreadPoolDevice : public LocalDevice { Allocator* allocator); ~ThreadPoolDevice() override; - void Compute(OpKernel* op_kernel, OpKernelContext* context) override; Allocator* GetAllocator(AllocatorAttributes attr) override; Allocator* GetScopedAllocator(AllocatorAttributes attr, int64 step_id) override; diff --git a/tensorflow/core/common_runtime/tracing_device.h b/tensorflow/core/common_runtime/tracing_device.h new file mode 100644 index 0000000000..39215efa35 --- /dev/null +++ b/tensorflow/core/common_runtime/tracing_device.h @@ -0,0 +1,57 @@ +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_TRACING_DEVICE_H_ +#define TENSORFLOW_CORE_COMMON_RUNTIME_TRACING_DEVICE_H_ + +#include "tensorflow/core/common_runtime/device.h" +#include "tensorflow/core/platform/macros.h" +#include "tensorflow/core/platform/tracing.h" + +namespace tensorflow { + +namespace test { +class Benchmark; +} +struct SessionOptions; + +// This class implements tracing functionality that is shared by its subclasses +// (including ThreadPoolDevice and XlaDevice). +class TracingDevice : public Device { + public: + TracingDevice(Env* env, const DeviceAttributes& attributes) + : Device(env, attributes) {} + + void Compute(OpKernel* op_kernel, OpKernelContext* context) override { + if (TF_PREDICT_FALSE( + tracing::GetTraceCollector() || + tracing::GetEventCollector(tracing::EventCategory::kCompute))) { + const string& op_name = op_kernel->name(); + tracing::ScopedActivity activity(op_name, op_kernel->type_string(), + op_kernel->IsExpensive()); + tracing::ScopedRegion region(tracing::EventCategory::kCompute, op_name); + op_kernel->Compute(context); + } else { + op_kernel->Compute(context); + } + } + + private: + TF_DISALLOW_COPY_AND_ASSIGN(TracingDevice); +}; + +} // namespace tensorflow + +#endif // TENSORFLOW_CORE_COMMON_RUNTIME_TRACING_DEVICE_H_ -- GitLab From 632e3d66334ac3718a0fd41524c7dfc499363cab Mon Sep 17 00:00:00 2001 From: Gunhan Gulsoy Date: Sun, 26 Aug 2018 23:15:48 -0700 Subject: [PATCH 144/598] Disable flaky coordinator_test. PiperOrigin-RevId: 210320269 --- tensorflow/python/distribute/BUILD | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/tensorflow/python/distribute/BUILD b/tensorflow/python/distribute/BUILD index ebfcd085e6..a081c30781 100644 --- a/tensorflow/python/distribute/BUILD +++ b/tensorflow/python/distribute/BUILD @@ -44,7 +44,11 @@ py_test( size = "large", srcs = ["distribute_coordinator_test.py"], srcs_version = "PY2AND3", - tags = ["no_pip"], + tags = [ + "manual", + "no_pip", + "notap", + ], deps = [ ":distribute_coordinator", "//tensorflow/core:protos_all_py", -- GitLab From 86abbaa083beaca05ee32675ac7bfafb58a4557d Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Mon, 27 Aug 2018 02:01:42 -0700 Subject: [PATCH 145/598] [TFGAN] StarGAN Estimator Implementation PiperOrigin-RevId: 210334354 --- tensorflow/contrib/gan/BUILD | 52 +++ .../contrib/gan/python/estimator/__init__.py | 5 +- .../estimator/python/stargan_estimator.py | 28 ++ .../python/stargan_estimator_impl.py | 363 ++++++++++++++++++ .../python/stargan_estimator_test.py | 306 +++++++++++++++ 5 files changed, 753 insertions(+), 1 deletion(-) create mode 100644 tensorflow/contrib/gan/python/estimator/python/stargan_estimator.py create mode 100644 tensorflow/contrib/gan/python/estimator/python/stargan_estimator_impl.py create mode 100644 tensorflow/contrib/gan/python/estimator/python/stargan_estimator_test.py diff --git a/tensorflow/contrib/gan/BUILD b/tensorflow/contrib/gan/BUILD index 9866fccfba..9d0e6e1335 100644 --- a/tensorflow/contrib/gan/BUILD +++ b/tensorflow/contrib/gan/BUILD @@ -105,6 +105,7 @@ py_library( deps = [ ":gan_estimator", ":head", + ":stargan_estimator", "//tensorflow/python:util", ], ) @@ -533,6 +534,57 @@ py_test( ], ) +py_library( + name = "stargan_estimator", + srcs = [ + "python/estimator/python/stargan_estimator.py", + "python/estimator/python/stargan_estimator_impl.py", + ], + srcs_version = "PY2AND3", + deps = [ + ":namedtuples", + ":summaries", + ":train", + "//tensorflow/contrib/framework:framework_py", + "//tensorflow/python:framework_ops", + "//tensorflow/python:metrics", + "//tensorflow/python:util", + "//tensorflow/python:variable_scope", + "//tensorflow/python/estimator:estimator_py", + ], +) + +py_test( + name = "stargan_estimator_test", + srcs = ["python/estimator/python/stargan_estimator_test.py"], + shard_count = 1, + srcs_version = "PY2AND3", + tags = ["notsan"], + deps = [ + ":namedtuples", + ":stargan_estimator", + ":tuple_losses", + "//tensorflow/contrib/layers:layers_py", + "//tensorflow/contrib/learn", + "//tensorflow/core:protos_all_py", + "//tensorflow/python:array_ops", + "//tensorflow/python:client_testlib", + "//tensorflow/python:dtypes", + "//tensorflow/python:framework_ops", + "//tensorflow/python:math_ops", + "//tensorflow/python:metrics", + "//tensorflow/python:parsing_ops", + "//tensorflow/python:summary", + "//tensorflow/python:training", + "//tensorflow/python:training_util", + "//tensorflow/python:variable_scope", + "//tensorflow/python/estimator:estimator_py", + "//third_party/py/numpy", + "@absl_py//absl/testing:parameterized", + "@six_archive//:six", + ], +) + py_library( name = "sliced_wasserstein", srcs = [ diff --git a/tensorflow/contrib/gan/python/estimator/__init__.py b/tensorflow/contrib/gan/python/estimator/__init__.py index c9f7bc61b2..99d38011ba 100644 --- a/tensorflow/contrib/gan/python/estimator/__init__.py +++ b/tensorflow/contrib/gan/python/estimator/__init__.py @@ -26,15 +26,18 @@ from __future__ import print_function # pylint: disable=unused-import,wildcard-import from tensorflow.contrib.gan.python.estimator.python import gan_estimator from tensorflow.contrib.gan.python.estimator.python import head +from tensorflow.contrib.gan.python.estimator.python import stargan_estimator from tensorflow.contrib.gan.python.estimator.python.gan_estimator import * from tensorflow.contrib.gan.python.estimator.python.head import * +from tensorflow.contrib.gan.python.estimator.python.stargan_estimator import * # pylint: enable=unused-import,wildcard-import from tensorflow.python.util.all_util import remove_undocumented _allowed_symbols = [ 'gan_estimator', + 'stargan_estimator', 'head', -] + gan_estimator.__all__ + head.__all__ +] + gan_estimator.__all__ + stargan_estimator.__all__ + head.__all__ remove_undocumented(__name__, _allowed_symbols) diff --git a/tensorflow/contrib/gan/python/estimator/python/stargan_estimator.py b/tensorflow/contrib/gan/python/estimator/python/stargan_estimator.py new file mode 100644 index 0000000000..341bdf9fbb --- /dev/null +++ b/tensorflow/contrib/gan/python/estimator/python/stargan_estimator.py @@ -0,0 +1,28 @@ +# Copyright 2017 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""`tf.Learn` components for `GANEstimator`.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +from tensorflow.contrib.gan.python.estimator.python import stargan_estimator_impl +# pylint: disable=wildcard-import +from tensorflow.contrib.gan.python.estimator.python.stargan_estimator_impl import * +# pylint: enable=wildcard-import +from tensorflow.python.util.all_util import remove_undocumented + +__all__ = stargan_estimator_impl.__all__ +remove_undocumented(__name__, __all__) diff --git a/tensorflow/contrib/gan/python/estimator/python/stargan_estimator_impl.py b/tensorflow/contrib/gan/python/estimator/python/stargan_estimator_impl.py new file mode 100644 index 0000000000..f60e16bc04 --- /dev/null +++ b/tensorflow/contrib/gan/python/estimator/python/stargan_estimator_impl.py @@ -0,0 +1,363 @@ +# Copyright 2017 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""A TFGAN-backed StarGAN Estimator.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import functools +import enum + +from tensorflow.contrib.framework.python.ops import variables as variable_lib +from tensorflow.contrib.gan.python import namedtuples as tfgan_tuples +from tensorflow.contrib.gan.python import train as tfgan_train +from tensorflow.contrib.gan.python.eval.python import summaries as tfgan_summaries +from tensorflow.python.estimator import estimator +from tensorflow.python.estimator import model_fn as model_fn_lib +from tensorflow.python.framework import ops +from tensorflow.python.ops import metrics as metrics_lib +from tensorflow.python.ops import variable_scope +from tensorflow.python.util import tf_inspect as inspect + +__all__ = ['StarGANEstimator', 'SummaryType'] + + +class SummaryType(enum.IntEnum): + NONE = 0 + VARIABLES = 1 + IMAGES = 2 + IMAGE_COMPARISON = 3 + + +_summary_type_map = { + SummaryType.VARIABLES: tfgan_summaries.add_gan_model_summaries, + SummaryType.IMAGES: tfgan_summaries.add_stargan_image_summaries, +} + + +class StarGANEstimator(estimator.Estimator): + """An estimator for Generative Adversarial Networks (GANs). + + This Estimator is backed by TFGAN. The network functions follow the TFGAN API + except for one exception: if either `generator_fn` or `discriminator_fn` have + an argument called `mode`, then the tf.Estimator mode is passed in for that + argument. This helps with operations like batch normalization, which have + different train and evaluation behavior. + + Example: + + ```python + import tensorflow as tf + tfgan = tf.contrib.gan + + # See TFGAN's `train.py` for a description of the generator and + # discriminator API. + def generator_fn(generator_inputs): + ... + return generated_data + + def discriminator_fn(data, conditioning): + ... + return logits + + # Create GAN estimator. + stargan_estimator = tfgan.estimator.StarGANEstimator( + model_dir, + generator_fn=generator_fn, + discriminator_fn=discriminator_fn, + loss_fn=loss_fn, + generator_optimizer=tf.train.AdamOptimizer(0.1, 0.5), + discriminator_optimizer=tf.train.AdamOptimizer(0.1, 0.5)) + + # Train estimator. + stargan_estimator.train(train_input_fn, steps) + + # Evaluate resulting estimator. + stargan_estimator.evaluate(eval_input_fn) + + # Generate samples from generator. + stargan_estimator = np.array([ + x for x in stargan_estimator.predict(predict_input_fn)]) + ``` + """ + + def __init__(self, + model_dir=None, + generator_fn=None, + discriminator_fn=None, + loss_fn=None, + generator_optimizer=None, + discriminator_optimizer=None, + get_hooks_fn=None, + get_eval_metric_ops_fn=None, + add_summaries=None, + use_loss_summaries=True, + config=None): + """Initializes a StarGANEstimator instance. + + Args: + model_dir: Directory to save model parameters, graph and etc. This can + also be used to load checkpoints from the directory into a estimator to + continue training a previously saved model. + generator_fn: A python function that takes a Tensor, Tensor list, or + Tensor dictionary as inputs and returns the outputs of the GAN + generator. See `TFGAN` for more details and examples. Additionally, if + it has an argument called `mode`, the Estimator's `mode` will be passed + in (ex TRAIN, EVAL, PREDICT). This is useful for things like batch + normalization. + discriminator_fn: A python function that takes the output of + `generator_fn` or real data in the GAN setup, and `input_data`. Outputs + a Tensor in the range [-inf, inf]. See `TFGAN` for more details and + examples. + loss_fn: The loss function on the generator. Takes a `StarGANModel` + namedtuple and return a `GANLoss` namedtuple. + generator_optimizer: The optimizer for generator updates, or a function + that takes no arguments and returns an optimizer. This function will be + called when the default graph is the `StarGANEstimator`'s graph, so + utilities like `tf.contrib.framework.get_or_create_global_step` will + work. + discriminator_optimizer: Same as `generator_optimizer`, but for the + discriminator updates. + get_hooks_fn: A function that takes a `GANTrainOps` tuple and returns a + list of hooks. These hooks are run on the generator and discriminator + train ops, and can be used to implement the GAN training scheme. + Defaults to `train.get_sequential_train_hooks()`. + get_eval_metric_ops_fn: A function that takes a `GANModel`, and returns a + dict of metric results keyed by name. The output of this function is + passed into `tf.estimator.EstimatorSpec` during evaluation. + add_summaries: `None`, a single `SummaryType`, or a list of `SummaryType`. + use_loss_summaries: If `True`, add loss summaries. If `False`, does not. + If `None`, uses defaults. + config: `RunConfig` object to configure the runtime settings. + + Raises: + ValueError: If loss functions aren't callable. + ValueError: If `use_loss_summaries` isn't boolean or `None`. + ValueError: If `get_hooks_fn` isn't callable or `None`. + """ + if not callable(loss_fn): + raise ValueError('loss_fn must be callable.') + if use_loss_summaries not in [True, False, None]: + raise ValueError('use_loss_summaries must be True, False or None.') + if get_hooks_fn is not None and not callable(get_hooks_fn): + raise TypeError('get_hooks_fn must be callable.') + + def _model_fn(features, labels, mode): + """StarGANEstimator model function.""" + if mode not in [ + model_fn_lib.ModeKeys.TRAIN, model_fn_lib.ModeKeys.EVAL, + model_fn_lib.ModeKeys.PREDICT + ]: + raise ValueError('Mode not recognized: %s' % mode) + + if mode == model_fn_lib.ModeKeys.PREDICT: + input_data = features[0] + input_data_domain_label = features[1] + else: + input_data = features # rename inputs for clarity + input_data_domain_label = labels # rename inputs for clarity + + # Make StarGANModel, which encapsulates the GAN model architectures. + gan_model = _get_gan_model(mode, generator_fn, discriminator_fn, + input_data, input_data_domain_label, + add_summaries) + + # Make the EstimatorSpec, which incorporates the StarGANModel, losses, + # eval, metrics, and optimizers (if required). + return _get_estimator_spec(mode, gan_model, loss_fn, + get_eval_metric_ops_fn, generator_optimizer, + discriminator_optimizer, get_hooks_fn) + + super(StarGANEstimator, self).__init__( + model_fn=_model_fn, model_dir=model_dir, config=config) + + +def _get_gan_model(mode, + generator_fn, + discriminator_fn, + input_data, + input_data_domain_label, + add_summaries, + generator_scope='Generator'): + """Makes the StarGANModel tuple.""" + if mode == model_fn_lib.ModeKeys.PREDICT: + gan_model = _make_prediction_gan_model(input_data, input_data_domain_label, + generator_fn, generator_scope) + else: # model_fn_lib.ModeKeys.TRAIN or model_fn_lib.ModeKeys.EVAL + gan_model = _make_gan_model(generator_fn, discriminator_fn, input_data, + input_data_domain_label, generator_scope, + add_summaries, mode) + + return gan_model + + +def _get_estimator_spec(mode, + gan_model, + loss_fn, + get_eval_metric_ops_fn, + generator_optimizer, + discriminator_optimizer, + get_hooks_fn=None): + """Get the EstimatorSpec for the current mode.""" + if mode == model_fn_lib.ModeKeys.PREDICT: + estimator_spec = model_fn_lib.EstimatorSpec( + mode=mode, predictions=gan_model.generated_data) + else: + gan_loss = loss_fn(gan_model) + if mode == model_fn_lib.ModeKeys.EVAL: + estimator_spec = _get_eval_estimator_spec(gan_model, gan_loss, + get_eval_metric_ops_fn) + else: # model_fn_lib.ModeKeys.TRAIN: + gopt = ( + generator_optimizer() + if callable(generator_optimizer) else generator_optimizer) + dopt = ( + discriminator_optimizer() + if callable(discriminator_optimizer) else discriminator_optimizer) + get_hooks_fn = get_hooks_fn or tfgan_train.get_sequential_train_hooks() + estimator_spec = _get_train_estimator_spec(gan_model, gan_loss, gopt, + dopt, get_hooks_fn) + + return estimator_spec + + +def _make_gan_model(generator_fn, discriminator_fn, input_data, + input_data_domain_label, generator_scope, add_summaries, + mode): + """Construct a `StarGANModel`, and optionally pass in `mode`.""" + # If network functions have an argument `mode`, pass mode to it. + if 'mode' in inspect.getargspec(generator_fn).args: + generator_fn = functools.partial(generator_fn, mode=mode) + if 'mode' in inspect.getargspec(discriminator_fn).args: + discriminator_fn = functools.partial(discriminator_fn, mode=mode) + gan_model = tfgan_train.stargan_model( + generator_fn, + discriminator_fn, + input_data, + input_data_domain_label, + generator_scope=generator_scope) + if add_summaries: + if not isinstance(add_summaries, (tuple, list)): + add_summaries = [add_summaries] + with ops.name_scope(None): + for summary_type in add_summaries: + _summary_type_map[summary_type](gan_model) + + return gan_model + + +def _make_prediction_gan_model(input_data, input_data_domain_label, + generator_fn, generator_scope): + """Make a `StarGANModel` from just the generator.""" + # If `generator_fn` has an argument `mode`, pass mode to it. + if 'mode' in inspect.getargspec(generator_fn).args: + generator_fn = functools.partial( + generator_fn, mode=model_fn_lib.ModeKeys.PREDICT) + with variable_scope.variable_scope(generator_scope) as gen_scope: + # pylint:disable=protected-access + input_data = tfgan_train._convert_tensor_or_l_or_d(input_data) + input_data_domain_label = tfgan_train._convert_tensor_or_l_or_d( + input_data_domain_label) + # pylint:enable=protected-access + generated_data = generator_fn(input_data, input_data_domain_label) + generator_variables = variable_lib.get_trainable_variables(gen_scope) + + return tfgan_tuples.StarGANModel( + input_data=input_data, + input_data_domain_label=None, + generated_data=generated_data, + generated_data_domain_target=input_data_domain_label, + reconstructed_data=None, + discriminator_input_data_source_predication=None, + discriminator_generated_data_source_predication=None, + discriminator_input_data_domain_predication=None, + discriminator_generated_data_domain_predication=None, + generator_variables=generator_variables, + generator_scope=generator_scope, + generator_fn=generator_fn, + discriminator_variables=None, + discriminator_scope=None, + discriminator_fn=None) + + +def _get_eval_estimator_spec(gan_model, + gan_loss, + get_eval_metric_ops_fn=None, + name=None): + """Return an EstimatorSpec for the eval case.""" + scalar_loss = gan_loss.generator_loss + gan_loss.discriminator_loss + with ops.name_scope(None, 'metrics', + [gan_loss.generator_loss, gan_loss.discriminator_loss]): + + def _summary_key(head_name, val): + return '%s/%s' % (val, head_name) if head_name else val + + eval_metric_ops = { + _summary_key(name, 'generator_loss'): + metrics_lib.mean(gan_loss.generator_loss), + _summary_key(name, 'discriminator_loss'): + metrics_lib.mean(gan_loss.discriminator_loss) + } + if get_eval_metric_ops_fn is not None: + custom_eval_metric_ops = get_eval_metric_ops_fn(gan_model) + if not isinstance(custom_eval_metric_ops, dict): + raise TypeError('get_eval_metric_ops_fn must return a dict, ' + 'received: {}'.format(custom_eval_metric_ops)) + eval_metric_ops.update(custom_eval_metric_ops) + return model_fn_lib.EstimatorSpec( + mode=model_fn_lib.ModeKeys.EVAL, + predictions=gan_model.generated_data, + loss=scalar_loss, + eval_metric_ops=eval_metric_ops) + + +def _get_train_estimator_spec(gan_model, + gan_loss, + generator_optimizer, + discriminator_optimizer, + get_hooks_fn, + train_op_fn=tfgan_train.gan_train_ops): + """Return an EstimatorSpec for the train case.""" + scalar_loss = gan_loss.generator_loss + gan_loss.discriminator_loss + train_ops = train_op_fn(gan_model, gan_loss, generator_optimizer, + discriminator_optimizer) + training_hooks = get_hooks_fn(train_ops) + return model_fn_lib.EstimatorSpec( + loss=scalar_loss, + mode=model_fn_lib.ModeKeys.TRAIN, + train_op=train_ops.global_step_inc_op, + training_hooks=training_hooks) + + +def stargan_prediction_input_fn_wrapper(fn): + """StarGAN Estimator prediction input_fn wrapper. + + Since estimator will disregard the "label" variable pass to the model, we will + use a wrapper to pack the (feature, label) tuple as feature passed to the + model. + + Args: + fn: input_fn for the prediction. + + Returns: + A tuple ((feature, label), None) where the second element is the dummy label + to be disregarded and the first element is the true input to the estimator. + """ + + def new_fn(): + return fn(), None + + return new_fn diff --git a/tensorflow/contrib/gan/python/estimator/python/stargan_estimator_test.py b/tensorflow/contrib/gan/python/estimator/python/stargan_estimator_test.py new file mode 100644 index 0000000000..2ec7938c7c --- /dev/null +++ b/tensorflow/contrib/gan/python/estimator/python/stargan_estimator_test.py @@ -0,0 +1,306 @@ +# Copyright 2017 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Tests for TFGAN's stargan_estimator.py.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import shutil +import tempfile + +from absl.testing import parameterized +import numpy as np +import six + +from tensorflow.contrib import layers +from tensorflow.contrib.gan.python import namedtuples as tfgan_tuples +from tensorflow.contrib.gan.python.estimator.python import stargan_estimator_impl as estimator +from tensorflow.python.estimator import model_fn as model_fn_lib +from tensorflow.python.estimator.inputs import numpy_io +from tensorflow.python.framework import ops +from tensorflow.python.ops import array_ops +from tensorflow.python.ops import math_ops +from tensorflow.python.ops import metrics as metrics_lib +from tensorflow.python.ops import variable_scope +from tensorflow.python.platform import test +from tensorflow.python.summary.writer import writer_cache +from tensorflow.python.training import learning_rate_decay +from tensorflow.python.training import training +from tensorflow.python.training import training_util + + +def dummy_generator_fn(input_data, input_data_domain_label, mode): + del input_data_domain_label, mode + + return variable_scope.get_variable('dummy_g', initializer=0.5) * input_data + + +def dummy_discriminator_fn(input_data, num_domains, mode): + del mode + + hidden = layers.flatten(input_data) + output_src = math_ops.reduce_mean(hidden, axis=1) + output_cls = layers.fully_connected( + inputs=hidden, num_outputs=num_domains, scope='debug') + + return output_src, output_cls + + +class StarGetGANModelTest(test.TestCase, parameterized.TestCase): + """Tests that `StarGetGANModel` produces the correct model.""" + + @parameterized.named_parameters(('train', model_fn_lib.ModeKeys.TRAIN), + ('eval', model_fn_lib.ModeKeys.EVAL), + ('predict', model_fn_lib.ModeKeys.PREDICT)) + def test_get_gan_model(self, mode): + with ops.Graph().as_default(): + input_data = array_ops.ones([6, 4, 4, 3]) + input_data_domain_label = array_ops.one_hot([0] * 6, 5) + gan_model = estimator._get_gan_model( + mode, + dummy_generator_fn, + dummy_discriminator_fn, + input_data, + input_data_domain_label, + add_summaries=False) + + self.assertEqual(input_data, gan_model.input_data) + self.assertIsNotNone(gan_model.generated_data) + self.assertIsNotNone(gan_model.generated_data_domain_target) + self.assertEqual(1, len(gan_model.generator_variables)) + self.assertIsNotNone(gan_model.generator_scope) + self.assertIsNotNone(gan_model.generator_fn) + if mode == model_fn_lib.ModeKeys.PREDICT: + self.assertIsNone(gan_model.input_data_domain_label) + self.assertEqual(input_data_domain_label, + gan_model.generated_data_domain_target) + self.assertIsNone(gan_model.reconstructed_data) + self.assertIsNone(gan_model.discriminator_input_data_source_predication) + self.assertIsNone( + gan_model.discriminator_generated_data_source_predication) + self.assertIsNone(gan_model.discriminator_input_data_domain_predication) + self.assertIsNone( + gan_model.discriminator_generated_data_domain_predication) + self.assertIsNone(gan_model.discriminator_variables) + self.assertIsNone(gan_model.discriminator_scope) + self.assertIsNone(gan_model.discriminator_fn) + else: + self.assertEqual(input_data_domain_label, + gan_model.input_data_domain_label) + self.assertIsNotNone(gan_model.reconstructed_data.shape) + self.assertIsNotNone( + gan_model.discriminator_input_data_source_predication) + self.assertIsNotNone( + gan_model.discriminator_generated_data_source_predication) + self.assertIsNotNone( + gan_model.discriminator_input_data_domain_predication) + self.assertIsNotNone( + gan_model.discriminator_generated_data_domain_predication) + self.assertEqual(2, len(gan_model.discriminator_variables)) # 1 FC layer + self.assertIsNotNone(gan_model.discriminator_scope) + self.assertIsNotNone(gan_model.discriminator_fn) + + +def get_dummy_gan_model(): + """Similar to get_gan_model().""" + # TODO(joelshor): Find a better way of creating a variable scope. + with variable_scope.variable_scope('generator') as gen_scope: + gen_var = variable_scope.get_variable('dummy_var', initializer=0.0) + with variable_scope.variable_scope('discriminator') as dis_scope: + dis_var = variable_scope.get_variable('dummy_var', initializer=0.0) + return tfgan_tuples.StarGANModel( + input_data=array_ops.ones([1, 2, 2, 3]), + input_data_domain_label=array_ops.ones([1, 2]), + generated_data=array_ops.ones([1, 2, 2, 3]), + generated_data_domain_target=array_ops.ones([1, 2]), + reconstructed_data=array_ops.ones([1, 2, 2, 3]), + discriminator_input_data_source_predication=array_ops.ones([1]) * dis_var, + discriminator_generated_data_source_predication=array_ops.ones( + [1]) * gen_var * dis_var, + discriminator_input_data_domain_predication=array_ops.ones([1, 2 + ]) * dis_var, + discriminator_generated_data_domain_predication=array_ops.ones([1, 2]) * + gen_var * dis_var, + generator_variables=[gen_var], + generator_scope=gen_scope, + generator_fn=None, + discriminator_variables=[dis_var], + discriminator_scope=dis_scope, + discriminator_fn=None) + + +def dummy_loss_fn(gan_model): + loss = math_ops.reduce_sum( + gan_model.discriminator_input_data_domain_predication - + gan_model.discriminator_generated_data_domain_predication) + loss += math_ops.reduce_sum(gan_model.input_data - gan_model.generated_data) + return tfgan_tuples.GANLoss(loss, loss) + + +def get_metrics(gan_model): + return { + 'mse_custom_metric': + metrics_lib.mean_squared_error(gan_model.input_data, + gan_model.generated_data) + } + + +class GetEstimatorSpecTest(test.TestCase, parameterized.TestCase): + """Tests that the EstimatorSpec is constructed appropriately.""" + + @classmethod + def setUpClass(cls): + cls._generator_optimizer = training.GradientDescentOptimizer(1.0) + cls._discriminator_optimizer = training.GradientDescentOptimizer(1.0) + + @parameterized.named_parameters(('train', model_fn_lib.ModeKeys.TRAIN), + ('eval', model_fn_lib.ModeKeys.EVAL), + ('predict', model_fn_lib.ModeKeys.PREDICT)) + def test_get_estimator_spec(self, mode): + with ops.Graph().as_default(): + self._gan_model = get_dummy_gan_model() + spec = estimator._get_estimator_spec( + mode, + self._gan_model, + loss_fn=dummy_loss_fn, + get_eval_metric_ops_fn=get_metrics, + generator_optimizer=self._generator_optimizer, + discriminator_optimizer=self._discriminator_optimizer) + + self.assertEqual(mode, spec.mode) + if mode == model_fn_lib.ModeKeys.PREDICT: + self.assertEqual(self._gan_model.generated_data, spec.predictions) + elif mode == model_fn_lib.ModeKeys.TRAIN: + self.assertShapeEqual(np.array(0), spec.loss) # must be a scalar + self.assertIsNotNone(spec.train_op) + self.assertIsNotNone(spec.training_hooks) + elif mode == model_fn_lib.ModeKeys.EVAL: + self.assertEqual(self._gan_model.generated_data, spec.predictions) + self.assertShapeEqual(np.array(0), spec.loss) # must be a scalar + self.assertIsNotNone(spec.eval_metric_ops) + + +# TODO(joelshor): Add pandas test. +class StarGANEstimatorIntegrationTest(test.TestCase): + + def setUp(self): + self._model_dir = tempfile.mkdtemp() + + def tearDown(self): + if self._model_dir: + writer_cache.FileWriterCache.clear() + shutil.rmtree(self._model_dir) + + def _test_complete_flow(self, + train_input_fn, + eval_input_fn, + predict_input_fn, + prediction_size, + lr_decay=False): + + def make_opt(): + gstep = training_util.get_or_create_global_step() + lr = learning_rate_decay.exponential_decay(1.0, gstep, 10, 0.9) + return training.GradientDescentOptimizer(lr) + + gopt = make_opt if lr_decay else training.GradientDescentOptimizer(1.0) + dopt = make_opt if lr_decay else training.GradientDescentOptimizer(1.0) + est = estimator.StarGANEstimator( + generator_fn=dummy_generator_fn, + discriminator_fn=dummy_discriminator_fn, + loss_fn=dummy_loss_fn, + generator_optimizer=gopt, + discriminator_optimizer=dopt, + get_eval_metric_ops_fn=get_metrics, + model_dir=self._model_dir) + + # TRAIN + num_steps = 10 + est.train(train_input_fn, steps=num_steps) + + # EVALUTE + scores = est.evaluate(eval_input_fn) + self.assertEqual(num_steps, scores[ops.GraphKeys.GLOBAL_STEP]) + self.assertIn('loss', six.iterkeys(scores)) + self.assertEqual(scores['discriminator_loss'] + scores['generator_loss'], + scores['loss']) + self.assertIn('mse_custom_metric', six.iterkeys(scores)) + + # PREDICT + predictions = np.array([x for x in est.predict(predict_input_fn)]) + + self.assertAllEqual(prediction_size, predictions.shape) + + @staticmethod + def _numpy_input_fn_wrapper(numpy_input_fn, batch_size, label_size): + """Wrapper to remove the dictionary in numpy_input_fn. + + NOTE: + We create the domain_label here because the model expect a fully define + batch_size from the input. + + Args: + numpy_input_fn: input_fn created from numpy_io + batch_size: (int) number of items for each batch + label_size: (int) number of domains + + Returns: + a new input_fn + """ + + def new_input_fn(): + features = numpy_input_fn() + return features['x'], array_ops.one_hot([0] * batch_size, label_size) + + return new_input_fn + + def test_numpy_input_fn(self): + """Tests complete flow with numpy_input_fn.""" + batch_size = 5 + img_size = 8 + channel_size = 3 + label_size = 3 + image_data = np.zeros( + [batch_size, img_size, img_size, channel_size], dtype=np.float32) + train_input_fn = numpy_io.numpy_input_fn( + x={'x': image_data}, + batch_size=batch_size, + num_epochs=None, + shuffle=True) + eval_input_fn = numpy_io.numpy_input_fn( + x={'x': image_data}, batch_size=batch_size, shuffle=False) + predict_input_fn = numpy_io.numpy_input_fn( + x={'x': image_data}, shuffle=False) + + train_input_fn = self._numpy_input_fn_wrapper(train_input_fn, batch_size, + label_size) + eval_input_fn = self._numpy_input_fn_wrapper(eval_input_fn, batch_size, + label_size) + predict_input_fn = self._numpy_input_fn_wrapper(predict_input_fn, + batch_size, label_size) + + predict_input_fn = estimator.stargan_prediction_input_fn_wrapper( + predict_input_fn) + + self._test_complete_flow( + train_input_fn=train_input_fn, + eval_input_fn=eval_input_fn, + predict_input_fn=predict_input_fn, + prediction_size=[batch_size, img_size, img_size, channel_size]) + + +if __name__ == '__main__': + test.main() -- GitLab From ccd8cfe1fcce8fb197f5020e36714d3526e1b1f6 Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Mon, 27 Aug 2018 02:01:42 -0700 Subject: [PATCH 146/598] compat: Update forward compatibility horizon to 2018-08-27 PiperOrigin-RevId: 210334356 --- tensorflow/python/compat/compat.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tensorflow/python/compat/compat.py b/tensorflow/python/compat/compat.py index dd3b957da9..d9f92c3eda 100644 --- a/tensorflow/python/compat/compat.py +++ b/tensorflow/python/compat/compat.py @@ -26,7 +26,7 @@ import datetime from tensorflow.python.util import tf_contextlib from tensorflow.python.util.tf_export import tf_export -_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2018, 8, 26) +_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2018, 8, 27) @tf_export("compat.forward_compatible") -- GitLab From 8359c1d982ba6efafae4a1855f2e3c05c97cca16 Mon Sep 17 00:00:00 2001 From: Tom Hennigan Date: Mon, 27 Aug 2018 02:59:28 -0700 Subject: [PATCH 147/598] Migrate Keras from `variable_scope` imports to `variables`. Apart from one instance of `get_variable` in `training_test.py` this is just a costmetic change since the symbols from `variable_scope` simply forward to ones in `variables`. PiperOrigin-RevId: 210340379 --- tensorflow/python/keras/BUILD | 1 - tensorflow/python/keras/engine/base_layer.py | 17 ++++++++--------- tensorflow/python/keras/engine/training_test.py | 4 ++-- tensorflow/python/keras/layers/normalization.py | 14 +++++++------- tensorflow/python/keras/metrics.py | 6 +++--- 5 files changed, 20 insertions(+), 22 deletions(-) diff --git a/tensorflow/python/keras/BUILD b/tensorflow/python/keras/BUILD index f70da75610..5523d70a8d 100755 --- a/tensorflow/python/keras/BUILD +++ b/tensorflow/python/keras/BUILD @@ -102,7 +102,6 @@ py_library( "//tensorflow/python:tensor_array_ops", "//tensorflow/python:tensor_shape", "//tensorflow/python:util", - "//tensorflow/python:variable_scope", "//tensorflow/python:variables", ], ) diff --git a/tensorflow/python/keras/engine/base_layer.py b/tensorflow/python/keras/engine/base_layer.py index 2c7f1036fb..b6b05c0311 100644 --- a/tensorflow/python/keras/engine/base_layer.py +++ b/tensorflow/python/keras/engine/base_layer.py @@ -42,7 +42,6 @@ from tensorflow.python.keras.utils.generic_utils import to_snake_case # pylint: from tensorflow.python.keras.utils.tf_utils import is_tensor_or_tensor_list # pylint: disable=unused-import from tensorflow.python.ops import array_ops from tensorflow.python.ops import init_ops -from tensorflow.python.ops import variable_scope as vs from tensorflow.python.ops import variables as tf_variables from tensorflow.python.training.checkpointable import base as checkpointable from tensorflow.python.util import function_utils @@ -483,8 +482,8 @@ class Layer(checkpointable.CheckpointableBase): constraint=None, partitioner=None, use_resource=None, - synchronization=vs.VariableSynchronization.AUTO, - aggregation=vs.VariableAggregation.NONE, + synchronization=tf_variables.VariableSynchronization.AUTO, + aggregation=tf_variables.VariableAggregation.NONE, **kwargs): """Adds a new variable to the layer, or gets an existing one; returns it. @@ -541,7 +540,7 @@ class Layer(checkpointable.CheckpointableBase): regularizer = regularizers.get(regularizer) constraint = constraints.get(constraint) - if synchronization == vs.VariableSynchronization.ON_READ: + if synchronization == tf_variables.VariableSynchronization.ON_READ: if trainable: raise ValueError( 'Synchronization value can be set to ' @@ -1906,8 +1905,8 @@ def make_variable(name, constraint=None, use_resource=None, collections=None, - synchronization=vs.VariableSynchronization.AUTO, - aggregation=vs.VariableAggregation.NONE, + synchronization=tf_variables.VariableSynchronization.AUTO, + aggregation=tf_variables.VariableAggregation.NONE, partitioner=None): # pylint: disable=unused-argument """Temporary util to create a variable (relies on `variable_scope.variable`). @@ -1935,8 +1934,8 @@ def make_variable(name, then this parameter is ignored and any added variables are also marked as non-trainable. `trainable` defaults to `True` unless `synchronization` is set to `ON_READ`. - caching_device: Passed to `vs.variable`. - validate_shape: Passed to `vs.variable`. + caching_device: Passed to `tf.Variable`. + validate_shape: Passed to `tf.Variable`. constraint: Constraint instance (callable). use_resource: Whether to use a `ResourceVariable`. collections: List of graph collections keys. The new variable is added to @@ -1973,7 +1972,7 @@ def make_variable(name, if use_resource is None: use_resource = True - v = vs.variable( + v = tf_variables.Variable( initial_value=init_val, name=name, trainable=trainable, diff --git a/tensorflow/python/keras/engine/training_test.py b/tensorflow/python/keras/engine/training_test.py index bf2d231861..bf5c7fd7f8 100644 --- a/tensorflow/python/keras/engine/training_test.py +++ b/tensorflow/python/keras/engine/training_test.py @@ -36,7 +36,6 @@ from tensorflow.python.keras.engine.training_utils import weighted_masked_object from tensorflow.python.keras.utils.generic_utils import slice_arrays from tensorflow.python.ops import array_ops from tensorflow.python.ops import sparse_ops -from tensorflow.python.ops import variable_scope from tensorflow.python.ops import variables as variables_lib from tensorflow.python.platform import test from tensorflow.python.platform import tf_logging as logging @@ -392,7 +391,8 @@ class TrainingTest(test.TestCase): def test_compile_with_sparse_placeholders(self): with self.test_session(): input_layer = keras.layers.Input(shape=(10,), sparse=True) - weights = variable_scope.get_variable(name='weights', shape=(10, 1)) + weights = variables_lib.Variable( + np.ones((10, 1)).astype(np.float32), name='weights') weights_mult = lambda x: sparse_ops.sparse_tensor_dense_matmul(x, weights) output_layer = keras.layers.Lambda(weights_mult)(input_layer) model = keras.Model([input_layer], output_layer) diff --git a/tensorflow/python/keras/layers/normalization.py b/tensorflow/python/keras/layers/normalization.py index cd26e04c39..013d572088 100644 --- a/tensorflow/python/keras/layers/normalization.py +++ b/tensorflow/python/keras/layers/normalization.py @@ -34,7 +34,7 @@ from tensorflow.python.ops import init_ops from tensorflow.python.ops import math_ops from tensorflow.python.ops import nn from tensorflow.python.ops import state_ops -from tensorflow.python.ops import variable_scope +from tensorflow.python.ops import variables as tf_variables from tensorflow.python.platform import tf_logging as logging from tensorflow.python.training import distribution_strategy_context from tensorflow.python.util.tf_export import tf_export @@ -313,18 +313,18 @@ class BatchNormalization(Layer): shape=param_shape, dtype=param_dtype, initializer=self.moving_mean_initializer, - synchronization=variable_scope.VariableSynchronization.ON_READ, + synchronization=tf_variables.VariableSynchronization.ON_READ, trainable=False, - aggregation=variable_scope.VariableAggregation.MEAN) + aggregation=tf_variables.VariableAggregation.MEAN) self.moving_variance = self.add_weight( name='moving_variance', shape=param_shape, dtype=param_dtype, initializer=self.moving_variance_initializer, - synchronization=variable_scope.VariableSynchronization.ON_READ, + synchronization=tf_variables.VariableSynchronization.ON_READ, trainable=False, - aggregation=variable_scope.VariableAggregation.MEAN) + aggregation=tf_variables.VariableAggregation.MEAN) if self.renorm: # Create variables to maintain the moving mean and standard deviation. @@ -340,9 +340,9 @@ class BatchNormalization(Layer): shape=shape, dtype=param_dtype, initializer=init_ops.zeros_initializer(), - synchronization=variable_scope.VariableSynchronization.ON_READ, + synchronization=tf_variables.VariableSynchronization.ON_READ, trainable=False, - aggregation=variable_scope.VariableAggregation.MEAN) + aggregation=tf_variables.VariableAggregation.MEAN) return var with distribution_strategy_context.get_distribution_strategy( diff --git a/tensorflow/python/keras/metrics.py b/tensorflow/python/keras/metrics.py index 44ae6c5b1f..14cf1ce2af 100644 --- a/tensorflow/python/keras/metrics.py +++ b/tensorflow/python/keras/metrics.py @@ -53,7 +53,7 @@ from tensorflow.python.ops import init_ops from tensorflow.python.ops import math_ops from tensorflow.python.ops import nn from tensorflow.python.ops import state_ops -from tensorflow.python.ops import variable_scope as vs +from tensorflow.python.ops import variables as tf_variables from tensorflow.python.ops import weights_broadcast_ops from tensorflow.python.training import distribution_strategy_context from tensorflow.python.util import tf_decorator @@ -393,8 +393,8 @@ class Metric(Layer): def add_weight(self, name, shape=(), - aggregation=vs.VariableAggregation.SUM, - synchronization=vs.VariableSynchronization.ON_READ, + aggregation=tf_variables.VariableAggregation.SUM, + synchronization=tf_variables.VariableSynchronization.ON_READ, initializer=None): """Adds state variable. Only for use by subclasses.""" return super(Metric, self).add_weight( -- GitLab From e2c6ec9e86dd86e0dd56e0f11302a5bf5d9ed440 Mon Sep 17 00:00:00 2001 From: Tom Hennigan Date: Mon, 27 Aug 2018 03:58:42 -0700 Subject: [PATCH 148/598] Allow `tf.enable_eager_execution()` to be run multiple times. Subsequent calls after a successful call to enable eager execution are a no op. This is mostly to support colab where I might accidentally re-execute this and have to clear a large stack trace / split my import cell. PiperOrigin-RevId: 210344960 --- tensorflow/python/BUILD | 13 +++++++ tensorflow/python/framework/ops.py | 11 +++--- .../python/framework/ops_enable_eager_test.py | 38 +++++++++++++++++++ 3 files changed, 57 insertions(+), 5 deletions(-) create mode 100644 tensorflow/python/framework/ops_enable_eager_test.py diff --git a/tensorflow/python/BUILD b/tensorflow/python/BUILD index 37af3d350e..f62a927925 100644 --- a/tensorflow/python/BUILD +++ b/tensorflow/python/BUILD @@ -1347,6 +1347,19 @@ py_test( ], ) +py_test( + name = "framework_ops_enable_eager_test", + size = "small", + srcs = ["framework/ops_enable_eager_test.py"], + main = "framework/ops_enable_eager_test.py", + srcs_version = "PY2AND3", + deps = [ + ":framework", + ":platform_test", + "//tensorflow/python/eager:context", + ], +) + py_test( name = "framework_tensor_shape_test", size = "small", diff --git a/tensorflow/python/framework/ops.py b/tensorflow/python/framework/ops.py index 8c85a422e7..ae86d55d3e 100644 --- a/tensorflow/python/framework/ops.py +++ b/tensorflow/python/framework/ops.py @@ -5376,11 +5376,12 @@ def enable_eager_execution(config=None, TensorFlow graph, or if options provided conflict with a previous call to this function. """ - return enable_eager_execution_internal( - config=config, - device_policy=device_policy, - execution_mode=execution_mode, - server_def=None) + if context._default_mode != context.EAGER_MODE: # pylint: disable=protected-access + return enable_eager_execution_internal( + config=config, + device_policy=device_policy, + execution_mode=execution_mode, + server_def=None) def enable_eager_execution_internal(config=None, diff --git a/tensorflow/python/framework/ops_enable_eager_test.py b/tensorflow/python/framework/ops_enable_eager_test.py new file mode 100644 index 0000000000..99d06f1c2d --- /dev/null +++ b/tensorflow/python/framework/ops_enable_eager_test.py @@ -0,0 +1,38 @@ +# Copyright 2018 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Tests enabling eager execution at process level.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +from tensorflow.python.eager import context +from tensorflow.python.framework import ops +from tensorflow.python.platform import googletest + + +class OpsEnableEagerTest(googletest.TestCase): + + def test_enable_eager_execution_multiple_times(self): + ops.enable_eager_execution() + self.assertTrue(context.executing_eagerly()) + + # Calling enable eager execution a second time should not cause an error. + ops.enable_eager_execution() + self.assertTrue(context.executing_eagerly()) + + +if __name__ == '__main__': + googletest.main() -- GitLab From 6db10487de1aabfd547aaf8a2cb72379ea9a4a8d Mon Sep 17 00:00:00 2001 From: Keishi Hattori Date: Mon, 27 Aug 2018 20:15:10 +0900 Subject: [PATCH 149/598] Fix pix2pix_eager.ipynb image resize method 2 was BICUBIC not NEAREST_NEIGHBOR https://github.com/tensorflow/tensorflow/blob/r1.10/tensorflow/python/ops/image_ops_impl.py#L936 --- .../eager/python/examples/pix2pix/pix2pix_eager.ipynb | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/tensorflow/contrib/eager/python/examples/pix2pix/pix2pix_eager.ipynb b/tensorflow/contrib/eager/python/examples/pix2pix/pix2pix_eager.ipynb index ee25d25b52..d60ee18586 100644 --- a/tensorflow/contrib/eager/python/examples/pix2pix/pix2pix_eager.ipynb +++ b/tensorflow/contrib/eager/python/examples/pix2pix/pix2pix_eager.ipynb @@ -147,11 +147,12 @@ " # random jittering\n", " \n", " # resizing to 286 x 286 x 3\n", - " # method = 2 indicates using \"ResizeMethod.NEAREST_NEIGHBOR\"\n", " input_image = tf.image.resize_images(input_image, [286, 286], \n", - " align_corners=True, method=2)\n", + " align_corners=True, \n", + " method=tf.image.ResizeMethod.NEAREST_NEIGHBOR)\n", " real_image = tf.image.resize_images(real_image, [286, 286], \n", - " align_corners=True, method=2)\n", + " align_corners=True, \n", + " method=tf.image.ResizeMethod.NEAREST_NEIGHBOR)\n", " \n", " # randomly cropping to 256 x 256 x 3\n", " stacked_image = tf.stack([input_image, real_image], axis=0)\n", -- GitLab From 514f65a0cab6fb98bba6d69904ba930ff1c46247 Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Mon, 27 Aug 2018 08:06:33 -0700 Subject: [PATCH 150/598] Fix compiler warning about unused variable PiperOrigin-RevId: 210368536 --- tensorflow/core/framework/function.h | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/tensorflow/core/framework/function.h b/tensorflow/core/framework/function.h index 56e2017a61..03296a7761 100644 --- a/tensorflow/core/framework/function.h +++ b/tensorflow/core/framework/function.h @@ -710,9 +710,10 @@ Status ArgNumType(AttrSlice attrs, const OpDef::ArgDef& arg_def, #define REGISTER_OP_GRADIENT_UNIQ_HELPER(ctr, name, fn) \ REGISTER_OP_GRADIENT_UNIQ(ctr, name, fn) -#define REGISTER_OP_GRADIENT_UNIQ(ctr, name, fn) \ - static bool unused_grad_##ctr = SHOULD_REGISTER_OP_GRADIENT && \ - ::tensorflow::gradient::RegisterOp(name, fn) +#define REGISTER_OP_GRADIENT_UNIQ(ctr, name, fn) \ + static bool unused_grad_##ctr TF_ATTRIBUTE_UNUSED = \ + SHOULD_REGISTER_OP_GRADIENT && \ + ::tensorflow::gradient::RegisterOp(name, fn) namespace gradient { // Register a gradient creator for the "op". -- GitLab From a4ca39f0bc581c952193c75b3a1ad39d2f616996 Mon Sep 17 00:00:00 2001 From: Benjamin Kramer Date: Mon, 27 Aug 2018 08:47:13 -0700 Subject: [PATCH 151/598] [XLA] Unify spelling of 'fusible' Of {fusable, fusile, fusible} my dictionary only knows about fusible. PiperOrigin-RevId: 210373347 --- .../compiler/jit/xla_fusion_optimizer.cc | 8 +++---- .../compiler/jit/xla_fusion_optimizer_test.cc | 2 +- .../xla/service/cpu/cpu_instruction_fusion.cc | 4 ++-- .../cpu/cpu_instruction_fusion_test.cc | 2 +- .../xla/service/cpu/tests/cpu_fusion_test.cc | 4 ++-- .../xla/service/gpu/instruction_fusion.cc | 4 ++-- .../xla/service/gpu/ir_emitter_unnested.cc | 2 +- .../xla/service/gpu/multi_output_fusion.cc | 12 +++++----- .../compiler/xla/service/hlo_instruction.cc | 2 +- .../compiler/xla/service/hlo_instruction.h | 2 +- .../compiler/xla/service/hlo_instructions.cc | 2 +- .../xla/service/instruction_fusion.cc | 24 +++++++++---------- .../compiler/xla/service/instruction_fusion.h | 2 +- .../xla/service/instruction_fusion_test.cc | 6 ++--- 14 files changed, 38 insertions(+), 38 deletions(-) diff --git a/tensorflow/compiler/jit/xla_fusion_optimizer.cc b/tensorflow/compiler/jit/xla_fusion_optimizer.cc index 915c5afa79..07cfab6151 100644 --- a/tensorflow/compiler/jit/xla_fusion_optimizer.cc +++ b/tensorflow/compiler/jit/xla_fusion_optimizer.cc @@ -41,8 +41,8 @@ static bool IsShapeConsumerOp(const Node& node) { } // Returns true if the op can be decomposed into XLA ops for which -// there are fusable elemental implementations. -bool IsXlaFusable(const NodeDef& node) { +// there are fusible elemental implementations. +static bool IsXlaFusible(const NodeDef& node) { static const std::unordered_set* elementwise_ops = new std::unordered_set( {// tf2xla/kernels/aggregate_ops.cc @@ -176,9 +176,9 @@ Status XlaFusionOptimizer::Optimize(grappler::Cluster* cluster, TF_RETURN_IF_ERROR(DeviceToDeviceType(node->def().device(), &device_type)); if (device_type.type_string().find("XLA") != string::npos) continue; - // Assume all fusable ops are registered. + // Assume all fusible ops are registered. // TODO(hpucha): Check for registration if possible. - if (!IsXlaFusable(node->def())) { + if (!IsXlaFusible(node->def())) { continue; } diff --git a/tensorflow/compiler/jit/xla_fusion_optimizer_test.cc b/tensorflow/compiler/jit/xla_fusion_optimizer_test.cc index b77b207908..68e19c8a13 100644 --- a/tensorflow/compiler/jit/xla_fusion_optimizer_test.cc +++ b/tensorflow/compiler/jit/xla_fusion_optimizer_test.cc @@ -73,7 +73,7 @@ TEST_F(XlaFusionOptimizerTest, Chains) { EXPECT_TRUE(clusters.find("D") == clusters.cend()); } -TEST_F(XlaFusionOptimizerTest, FusableOps) { +TEST_F(XlaFusionOptimizerTest, FusibleOps) { GraphDef graph; { GraphDefBuilder builder(GraphDefBuilder::kFailImmediately); diff --git a/tensorflow/compiler/xla/service/cpu/cpu_instruction_fusion.cc b/tensorflow/compiler/xla/service/cpu/cpu_instruction_fusion.cc index b40d264c03..7f867fa149 100644 --- a/tensorflow/compiler/xla/service/cpu/cpu_instruction_fusion.cc +++ b/tensorflow/compiler/xla/service/cpu/cpu_instruction_fusion.cc @@ -78,7 +78,7 @@ bool CpuInstructionFusion::ShouldFuse(HloInstruction* consumer, } if (!CanBeLoopFused(*producer)) { - VLOG(2) << "Producer is not fusile."; + VLOG(2) << "Producer is not fusible."; return false; } @@ -140,7 +140,7 @@ bool CpuInstructionFusion::ShouldFuse(HloInstruction* consumer, } if (CanBeLoopFused(*consumer)) { - VLOG(2) << "Fusing: consumer is elementwise or fusile."; + VLOG(2) << "Fusing: consumer is elementwise or fusible."; return true; } diff --git a/tensorflow/compiler/xla/service/cpu/cpu_instruction_fusion_test.cc b/tensorflow/compiler/xla/service/cpu/cpu_instruction_fusion_test.cc index c3e03056f0..28aaa28cdb 100644 --- a/tensorflow/compiler/xla/service/cpu/cpu_instruction_fusion_test.cc +++ b/tensorflow/compiler/xla/service/cpu/cpu_instruction_fusion_test.cc @@ -567,7 +567,7 @@ TEST_F(OpcodeFusionTest, DynamicSliceWithDynamicUpdateSlice) { HloOpcode::kParameter, HloOpcode::kParameter}); } -TEST_F(OpcodeFusionTest, MessOfFusileNodes) { +TEST_F(OpcodeFusionTest, MessOfFusibleNodes) { auto module = CreateNewModule(); HloComputation::Builder builder(TestName()); diff --git a/tensorflow/compiler/xla/service/cpu/tests/cpu_fusion_test.cc b/tensorflow/compiler/xla/service/cpu/tests/cpu_fusion_test.cc index b68ac67574..22721051e5 100644 --- a/tensorflow/compiler/xla/service/cpu/tests/cpu_fusion_test.cc +++ b/tensorflow/compiler/xla/service/cpu/tests/cpu_fusion_test.cc @@ -129,8 +129,8 @@ TEST_F(CpuFusionTest, FuseElementwiseOpChain) { error_spec_); } -TEST_F(CpuFusionTest, ElementwiseOpChainWithNonfusableInstruction) { - // Test a chain of fusable ops with a non-fusable op (a reduce) thrown in the +TEST_F(CpuFusionTest, ElementwiseOpChainWithNonfusibleInstruction) { + // Test a chain of fusible ops with a non-fusible op (a reduce) thrown in the // middle. auto module = CreateNewModule(); auto builder = HloComputation::Builder(TestName()); diff --git a/tensorflow/compiler/xla/service/gpu/instruction_fusion.cc b/tensorflow/compiler/xla/service/gpu/instruction_fusion.cc index 0f2c83aeb2..0bcaaee2b7 100644 --- a/tensorflow/compiler/xla/service/gpu/instruction_fusion.cc +++ b/tensorflow/compiler/xla/service/gpu/instruction_fusion.cc @@ -26,7 +26,7 @@ namespace gpu { namespace { -bool IsFusile(const HloInstruction& hlo) { +bool IsFusible(const HloInstruction& hlo) { // Don't fuse get-tuple-element on GPU: We can, but it's slower than not // fusing. We never generate kernels for unfused GTEs. Instead, if an // unfused GTE is an input to a kernel (including a fusion kernel), we @@ -245,7 +245,7 @@ bool GpuInstructionFusion::ShouldFuse(HloInstruction* consumer, return true; } - if (!IsFusile(*producer) || !IsFusile(*consumer) || + if (!IsFusible(*producer) || !IsFusible(*consumer) || !InstructionFusion::ShouldFuse(consumer, operand_index)) { return false; } diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc index 84043689bd..9c7b508e10 100644 --- a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc +++ b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc @@ -3345,7 +3345,7 @@ bool IrEmitterUnnested::CheckAndEmitHloWithTile021(HloInstruction* hlo) { // if there's a Right Choice. // // This is only sound if tiled transposes are the only place where we use - // shared memory in fusions. If in the future other fusile ops use shared + // shared memory in fusions. If in the future other fusible ops use shared // memory, we'll have to adjust this heuristic. constexpr int kMinBlocksPerCore = 3; constexpr int64 kShmemPerCore = 48 * 1024; diff --git a/tensorflow/compiler/xla/service/gpu/multi_output_fusion.cc b/tensorflow/compiler/xla/service/gpu/multi_output_fusion.cc index 9fb6f569ae..7a43f0be54 100644 --- a/tensorflow/compiler/xla/service/gpu/multi_output_fusion.cc +++ b/tensorflow/compiler/xla/service/gpu/multi_output_fusion.cc @@ -145,7 +145,7 @@ bool GpuMultiOutputFusion::IsFusible(HloInstruction* instr) { // with any other instruction. // TODO(b/112957171): This should use the same isFusible logic as // instruction_fusion. - return instr->IsFusable() && + return instr->IsFusible() && (IsInputFusibleReduction(instr) || (instr->opcode() == HloOpcode::kFusion && instr->fusion_kind() == HloInstruction::FusionKind::kLoop) || @@ -204,7 +204,7 @@ bool GpuMultiOutputFusion::DoProducerConsumerMultiOutputFusion() { tensorflow::gtl::FlatSet to_fuse; // Keep a list of the instructions to fuse after making all the fusion // decisions. We first aggressively add instructions to potential_fusion_list, - // then filter out instructions that will be no longer fusable because of + // then filter out instructions that will be no longer fusible because of // reachability change. This avoids recalculating reachability on a large set // of instructions. std::vector> @@ -220,7 +220,7 @@ bool GpuMultiOutputFusion::DoProducerConsumerMultiOutputFusion() { continue; } if (!IsInputFusibleReduction(consumer)) { - VLOG(3) << consumer->name() << " is not an input-fusable reduction."; + VLOG(3) << consumer->name() << " is not an input-fusible reduction."; continue; } VLOG(3) << consumer->name() @@ -229,8 +229,8 @@ bool GpuMultiOutputFusion::DoProducerConsumerMultiOutputFusion() { auto consumer_operands = consumer->operands(); for (size_t i = 0; i < consumer_operands.size(); ++i) { HloInstruction* producer = consumer_operands[i]; - if (!producer->IsFusable()) { - VLOG(3) << producer->name() << " is not fusable."; + if (!producer->IsFusible()) { + VLOG(3) << producer->name() << " is not fusible."; continue; } const bool is_loop_fusion = @@ -270,7 +270,7 @@ bool GpuMultiOutputFusion::DoProducerConsumerMultiOutputFusion() { } } - // Filter out pairs that will be no longer fusable because of reachability + // Filter out pairs that will be no longer fusible because of reachability // change. for (auto& fusion_pair : potential_fusion_list) { HloInstruction* producer = fusion_pair.first; diff --git a/tensorflow/compiler/xla/service/hlo_instruction.cc b/tensorflow/compiler/xla/service/hlo_instruction.cc index 2bb9de686f..3e077d8aec 100644 --- a/tensorflow/compiler/xla/service/hlo_instruction.cc +++ b/tensorflow/compiler/xla/service/hlo_instruction.cc @@ -2169,7 +2169,7 @@ void HloInstruction::set_tracing(HloInstruction* trace_instruction) { bool HloInstruction::IsFused() const { return parent_->IsFusionComputation(); } -bool HloInstruction::IsFusable() const { +bool HloInstruction::IsFusible() const { // Instructions which are traced should not be fused. if (tracing()) { return false; diff --git a/tensorflow/compiler/xla/service/hlo_instruction.h b/tensorflow/compiler/xla/service/hlo_instruction.h index 948e33a0a3..01437f66cd 100644 --- a/tensorflow/compiler/xla/service/hlo_instruction.h +++ b/tensorflow/compiler/xla/service/hlo_instruction.h @@ -1029,7 +1029,7 @@ class HloInstruction { // Returns true if this instruction can be legally fused into a fusion // instruction. - bool IsFusable() const; + bool IsFusible() const; // Returns the sharding applied to this operator. // REQUIRES: has_sharding() is true. diff --git a/tensorflow/compiler/xla/service/hlo_instructions.cc b/tensorflow/compiler/xla/service/hlo_instructions.cc index a0de253eda..b407cfeb50 100644 --- a/tensorflow/compiler/xla/service/hlo_instructions.cc +++ b/tensorflow/compiler/xla/service/hlo_instructions.cc @@ -1152,7 +1152,7 @@ HloInstruction* HloFusionInstruction::FuseInstructionInternal( HloInstruction* HloFusionInstruction::CloneAndFuseInternal( HloInstruction* instruction_to_fuse, bool add_output) { - CHECK(instruction_to_fuse->IsFusable()) << instruction_to_fuse->ToString(); + CHECK(instruction_to_fuse->IsFusible()) << instruction_to_fuse->ToString(); VLOG(3) << "CloneAndFuseInternal:\n" << instruction_to_fuse->ToString(); HloInstruction* clone = nullptr; if (called_computations().empty()) { diff --git a/tensorflow/compiler/xla/service/instruction_fusion.cc b/tensorflow/compiler/xla/service/instruction_fusion.cc index be59ce8281..6207cdfb0d 100644 --- a/tensorflow/compiler/xla/service/instruction_fusion.cc +++ b/tensorflow/compiler/xla/service/instruction_fusion.cc @@ -189,13 +189,13 @@ bool InstructionFusion::CanFuseOnAllPaths( if (consumer == producer) { return true; } - if (!consumer->IsFusable()) { + if (!consumer->IsFusible()) { return false; } for (int64 i = 0, e = consumer->operand_count(); i < e; ++i) { auto* consumer_operand = consumer->mutable_operand(i); // If the operand is not on a path to the producer, it doesn't matter - // whether it's fusable. + // whether it's fusible. if (!reachability_->IsReachable(producer, consumer_operand)) { continue; } @@ -205,7 +205,7 @@ bool InstructionFusion::CanFuseOnAllPaths( } // The producer is reachable from consumer_operand which means we need // to be able to fuse consumer_operand into consumer in order for - // producer to be fusable into consumer on all paths. + // producer to be fusible into consumer on all paths. // Perform the recursive step: make sure producer can be fused into // consumer_operand on all paths. if (!CanFuseOnAllPaths(producer, consumer_operand, do_not_duplicate)) { @@ -216,7 +216,7 @@ bool InstructionFusion::CanFuseOnAllPaths( } InstructionFusion::HloInstructionSet -InstructionFusion::ComputeGloballyUnfusable( +InstructionFusion::ComputeGloballyUnfusible( tensorflow::gtl::ArraySlice post_order) { // Forbid fusion of producers that: // a) Need to be duplicated, unless they can be fused into all consumers @@ -270,19 +270,19 @@ InstructionFusion::ComputeGloballyUnfusable( // all of its consumers on all paths. // // That means, that for: - // A --> B (fusable) - // \-> C (non-fusable) + // A --> B (fusible) + // \-> C (non-fusible) // A will be not allowed to be fused into B, as it cannot be fused into C. // // Similarly, for: // A -------------> B // \-> C -> D -/ // If: - // - A is fusable into B and C, and D is fusable into B - // - C is *not* fusable into D + // - A is fusible into B and C, and D is fusible into B + // - C is *not* fusible into D // A will be not allowed to be fused into B, as it cannot be fused via // all paths. - if (producer->IsFusable() && + if (producer->IsFusible() && CanFuseOnAllPaths(producer, consumer, do_not_duplicate)) { continue; } @@ -318,7 +318,7 @@ StatusOr InstructionFusion::Run(HloModule* module) { InsertOrDie(&post_order_index, post_order[i], i); } - HloInstructionSet do_not_duplicate = ComputeGloballyUnfusable(post_order); + HloInstructionSet do_not_duplicate = ComputeGloballyUnfusible(post_order); // Instruction fusion effectively fuses edges in the computation graph // (producer instruction -> consumer instruction) so we iterate over all @@ -341,7 +341,7 @@ StatusOr InstructionFusion::Run(HloModule* module) { // consistent. post_order_index.erase(instruction); - if (!instruction->IsFusable() && + if (!instruction->IsFusible() && instruction->opcode() != HloOpcode::kFusion) { continue; } @@ -413,7 +413,7 @@ StatusOr InstructionFusion::Run(HloModule* module) { for (int64 i : sorted_operand_numbers) { HloInstruction* operand = instruction->mutable_operand(i); - if (!operand->IsFusable()) { + if (!operand->IsFusible()) { continue; } diff --git a/tensorflow/compiler/xla/service/instruction_fusion.h b/tensorflow/compiler/xla/service/instruction_fusion.h index 8489c3d9ad..9802d4cfc1 100644 --- a/tensorflow/compiler/xla/service/instruction_fusion.h +++ b/tensorflow/compiler/xla/service/instruction_fusion.h @@ -122,7 +122,7 @@ class InstructionFusion : public HloPassInterface { // Computes the set of nodes that we do not want to fuse into any of their // consumers based on a global analysis of the HLO graph. - HloInstructionSet ComputeGloballyUnfusable( + HloInstructionSet ComputeGloballyUnfusible( tensorflow::gtl::ArraySlice post_order); // Used to determine if an HLO is expensive. Expensive operations will not be diff --git a/tensorflow/compiler/xla/service/instruction_fusion_test.cc b/tensorflow/compiler/xla/service/instruction_fusion_test.cc index 9e7a15f033..da1ad90959 100644 --- a/tensorflow/compiler/xla/service/instruction_fusion_test.cc +++ b/tensorflow/compiler/xla/service/instruction_fusion_test.cc @@ -158,7 +158,7 @@ TEST_F(InstructionFusionTest, PotentialBitcastTransposeOfParameterUnfused) { .ValueOrDie()); } -TEST_F(InstructionFusionTest, AvoidDuplicationIfNotAllFusable) { +TEST_F(InstructionFusionTest, AvoidDuplicationIfNotAllFusible) { HloComputation::Builder builder(TestName()); auto shape = ShapeUtil::MakeShape(F32, {16, 16}); auto param0 = @@ -216,7 +216,7 @@ TEST_F(InstructionFusionTest, FuseCheapNonDuplicatableOps) { EXPECT_EQ(Count(*module, HloOpcode::kAdd), 1) << module->ToString(); } -TEST_F(InstructionFusionTest, AvoidDuplicationIfNotAllFusableRecursively) { +TEST_F(InstructionFusionTest, AvoidDuplicationIfNotAllFusibleRecursively) { // Make sure we do not duplicate the add, as we cannot fuse through the rng. // // p0 -> add -------------------------> sub @@ -309,7 +309,7 @@ TEST_F(InstructionFusionTest, AvoidDuplicationIfNotAllFusableRecursively) { EXPECT_EQ(Count(*module, HloOpcode::kAdd), 2) << module->ToString(); // A variant of the above that allows the algorithm to put add2 into the set - // of unfusable ops to short-circuit the decision whether add1 should be fused + // of unfusible ops to short-circuit the decision whether add1 should be fused // into sub2. // // /---------------\ -- GitLab From 3848e7ce3e2fa80029d3d8aa762b2f5a47c44c7a Mon Sep 17 00:00:00 2001 From: Tom Hennigan Date: Mon, 27 Aug 2018 08:52:03 -0700 Subject: [PATCH 152/598] Remove some useless pylint statements. PiperOrigin-RevId: 210373905 --- tensorflow/python/eager/context.py | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/tensorflow/python/eager/context.py b/tensorflow/python/eager/context.py index 6a327bd010..f75ea6c265 100644 --- a/tensorflow/python/eager/context.py +++ b/tensorflow/python/eager/context.py @@ -504,9 +504,7 @@ class Context(object): Args: fn: A wrapped TF_Function (returned from TF_GraphToFunction_wrapper). """ - pywrap_tensorflow.TFE_ContextAddFunction( - self._handle, # pylint: disable=protected-access - fn) + pywrap_tensorflow.TFE_ContextAddFunction(self._handle, fn) def add_function_def(self, fdef): """Add a function definition to the context. @@ -519,9 +517,7 @@ class Context(object): """ fdef_string = fdef.SerializeToString() pywrap_tensorflow.TFE_ContextAddFunctionDef( - self._handle, # pylint: disable=protected-access - fdef_string, - len(fdef_string)) + self._handle, fdef_string, len(fdef_string)) def add_post_execution_callback(self, callback): """Add a post-execution callback to the context. -- GitLab From 1022de8a575718f8421549572684f879ffa33bce Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Mon, 27 Aug 2018 09:21:39 -0700 Subject: [PATCH 153/598] This CL makes it so several of the assertAll... tests in test_util_test.py run in both graph and eager mode. PiperOrigin-RevId: 210377869 --- tensorflow/python/framework/test_util_test.py | 28 +++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/tensorflow/python/framework/test_util_test.py b/tensorflow/python/framework/test_util_test.py index f68c0ddecb..a0939f98b2 100644 --- a/tensorflow/python/framework/test_util_test.py +++ b/tensorflow/python/framework/test_util_test.py @@ -121,6 +121,7 @@ class TestUtilTest(test_util.TensorFlowTestCase): else: print("MKL is disabled") + @test_util.run_in_graph_and_eager_modes def testAssertProtoEqualsStr(self): graph_str = "node { name: 'w1' op: 'params' }" @@ -133,6 +134,7 @@ class TestUtilTest(test_util.TensorFlowTestCase): # test original comparison self.assertProtoEquals(graph_def, graph_def) + @test_util.run_in_graph_and_eager_modes def testAssertProtoEqualsAny(self): # Test assertProtoEquals with a protobuf.Any field. meta_graph_def_str = """ @@ -161,6 +163,7 @@ class TestUtilTest(test_util.TensorFlowTestCase): r'meta_graph_version: "inner"'): self.assertProtoEquals("", meta_graph_def_outer) + @test_util.run_in_graph_and_eager_modes def testNDArrayNear(self): a1 = np.array([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]]) a2 = np.array([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]]) @@ -168,6 +171,7 @@ class TestUtilTest(test_util.TensorFlowTestCase): self.assertTrue(self._NDArrayNear(a1, a2, 1e-5)) self.assertFalse(self._NDArrayNear(a1, a3, 1e-5)) + @test_util.run_in_graph_and_eager_modes def testCheckedThreadSucceeds(self): def noop(ev): @@ -181,6 +185,7 @@ class TestUtilTest(test_util.TensorFlowTestCase): t.join() self.assertTrue(event_arg.is_set()) + @test_util.run_in_graph_and_eager_modes def testCheckedThreadFails(self): def err_func(): @@ -192,6 +197,7 @@ class TestUtilTest(test_util.TensorFlowTestCase): t.join() self.assertTrue("integer division or modulo by zero" in str(fe.exception)) + @test_util.run_in_graph_and_eager_modes def testCheckedThreadWithWrongAssertionFails(self): x = 37 @@ -204,6 +210,7 @@ class TestUtilTest(test_util.TensorFlowTestCase): t.join() self.assertTrue("False is not true" in str(fe.exception)) + @test_util.run_in_graph_and_eager_modes def testMultipleThreadsWithOneFailure(self): def err_func(i): @@ -232,6 +239,7 @@ class TestUtilTest(test_util.TensorFlowTestCase): original_op=op_orig) raise errors.UnauthenticatedError(node_def, op, "true_err") + @test_util.run_in_graph_and_eager_modes def testAssertRaisesOpErrorDoesNotPassMessageDueToLeakedStack(self): with self.assertRaises(AssertionError): self._WeMustGoDeeper("this_is_not_the_error_you_are_looking_for") @@ -240,6 +248,7 @@ class TestUtilTest(test_util.TensorFlowTestCase): self._WeMustGoDeeper("name") self._WeMustGoDeeper("orig") + @test_util.run_in_graph_and_eager_modes def testAllCloseTensors(self): a_raw_data = [[1, 2, 3], [4, 5, 6], [7, 8, 9]] a = constant_op.constant(a_raw_data) @@ -255,17 +264,20 @@ class TestUtilTest(test_util.TensorFlowTestCase): y_list = [a_raw_data, b] self.assertAllClose(x_list, y_list) + @test_util.run_in_graph_and_eager_modes def testAllCloseScalars(self): self.assertAllClose(7, 7 + 1e-8) with self.assertRaisesRegexp(AssertionError, r"Not equal to tolerance"): self.assertAllClose(7, 7 + 1e-5) + @test_util.run_in_graph_and_eager_modes def testAllCloseDictToNonDict(self): with self.assertRaisesRegexp(ValueError, r"Can't compare dict to non-dict"): self.assertAllClose(1, {"a": 1}) with self.assertRaisesRegexp(ValueError, r"Can't compare dict to non-dict"): self.assertAllClose({"a": 1}, 1) + @test_util.run_in_graph_and_eager_modes def testAllCloseNamedtuples(self): a = 7 b = (2., 3.) @@ -278,6 +290,7 @@ class TestUtilTest(test_util.TensorFlowTestCase): self.assertAllClose( my_named_tuple(a=a, b=b, c=c), my_named_tuple(a=a, b=b, c=c)) + @test_util.run_in_graph_and_eager_modes def testAllCloseDicts(self): a = 7 b = (2., 3.) @@ -305,6 +318,7 @@ class TestUtilTest(test_util.TensorFlowTestCase): with self.assertRaisesRegexp(AssertionError, r"Not equal to tolerance"): self.assertAllClose(expected, {"a": a, "b": b, "c": c_copy}) + @test_util.run_in_graph_and_eager_modes def testAllCloseListOfNamedtuples(self): my_named_tuple = collections.namedtuple("MyNamedTuple", ["x", "y"]) l1 = [ @@ -317,6 +331,7 @@ class TestUtilTest(test_util.TensorFlowTestCase): ] self.assertAllClose(l1, l2) + @test_util.run_in_graph_and_eager_modes def testAllCloseNestedStructure(self): a = {"x": np.ones((3, 2, 4)) * 7, "y": (2, [{"nested": {"m": 3, "n": 4}}])} self.assertAllClose(a, a) @@ -330,6 +345,7 @@ class TestUtilTest(test_util.TensorFlowTestCase): r"\[y\]\[1\]\[0\]\[nested\]\[n\]"): self.assertAllClose(a, b) + @test_util.run_in_graph_and_eager_modes def testArrayNear(self): a = [1, 2] b = [1, 2, 5] @@ -352,6 +368,7 @@ class TestUtilTest(test_util.TensorFlowTestCase): y = [15] control_flow_ops.Assert(x, y).run() + @test_util.run_in_graph_and_eager_modes def testAssertAllCloseAccordingToType(self): # test plain int self.assertAllCloseAccordingToType(1, 1, rtol=1e-8, atol=1e-8) @@ -428,6 +445,7 @@ class TestUtilTest(test_util.TensorFlowTestCase): half_rtol=1e-4, half_atol=1e-4 ) + @test_util.run_in_graph_and_eager_modes def testAssertAllEqual(self): i = variables.Variable([100] * 3, dtype=dtypes.int32, name="i") j = constant_op.constant([20] * 3, dtype=dtypes.int32, name="j") @@ -437,6 +455,7 @@ class TestUtilTest(test_util.TensorFlowTestCase): self.assertAllEqual([120] * 3, k) self.assertAllEqual([20] * 3, j) + @test_util.run_in_graph_and_eager_modes def testAssertNotAllClose(self): # Test with arrays self.assertNotAllClose([0.1], [0.2]) @@ -453,6 +472,7 @@ class TestUtilTest(test_util.TensorFlowTestCase): with self.assertRaises(AssertionError): self.assertNotAllClose([1.0, 1.0], x) + @test_util.run_in_graph_and_eager_modes def testAssertNotAllCloseRTol(self): # Test with arrays with self.assertRaises(AssertionError): @@ -467,6 +487,7 @@ class TestUtilTest(test_util.TensorFlowTestCase): with self.assertRaises(AssertionError): self.assertNotAllClose([0.9, 1.0], x, rtol=0.2) + @test_util.run_in_graph_and_eager_modes def testAssertNotAllCloseATol(self): # Test with arrays with self.assertRaises(AssertionError): @@ -481,6 +502,7 @@ class TestUtilTest(test_util.TensorFlowTestCase): with self.assertRaises(AssertionError): self.assertNotAllClose([0.9, 1.0], x, atol=0.2) + @test_util.run_in_graph_and_eager_modes def testAssertAllGreaterLess(self): x = constant_op.constant([100.0, 110.0, 120.0], dtype=dtypes.float32) y = constant_op.constant([10.0] * 3, dtype=dtypes.float32) @@ -501,6 +523,7 @@ class TestUtilTest(test_util.TensorFlowTestCase): with self.assertRaises(AssertionError): self.assertAllLess(x, 95.0) + @test_util.run_in_graph_and_eager_modes def testAssertAllGreaterLessEqual(self): x = constant_op.constant([100.0, 110.0, 120.0], dtype=dtypes.float32) y = constant_op.constant([10.0] * 3, dtype=dtypes.float32) @@ -533,6 +556,7 @@ class TestUtilTest(test_util.TensorFlowTestCase): with self.assertRaises(AssertionError): self.assertAllInRange(b, 0, 1) + @test_util.run_in_graph_and_eager_modes def testAssertAllInRange(self): x = constant_op.constant([10.0, 15.0], name="x") self.assertAllInRange(x, 10, 15) @@ -545,24 +569,28 @@ class TestUtilTest(test_util.TensorFlowTestCase): self.assertAllInRange( x, 10, 15, open_lower_bound=True, open_upper_bound=True) + @test_util.run_in_graph_and_eager_modes def testAssertAllInRangeErrorMessageEllipses(self): x_init = np.array([[10.0, 15.0]] * 12) x = constant_op.constant(x_init, name="x") with self.assertRaises(AssertionError): self.assertAllInRange(x, 5, 10) + @test_util.run_in_graph_and_eager_modes def testAssertAllInRangeDetectsNaNs(self): x = constant_op.constant( [[np.nan, 0.0], [np.nan, np.inf], [np.inf, np.nan]], name="x") with self.assertRaises(AssertionError): self.assertAllInRange(x, 0.0, 2.0) + @test_util.run_in_graph_and_eager_modes def testAssertAllInRangeWithInfinities(self): x = constant_op.constant([10.0, np.inf], name="x") self.assertAllInRange(x, 10, np.inf) with self.assertRaises(AssertionError): self.assertAllInRange(x, 10, np.inf, open_upper_bound=True) + @test_util.run_in_graph_and_eager_modes def testAssertAllInSet(self): b = constant_op.constant([True, False], name="b") x = constant_op.constant([13, 37], name="x") -- GitLab From 176781dca8a310095d00658679a9e4b132bc92a7 Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Mon, 27 Aug 2018 09:34:14 -0700 Subject: [PATCH 154/598] [TF:XLA] Test zero element slice and update documentation. Documentation previously disallowed slices where start and limit indices were the same, but it was allowed by the implementation. Updated the documentation to support the implementation. PiperOrigin-RevId: 210379434 --- tensorflow/compiler/tests/slice_ops_test.py | 13 +++++++++++++ .../docs_src/performance/xla/operation_semantics.md | 2 +- 2 files changed, 14 insertions(+), 1 deletion(-) diff --git a/tensorflow/compiler/tests/slice_ops_test.py b/tensorflow/compiler/tests/slice_ops_test.py index 8f10c2fe86..2c611a959e 100644 --- a/tensorflow/compiler/tests/slice_ops_test.py +++ b/tensorflow/compiler/tests/slice_ops_test.py @@ -40,6 +40,19 @@ class SliceTest(xla_test.XLATestCase): self.assertAllEqual([2, 3, 4, 5], result) + def testZeroSlice(self): + for dtype in self.numeric_types: + with self.cached_session(): + i = array_ops.placeholder(dtype, shape=[2]) + with self.test_scope(): + o = array_ops.slice(i, [0], [0]) + params = { + i: [0, 1], + } + result = o.eval(feed_dict=params) + + self.assertAllEqual([], result) + def test3D(self): for dtype in self.numeric_types: with self.cached_session(): diff --git a/tensorflow/docs_src/performance/xla/operation_semantics.md b/tensorflow/docs_src/performance/xla/operation_semantics.md index c23a7ad9e2..96d269bec4 100644 --- a/tensorflow/docs_src/performance/xla/operation_semantics.md +++ b/tensorflow/docs_src/performance/xla/operation_semantics.md @@ -2266,7 +2266,7 @@ arguments to the slice operation. | `limit_indices` | `ArraySlice` | List of N integers containing the | : : : ending indices (exclusive) for the : : : : slice for each dimension. Each value : -: : : must be strictly greater than the : +: : : must be greater than or equal to the : : : : respective `start_indices` value for : : : : the dimension and less than or equal : : : : to the size of the dimension. : -- GitLab From dfa007e4562fb85fd5320a0c7ca8a00e50e8b34d Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Mon, 27 Aug 2018 10:36:53 -0700 Subject: [PATCH 155/598] Support scalar tensors in collective ops. PiperOrigin-RevId: 210390324 --- tensorflow/core/common_runtime/base_collective_executor.cc | 2 +- tensorflow/python/ops/collective_ops_test.py | 3 +++ 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/tensorflow/core/common_runtime/base_collective_executor.cc b/tensorflow/core/common_runtime/base_collective_executor.cc index 637b43c844..425a628a49 100644 --- a/tensorflow/core/common_runtime/base_collective_executor.cc +++ b/tensorflow/core/common_runtime/base_collective_executor.cc @@ -83,7 +83,7 @@ class CollectiveAdapterImpl : public CollectiveAdapter { // If necessary, flatten output. void Flatten() { - if (old_shape_.dims() > 1) { + if (old_shape_.dims() != 1) { TensorShape new_shape = TensorShape({old_shape_.num_elements()}); DMAHelper::UnsafeSetShape(&output_, new_shape); } diff --git a/tensorflow/python/ops/collective_ops_test.py b/tensorflow/python/ops/collective_ops_test.py index 9cc64ef9f6..6f3cd74406 100644 --- a/tensorflow/python/ops/collective_ops_test.py +++ b/tensorflow/python/ops/collective_ops_test.py @@ -53,6 +53,9 @@ class CollectiveOpTest(test.TestCase): [0.3, 1.3, 2.3, 3.3, 4.3, 5.3, 6.3, 7.3], [0.2, 1.2, 2.2, 3.2, 4.2, 5.2, 6.2, 7.2]) + def testCollectiveReduceScalar(self): + self._testCollectiveReduce(0.1, 0.3, 0.2) + def _testCollectiveBroadcast(self, t0): group_key = 1 instance_key = 1 -- GitLab From 8b05705ce58d264d09330531eea0c0701cc07ae2 Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Mon, 27 Aug 2018 11:01:09 -0700 Subject: [PATCH 156/598] Replaced calls to tensorflow::StringPiece::ToString with std::string conversions. That is, instances of sp.ToString() are replaced with string(sp). This will allow tensorflow::StringPiece::ToString to be removed, which is necessary before it can be replaced with absl::string_view. PiperOrigin-RevId: 210394878 --- tensorflow/cc/saved_model/loader.cc | 4 ++-- .../contrib/android/asset_manager_filesystem.cc | 4 ++-- tensorflow/contrib/data/kernels/csv_dataset_op.cc | 2 +- .../core/common_runtime/eager/attr_builder.h | 4 ++-- .../core/common_runtime/graph_execution_state.cc | 4 ++-- tensorflow/core/framework/function_testlib.cc | 4 ++-- tensorflow/core/graph/tensor_id.cc | 2 +- .../core/grappler/optimizers/data/graph_utils.cc | 14 +++++++------- .../optimizers/data/map_and_filter_fusion_test.cc | 4 ++-- .../grappler/optimizers/data/map_fusion_test.cc | 2 +- .../optimizers/data/map_vectorization_test.cc | 10 +++++----- .../optimizers/scoped_allocator_optimizer.cc | 2 +- tensorflow/core/lib/io/path_test.cc | 6 +++--- .../platform/cloud/compute_engine_zone_provider.cc | 2 +- .../tools/graph_transforms/fold_constants_lib.cc | 2 +- 15 files changed, 33 insertions(+), 33 deletions(-) diff --git a/tensorflow/cc/saved_model/loader.cc b/tensorflow/cc/saved_model/loader.cc index 222e769881..c6abe2f41b 100644 --- a/tensorflow/cc/saved_model/loader.cc +++ b/tensorflow/cc/saved_model/loader.cc @@ -148,7 +148,7 @@ Status RunMainOp(const RunOptions& run_options, const string& export_dir, AddAssetsTensorsToInputs(export_dir, asset_file_defs, &inputs); RunMetadata run_metadata; const StringPiece main_op_name = main_op_it->second.node_list().value(0); - return RunOnce(run_options, inputs, {}, {main_op_name.ToString()}, + return RunOnce(run_options, inputs, {}, {string(main_op_name)}, nullptr /* outputs */, &run_metadata, session); } return Status::OK(); @@ -187,7 +187,7 @@ Status RunRestore(const RunOptions& run_options, const string& export_dir, AddAssetsTensorsToInputs(export_dir, asset_file_defs, &inputs); RunMetadata run_metadata; - return RunOnce(run_options, inputs, {}, {restore_op_name.ToString()}, + return RunOnce(run_options, inputs, {}, {string(restore_op_name)}, nullptr /* outputs */, &run_metadata, session); } diff --git a/tensorflow/contrib/android/asset_manager_filesystem.cc b/tensorflow/contrib/android/asset_manager_filesystem.cc index 513d519eab..d14b2126a0 100644 --- a/tensorflow/contrib/android/asset_manager_filesystem.cc +++ b/tensorflow/contrib/android/asset_manager_filesystem.cc @@ -28,7 +28,7 @@ string RemoveSuffix(const string& name, const string& suffix) { string output(name); StringPiece piece(output); str_util::ConsumeSuffix(&piece, suffix); - return piece.ToString(); + return string(piece); } // Closes the given AAsset when variable is destructed. @@ -231,7 +231,7 @@ string AssetManagerFileSystem::NormalizeDirectoryPath(const string& fname) { string AssetManagerFileSystem::RemoveAssetPrefix(const string& name) { StringPiece piece(name); str_util::ConsumePrefix(&piece, prefix_); - return piece.ToString(); + return string(piece); } bool AssetManagerFileSystem::DirectoryExists(const std::string& fname) { diff --git a/tensorflow/contrib/data/kernels/csv_dataset_op.cc b/tensorflow/contrib/data/kernels/csv_dataset_op.cc index d242cfdf49..0ba905b92e 100644 --- a/tensorflow/contrib/data/kernels/csv_dataset_op.cc +++ b/tensorflow/contrib/data/kernels/csv_dataset_op.cc @@ -713,7 +713,7 @@ class CSVDatasetOp : public DatasetOpKernel { component.scalar()() = dataset()->record_defaults_[output_idx].flat()(0); } else { - component.scalar()() = field.ToString(); + component.scalar()() = string(field); } break; } diff --git a/tensorflow/core/common_runtime/eager/attr_builder.h b/tensorflow/core/common_runtime/eager/attr_builder.h index ccc95a35e5..cbe6a1cb50 100644 --- a/tensorflow/core/common_runtime/eager/attr_builder.h +++ b/tensorflow/core/common_runtime/eager/attr_builder.h @@ -122,12 +122,12 @@ class AttrBuilder { AttrValue attr_value; if (found == nullptr) { SetAttrValue(value, &attr_value); - m->insert(AttrValueMap::value_type(attr_name.ToString(), attr_value)); + m->insert(AttrValueMap::value_type(string(attr_name), attr_value)); } else { // TODO(ashankar): Do what is done in // NodeDefBuilder::CheckInconsistency(attr_name, *found, attr_value); SetAttrValue(std::forward(value), &attr_value); - (*m)[attr_name.ToString()] = attr_value; + (*m)[string(attr_name)] = attr_value; } } diff --git a/tensorflow/core/common_runtime/graph_execution_state.cc b/tensorflow/core/common_runtime/graph_execution_state.cc index c23b7d3699..346befc255 100644 --- a/tensorflow/core/common_runtime/graph_execution_state.cc +++ b/tensorflow/core/common_runtime/graph_execution_state.cc @@ -581,7 +581,7 @@ Status GraphExecutionState::OptimizeGraph( if (id.second != 0) { return errors::InvalidArgument("Unsupported feed: ", feed); } - feeds.insert(id.first.ToString()); + feeds.emplace(id.first); } for (const TensorConnection& tensor_connection : options.callable_options.tensor_connection()) { @@ -590,7 +590,7 @@ Status GraphExecutionState::OptimizeGraph( return errors::InvalidArgument("Unsupported feed: ", tensor_connection.to_tensor()); } - feeds.insert(id.first.ToString()); + feeds.emplace(id.first); } for (const NodeDef& node : original_graph_def_.node()) { if (feeds.find(node.name()) == feeds.end()) { diff --git a/tensorflow/core/framework/function_testlib.cc b/tensorflow/core/framework/function_testlib.cc index 41270b8e5e..6e38256ba8 100644 --- a/tensorflow/core/framework/function_testlib.cc +++ b/tensorflow/core/framework/function_testlib.cc @@ -49,8 +49,8 @@ NodeDef NDef(StringPiece name, StringPiece op, gtl::ArraySlice inputs, gtl::ArraySlice> attrs, const string& device) { NodeDef n; - n.set_name(name.ToString()); - n.set_op(op.ToString()); + n.set_name(string(name)); + n.set_op(string(op)); for (const auto& in : inputs) n.add_input(in); n.set_device(device); for (auto na : attrs) n.mutable_attr()->insert({na.first, na.second.proto}); diff --git a/tensorflow/core/graph/tensor_id.cc b/tensorflow/core/graph/tensor_id.cc index 80c76df255..5a5b85e727 100644 --- a/tensorflow/core/graph/tensor_id.cc +++ b/tensorflow/core/graph/tensor_id.cc @@ -25,7 +25,7 @@ namespace tensorflow { TensorId::TensorId(const SafeTensorId& id) : TensorId(id.first, id.second) {} SafeTensorId::SafeTensorId(const TensorId& id) - : SafeTensorId(id.first.ToString(), id.second) {} + : SafeTensorId(string(id.first), id.second) {} TensorId ParseTensorName(const string& name) { return ParseTensorName(StringPiece(name.data(), name.size())); diff --git a/tensorflow/core/grappler/optimizers/data/graph_utils.cc b/tensorflow/core/grappler/optimizers/data/graph_utils.cc index 883037173b..5a7fe19265 100644 --- a/tensorflow/core/grappler/optimizers/data/graph_utils.cc +++ b/tensorflow/core/grappler/optimizers/data/graph_utils.cc @@ -94,11 +94,11 @@ NodeDef* AddNode(StringPiece name, StringPiece op, MutableGraphView* graph) { NodeDef node; if (!name.empty()) { - node.set_name(name.ToString()); + node.set_name(string(name)); } else { SetUniqueGraphNodeName(op, graph->GetGraph(), &node); } - node.set_op(op.ToString()); + node.set_op(string(op)); for (const string& input : inputs) { node.add_input(input); } @@ -114,11 +114,11 @@ NodeDef* AddNode(StringPiece name, StringPiece op, FunctionDef* fd) { NodeDef* node = fd->add_node_def(); if (!name.empty()) { - node->set_name(name.ToString()); + node->set_name(string(name)); } else { SetUniqueFunctionNodeName(op, fd, node); } - node->set_op(op.ToString()); + node->set_op(string(op)); for (const string& input : inputs) { node->add_input(input); } @@ -270,7 +270,7 @@ NodeDef* GetInputNode(const NodeDef& node, const MutableGraphView& graph) { void SetUniqueGraphNodeName(StringPiece prefix, GraphDef* graph, NodeDef* node) { - string name = prefix.ToString(); + string name = string(prefix); int id = graph->node_size(); while (ContainsGraphNodeWithName(name, *graph)) { if (name.rfind("_generated") != std::string::npos && @@ -286,7 +286,7 @@ void SetUniqueGraphNodeName(StringPiece prefix, GraphDef* graph, void SetUniqueFunctionNodeName(StringPiece prefix, FunctionDef* function, NodeDef* node) { - string name = prefix.ToString(); + string name = string(prefix); int id = function->node_def_size(); while (ContainsFunctionNodeWithName(name, *function)) { name = strings::StrCat(prefix, "/_", id); @@ -297,7 +297,7 @@ void SetUniqueFunctionNodeName(StringPiece prefix, FunctionDef* function, void SetUniqueGraphFunctionName(StringPiece prefix, FunctionDefLibrary* library, FunctionDef* function) { - string name = prefix.ToString(); + string name = string(prefix); int id = library->function_size(); while (ContainsGraphFunctionWithName(name, *library)) { name = strings::StrCat(prefix, "/_", id); diff --git a/tensorflow/core/grappler/optimizers/data/map_and_filter_fusion_test.cc b/tensorflow/core/grappler/optimizers/data/map_and_filter_fusion_test.cc index 3b6829ade3..f029a093fa 100644 --- a/tensorflow/core/grappler/optimizers/data/map_and_filter_fusion_test.cc +++ b/tensorflow/core/grappler/optimizers/data/map_and_filter_fusion_test.cc @@ -30,7 +30,7 @@ namespace { NodeDef MakeMapNode(StringPiece name, StringPiece input_node_name) { return test::function::NDef( - name, "MapDataset", {input_node_name.ToString()}, + name, "MapDataset", {string(input_node_name)}, {{"f", FunctionDefHelper::FunctionRef("XTimesTwo")}, {"Targuments", {}}, {"output_shapes", {}}, @@ -39,7 +39,7 @@ NodeDef MakeMapNode(StringPiece name, StringPiece input_node_name) { NodeDef MakeFilterNode(StringPiece name, StringPiece input_node_name) { return test::function::NDef( - name, "FilterDataset", {input_node_name.ToString()}, + name, "FilterDataset", {string(input_node_name)}, {{"predicate", FunctionDefHelper::FunctionRef("IsZero")}, {"Targuments", {}}, {"output_shapes", {}}, diff --git a/tensorflow/core/grappler/optimizers/data/map_fusion_test.cc b/tensorflow/core/grappler/optimizers/data/map_fusion_test.cc index df6c19dc7c..b25dfbd0b8 100644 --- a/tensorflow/core/grappler/optimizers/data/map_fusion_test.cc +++ b/tensorflow/core/grappler/optimizers/data/map_fusion_test.cc @@ -30,7 +30,7 @@ namespace { NodeDef MakeMapNode(StringPiece name, StringPiece input_node_name) { return test::function::NDef( - name, "MapDataset", {input_node_name.ToString()}, + name, "MapDataset", {string(input_node_name)}, {{"f", FunctionDefHelper::FunctionRef("XTimesTwo")}, {"Targuments", {}}, {"output_shapes", {}}, diff --git a/tensorflow/core/grappler/optimizers/data/map_vectorization_test.cc b/tensorflow/core/grappler/optimizers/data/map_vectorization_test.cc index be2475bae8..ed1bd6bc97 100644 --- a/tensorflow/core/grappler/optimizers/data/map_vectorization_test.cc +++ b/tensorflow/core/grappler/optimizers/data/map_vectorization_test.cc @@ -55,8 +55,8 @@ NodeDef MakeMapNodeHelper( const gtl::ArraySlice>& output_shapes, const gtl::ArraySlice& output_types) { return test::function::NDef( - name, map_op_name, {input_node_name.ToString()}, - {{"f", FunctionDefHelper::FunctionRef(function_name.ToString())}, + name, map_op_name, {string(input_node_name)}, + {{"f", FunctionDefHelper::FunctionRef(string(function_name))}, {"Targuments", {}}, {"output_shapes", MakeShapeListAttr(output_shapes)}, {"output_types", output_types}}); @@ -76,7 +76,7 @@ NodeDef MakeBatchNode( const gtl::ArraySlice>& output_shapes, const gtl::ArraySlice& output_types) { return NDef(name, "BatchDataset", - {input_node_name.ToString(), input_batch_size_name.ToString()}, + {string(input_node_name), string(input_batch_size_name)}, {{"output_types", output_types}, {"output_shapes", MakeShapeListAttr(output_shapes)}}); } @@ -87,8 +87,8 @@ NodeDef MakeBatchV2Node( const gtl::ArraySlice>& output_shapes, const gtl::ArraySlice& output_types) { return NDef(name, "BatchDatasetV2", - {input_node_name.ToString(), input_batch_size_name.ToString(), - input_drop_remainder_name.ToString()}, + {string(input_node_name), string(input_batch_size_name), + string(input_drop_remainder_name)}, {{"output_types", output_types}, {"output_shapes", MakeShapeListAttr(output_shapes)}}); } diff --git a/tensorflow/core/grappler/optimizers/scoped_allocator_optimizer.cc b/tensorflow/core/grappler/optimizers/scoped_allocator_optimizer.cc index 275568e464..0d4aaf6462 100644 --- a/tensorflow/core/grappler/optimizers/scoped_allocator_optimizer.cc +++ b/tensorflow/core/grappler/optimizers/scoped_allocator_optimizer.cc @@ -203,7 +203,7 @@ void ScopedAllocatorOptimizer::ExtendNodeAttr(StringPiece name, NodeDef* node_def) { if (HasNodeAttr(*node_def, name)) { VLOG(2) << "extending"; - AttrValue* existing = &(*node_def->mutable_attr())[name.ToString()]; + AttrValue* existing = &(*node_def->mutable_attr())[string(name)]; for (int32 i : values) { existing->mutable_list()->add_i(i); } diff --git a/tensorflow/core/lib/io/path_test.cc b/tensorflow/core/lib/io/path_test.cc index e3275b93b6..0090b9100c 100644 --- a/tensorflow/core/lib/io/path_test.cc +++ b/tensorflow/core/lib/io/path_test.cc @@ -104,9 +104,9 @@ TEST(PathTest, CleanPath) { StringPiece u(uri); \ StringPiece s, h, p; \ ParseURI(u, &s, &h, &p); \ - EXPECT_EQ(scheme, s.ToString()); \ - EXPECT_EQ(host, h.ToString()); \ - EXPECT_EQ(path, p.ToString()); \ + EXPECT_EQ(scheme, s); \ + EXPECT_EQ(host, h); \ + EXPECT_EQ(path, p); \ EXPECT_EQ(uri, CreateURI(scheme, host, path)); \ EXPECT_LE(u.begin(), s.begin()); \ EXPECT_GE(u.end(), s.begin()); \ diff --git a/tensorflow/core/platform/cloud/compute_engine_zone_provider.cc b/tensorflow/core/platform/cloud/compute_engine_zone_provider.cc index dacf56187c..e147d88371 100644 --- a/tensorflow/core/platform/cloud/compute_engine_zone_provider.cc +++ b/tensorflow/core/platform/cloud/compute_engine_zone_provider.cc @@ -43,7 +43,7 @@ Status ComputeEngineZoneProvider::GetZone(string* zone) { *zone = cached_zone; } else { LOG(ERROR) << "Failed to parse the zone name from location: " - << location.ToString(); + << string(location); } return Status::OK(); diff --git a/tensorflow/tools/graph_transforms/fold_constants_lib.cc b/tensorflow/tools/graph_transforms/fold_constants_lib.cc index f858411876..6df2718e61 100644 --- a/tensorflow/tools/graph_transforms/fold_constants_lib.cc +++ b/tensorflow/tools/graph_transforms/fold_constants_lib.cc @@ -121,7 +121,7 @@ Status RewriteInputsAsPlaceholders(const TransformFuncContext& context, GraphDef* graph_def) { std::unordered_set input_names; for (const string& input_name : context.input_names) { - input_names.insert(ParseTensorName(input_name).first.ToString()); + input_names.emplace(ParseTensorName(input_name).first); } for (NodeDef& node : *graph_def->mutable_node()) { -- GitLab From 476f65230982842fdd7fabe2ed8d80ee719c20dc Mon Sep 17 00:00:00 2001 From: "William D. Irons" Date: Mon, 27 Aug 2018 13:29:52 -0500 Subject: [PATCH 157/598] Disable GPU test for scatter_add_ndim_op_test As scatter_add_ndim doesn't have implementation for GPU, the test needs to be excluded from GPU test to prevent it from failing. Currently fails on both x86_64 and ppc64le. Fixes #21833 --- tensorflow/contrib/tensor_forest/BUILD | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/tensorflow/contrib/tensor_forest/BUILD b/tensorflow/contrib/tensor_forest/BUILD index cf55fec488..4008699dda 100644 --- a/tensorflow/contrib/tensor_forest/BUILD +++ b/tensorflow/contrib/tensor_forest/BUILD @@ -462,7 +462,10 @@ py_test( size = "small", srcs = ["python/kernel_tests/scatter_add_ndim_op_test.py"], srcs_version = "PY2AND3", - tags = ["no_pip_gpu"], + tags = [ + "no_pip_gpu", + "no_gpu", + ], deps = [ ":tensor_forest_ops_py", "//tensorflow/python:framework_test_lib", -- GitLab From 5b08d751575c577f4ec5f241e5d8b3a6e356c07a Mon Sep 17 00:00:00 2001 From: Dustin Tran Date: Mon, 27 Aug 2018 11:33:46 -0700 Subject: [PATCH 158/598] Minor: Apply tfe.defun as decorator on self.call example. PiperOrigin-RevId: 210401396 --- tensorflow/python/eager/function.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tensorflow/python/eager/function.py b/tensorflow/python/eager/function.py index 9dc5648861..9b50f54eb8 100644 --- a/tensorflow/python/eager/function.py +++ b/tensorflow/python/eager/function.py @@ -1213,6 +1213,7 @@ def defun(func=None, input_signature=None): self.dense2 = tf.keras.layers.Dense(5, activation=tf.nn.softmax) self.keep_probability = keep_probability + @tf.contrib.eager.defun def call(self, inputs, training=True): x = self.dense2(self.dense1(inputs)) if training: @@ -1221,7 +1222,6 @@ def defun(func=None, input_signature=None): return x model = MyModel() - model.call = tf.contrib.eager.defun(model.call) model(x, training=True) # executes a graph, with dropout model(x, training=False) # executes a graph, without dropout -- GitLab From 8199873c4460baec443bade20b121c1516081fc6 Mon Sep 17 00:00:00 2001 From: Mark Daoust Date: Mon, 27 Aug 2018 11:34:33 -0700 Subject: [PATCH 159/598] These files have moved to the docs repo. https://github.com/tensorflow/docs/tree/master/site/en/tutorials/eager PiperOrigin-RevId: 210401536 --- .../eager/python/examples/notebooks/README.md | 14 +- .../notebooks/automatic_differentiation.ipynb | 298 +---------- .../examples/notebooks/custom_layers.ipynb | 389 ++------------ .../examples/notebooks/custom_training.ipynb | 467 ++--------------- .../examples/notebooks/eager_basics.ipynb | 485 ++---------------- 5 files changed, 127 insertions(+), 1526 deletions(-) diff --git a/tensorflow/contrib/eager/python/examples/notebooks/README.md b/tensorflow/contrib/eager/python/examples/notebooks/README.md index 0d5ed84894..2778b228e9 100644 --- a/tensorflow/contrib/eager/python/examples/notebooks/README.md +++ b/tensorflow/contrib/eager/python/examples/notebooks/README.md @@ -1,11 +1,3 @@ -## Research and experimentation - -Eager execution provides an imperative, define-by-run interface for advanced -operations. Write custom layers, forward passes, and training loops with auto -differentiation. Start with these notebooks, then read the -[eager execution guide](https://www.tensorflow.org/guide/eager). - -1. [Eager execution basics](./eager_basics.ipynb) -2. [Automatic differentiation and gradient tapes](./automatic_differentiation.ipynb) -3. [Custom training: basics](./custom_training.ipynb) -4. [Custom layers](./custom_layers.ipynb) +The notebooks have been moved to the +[tensorflow/docs](https://github.com/tensorflow/docs/tree/master/site/en/tutorials/eager) +repository. diff --git a/tensorflow/contrib/eager/python/examples/notebooks/automatic_differentiation.ipynb b/tensorflow/contrib/eager/python/examples/notebooks/automatic_differentiation.ipynb index 51b7ffc4de..8fae622e12 100644 --- a/tensorflow/contrib/eager/python/examples/notebooks/automatic_differentiation.ipynb +++ b/tensorflow/contrib/eager/python/examples/notebooks/automatic_differentiation.ipynb @@ -15,12 +15,7 @@ "execution_count": 0, "metadata": { "cellView": "form", - "colab": { - "autoexec": { - "startup": false, - "wait_interval": 0 - } - }, + "colab": {}, "colab_type": "code", "id": "GCCk8_dHpuNf" }, @@ -53,308 +48,35 @@ "cell_type": "markdown", "metadata": { "colab_type": "text", - "id": "idv0bPeCp325" - }, - "source": [ - "\u003ctable class=\"tfo-notebook-buttons\" align=\"left\"\u003e\u003ctd\u003e\n", - "\u003ca target=\"_blank\" href=\"https://colab.research.google.com/github/tensorflow/tensorflow/blob/master/tensorflow/contrib/eager/python/examples/notebooks/automatic_differentiation.ipynb\"\u003e\n", - " \u003cimg src=\"https://www.tensorflow.org/images/colab_logo_32px.png\" /\u003eRun in Google Colab\u003c/a\u003e\n", - "\u003c/td\u003e\u003ctd\u003e\n", - "\u003ca target=\"_blank\" href=\"https://github.com/tensorflow/tensorflow/blob/master/tensorflow/contrib/eager/python/examples/notebooks/automatic_differentiation.ipynb\"\u003e\u003cimg width=32px src=\"https://www.tensorflow.org/images/GitHub-Mark-32px.png\" /\u003eView source on GitHub\u003c/a\u003e\u003c/td\u003e\u003c/table\u003e" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "vDJ4XzMqodTy" - }, - "source": [ - "In the previous tutorial we introduced `Tensor`s and operations on them. In this tutorial we will cover [automatic differentiation](https://en.wikipedia.org/wiki/Automatic_differentiation), a key technique for optimizing machine learning models." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "GQJysDM__Qb0" - }, - "source": [ - "## Setup\n" - ] - }, - { - "cell_type": "code", - "execution_count": 0, - "metadata": { - "colab": { - "autoexec": { - "startup": false, - "wait_interval": 0 - } - }, - "colab_type": "code", - "id": "OiMPZStlibBv" - }, - "outputs": [], - "source": [ - "import tensorflow as tf\n", - "tf.enable_eager_execution()\n", - "\n", - "tfe = tf.contrib.eager # Shorthand for some symbols" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "1CLWJl0QliB0" - }, - "source": [ - "## Derivatives of a function\n", - "\n", - "TensorFlow provides APIs for automatic differentiation - computing the derivative of a function. The way that more closely mimics the math is to encapsulate the computation in a Python function, say `f`, and use `tfe.gradients_function` to create a function that computes the derivatives of `f` with respect to its arguments. If you're familiar with [autograd](https://github.com/HIPS/autograd) for differentiating numpy functions, this will be familiar. For example: " - ] - }, - { - "cell_type": "code", - "execution_count": 0, - "metadata": { - "colab": { - "autoexec": { - "startup": false, - "wait_interval": 0 - } - }, - "colab_type": "code", - "id": "9FViq92UX7P8" - }, - "outputs": [], - "source": [ - "from math import pi\n", - "\n", - "def f(x):\n", - " return tf.square(tf.sin(x))\n", - "\n", - "assert f(pi/2).numpy() == 1.0\n", - "\n", - "\n", - "# grad_f will return a list of derivatives of f\n", - "# with respect to its arguments. Since f() has a single argument,\n", - "# grad_f will return a list with a single element.\n", - "grad_f = tfe.gradients_function(f)\n", - "assert tf.abs(grad_f(pi/2)[0]).numpy() \u003c 1e-7" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "v9fPs8RyopCf" - }, - "source": [ - "### Higher-order gradients\n", - "\n", - "The same API can be used to differentiate as many times as you like:\n" - ] - }, - { - "cell_type": "code", - "execution_count": 0, - "metadata": { - "colab": { - "autoexec": { - "startup": false, - "wait_interval": 0 - } - }, - "colab_type": "code", - "id": "3D0ZvnGYo0rW" - }, - "outputs": [], - "source": [ - "def f(x):\n", - " return tf.square(tf.sin(x))\n", - "\n", - "def grad(f):\n", - " return lambda x: tfe.gradients_function(f)(x)[0]\n", - "\n", - "x = tf.lin_space(-2*pi, 2*pi, 100) # 100 points between -2π and +2π\n", - "\n", - "import matplotlib.pyplot as plt\n", - "\n", - "plt.plot(x, f(x), label=\"f\")\n", - "plt.plot(x, grad(f)(x), label=\"first derivative\")\n", - "plt.plot(x, grad(grad(f))(x), label=\"second derivative\")\n", - "plt.plot(x, grad(grad(grad(f)))(x), label=\"third derivative\")\n", - "plt.legend()\n", - "plt.show()" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "-39gouo7mtgu" - }, - "source": [ - "## Gradient tapes\n", - "\n", - "Every differentiable TensorFlow operation has an associated gradient function. For example, the gradient function of `tf.square(x)` would be a function that returns `2.0 * x`. To compute the gradient of a user-defined function (like `f(x)` in the example above), TensorFlow first \"records\" all the operations applied to compute the output of the function. We call this record a \"tape\". It then uses that tape and the gradients functions associated with each primitive operation to compute the gradients of the user-defined function using [reverse mode differentiation](https://en.wikipedia.org/wiki/Automatic_differentiation).\n", - "\n", - "Since operations are recorded as they are executed, Python control flow (using `if`s and `while`s for example) is naturally handled:\n", - "\n" - ] - }, - { - "cell_type": "code", - "execution_count": 0, - "metadata": { - "colab": { - "autoexec": { - "startup": false, - "wait_interval": 0 - } - }, - "colab_type": "code", - "id": "MH0UfjympWf7" - }, - "outputs": [], - "source": [ - "def f(x, y):\n", - " output = 1\n", - " # Must use range(int(y)) instead of range(y) in Python 3 when\n", - " # using TensorFlow 1.10 and earlier. Can use range(y) in 1.11+\n", - " for i in range(int(y)):\n", - " output = tf.multiply(output, x)\n", - " return output\n", - "\n", - "def g(x, y):\n", - " # Return the gradient of `f` with respect to it's first parameter\n", - " return tfe.gradients_function(f)(x, y)[0]\n", - "\n", - "assert f(3.0, 2).numpy() == 9.0 # f(x, 2) is essentially x * x\n", - "assert g(3.0, 2).numpy() == 6.0 # And its gradient will be 2 * x\n", - "assert f(4.0, 3).numpy() == 64.0 # f(x, 3) is essentially x * x * x\n", - "assert g(4.0, 3).numpy() == 48.0 # And its gradient will be 3 * x * x" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "aNmR5-jhpX2t" - }, - "source": [ - "At times it may be inconvenient to encapsulate computation of interest into a function. For example, if you want the gradient of the output with respect to intermediate values computed in the function. In such cases, the slightly more verbose but explicit [tf.GradientTape](https://www.tensorflow.org/api_docs/python/tf/GradientTape) context is useful. All computation inside the context of a `tf.GradientTape` is \"recorded\".\n", - "\n", - "For example:" - ] - }, - { - "cell_type": "code", - "execution_count": 0, - "metadata": { - "colab": { - "autoexec": { - "startup": false, - "wait_interval": 0 - } - }, - "colab_type": "code", - "id": "bAFeIE8EuVIq" + "id": "clNGnJ3u8Rl6" }, - "outputs": [], "source": [ - "x = tf.ones((2, 2))\n", - " \n", - "# TODO(b/78880779): Remove the 'persistent=True' argument and use\n", - "# a single t.gradient() call when the bug is resolved.\n", - "with tf.GradientTape(persistent=True) as t:\n", - " # TODO(ashankar): Explain with \"watch\" argument better?\n", - " t.watch(x)\n", - " y = tf.reduce_sum(x)\n", - " z = tf.multiply(y, y)\n", - "\n", - "# Use the same tape to compute the derivative of z with respect to the\n", - "# intermediate value y.\n", - "dz_dy = t.gradient(z, y)\n", - "assert dz_dy.numpy() == 8.0\n", - "\n", - "# Derivative of z with respect to the original input tensor x\n", - "dz_dx = t.gradient(z, x)\n", - "for i in [0, 1]:\n", - " for j in [0, 1]:\n", - " assert dz_dx[i][j].numpy() == 8.0" + "This file has moved." ] }, { "cell_type": "markdown", "metadata": { "colab_type": "text", - "id": "DK05KXrAAld3" - }, - "source": [ - "### Higher-order gradients\n", - "\n", - "Operations inside of the `GradientTape` context manager are recorded for automatic differentiation. If gradients are computed in that context, then the gradient computation is recorded as well. As a result, the exact same API works for higher-order gradients as well. For example:" - ] - }, - { - "cell_type": "code", - "execution_count": 0, - "metadata": { - "colab": { - "autoexec": { - "startup": false, - "wait_interval": 0 - } - }, - "colab_type": "code", - "id": "cPQgthZ7ugRJ" - }, - "outputs": [], - "source": [ - "# TODO(ashankar): Should we use the persistent tape here instead? Follow up on Tom and Alex's discussion\n", - "\n", - "x = tf.constant(1.0) # Convert the Python 1.0 to a Tensor object\n", - "\n", - "with tf.GradientTape() as t:\n", - " with tf.GradientTape() as t2:\n", - " t2.watch(x)\n", - " y = x * x * x\n", - " # Compute the gradient inside the 't' context manager\n", - " # which means the gradient computation is differentiable as well.\n", - " dy_dx = t2.gradient(y, x)\n", - "d2y_dx2 = t.gradient(dy_dx, x)\n", - "\n", - "assert dy_dx.numpy() == 3.0\n", - "assert d2y_dx2.numpy() == 6.0" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "4U1KKzUpNl58" + "id": "idv0bPeCp325" }, "source": [ - "## Next Steps\n", - "\n", - "In this tutorial we covered gradient computation in TensorFlow. With that we have enough of the primitives required to build an train neural networks, which we will cover in the [next tutorial](https://github.com/tensorflow/models/tree/master/official/contrib/eager/python/examples/notebooks/3_neural_networks.ipynb)." + "\u003ctable class=\"tfo-notebook-buttons\" align=\"left\"\u003e\u003ctd\u003e\n", + "\u003ca target=\"_blank\" href=\"https://colab.research.google.com/github/tensorflow/docs/blob/master/site/en/tutorials/eager/automatic_differentiation.ipynb\"\u003e\n", + " \u003cimg src=\"https://www.tensorflow.org/images/colab_logo_32px.png\" /\u003eRun in Google Colab\u003c/a\u003e\n", + "\u003c/td\u003e\u003ctd\u003e\n", + "\u003ca target=\"_blank\" href=\"https://github.com/tensorflow/tensorflow/blob/master/site/en/tutorials/eager/automatic_differentiation.ipynb\"\u003e\u003cimg width=32px src=\"https://www.tensorflow.org/images/GitHub-Mark-32px.png\" /\u003eView source on GitHub\u003c/a\u003e\u003c/td\u003e\u003c/table\u003e" ] } ], "metadata": { "colab": { "collapsed_sections": [], - "default_view": {}, "name": "automatic_differentiation.ipynb", "private_outputs": true, "provenance": [], "toc_visible": true, - "version": "0.3.2", - "views": {} + "version": "0.3.2" }, "kernelspec": { "display_name": "Python 3", diff --git a/tensorflow/contrib/eager/python/examples/notebooks/custom_layers.ipynb b/tensorflow/contrib/eager/python/examples/notebooks/custom_layers.ipynb index a0bbbb6123..d89774c45e 100644 --- a/tensorflow/contrib/eager/python/examples/notebooks/custom_layers.ipynb +++ b/tensorflow/contrib/eager/python/examples/notebooks/custom_layers.ipynb @@ -1,46 +1,25 @@ { - "nbformat": 4, - "nbformat_minor": 0, - "metadata": { - "colab": { - "name": "custom_layers.ipynb", - "version": "0.3.2", - "views": {}, - "default_view": {}, - "provenance": [], - "private_outputs": true, - "collapsed_sections": [], - "toc_visible": true - }, - "kernelspec": { - "display_name": "Python 3", - "name": "python3" - } - }, "cells": [ { + "cell_type": "markdown", "metadata": { - "id": "tDnwEv8FtJm7", - "colab_type": "text" + "colab_type": "text", + "id": "tDnwEv8FtJm7" }, - "cell_type": "markdown", "source": [ "##### Copyright 2018 The TensorFlow Authors." ] }, { + "cell_type": "code", + "execution_count": 0, "metadata": { - "id": "JlknJBWQtKkI", + "cellView": "form", + "colab": {}, "colab_type": "code", - "colab": { - "autoexec": { - "startup": false, - "wait_interval": 0 - } - }, - "cellView": "form" + "id": "JlknJBWQtKkI" }, - "cell_type": "code", + "outputs": [], "source": [ "#@title Licensed under the Apache License, Version 2.0 (the \"License\");\n", "# you may not use this file except in compliance with the License.\n", @@ -53,347 +32,57 @@ "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n", "# See the License for the specific language governing permissions and\n", "# limitations under the License." - ], - "execution_count": 0, - "outputs": [] - }, - { - "metadata": { - "id": "60RdWsg1tETW", - "colab_type": "text" - }, - "cell_type": "markdown", - "source": [ - "# Custom layers" - ] - }, - { - "metadata": { - "id": "BcJg7Enms86w", - "colab_type": "text" - }, - "cell_type": "markdown", - "source": [ - "
\n", - "\n", - " Run in Google Colab\n", - "\n", - "View source on GitHub
" - ] - }, - { - "metadata": { - "id": "UEu3q4jmpKVT", - "colab_type": "text" - }, - "cell_type": "markdown", - "source": [ - "We recommend using `tf.keras` as a high-level API for building neural networks. That said, most TensorFlow APIs are usable with eager execution.\n" ] }, { - "metadata": { - "id": "pwX7Fii1rwsJ", - "colab_type": "code", - "colab": { - "autoexec": { - "startup": false, - "wait_interval": 0 - } - } - }, - "cell_type": "code", - "source": [ - "import tensorflow as tf\n", - "tfe = tf.contrib.eager\n", - "\n", - "tf.enable_eager_execution()" - ], - "execution_count": 0, - "outputs": [] - }, - { - "metadata": { - "id": "zSFfVVjkrrsI", - "colab_type": "text" - }, "cell_type": "markdown", - "source": [ - "## Layers: common sets of useful operations\n", - "\n", - "Most of the time when writing code for machine learning models you want to operate at a higher level of abstraction than individual operations and manipulation of individual variables.\n", - "\n", - "Many machine learning models are expressible as the composition and stacking of relatively simple layers, and TensorFlow provides both a set of many common layers as a well as easy ways for you to write your own application-specific layers either from scratch or as the composition of existing layers.\n", - "\n", - "TensorFlow includes the full [Keras](https://keras.io) API in the tf.keras package, and the Keras layers are very useful when building your own models.\n" - ] - }, - { "metadata": { - "id": "8PyXlPl-4TzQ", - "colab_type": "code", - "colab": { - "autoexec": { - "startup": false, - "wait_interval": 0 - } - } + "colab_type": "text", + "id": "60RdWsg1tETW" }, - "cell_type": "code", - "source": [ - "# In the tf.keras.layers package, layers are objects. To construct a layer,\n", - "# simply construct the object. Most layers take as a first argument the number\n", - "# of output dimensions / channels.\n", - "layer = tf.keras.layers.Dense(100)\n", - "# The number of input dimensions is often unnecessary, as it can be inferred\n", - "# the first time the layer is used, but it can be provided if you want to \n", - "# specify it manually, which is useful in some complex models.\n", - "layer = tf.keras.layers.Dense(10, input_shape=(None, 5))" - ], - "execution_count": 0, - "outputs": [] - }, - { - "metadata": { - "id": "Fn69xxPO5Psr", - "colab_type": "text" - }, - "cell_type": "markdown", "source": [ - "The full list of pre-existing layers can be seen in [the documentation](https://www.tensorflow.org/api_docs/python/tf/keras/layers). It includes Dense (a fully-connected layer),\n", - "Conv2D, LSTM, BatchNormalization, Dropout, and many others." + "# Custom layers" ] }, { - "metadata": { - "id": "E3XKNknP5Mhb", - "colab_type": "code", - "colab": { - "autoexec": { - "startup": false, - "wait_interval": 0 - } - } - }, - "cell_type": "code", - "source": [ - "# To use a layer, simply call it.\n", - "layer(tf.zeros([10, 5]))" - ], - "execution_count": 0, - "outputs": [] - }, - { - "metadata": { - "id": "Wt_Nsv-L5t2s", - "colab_type": "code", - "colab": { - "autoexec": { - "startup": false, - "wait_interval": 0 - } - } - }, - "cell_type": "code", - "source": [ - "# Layers have many useful methods. For example, you can inspect all variables\n", - "# in a layer by calling layer.variables. In this case a fully-connected layer\n", - "# will have variables for weights and biases.\n", - "layer.variables" - ], - "execution_count": 0, - "outputs": [] - }, - { - "metadata": { - "id": "6ilvKjz8_4MQ", - "colab_type": "code", - "colab": { - "autoexec": { - "startup": false, - "wait_interval": 0 - } - } - }, - "cell_type": "code", - "source": [ - "# The variables are also accessible through nice accessors\n", - "layer.kernel, layer.bias" - ], - "execution_count": 0, - "outputs": [] - }, - { - "metadata": { - "id": "O0kDbE54-5VS", - "colab_type": "text" - }, "cell_type": "markdown", - "source": [ - "## Implementing custom layers\n", - "The best way to implement your own layer is extending the tf.keras.Layer class and implementing:\n", - " * `__init__` , where you can do all input-independent initialization\n", - " * `build`, where you know the shapes of the input tensors and can do the rest of the initialization\n", - " * `call`, where you do the forward computation\n", - "\n", - "Note that you don't have to wait until `build` is called to create your variables, you can also create them in `__init__`. However, the advantage of creating them in `build` is that it enables late variable creation based on the shape of the inputs the layer will operate on. On the other hand, creating variables in `__init__` would mean that shapes required to create the variables will need to be explicitly specified." - ] - }, - { - "metadata": { - "id": "5Byl3n1k5kIy", - "colab_type": "code", - "colab": { - "autoexec": { - "startup": false, - "wait_interval": 0 - } - } - }, - "cell_type": "code", - "source": [ - "class MyDenseLayer(tf.keras.layers.Layer):\n", - " def __init__(self, num_outputs):\n", - " super(MyDenseLayer, self).__init__()\n", - " self.num_outputs = num_outputs\n", - " \n", - " def build(self, input_shape):\n", - " self.kernel = self.add_variable(\"kernel\", \n", - " shape=[input_shape[-1].value, \n", - " self.num_outputs])\n", - " \n", - " def call(self, input):\n", - " return tf.matmul(input, self.kernel)\n", - " \n", - "layer = MyDenseLayer(10)\n", - "print(layer(tf.zeros([10, 5])))\n", - "print(layer.variables)" - ], - "execution_count": 0, - "outputs": [] - }, - { "metadata": { - "id": "tk8E2vY0-z4Z", - "colab_type": "text" + "colab_type": "text", + "id": "9sFn_RV_8zM-" }, - "cell_type": "markdown", "source": [ - "Note that you don't have to wait until `build` is called to create your variables, you can also create them in `__init__`.\n", - "\n", - "Overall code is easier to read and maintain if it uses standard layers whenever possible, as other readers will be familiar with the behavior of standard layers. If you want to use a layer which is not present in tf.keras.layers or tf.contrib.layers, consider filing a [github issue](http://github.com/tensorflow/tensorflow/issues/new) or, even better, sending us a pull request!" + "This file has moved." ] }, { - "metadata": { - "id": "Qhg4KlbKrs3G", - "colab_type": "text" - }, "cell_type": "markdown", - "source": [ - "## Models: composing layers\n", - "\n", - "Many interesting layer-like things in machine learning models are implemented by composing existing layers. For example, each residual block in a resnet is a composition of convolutions, batch normalizations, and a shortcut.\n", - "\n", - "The main class used when creating a layer-like thing which contains other layers is tf.keras.Model. Implementing one is done by inheriting from tf.keras.Model." - ] - }, - { - "metadata": { - "id": "N30DTXiRASlb", - "colab_type": "code", - "colab": { - "autoexec": { - "startup": false, - "wait_interval": 0 - } - } - }, - "cell_type": "code", - "source": [ - "class ResnetIdentityBlock(tf.keras.Model):\n", - " def __init__(self, kernel_size, filters):\n", - " super(ResnetIdentityBlock, self).__init__(name='')\n", - " filters1, filters2, filters3 = filters\n", - "\n", - " self.conv2a = tf.keras.layers.Conv2D(filters1, (1, 1))\n", - " self.bn2a = tf.keras.layers.BatchNormalization()\n", - "\n", - " self.conv2b = tf.keras.layers.Conv2D(filters2, kernel_size, padding='same')\n", - " self.bn2b = tf.keras.layers.BatchNormalization()\n", - "\n", - " self.conv2c = tf.keras.layers.Conv2D(filters3, (1, 1))\n", - " self.bn2c = tf.keras.layers.BatchNormalization()\n", - "\n", - " def call(self, input_tensor, training=False):\n", - " x = self.conv2a(input_tensor)\n", - " x = self.bn2a(x, training=training)\n", - " x = tf.nn.relu(x)\n", - "\n", - " x = self.conv2b(x)\n", - " x = self.bn2b(x, training=training)\n", - " x = tf.nn.relu(x)\n", - "\n", - " x = self.conv2c(x)\n", - " x = self.bn2c(x, training=training)\n", - "\n", - " x += input_tensor\n", - " return tf.nn.relu(x)\n", - "\n", - " \n", - "block = ResnetIdentityBlock(1, [1, 2, 3])\n", - "print(block(tf.zeros([1, 2, 3, 3])))\n", - "print([x.name for x in block.variables])" - ], - "execution_count": 0, - "outputs": [] - }, - { "metadata": { - "id": "wYfucVw65PMj", - "colab_type": "text" + "colab_type": "text", + "id": "BcJg7Enms86w" }, - "cell_type": "markdown", "source": [ - "Much of the time, however, models which compose many layers simply call one layer after the other. This can be done in very little code using tf.keras.Sequential" + "\u003ctable class=\"tfo-notebook-buttons\" align=\"left\"\u003e\u003ctd\u003e\n", + "\u003ca target=\"_blank\" href=\"https://colab.research.google.com/github/tensorflow/docs/blob/master/site/en/tutorials/eager/custom_layers.ipynb\"\u003e\n", + " \u003cimg src=\"https://www.tensorflow.org/images/colab_logo_32px.png\" /\u003eRun in Google Colab\u003c/a\u003e\n", + "\u003c/td\u003e\u003ctd\u003e\n", + "\u003ca target=\"_blank\" href=\"https://github.com/tensorflow/docs/blob/master/site/en/tutorials/eager/custom_layers.ipynb\"\u003e\u003cimg width=32px src=\"https://www.tensorflow.org/images/GitHub-Mark-32px.png\" /\u003eView source on GitHub\u003c/a\u003e\u003c/td\u003e\u003c/table\u003e" ] + } + ], + "metadata": { + "colab": { + "collapsed_sections": [], + "name": "custom_layers.ipynb", + "private_outputs": true, + "provenance": [], + "toc_visible": true, + "version": "0.3.2" }, - { - "metadata": { - "id": "L9frk7Ur4uvJ", - "colab_type": "code", - "colab": { - "autoexec": { - "startup": false, - "wait_interval": 0 - } - } - }, - "cell_type": "code", - "source": [ - " my_seq = tf.keras.Sequential([tf.keras.layers.Conv2D(1, (1, 1)),\n", - " tf.keras.layers.BatchNormalization(),\n", - " tf.keras.layers.Conv2D(2, 1, \n", - " padding='same'),\n", - " tf.keras.layers.BatchNormalization(),\n", - " tf.keras.layers.Conv2D(3, (1, 1)),\n", - " tf.keras.layers.BatchNormalization()])\n", - "my_seq(tf.zeros([1, 2, 3, 3]))" - ], - "execution_count": 0, - "outputs": [] - }, - { - "metadata": { - "id": "c5YwYcnuK-wc", - "colab_type": "text" - }, - "cell_type": "markdown", - "source": [ - "# Next steps\n", - "\n", - "Now you can go back to the previous notebook and adapt the linear regression example to use layers and models to be better structured." - ] + "kernelspec": { + "display_name": "Python 3", + "name": "python3" } - ] -} \ No newline at end of file + }, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/tensorflow/contrib/eager/python/examples/notebooks/custom_training.ipynb b/tensorflow/contrib/eager/python/examples/notebooks/custom_training.ipynb index 5f1b48fa0d..86dca0b423 100644 --- a/tensorflow/contrib/eager/python/examples/notebooks/custom_training.ipynb +++ b/tensorflow/contrib/eager/python/examples/notebooks/custom_training.ipynb @@ -1,46 +1,25 @@ { - "nbformat": 4, - "nbformat_minor": 0, - "metadata": { - "colab": { - "name": "Custom training: basics", - "version": "0.3.2", - "views": {}, - "default_view": {}, - "provenance": [], - "private_outputs": true, - "collapsed_sections": [], - "toc_visible": true - }, - "kernelspec": { - "name": "python3", - "display_name": "Python 3" - } - }, "cells": [ { + "cell_type": "markdown", "metadata": { - "id": "5rmpybwysXGV", - "colab_type": "text" + "colab_type": "text", + "id": "5rmpybwysXGV" }, - "cell_type": "markdown", "source": [ "##### Copyright 2018 The TensorFlow Authors." ] }, { + "cell_type": "code", + "execution_count": 0, "metadata": { - "id": "m8y3rGtQsYP2", + "cellView": "form", + "colab": {}, "colab_type": "code", - "colab": { - "autoexec": { - "startup": false, - "wait_interval": 0 - } - }, - "cellView": "form" + "id": "m8y3rGtQsYP2" }, - "cell_type": "code", + "outputs": [], "source": [ "#@title Licensed under the Apache License, Version 2.0 (the \"License\");\n", "# you may not use this file except in compliance with the License.\n", @@ -53,425 +32,57 @@ "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n", "# See the License for the specific language governing permissions and\n", "# limitations under the License." - ], - "execution_count": 0, - "outputs": [] - }, - { - "metadata": { - "id": "hrXv0rU9sIma", - "colab_type": "text" - }, - "cell_type": "markdown", - "source": [ - "# Custom training: basics" - ] - }, - { - "metadata": { - "id": "7S0BwJ_8sLu7", - "colab_type": "text" - }, - "cell_type": "markdown", - "source": [ - "
\n", - "\n", - " Run in Google Colab\n", - "\n", - "View source on GitHub
" - ] - }, - { - "metadata": { - "id": "k2o3TTG4TFpt", - "colab_type": "text" - }, - "cell_type": "markdown", - "source": [ - "In the previous tutorial we covered the TensorFlow APIs for automatic differentiation, a basic building block for machine learning.\n", - "In this tutorial we will use the TensorFlow primitives introduced in the prior tutorials to do some simple machine learning.\n", - "\n", - "TensorFlow also includes a higher-level neural networks API (`tf.keras`) which provides useful abstractions to reduce boilerplate. We strongly recommend those higher level APIs for people working with neural networks. However, in this short tutorial we cover neural network training from first principles to establish a strong foundation." - ] - }, - { - "metadata": { - "id": "3LXMVuV0VhDr", - "colab_type": "text" - }, - "cell_type": "markdown", - "source": [ - "## Setup" - ] - }, - { - "metadata": { - "id": "PJ64L90aVir3", - "colab_type": "code", - "colab": { - "autoexec": { - "startup": false, - "wait_interval": 0 - } - } - }, - "cell_type": "code", - "source": [ - "import tensorflow as tf\n", - "\n", - "tf.enable_eager_execution()" - ], - "execution_count": 0, - "outputs": [] - }, - { - "metadata": { - "id": "eMAWbDJFVmMk", - "colab_type": "text" - }, - "cell_type": "markdown", - "source": [ - "## Variables\n", - "\n", - "Tensors in TensorFlow are immutable stateless objects. Machine learning models, however, need to have changing state: as your model trains, the same code to compute predictions should behave differently over time (hopefully with a lower loss!). To represent this state which needs to change over the course of your computation, you can choose to rely on the fact that Python is a stateful programming language:\n" - ] - }, - { - "metadata": { - "id": "VkJwtLS_Jbn8", - "colab_type": "code", - "colab": { - "autoexec": { - "startup": false, - "wait_interval": 0 - } - } - }, - "cell_type": "code", - "source": [ - "# Using python state\n", - "x = tf.zeros([10, 10])\n", - "x += 2 # This is equivalent to x = x + 2, which does not mutate the original\n", - " # value of x\n", - "print(x)" - ], - "execution_count": 0, - "outputs": [] - }, - { - "metadata": { - "id": "wfneTXy7JcUz", - "colab_type": "text" - }, - "cell_type": "markdown", - "source": [ - "TensorFlow, however, has stateful operations built in, and these are often more pleasant to use than low-level Python representations of your state. To represent weights in a model, for example, it's often convenient and efficient to use TensorFlow variables.\n", - "\n", - "A Variable is an object which stores a value and, when used in a TensorFlow computation, will implicitly read from this stored value. There are operations (`tf.assign_sub`, `tf.scatter_update`, etc) which manipulate the value stored in a TensorFlow variable." ] }, { - "metadata": { - "id": "itxmrMil6DQi", - "colab_type": "code", - "colab": { - "autoexec": { - "startup": false, - "wait_interval": 0 - } - } - }, - "cell_type": "code", - "source": [ - "v = tf.Variable(1.0)\n", - "assert v.numpy() == 1.0\n", - "\n", - "# Re-assign the value\n", - "v.assign(3.0)\n", - "assert v.numpy() == 3.0\n", - "\n", - "# Use `v` in a TensorFlow operation like tf.square() and reassign\n", - "v.assign(tf.square(v))\n", - "assert v.numpy() == 9.0" - ], - "execution_count": 0, - "outputs": [] - }, - { - "metadata": { - "id": "-paSaeq1JzwC", - "colab_type": "text" - }, "cell_type": "markdown", - "source": [ - "Computations using Variables are automatically traced when computing gradients. For Variables representing embeddings TensorFlow will do sparse updates by default, which are more computation and memory efficient.\n", - "\n", - "Using Variables is also a way to quickly let a reader of your code know that this piece of state is mutable." - ] - }, - { "metadata": { - "id": "BMiFcDzE7Qu3", - "colab_type": "text" + "colab_type": "text", + "id": "hrXv0rU9sIma" }, - "cell_type": "markdown", "source": [ - "## Example: Fitting a linear model\n", - "\n", - "Let's now put the few concepts we have so far ---`Tensor`, `GradientTape`, `Variable` --- to build and train a simple model. This typically involves a few steps:\n", - "\n", - "1. Define the model.\n", - "2. Define a loss function.\n", - "3. Obtain training data.\n", - "4. Run through the training data and use an \"optimizer\" to adjust the variables to fit the data.\n", - "\n", - "In this tutorial, we'll walk through a trivial example of a simple linear model: `f(x) = x * W + b`, which has two variables - `W` and `b`. Furthermore, we'll synthesize data such that a well trained model would have `W = 3.0` and `b = 2.0`." - ] - }, - { - "metadata": { - "id": "gFzH64Jn9PIm", - "colab_type": "text" - }, - "cell_type": "markdown", - "source": [ - "### Define the model\n", - "\n", - "Let's define a simple class to encapsulate the variables and the computation." + "# Custom training: basics" ] }, { - "metadata": { - "id": "_WRu7Pze7wk8", - "colab_type": "code", - "colab": { - "autoexec": { - "startup": false, - "wait_interval": 0 - } - } - }, - "cell_type": "code", - "source": [ - "class Model(object):\n", - " def __init__(self):\n", - " # Initialize variable to (5.0, 0.0)\n", - " # In practice, these should be initialized to random values.\n", - " self.W = tf.Variable(5.0)\n", - " self.b = tf.Variable(0.0)\n", - " \n", - " def __call__(self, x):\n", - " return self.W * x + self.b\n", - " \n", - "model = Model()\n", - "\n", - "assert model(3.0).numpy() == 15.0" - ], - "execution_count": 0, - "outputs": [] - }, - { - "metadata": { - "id": "xa6j_yXa-j79", - "colab_type": "text" - }, "cell_type": "markdown", - "source": [ - "### Define a loss function\n", - "\n", - "A loss function measures how well the output of a model for a given input matches the desired output. Let's use the standard L2 loss." - ] - }, - { - "metadata": { - "id": "Y0ysUFGY924U", - "colab_type": "code", - "colab": { - "autoexec": { - "startup": false, - "wait_interval": 0 - } - } - }, - "cell_type": "code", - "source": [ - "def loss(predicted_y, desired_y):\n", - " return tf.reduce_mean(tf.square(predicted_y - desired_y))" - ], - "execution_count": 0, - "outputs": [] - }, - { "metadata": { - "id": "qutT_fkl_CBc", - "colab_type": "text" + "colab_type": "text", + "id": "IGPZTmwn9IT4" }, - "cell_type": "markdown", "source": [ - "### Obtain training data\n", - "\n", - "Let's synthesize the training data with some noise." + "This file has moved." ] }, { - "metadata": { - "id": "gxPTb-kt_N5m", - "colab_type": "code", - "colab": { - "autoexec": { - "startup": false, - "wait_interval": 0 - } - } - }, - "cell_type": "code", - "source": [ - "TRUE_W = 3.0\n", - "TRUE_b = 2.0\n", - "NUM_EXAMPLES = 1000\n", - "\n", - "inputs = tf.random_normal(shape=[NUM_EXAMPLES])\n", - "noise = tf.random_normal(shape=[NUM_EXAMPLES])\n", - "outputs = inputs * TRUE_W + TRUE_b + noise" - ], - "execution_count": 0, - "outputs": [] - }, - { - "metadata": { - "id": "-50nq-wPBsAW", - "colab_type": "text" - }, "cell_type": "markdown", - "source": [ - "Before we train the model let's visualize where the model stands right now. We'll plot the model's predictions in red and the training data in blue." - ] - }, - { "metadata": { - "id": "_eb83LtrB4nt", - "colab_type": "code", - "colab": { - "autoexec": { - "startup": false, - "wait_interval": 0 - } - } + "colab_type": "text", + "id": "7S0BwJ_8sLu7" }, - "cell_type": "code", "source": [ - "import matplotlib.pyplot as plt\n", - "\n", - "plt.scatter(inputs, outputs, c='b')\n", - "plt.scatter(inputs, model(inputs), c='r')\n", - "plt.show()\n", - "\n", - "print('Current loss: '),\n", - "print(loss(model(inputs), outputs).numpy())" - ], - "execution_count": 0, - "outputs": [] - }, - { - "metadata": { - "id": "sSDP-yeq_4jE", - "colab_type": "text" - }, - "cell_type": "markdown", - "source": [ - "### Define a training loop\n", - "\n", - "We now have our network and our training data. Let's train it, i.e., use the training data to update the model's variables (`W` and `b`) so that the loss goes down using [gradient descent](https://en.wikipedia.org/wiki/Gradient_descent). There are many variants of the gradient descent scheme that are captured in `tf.train.Optimizer` implementations. We'd highly recommend using those implementations, but in the spirit of building from first principles, in this particular example we will implement the basic math ourselves." + "\u003ctable class=\"tfo-notebook-buttons\" align=\"left\"\u003e\u003ctd\u003e\n", + "\u003ca target=\"_blank\" href=\"https://colab.research.google.com/github/tensorflow/docs/blob/master/site/en/tutorials/eager/custom_training.ipynb\"\u003e\n", + " \u003cimg src=\"https://www.tensorflow.org/images/colab_logo_32px.png\" /\u003eRun in Google Colab\u003c/a\u003e\n", + "\u003c/td\u003e\u003ctd\u003e\n", + "\u003ca target=\"_blank\" href=\"https://github.com/tensorflow/docs/blob/master/site/en/tutorials/eager/custom_training.ipynb\"\u003e\u003cimg width=32px src=\"https://www.tensorflow.org/images/GitHub-Mark-32px.png\" /\u003eView source on GitHub\u003c/a\u003e\u003c/td\u003e\u003c/table\u003e" ] + } + ], + "metadata": { + "colab": { + "collapsed_sections": [], + "name": "Custom training: basics", + "private_outputs": true, + "provenance": [], + "toc_visible": true, + "version": "0.3.2" }, - { - "metadata": { - "id": "MBIACgdnA55X", - "colab_type": "code", - "colab": { - "autoexec": { - "startup": false, - "wait_interval": 0 - } - } - }, - "cell_type": "code", - "source": [ - "def train(model, inputs, outputs, learning_rate):\n", - " with tf.GradientTape() as t:\n", - " current_loss = loss(model(inputs), outputs)\n", - " dW, db = t.gradient(current_loss, [model.W, model.b])\n", - " model.W.assign_sub(learning_rate * dW)\n", - " model.b.assign_sub(learning_rate * db)" - ], - "execution_count": 0, - "outputs": [] - }, - { - "metadata": { - "id": "RwWPaJryD2aN", - "colab_type": "text" - }, - "cell_type": "markdown", - "source": [ - "Finally, let's repeatedly run through the training data and see how `W` and `b` evolve." - ] - }, - { - "metadata": { - "id": "XdfkR223D9dW", - "colab_type": "code", - "colab": { - "autoexec": { - "startup": false, - "wait_interval": 0 - } - } - }, - "cell_type": "code", - "source": [ - "model = Model()\n", - "\n", - "# Collect the history of W-values and b-values to plot later\n", - "Ws, bs = [], []\n", - "epochs = range(10)\n", - "for epoch in epochs:\n", - " Ws.append(model.W.numpy())\n", - " bs.append(model.b.numpy())\n", - " current_loss = loss(model(inputs), outputs)\n", - "\n", - " train(model, inputs, outputs, learning_rate=0.1)\n", - " print('Epoch %2d: W=%1.2f b=%1.2f, loss=%2.5f' %\n", - " (epoch, Ws[-1], bs[-1], current_loss))\n", - "\n", - "# Let's plot it all\n", - "plt.plot(epochs, Ws, 'r',\n", - " epochs, bs, 'b')\n", - "plt.plot([TRUE_W] * len(epochs), 'r--',\n", - " [TRUE_b] * len(epochs), 'b--')\n", - "plt.legend(['W', 'b', 'true W', 'true_b'])\n", - "plt.show()\n", - " " - ], - "execution_count": 0, - "outputs": [] - }, - { - "metadata": { - "id": "vPnIVuaSJwWz", - "colab_type": "text" - }, - "cell_type": "markdown", - "source": [ - "## Next Steps\n", - "\n", - "In this tutorial we covered `Variable`s and built and trained a simple linear model using the TensorFlow primitives discussed so far.\n", - "\n", - "In theory, this is pretty much all you need to use TensorFlow for your machine learning research.\n", - "In practice, particularly for neural networks, the higher level APIs like `tf.keras` will be much more convenient since it provides higher level building blocks (called \"layers\"), utilities to save and restore state, a suite of loss functions, a suite of optimization strategies etc. \n", - "\n", - "The [next tutorial](TODO) will cover these higher level APIs." - ] + "kernelspec": { + "display_name": "Python 3", + "name": "python3" } - ] -} \ No newline at end of file + }, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/tensorflow/contrib/eager/python/examples/notebooks/eager_basics.ipynb b/tensorflow/contrib/eager/python/examples/notebooks/eager_basics.ipynb index f1e13de5de..c6d1a56604 100644 --- a/tensorflow/contrib/eager/python/examples/notebooks/eager_basics.ipynb +++ b/tensorflow/contrib/eager/python/examples/notebooks/eager_basics.ipynb @@ -1,46 +1,25 @@ { - "nbformat": 4, - "nbformat_minor": 0, - "metadata": { - "colab": { - "name": "eager_basics.ipynb", - "version": "0.3.2", - "views": {}, - "default_view": {}, - "provenance": [], - "private_outputs": true, - "collapsed_sections": [], - "toc_visible": true - }, - "kernelspec": { - "name": "python3", - "display_name": "Python 3" - } - }, "cells": [ { + "cell_type": "markdown", "metadata": { - "id": "iPpI7RaYoZuE", - "colab_type": "text" + "colab_type": "text", + "id": "iPpI7RaYoZuE" }, - "cell_type": "markdown", "source": [ "##### Copyright 2018 The TensorFlow Authors." ] }, { + "cell_type": "code", + "execution_count": 0, "metadata": { - "id": "hro2InpHobKk", + "cellView": "form", + "colab": {}, "colab_type": "code", - "colab": { - "autoexec": { - "startup": false, - "wait_interval": 0 - } - }, - "cellView": "form" + "id": "hro2InpHobKk" }, - "cell_type": "code", + "outputs": [], "source": [ "#@title Licensed under the Apache License, Version 2.0 (the \"License\");\n", "# you may not use this file except in compliance with the License.\n", @@ -53,439 +32,47 @@ "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n", "# See the License for the specific language governing permissions and\n", "# limitations under the License." - ], - "execution_count": 0, - "outputs": [] - }, - { - "metadata": { - "id": "U9i2Dsh-ziXr", - "colab_type": "text" - }, - "cell_type": "markdown", - "source": [ - "# Eager execution basics" - ] - }, - { - "metadata": { - "id": "Hndw-YcxoOJK", - "colab_type": "text" - }, - "cell_type": "markdown", - "source": [ - "
\n", - "\n", - " Run in Google Colab\n", - "\n", - "View source on GitHub
" - ] - }, - { - "metadata": { - "id": "6sILUVbHoSgH", - "colab_type": "text" - }, - "cell_type": "markdown", - "source": [ - "This is an introductory tutorial for using TensorFlow. It will cover:\n", - "\n", - "* Importing required packages\n", - "* Creating and using Tensors\n", - "* Using GPU acceleration\n", - "* Datasets" - ] - }, - { - "metadata": { - "id": "z1JcS5iBXMRO", - "colab_type": "text" - }, - "cell_type": "markdown", - "source": [ - "## Import TensorFlow\n", - "\n", - "To get started, import the `tensorflow` module and enable eager execution.\n", - "Eager execution enables a more interactive frontend to TensorFlow, the details of which we will discuss much later." - ] - }, - { - "metadata": { - "id": "RlIWhyeLoYnG", - "colab_type": "code", - "colab": { - "autoexec": { - "startup": false, - "wait_interval": 0 - } - }, - "cellView": "code" - }, - "cell_type": "code", - "source": [ - "import tensorflow as tf\n", - "\n", - "tf.enable_eager_execution()" - ], - "execution_count": 0, - "outputs": [] - }, - { - "metadata": { - "id": "H9UySOPLXdaw", - "colab_type": "text" - }, - "cell_type": "markdown", - "source": [ - "## Tensors\n", - "\n", - "A Tensor is a multi-dimensional array. Similar to NumPy `ndarray` objects, `Tensor` objects have a data type and a shape. Additionally, Tensors can reside in accelerator (like GPU) memory. TensorFlow offers a rich library of operations ([tf.add](https://www.tensorflow.org/api_docs/python/tf/add), [tf.matmul](https://www.tensorflow.org/api_docs/python/tf/matmul), [tf.linalg.inv](https://www.tensorflow.org/api_docs/python/tf/linalg/inv) etc.) that consume and produce Tensors. These operations automatically convert native Python types. For example:\n" - ] - }, - { - "metadata": { - "id": "ngUe237Wt48W", - "colab_type": "code", - "colab": { - "autoexec": { - "startup": false, - "wait_interval": 0 - } - }, - "cellView": "code" - }, - "cell_type": "code", - "source": [ - "print(tf.add(1, 2))\n", - "print(tf.add([1, 2], [3, 4]))\n", - "print(tf.square(5))\n", - "print(tf.reduce_sum([1, 2, 3]))\n", - "print(tf.encode_base64(\"hello world\"))\n", - "\n", - "# Operator overloading is also supported\n", - "print(tf.square(2) + tf.square(3))" - ], - "execution_count": 0, - "outputs": [] - }, - { - "metadata": { - "id": "IDY4WsYRhP81", - "colab_type": "text" - }, - "cell_type": "markdown", - "source": [ - "Each Tensor has a shape and a datatype" - ] - }, - { - "metadata": { - "id": "srYWH1MdJNG7", - "colab_type": "code", - "colab": { - "autoexec": { - "startup": false, - "wait_interval": 0 - } - } - }, - "cell_type": "code", - "source": [ - "x = tf.matmul([[1]], [[2, 3]])\n", - "print(x.shape)\n", - "print(x.dtype)" - ], - "execution_count": 0, - "outputs": [] - }, - { - "metadata": { - "id": "eBPw8e8vrsom", - "colab_type": "text" - }, - "cell_type": "markdown", - "source": [ - "The most obvious differences between NumPy arrays and TensorFlow Tensors are:\n", - "\n", - "1. Tensors can be backed by accelerator memory (like GPU, TPU).\n", - "2. Tensors are immutable." - ] - }, - { - "metadata": { - "id": "Dwi1tdW3JBw6", - "colab_type": "text" - }, - "cell_type": "markdown", - "source": [ - "### NumPy Compatibility\n", - "\n", - "Conversion between TensorFlow Tensors and NumPy ndarrays is quite simple as:\n", - "* TensorFlow operations automatically convert NumPy ndarrays to Tensors.\n", - "* NumPy operations automatically convert Tensors to NumPy ndarrays.\n", - "\n", - "Tensors can be explicitly converted to NumPy ndarrays by invoking the `.numpy()` method on them.\n", - "These conversions are typically cheap as the array and Tensor share the underlying memory representation if possible. However, sharing the underlying representation isn't always possible since the Tensor may be hosted in GPU memory while NumPy arrays are always backed by host memory, and the conversion will thus involve a copy from GPU to host memory." - ] - }, - { - "metadata": { - "id": "lCUWzso6mbqR", - "colab_type": "code", - "colab": { - "autoexec": { - "startup": false, - "wait_interval": 0 - } - } - }, - "cell_type": "code", - "source": [ - "import numpy as np\n", - "\n", - "ndarray = np.ones([3, 3])\n", - "\n", - "print(\"TensorFlow operations convert numpy arrays to Tensors automatically\")\n", - "tensor = tf.multiply(ndarray, 42)\n", - "print(tensor)\n", - "\n", - "\n", - "print(\"And NumPy operations convert Tensors to numpy arrays automatically\")\n", - "print(np.add(tensor, 1))\n", - "\n", - "print(\"The .numpy() method explicitly converts a Tensor to a numpy array\")\n", - "print(tensor.numpy())" - ], - "execution_count": 0, - "outputs": [] - }, - { - "metadata": { - "id": "PBNP8yTRfu_X", - "colab_type": "text" - }, - "cell_type": "markdown", - "source": [ - "## GPU acceleration\n", - "\n", - "Many TensorFlow operations can be accelerated by using the GPU for computation. Without any annotations, TensorFlow automatically decides whether to use the GPU or CPU for an operation (and copies the tensor between CPU and GPU memory if necessary). Tensors produced by an operation are typically backed by the memory of the device on which the operation executed. For example:" - ] - }, - { - "metadata": { - "id": "3Twf_Rw-gQFM", - "colab_type": "code", - "colab": { - "autoexec": { - "startup": false, - "wait_interval": 0 - } - }, - "cellView": "code" - }, - "cell_type": "code", - "source": [ - "x = tf.random_uniform([3, 3])\n", - "\n", - "print(\"Is there a GPU available: \"),\n", - "print(tf.test.is_gpu_available())\n", - "\n", - "print(\"Is the Tensor on GPU #0: \"),\n", - "print(x.device.endswith('GPU:0'))" - ], - "execution_count": 0, - "outputs": [] - }, - { - "metadata": { - "id": "vpgYzgVXW2Ud", - "colab_type": "text" - }, - "cell_type": "markdown", - "source": [ - "### Device Names\n", - "\n", - "The `Tensor.device` property provides a fully qualified string name of the device hosting the contents of the Tensor. This name encodes a bunch of details, such as an identifier of the network address of the host on which this program is executing and the device within that host. This is required for distributed execution of TensorFlow programs, but we'll skip that for now. The string will end with `GPU:` if the tensor is placed on the `N`-th tensor on the host." ] }, { - "metadata": { - "id": "ZWZQCimzuqyP", - "colab_type": "text" - }, "cell_type": "markdown", - "source": [ - "\n", - "\n", - "### Explicit Device Placement\n", - "\n", - "The term \"placement\" in TensorFlow refers to how individual operations are assigned (placed on) a device for execution. As mentioned above, when there is no explicit guidance provided, TensorFlow automatically decides which device to execute an operation, and copies Tensors to that device if needed. However, TensorFlow operations can be explicitly placed on specific devices using the `tf.device` context manager. For example:" - ] - }, - { - "metadata": { - "id": "RjkNZTuauy-Q", - "colab_type": "code", - "colab": { - "autoexec": { - "startup": false, - "wait_interval": 0 - } - } - }, - "cell_type": "code", - "source": [ - "def time_matmul(x):\n", - " %timeit tf.matmul(x, x)\n", - "\n", - "# Force execution on CPU\n", - "print(\"On CPU:\")\n", - "with tf.device(\"CPU:0\"):\n", - " x = tf.random_uniform([1000, 1000])\n", - " assert x.device.endswith(\"CPU:0\")\n", - " time_matmul(x)\n", - "\n", - "# Force execution on GPU #0 if available\n", - "if tf.test.is_gpu_available():\n", - " with tf.device(\"GPU:0\"): # Or GPU:1 for the 2nd GPU, GPU:2 for the 3rd etc.\n", - " x = tf.random_uniform([1000, 1000])\n", - " assert x.device.endswith(\"GPU:0\")\n", - " time_matmul(x)" - ], - "execution_count": 0, - "outputs": [] - }, - { "metadata": { - "id": "o1K4dlhhHtQj", - "colab_type": "text" + "colab_type": "text", + "id": "U9i2Dsh-ziXr" }, - "cell_type": "markdown", "source": [ - "## Datasets\n", - "\n", - "This section demonstrates the use of the [`tf.data.Dataset` API](https://www.tensorflow.org/guide/datasets) to build pipelines to feed data to your model. It covers:\n", - "\n", - "* Creating a `Dataset`.\n", - "* Iteration over a `Dataset` with eager execution enabled.\n", - "\n", - "We recommend using the `Dataset`s API for building performant, complex input pipelines from simple, re-usable pieces that will feed your model's training or evaluation loops.\n", - "\n", - "If you're familiar with TensorFlow graphs, the API for constructing the `Dataset` object remains exactly the same when eager execution is enabled, but the process of iterating over elements of the dataset is slightly simpler.\n", - "You can use Python iteration over the `tf.data.Dataset` object and do not need to explicitly create an `tf.data.Iterator` object.\n", - "As a result, the discussion on iterators in the [TensorFlow Guide](https://www.tensorflow.org/guide/datasets) is not relevant when eager execution is enabled." - ] - }, - { - "metadata": { - "id": "zI0fmOynH-Ne", - "colab_type": "text" - }, - "cell_type": "markdown", - "source": [ - "### Create a source `Dataset`\n", - "\n", - "Create a _source_ dataset using one of the factory functions like [`Dataset.from_tensors`](https://www.tensorflow.org/api_docs/python/tf/data/Dataset#from_tensors), [`Dataset.from_tensor_slices`](https://www.tensorflow.org/api_docs/python/tf/data/Dataset#from_tensor_slices) or using objects that read from files like [`TextLineDataset`](https://www.tensorflow.org/api_docs/python/tf/data/TextLineDataset) or [`TFRecordDataset`](https://www.tensorflow.org/api_docs/python/tf/data/TFRecordDataset). See the [TensorFlow Guide](https://www.tensorflow.org/guide/datasets#reading_input_data) for more information." + "# Eager execution basics" ] }, { - "metadata": { - "id": "F04fVOHQIBiG", - "colab_type": "code", - "colab": { - "autoexec": { - "startup": false, - "wait_interval": 0 - } - } - }, - "cell_type": "code", - "source": [ - "ds_tensors = tf.data.Dataset.from_tensor_slices([1, 2, 3, 4, 5, 6])\n", - "\n", - "# Create a CSV file\n", - "import tempfile\n", - "_, filename = tempfile.mkstemp()\n", - "\n", - "with open(filename, 'w') as f:\n", - " f.write(\"\"\"Line 1\n", - "Line 2\n", - "Line 3\n", - " \"\"\")\n", - "\n", - "ds_file = tf.data.TextLineDataset(filename)" - ], - "execution_count": 0, - "outputs": [] - }, - { - "metadata": { - "id": "vbxIhC-5IPdf", - "colab_type": "text" - }, "cell_type": "markdown", - "source": [ - "### Apply transformations\n", - "\n", - "Use the transformations functions like [`map`](https://www.tensorflow.org/api_docs/python/tf/data/Dataset#map), [`batch`](https://www.tensorflow.org/api_docs/python/tf/data/Dataset#batch), [`shuffle`](https://www.tensorflow.org/api_docs/python/tf/data/Dataset#shuffle) etc. to apply transformations to the records of the dataset. See the [API documentation for `tf.data.Dataset`](https://www.tensorflow.org/api_docs/python/tf/data/Dataset) for details." - ] - }, - { "metadata": { - "id": "uXSDZWE-ISsd", - "colab_type": "code", - "colab": { - "autoexec": { - "startup": false, - "wait_interval": 0 - } - } + "colab_type": "text", + "id": "Hndw-YcxoOJK" }, - "cell_type": "code", "source": [ - "ds_tensors = ds_tensors.map(tf.square).shuffle(2).batch(2)\n", - "\n", - "ds_file = ds_file.batch(2)" - ], - "execution_count": 0, - "outputs": [] - }, - { - "metadata": { - "id": "A8X1GNfoIZKJ", - "colab_type": "text" - }, - "cell_type": "markdown", - "source": [ - "### Iterate\n", - "\n", - "When eager execution is enabled `Dataset` objects support iteration.\n", - "If you're familiar with the use of `Dataset`s in TensorFlow graphs, note that there is no need for calls to `Dataset.make_one_shot_iterator()` or `get_next()` calls." + "\u003ctable class=\"tfo-notebook-buttons\" align=\"left\"\u003e\u003ctd\u003e\n", + "\u003ca target=\"_blank\" href=\"https://colab.research.google.com/github/tensorflow/docs/blob/master/site/en/tutorials/eager/eager_basics.ipynb\"\u003e\n", + " \u003cimg src=\"https://www.tensorflow.org/images/colab_logo_32px.png\" /\u003eRun in Google Colab\u003c/a\u003e\n", + "\u003c/td\u003e\u003ctd\u003e\n", + "\u003ca target=\"_blank\" href=\"https://github.com/tensorflow/docs/blob/master/site/en/tutorials/eager/eager_basics.ipynb\"\u003e\u003cimg width=32px src=\"https://www.tensorflow.org/images/GitHub-Mark-32px.png\" /\u003eView source on GitHub\u003c/a\u003e\u003c/td\u003e\u003c/table\u003e" ] + } + ], + "metadata": { + "colab": { + "collapsed_sections": [], + "name": "eager_basics.ipynb", + "private_outputs": true, + "provenance": [], + "toc_visible": true, + "version": "0.3.2" }, - { - "metadata": { - "id": "ws-WKRk5Ic6-", - "colab_type": "code", - "colab": { - "autoexec": { - "startup": false, - "wait_interval": 0 - } - } - }, - "cell_type": "code", - "source": [ - "print('Elements of ds_tensors:')\n", - "for x in ds_tensors:\n", - " print(x)\n", - "\n", - "print('\\nElements in ds_file:')\n", - "for x in ds_file:\n", - " print(x)" - ], - "execution_count": 0, - "outputs": [] + "kernelspec": { + "display_name": "Python 3", + "name": "python3" } - ] -} \ No newline at end of file + }, + "nbformat": 4, + "nbformat_minor": 0 +} -- GitLab From 910443c6f2bb5eac3797f513abab1ae41f56efb1 Mon Sep 17 00:00:00 2001 From: Jiri Simsa Date: Mon, 27 Aug 2018 11:37:34 -0700 Subject: [PATCH 160/598] [tf.data] Minor cleanup. PiperOrigin-RevId: 210402159 --- .../optimizers/data/latency_all_edges.cc | 4 ++-- .../optimizers/data/map_and_batch_fusion.cc | 7 +++--- .../optimizers/data/map_vectorization.cc | 22 +++++++++---------- 3 files changed, 16 insertions(+), 17 deletions(-) diff --git a/tensorflow/core/grappler/optimizers/data/latency_all_edges.cc b/tensorflow/core/grappler/optimizers/data/latency_all_edges.cc index 0b25b1ea9d..9e382aeef9 100644 --- a/tensorflow/core/grappler/optimizers/data/latency_all_edges.cc +++ b/tensorflow/core/grappler/optimizers/data/latency_all_edges.cc @@ -33,7 +33,7 @@ namespace { constexpr char kInsertOpName[] = "LatencyStatsDataset"; -NodeDef make_latency_node(const NodeDef& node, MutableGraphView* graph) { +NodeDef MakeLatencyNode(const NodeDef& node, MutableGraphView* graph) { NodeDef new_node; new_node.set_op(kInsertOpName); graph_utils::SetUniqueGraphNodeName( @@ -96,7 +96,7 @@ Status LatencyAllEdges::Optimize(Cluster* cluster, const GrapplerItem& item, } } - graph.InsertNode(node, make_latency_node(node, &graph)); + graph.InsertNode(node, MakeLatencyNode(node, &graph)); } return Status::OK(); } diff --git a/tensorflow/core/grappler/optimizers/data/map_and_batch_fusion.cc b/tensorflow/core/grappler/optimizers/data/map_and_batch_fusion.cc index e9ad6f1b8a..63945b8b9e 100644 --- a/tensorflow/core/grappler/optimizers/data/map_and_batch_fusion.cc +++ b/tensorflow/core/grappler/optimizers/data/map_and_batch_fusion.cc @@ -32,9 +32,8 @@ namespace { constexpr char kFusedOpName[] = "MapAndBatchDatasetV2"; -NodeDef make_map_and_batch_node(const NodeDef& map_node, - const NodeDef& batch_node, - MutableGraphView* graph) { +NodeDef MakeMapAndBatchNode(const NodeDef& map_node, const NodeDef& batch_node, + MutableGraphView* graph) { NodeDef new_node; new_node.set_op(kFusedOpName); graph_utils::SetUniqueGraphNodeName(kFusedOpName, graph->GetGraph(), @@ -113,7 +112,7 @@ Status MapAndBatchFusion::Optimize(Cluster* cluster, const GrapplerItem& item, NodeDef* map_node = node2; auto* new_node = - graph.AddNode(make_map_and_batch_node(*map_node, batch_node, &graph)); + graph.AddNode(MakeMapAndBatchNode(*map_node, batch_node, &graph)); graph.ReplaceInput(batch_node, *new_node); // Mark the `Map` and `Batch` nodes for removal. diff --git a/tensorflow/core/grappler/optimizers/data/map_vectorization.cc b/tensorflow/core/grappler/optimizers/data/map_vectorization.cc index 92551a0459..a019b77eb7 100644 --- a/tensorflow/core/grappler/optimizers/data/map_vectorization.cc +++ b/tensorflow/core/grappler/optimizers/data/map_vectorization.cc @@ -112,10 +112,10 @@ bool HasCapturedInputs(const NodeDef& map_node) { return map_node.attr().at("Targuments").list().type_size() > 0; } -NodeDef make_new_batch_node(const NodeDef& old_batch_node, - const NodeDef& input_node, - const FunctionDef& vectorized_func, - MutableGraphView* graph) { +NodeDef MakeNewBatchNode(const NodeDef& old_batch_node, + const NodeDef& input_node, + const FunctionDef& vectorized_func, + MutableGraphView* graph) { NodeDef batch_node; batch_node.set_op(old_batch_node.op()); graph_utils::SetUniqueGraphNodeName(batch_node.op(), graph->GetGraph(), @@ -151,11 +151,11 @@ NodeDef make_new_batch_node(const NodeDef& old_batch_node, return batch_node; } -NodeDef make_new_map_node(const NodeDef& old_map_node, - const NodeDef& old_batch_node, - const NodeDef& new_batch_node, - const FunctionDef& vectorized_func, - MutableGraphView* graph) { +NodeDef MakeNewMapNode(const NodeDef& old_map_node, + const NodeDef& old_batch_node, + const NodeDef& new_batch_node, + const FunctionDef& vectorized_func, + MutableGraphView* graph) { NodeDef map_node; map_node.set_op(old_map_node.op()); graph_utils::SetUniqueGraphNodeName(map_node.op(), graph->GetGraph(), @@ -232,9 +232,9 @@ Status MapVectorization::Optimize(Cluster* cluster, const GrapplerItem& item, CHECK_NOTNULL(vectorized_func); auto* new_batch_node = graph.AddNode( - make_new_batch_node(batch_node, *input_node, *vectorized_func, &graph)); + MakeNewBatchNode(batch_node, *input_node, *vectorized_func, &graph)); - auto* new_map_node = graph.AddNode(make_new_map_node( + auto* new_map_node = graph.AddNode(MakeNewMapNode( *map_node, batch_node, *new_batch_node, *vectorized_func, &graph)); graph.ReplaceInput(batch_node, *new_map_node); -- GitLab From abc8452394aeeecc1f3fef27f7098a5924bdd0e9 Mon Sep 17 00:00:00 2001 From: Jiri Simsa Date: Mon, 27 Aug 2018 11:51:24 -0700 Subject: [PATCH 161/598] [tf.data] removing test for obsolete functionality PiperOrigin-RevId: 210404649 --- .../kernel_tests/optimize_dataset_op_test.py | 14 ----- .../core/grappler/optimizers/data/BUILD | 34 ------------- .../optimizers/data/function_rename.cc | 51 ------------------- .../optimizers/data/function_rename.h | 46 ----------------- .../optimizers/data/function_rename_test.cc | 42 --------------- 5 files changed, 187 deletions(-) delete mode 100644 tensorflow/core/grappler/optimizers/data/function_rename.cc delete mode 100644 tensorflow/core/grappler/optimizers/data/function_rename.h delete mode 100644 tensorflow/core/grappler/optimizers/data/function_rename_test.cc diff --git a/tensorflow/contrib/data/python/kernel_tests/optimize_dataset_op_test.py b/tensorflow/contrib/data/python/kernel_tests/optimize_dataset_op_test.py index ca38f8e2f9..ec43bc3653 100644 --- a/tensorflow/contrib/data/python/kernel_tests/optimize_dataset_op_test.py +++ b/tensorflow/contrib/data/python/kernel_tests/optimize_dataset_op_test.py @@ -100,20 +100,6 @@ class OptimizeDatasetTest(test.TestCase, parameterized.TestCase): with self.assertRaises(errors.OutOfRangeError): sess.run(get_next) - # TODO(b/112914454): Remove the test or figure out way to copy only new - # functions in optimize_dataset_op instead of taking union of old and new - # functions. - def _testFunctionLibraryDefinitionModification(self): - dataset = dataset_ops.Dataset.from_tensors(0).map(lambda x: x).apply( - optimization.optimize(["_test_only_function_rename"])) - iterator = dataset.make_one_shot_iterator() - get_next = iterator.get_next() - - with self.test_session() as sess: - with self.assertRaisesRegexp(errors.NotFoundError, - "Function .* is not defined."): - sess.run(get_next) - if __name__ == "__main__": test.main() diff --git a/tensorflow/core/grappler/optimizers/data/BUILD b/tensorflow/core/grappler/optimizers/data/BUILD index 979c437c02..530c957068 100644 --- a/tensorflow/core/grappler/optimizers/data/BUILD +++ b/tensorflow/core/grappler/optimizers/data/BUILD @@ -41,39 +41,6 @@ tf_cc_test( ], ) -cc_library( - name = "function_rename", - srcs = ["function_rename.cc"], - hdrs = [ - "function_rename.h", - ], - visibility = ["//visibility:public"], - deps = [ - ":graph_utils", - "//tensorflow/core:lib", - "//tensorflow/core/grappler:graph_view", - "//tensorflow/core/grappler:grappler_item", - "//tensorflow/core/grappler:op_types", - "//tensorflow/core/grappler:utils", - "//tensorflow/core/grappler/clusters:cluster", - "//tensorflow/core/grappler/optimizers:custom_graph_optimizer", - "//tensorflow/core/grappler/optimizers:custom_graph_optimizer_registry", - ] + tf_protos_all(), -) - -tf_cc_test( - name = "function_rename_test", - srcs = ["function_rename_test.cc"], - visibility = ["//visibility:public"], - deps = [ - ":function_rename", - "//tensorflow/core:framework", - "//tensorflow/core:test", - "//tensorflow/core:test_main", - "//tensorflow/core/grappler:grappler_item", - ] + tf_protos_all(), -) - cc_library( name = "fusion_utils", srcs = ["fusion_utils.cc"], @@ -384,7 +351,6 @@ cc_library( visibility = ["//visibility:public"], deps = [ ":filter_fusion", - ":function_rename", ":latency_all_edges", ":map_and_batch_fusion", ":map_and_filter_fusion", diff --git a/tensorflow/core/grappler/optimizers/data/function_rename.cc b/tensorflow/core/grappler/optimizers/data/function_rename.cc deleted file mode 100644 index 8cf044d1bd..0000000000 --- a/tensorflow/core/grappler/optimizers/data/function_rename.cc +++ /dev/null @@ -1,51 +0,0 @@ -/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -==============================================================================*/ - -#include "tensorflow/core/grappler/optimizers/data/function_rename.h" - -#include "tensorflow/core/grappler/clusters/cluster.h" -#include "tensorflow/core/grappler/graph_view.h" -#include "tensorflow/core/grappler/grappler_item.h" -#include "tensorflow/core/grappler/op_types.h" -#include "tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.h" -#include "tensorflow/core/grappler/optimizers/data/graph_utils.h" -#include "tensorflow/core/grappler/utils.h" -#include "tensorflow/core/platform/protobuf.h" - -namespace tensorflow { -namespace grappler { - -Status FunctionRename::Optimize(Cluster* cluster, const GrapplerItem& item, - GraphDef* output) { - *output = item.graph; - GraphView graph(output); - int n = output->mutable_library()->function_size(); - for (int i = 0; i < n; ++i) { - FunctionDef* fn = output->mutable_library()->mutable_function(i); - fn->mutable_signature()->set_name(fn->signature().name() + "world"); - } - - return Status::OK(); -} - -void FunctionRename::Feedback(Cluster* cluster, const GrapplerItem& item, - const GraphDef& optimize_output, double result) { - // no-op -} - -REGISTER_GRAPH_OPTIMIZER_AS(FunctionRename, "_test_only_function_rename"); - -} // end namespace grappler -} // end namespace tensorflow diff --git a/tensorflow/core/grappler/optimizers/data/function_rename.h b/tensorflow/core/grappler/optimizers/data/function_rename.h deleted file mode 100644 index 23ad9470ff..0000000000 --- a/tensorflow/core/grappler/optimizers/data/function_rename.h +++ /dev/null @@ -1,46 +0,0 @@ -/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -==============================================================================*/ - -#ifndef TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_DATA_FUNCTION_RENAME_H_ -#define TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_DATA_FUNCTION_RENAME_H_ - -#include "tensorflow/core/grappler/optimizers/custom_graph_optimizer.h" - -namespace tensorflow { -namespace grappler { - -class FunctionRename : public CustomGraphOptimizer { - public: - FunctionRename() = default; - ~FunctionRename() override = default; - - string name() const override { return "_test_only_function_rename"; }; - - Status Init( - const tensorflow::RewriterConfig_CustomGraphOptimizer* config) override { - return Status::OK(); - } - - Status Optimize(Cluster* cluster, const GrapplerItem& item, - GraphDef* output) override; - - void Feedback(Cluster* cluster, const GrapplerItem& item, - const GraphDef& optimize_output, double result) override; -}; - -} // end namespace grappler -} // end namespace tensorflow - -#endif // TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_DATA_FUNCTION_RENAME_H_ diff --git a/tensorflow/core/grappler/optimizers/data/function_rename_test.cc b/tensorflow/core/grappler/optimizers/data/function_rename_test.cc deleted file mode 100644 index 56b8a960a7..0000000000 --- a/tensorflow/core/grappler/optimizers/data/function_rename_test.cc +++ /dev/null @@ -1,42 +0,0 @@ -/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -==============================================================================*/ - -#include "tensorflow/core/grappler/optimizers/data/function_rename.h" - -#include "tensorflow/core/framework/function.pb.h" -#include "tensorflow/core/framework/op_def.pb.h" -#include "tensorflow/core/grappler/grappler_item.h" -#include "tensorflow/core/lib/core/status_test_util.h" -#include "tensorflow/core/platform/test.h" - -namespace tensorflow { -namespace grappler { -namespace { - -TEST(FunctionRenameTest, RenameFunction) { - GrapplerItem item; - GraphDef *graph = &item.graph; - FunctionDef *fn = graph->mutable_library()->add_function(); - fn->mutable_signature()->set_name("hello"); - - FunctionRename optimizer; - GraphDef output; - TF_ASSERT_OK(optimizer.Optimize(nullptr, item, &output)); - EXPECT_EQ(output.library().function(0).signature().name(), "helloworld"); -} - -} // namespace -} // namespace grappler -} // namespace tensorflow -- GitLab From 6c2bf6576321ad53ff1eb6d66b6efae2c93ef4e0 Mon Sep 17 00:00:00 2001 From: Mark Daoust Date: Mon, 27 Aug 2018 11:57:42 -0700 Subject: [PATCH 162/598] Moved tensorflow/docs_src to https://github.com/tensorflow/docs PiperOrigin-RevId: 210405729 --- tensorflow/docs_src/README.md | 3 + tensorflow/docs_src/about/attribution.md | 9 - tensorflow/docs_src/about/bib.md | 131 - tensorflow/docs_src/about/index.md | 11 - tensorflow/docs_src/about/leftnav_files | 4 - tensorflow/docs_src/about/uses.md | 68 - tensorflow/docs_src/api_guides/cc/guide.md | 301 -- .../docs_src/api_guides/python/array_ops.md | 87 - .../docs_src/api_guides/python/check_ops.md | 19 - .../docs_src/api_guides/python/client.md | 36 - .../docs_src/api_guides/python/constant_op.md | 87 - .../docs_src/api_guides/python/contrib.crf.md | 11 - .../api_guides/python/contrib.ffmpeg.md | 23 - .../api_guides/python/contrib.framework.md | 64 - .../api_guides/python/contrib.graph_editor.md | 177 -- .../api_guides/python/contrib.integrate.md | 41 - .../api_guides/python/contrib.layers.md | 109 - .../api_guides/python/contrib.learn.md | 63 - .../api_guides/python/contrib.linalg.md | 30 - .../api_guides/python/contrib.losses.md | 125 - .../api_guides/python/contrib.metrics.md | 133 - .../docs_src/api_guides/python/contrib.rnn.md | 61 - .../api_guides/python/contrib.seq2seq.md | 138 - .../api_guides/python/contrib.signal.md | 172 -- .../api_guides/python/contrib.staging.md | 6 - .../api_guides/python/contrib.training.md | 50 - .../api_guides/python/contrib.util.md | 12 - .../api_guides/python/control_flow_ops.md | 57 - .../docs_src/api_guides/python/framework.md | 51 - .../api_guides/python/functional_ops.md | 18 - .../docs_src/api_guides/python/image.md | 144 - .../docs_src/api_guides/python/index.md | 52 - .../api_guides/python/input_dataset.md | 85 - .../docs_src/api_guides/python/io_ops.md | 130 - .../docs_src/api_guides/python/math_ops.md | 200 -- .../docs_src/api_guides/python/meta_graph.md | 277 -- tensorflow/docs_src/api_guides/python/nn.md | 418 --- .../docs_src/api_guides/python/python_io.md | 29 - .../api_guides/python/reading_data.md | 522 ---- .../api_guides/python/regression_examples.md | 232 -- .../docs_src/api_guides/python/session_ops.md | 15 - .../docs_src/api_guides/python/sparse_ops.md | 45 - .../api_guides/python/spectral_ops.md | 26 - .../docs_src/api_guides/python/state_ops.md | 110 - .../docs_src/api_guides/python/string_ops.md | 39 - .../docs_src/api_guides/python/summary.md | 23 - tensorflow/docs_src/api_guides/python/test.md | 47 - .../docs_src/api_guides/python/tfdbg.md | 50 - .../api_guides/python/threading_and_queues.md | 270 -- .../docs_src/api_guides/python/train.md | 139 - tensorflow/docs_src/community/benchmarks.md | 108 - tensorflow/docs_src/community/contributing.md | 49 - .../docs_src/community/documentation.md | 673 ----- tensorflow/docs_src/community/groups.md | 38 - tensorflow/docs_src/community/index.md | 85 - tensorflow/docs_src/community/leftnav_files | 8 - tensorflow/docs_src/community/lists.md | 53 - tensorflow/docs_src/community/roadmap.md | 123 - tensorflow/docs_src/community/style_guide.md | 136 - tensorflow/docs_src/deploy/deploy_to_js.md | 4 - tensorflow/docs_src/deploy/distributed.md | 354 --- tensorflow/docs_src/deploy/hadoop.md | 65 - tensorflow/docs_src/deploy/index.md | 21 - tensorflow/docs_src/deploy/leftnav_files | 5 - tensorflow/docs_src/deploy/s3.md | 93 - tensorflow/docs_src/extend/add_filesys.md | 260 -- tensorflow/docs_src/extend/adding_an_op.md | 1460 ---------- tensorflow/docs_src/extend/architecture.md | 217 -- tensorflow/docs_src/extend/index.md | 34 - .../docs_src/extend/language_bindings.md | 231 -- tensorflow/docs_src/extend/leftnav_files | 7 - .../docs_src/extend/new_data_formats.md | 305 --- .../docs_src/extend/tool_developers/index.md | 186 -- tensorflow/docs_src/extras/README.txt | 3 - tensorflow/docs_src/guide/autograph.md | 3 - tensorflow/docs_src/guide/checkpoints.md | 238 -- .../docs_src/guide/custom_estimators.md | 602 ---- tensorflow/docs_src/guide/datasets.md | 823 ------ .../docs_src/guide/datasets_for_estimators.md | 387 --- tensorflow/docs_src/guide/debugger.md | 814 ------ tensorflow/docs_src/guide/eager.md | 854 ------ tensorflow/docs_src/guide/embedding.md | 262 -- tensorflow/docs_src/guide/estimators.md | 196 -- tensorflow/docs_src/guide/faq.md | 296 -- tensorflow/docs_src/guide/feature_columns.md | 572 ---- tensorflow/docs_src/guide/graph_viz.md | 317 --- tensorflow/docs_src/guide/graphs.md | 558 ---- tensorflow/docs_src/guide/index.md | 82 - tensorflow/docs_src/guide/keras.md | 623 ----- tensorflow/docs_src/guide/leftnav_files | 41 - tensorflow/docs_src/guide/low_level_intro.md | 604 ---- .../docs_src/guide/premade_estimators.md | 432 --- tensorflow/docs_src/guide/saved_model.md | 999 ------- .../guide/summaries_and_tensorboard.md | 225 -- .../docs_src/guide/tensorboard_histograms.md | 245 -- tensorflow/docs_src/guide/tensors.md | 330 --- tensorflow/docs_src/guide/using_gpu.md | 215 -- tensorflow/docs_src/guide/using_tpu.md | 395 --- tensorflow/docs_src/guide/variables.md | 319 --- tensorflow/docs_src/guide/version_compat.md | 327 --- tensorflow/docs_src/install/index.md | 39 - tensorflow/docs_src/install/install_c.md | 118 - tensorflow/docs_src/install/install_go.md | 142 - tensorflow/docs_src/install/install_java.md | 268 -- tensorflow/docs_src/install/install_linux.md | 714 ----- tensorflow/docs_src/install/install_mac.md | 529 ---- .../docs_src/install/install_raspbian.md | 313 --- .../docs_src/install/install_sources.md | 579 ---- .../install/install_sources_windows.md | 320 --- .../docs_src/install/install_windows.md | 227 -- tensorflow/docs_src/install/leftnav_files | 18 - tensorflow/docs_src/install/migration.md | 336 --- tensorflow/docs_src/mobile/README.md | 3 - tensorflow/docs_src/performance/benchmarks.md | 412 --- .../performance/datasets_performance.md | 331 --- tensorflow/docs_src/performance/index.md | 52 - tensorflow/docs_src/performance/leftnav_files | 14 - .../docs_src/performance/performance_guide.md | 733 ----- .../performance/performance_models.md | 422 --- .../docs_src/performance/quantization.md | 253 -- .../docs_src/performance/xla/broadcasting.md | 204 -- .../performance/xla/developing_new_backend.md | 77 - tensorflow/docs_src/performance/xla/index.md | 98 - tensorflow/docs_src/performance/xla/jit.md | 169 -- .../performance/xla/operation_semantics.md | 2426 ----------------- tensorflow/docs_src/performance/xla/shapes.md | 150 - .../docs_src/performance/xla/tfcompile.md | 281 -- tensorflow/docs_src/tutorials/_index.yaml | 202 -- tensorflow/docs_src/tutorials/_toc.yaml | 128 - .../eager/custom_training_walkthrough.md | 3 - tensorflow/docs_src/tutorials/eager/index.md | 12 - .../docs_src/tutorials/estimators/cnn.md | 694 ----- .../docs_src/tutorials/estimators/linear.md | 3 - .../docs_src/tutorials/images/deep_cnn.md | 446 --- .../tutorials/images/image_recognition.md | 455 ---- .../tutorials/keras/basic_classification.md | 3 - .../tutorials/keras/basic_regression.md | 3 - .../keras/basic_text_classification.md | 3 - tensorflow/docs_src/tutorials/keras/index.md | 22 - .../tutorials/keras/overfit_and_underfit.md | 3 - .../keras/save_and_restore_models.md | 3 - tensorflow/docs_src/tutorials/next_steps.md | 36 - .../docs_src/tutorials/non-ml/mandelbrot.md | 116 - tensorflow/docs_src/tutorials/non-ml/pdes.md | 140 - .../representation/kernel_methods.md | 303 -- .../tutorials/representation/linear.md | 239 -- .../tutorials/representation/word2vec.md | 405 --- .../tutorials/sequences/audio_recognition.md | 631 ----- .../docs_src/tutorials/sequences/recurrent.md | 230 -- .../sequences/recurrent_quickdraw.md | 410 --- 150 files changed, 3 insertions(+), 33942 deletions(-) create mode 100644 tensorflow/docs_src/README.md delete mode 100644 tensorflow/docs_src/about/attribution.md delete mode 100644 tensorflow/docs_src/about/bib.md delete mode 100644 tensorflow/docs_src/about/index.md delete mode 100644 tensorflow/docs_src/about/leftnav_files delete mode 100644 tensorflow/docs_src/about/uses.md delete mode 100644 tensorflow/docs_src/api_guides/cc/guide.md delete mode 100644 tensorflow/docs_src/api_guides/python/array_ops.md delete mode 100644 tensorflow/docs_src/api_guides/python/check_ops.md delete mode 100644 tensorflow/docs_src/api_guides/python/client.md delete mode 100644 tensorflow/docs_src/api_guides/python/constant_op.md delete mode 100644 tensorflow/docs_src/api_guides/python/contrib.crf.md delete mode 100644 tensorflow/docs_src/api_guides/python/contrib.ffmpeg.md delete mode 100644 tensorflow/docs_src/api_guides/python/contrib.framework.md delete mode 100644 tensorflow/docs_src/api_guides/python/contrib.graph_editor.md delete mode 100644 tensorflow/docs_src/api_guides/python/contrib.integrate.md delete mode 100644 tensorflow/docs_src/api_guides/python/contrib.layers.md delete mode 100644 tensorflow/docs_src/api_guides/python/contrib.learn.md delete mode 100644 tensorflow/docs_src/api_guides/python/contrib.linalg.md delete mode 100644 tensorflow/docs_src/api_guides/python/contrib.losses.md delete mode 100644 tensorflow/docs_src/api_guides/python/contrib.metrics.md delete mode 100644 tensorflow/docs_src/api_guides/python/contrib.rnn.md delete mode 100644 tensorflow/docs_src/api_guides/python/contrib.seq2seq.md delete mode 100644 tensorflow/docs_src/api_guides/python/contrib.signal.md delete mode 100644 tensorflow/docs_src/api_guides/python/contrib.staging.md delete mode 100644 tensorflow/docs_src/api_guides/python/contrib.training.md delete mode 100644 tensorflow/docs_src/api_guides/python/contrib.util.md delete mode 100644 tensorflow/docs_src/api_guides/python/control_flow_ops.md delete mode 100644 tensorflow/docs_src/api_guides/python/framework.md delete mode 100644 tensorflow/docs_src/api_guides/python/functional_ops.md delete mode 100644 tensorflow/docs_src/api_guides/python/image.md delete mode 100644 tensorflow/docs_src/api_guides/python/index.md delete mode 100644 tensorflow/docs_src/api_guides/python/input_dataset.md delete mode 100644 tensorflow/docs_src/api_guides/python/io_ops.md delete mode 100644 tensorflow/docs_src/api_guides/python/math_ops.md delete mode 100644 tensorflow/docs_src/api_guides/python/meta_graph.md delete mode 100644 tensorflow/docs_src/api_guides/python/nn.md delete mode 100644 tensorflow/docs_src/api_guides/python/python_io.md delete mode 100644 tensorflow/docs_src/api_guides/python/reading_data.md delete mode 100644 tensorflow/docs_src/api_guides/python/regression_examples.md delete mode 100644 tensorflow/docs_src/api_guides/python/session_ops.md delete mode 100644 tensorflow/docs_src/api_guides/python/sparse_ops.md delete mode 100644 tensorflow/docs_src/api_guides/python/spectral_ops.md delete mode 100644 tensorflow/docs_src/api_guides/python/state_ops.md delete mode 100644 tensorflow/docs_src/api_guides/python/string_ops.md delete mode 100644 tensorflow/docs_src/api_guides/python/summary.md delete mode 100644 tensorflow/docs_src/api_guides/python/test.md delete mode 100644 tensorflow/docs_src/api_guides/python/tfdbg.md delete mode 100644 tensorflow/docs_src/api_guides/python/threading_and_queues.md delete mode 100644 tensorflow/docs_src/api_guides/python/train.md delete mode 100644 tensorflow/docs_src/community/benchmarks.md delete mode 100644 tensorflow/docs_src/community/contributing.md delete mode 100644 tensorflow/docs_src/community/documentation.md delete mode 100644 tensorflow/docs_src/community/groups.md delete mode 100644 tensorflow/docs_src/community/index.md delete mode 100644 tensorflow/docs_src/community/leftnav_files delete mode 100644 tensorflow/docs_src/community/lists.md delete mode 100644 tensorflow/docs_src/community/roadmap.md delete mode 100644 tensorflow/docs_src/community/style_guide.md delete mode 100644 tensorflow/docs_src/deploy/deploy_to_js.md delete mode 100644 tensorflow/docs_src/deploy/distributed.md delete mode 100644 tensorflow/docs_src/deploy/hadoop.md delete mode 100644 tensorflow/docs_src/deploy/index.md delete mode 100644 tensorflow/docs_src/deploy/leftnav_files delete mode 100644 tensorflow/docs_src/deploy/s3.md delete mode 100644 tensorflow/docs_src/extend/add_filesys.md delete mode 100644 tensorflow/docs_src/extend/adding_an_op.md delete mode 100644 tensorflow/docs_src/extend/architecture.md delete mode 100644 tensorflow/docs_src/extend/index.md delete mode 100644 tensorflow/docs_src/extend/language_bindings.md delete mode 100644 tensorflow/docs_src/extend/leftnav_files delete mode 100644 tensorflow/docs_src/extend/new_data_formats.md delete mode 100644 tensorflow/docs_src/extend/tool_developers/index.md delete mode 100644 tensorflow/docs_src/extras/README.txt delete mode 100644 tensorflow/docs_src/guide/autograph.md delete mode 100644 tensorflow/docs_src/guide/checkpoints.md delete mode 100644 tensorflow/docs_src/guide/custom_estimators.md delete mode 100644 tensorflow/docs_src/guide/datasets.md delete mode 100644 tensorflow/docs_src/guide/datasets_for_estimators.md delete mode 100644 tensorflow/docs_src/guide/debugger.md delete mode 100644 tensorflow/docs_src/guide/eager.md delete mode 100644 tensorflow/docs_src/guide/embedding.md delete mode 100644 tensorflow/docs_src/guide/estimators.md delete mode 100644 tensorflow/docs_src/guide/faq.md delete mode 100644 tensorflow/docs_src/guide/feature_columns.md delete mode 100644 tensorflow/docs_src/guide/graph_viz.md delete mode 100644 tensorflow/docs_src/guide/graphs.md delete mode 100644 tensorflow/docs_src/guide/index.md delete mode 100644 tensorflow/docs_src/guide/keras.md delete mode 100644 tensorflow/docs_src/guide/leftnav_files delete mode 100644 tensorflow/docs_src/guide/low_level_intro.md delete mode 100644 tensorflow/docs_src/guide/premade_estimators.md delete mode 100644 tensorflow/docs_src/guide/saved_model.md delete mode 100644 tensorflow/docs_src/guide/summaries_and_tensorboard.md delete mode 100644 tensorflow/docs_src/guide/tensorboard_histograms.md delete mode 100644 tensorflow/docs_src/guide/tensors.md delete mode 100644 tensorflow/docs_src/guide/using_gpu.md delete mode 100644 tensorflow/docs_src/guide/using_tpu.md delete mode 100644 tensorflow/docs_src/guide/variables.md delete mode 100644 tensorflow/docs_src/guide/version_compat.md delete mode 100644 tensorflow/docs_src/install/index.md delete mode 100644 tensorflow/docs_src/install/install_c.md delete mode 100644 tensorflow/docs_src/install/install_go.md delete mode 100644 tensorflow/docs_src/install/install_java.md delete mode 100644 tensorflow/docs_src/install/install_linux.md delete mode 100644 tensorflow/docs_src/install/install_mac.md delete mode 100644 tensorflow/docs_src/install/install_raspbian.md delete mode 100644 tensorflow/docs_src/install/install_sources.md delete mode 100644 tensorflow/docs_src/install/install_sources_windows.md delete mode 100644 tensorflow/docs_src/install/install_windows.md delete mode 100644 tensorflow/docs_src/install/leftnav_files delete mode 100644 tensorflow/docs_src/install/migration.md delete mode 100644 tensorflow/docs_src/mobile/README.md delete mode 100644 tensorflow/docs_src/performance/benchmarks.md delete mode 100644 tensorflow/docs_src/performance/datasets_performance.md delete mode 100644 tensorflow/docs_src/performance/index.md delete mode 100644 tensorflow/docs_src/performance/leftnav_files delete mode 100644 tensorflow/docs_src/performance/performance_guide.md delete mode 100644 tensorflow/docs_src/performance/performance_models.md delete mode 100644 tensorflow/docs_src/performance/quantization.md delete mode 100644 tensorflow/docs_src/performance/xla/broadcasting.md delete mode 100644 tensorflow/docs_src/performance/xla/developing_new_backend.md delete mode 100644 tensorflow/docs_src/performance/xla/index.md delete mode 100644 tensorflow/docs_src/performance/xla/jit.md delete mode 100644 tensorflow/docs_src/performance/xla/operation_semantics.md delete mode 100644 tensorflow/docs_src/performance/xla/shapes.md delete mode 100644 tensorflow/docs_src/performance/xla/tfcompile.md delete mode 100644 tensorflow/docs_src/tutorials/_index.yaml delete mode 100644 tensorflow/docs_src/tutorials/_toc.yaml delete mode 100644 tensorflow/docs_src/tutorials/eager/custom_training_walkthrough.md delete mode 100644 tensorflow/docs_src/tutorials/eager/index.md delete mode 100644 tensorflow/docs_src/tutorials/estimators/cnn.md delete mode 100644 tensorflow/docs_src/tutorials/estimators/linear.md delete mode 100644 tensorflow/docs_src/tutorials/images/deep_cnn.md delete mode 100644 tensorflow/docs_src/tutorials/images/image_recognition.md delete mode 100644 tensorflow/docs_src/tutorials/keras/basic_classification.md delete mode 100644 tensorflow/docs_src/tutorials/keras/basic_regression.md delete mode 100644 tensorflow/docs_src/tutorials/keras/basic_text_classification.md delete mode 100644 tensorflow/docs_src/tutorials/keras/index.md delete mode 100644 tensorflow/docs_src/tutorials/keras/overfit_and_underfit.md delete mode 100644 tensorflow/docs_src/tutorials/keras/save_and_restore_models.md delete mode 100644 tensorflow/docs_src/tutorials/next_steps.md delete mode 100644 tensorflow/docs_src/tutorials/non-ml/mandelbrot.md delete mode 100644 tensorflow/docs_src/tutorials/non-ml/pdes.md delete mode 100644 tensorflow/docs_src/tutorials/representation/kernel_methods.md delete mode 100644 tensorflow/docs_src/tutorials/representation/linear.md delete mode 100644 tensorflow/docs_src/tutorials/representation/word2vec.md delete mode 100644 tensorflow/docs_src/tutorials/sequences/audio_recognition.md delete mode 100644 tensorflow/docs_src/tutorials/sequences/recurrent.md delete mode 100644 tensorflow/docs_src/tutorials/sequences/recurrent_quickdraw.md diff --git a/tensorflow/docs_src/README.md b/tensorflow/docs_src/README.md new file mode 100644 index 0000000000..bcd896c5ba --- /dev/null +++ b/tensorflow/docs_src/README.md @@ -0,0 +1,3 @@ +# This directory has moved + +The new location is: https://github.com/tensorflow/docs/site/en diff --git a/tensorflow/docs_src/about/attribution.md b/tensorflow/docs_src/about/attribution.md deleted file mode 100644 index a4858b400a..0000000000 --- a/tensorflow/docs_src/about/attribution.md +++ /dev/null @@ -1,9 +0,0 @@ -# Attribution - -Please only use the TensorFlow name and marks when accurately referencing this -software distribution, and do not use our marks in a way that suggests you are -endorsed by or otherwise affiliated with Google. When referring to our marks, -please include the following attribution statement: "TensorFlow, the TensorFlow -logo and any related marks are trademarks of Google Inc." - - diff --git a/tensorflow/docs_src/about/bib.md b/tensorflow/docs_src/about/bib.md deleted file mode 100644 index 5593a3d95c..0000000000 --- a/tensorflow/docs_src/about/bib.md +++ /dev/null @@ -1,131 +0,0 @@ -# TensorFlow White Papers - -This document identifies white papers about TensorFlow. - -## Large-Scale Machine Learning on Heterogeneous Distributed Systems - -[Access this white paper.](https://static.googleusercontent.com/media/research.google.com/en//pubs/archive/45166.pdf) - -**Abstract:** TensorFlow is an interface for expressing machine learning -algorithms, and an implementation for executing such algorithms. -A computation expressed using TensorFlow can be -executed with little or no change on a wide variety of heterogeneous -systems, ranging from mobile devices such as phones -and tablets up to large-scale distributed systems of hundreds -of machines and thousands of computational devices such as -GPU cards. The system is flexible and can be used to express -a wide variety of algorithms, including training and inference -algorithms for deep neural network models, and it has been -used for conducting research and for deploying machine learning -systems into production across more than a dozen areas of -computer science and other fields, including speech recognition, -computer vision, robotics, information retrieval, natural -language processing, geographic information extraction, and -computational drug discovery. This paper describes the TensorFlow -interface and an implementation of that interface that -we have built at Google. The TensorFlow API and a reference -implementation were released as an open-source package under -the Apache 2.0 license in November, 2015 and are available at -www.tensorflow.org. - - -### In BibTeX format - -If you use TensorFlow in your research and would like to cite the TensorFlow -system, we suggest you cite this whitepaper. - -
-@misc{tensorflow2015-whitepaper,
-title={ {TensorFlow}: Large-Scale Machine Learning on Heterogeneous Systems},
-url={https://www.tensorflow.org/},
-note={Software available from tensorflow.org},
-author={
-    Mart\'{\i}n~Abadi and
-    Ashish~Agarwal and
-    Paul~Barham and
-    Eugene~Brevdo and
-    Zhifeng~Chen and
-    Craig~Citro and
-    Greg~S.~Corrado and
-    Andy~Davis and
-    Jeffrey~Dean and
-    Matthieu~Devin and
-    Sanjay~Ghemawat and
-    Ian~Goodfellow and
-    Andrew~Harp and
-    Geoffrey~Irving and
-    Michael~Isard and
-    Yangqing Jia and
-    Rafal~Jozefowicz and
-    Lukasz~Kaiser and
-    Manjunath~Kudlur and
-    Josh~Levenberg and
-    Dandelion~Man\'{e} and
-    Rajat~Monga and
-    Sherry~Moore and
-    Derek~Murray and
-    Chris~Olah and
-    Mike~Schuster and
-    Jonathon~Shlens and
-    Benoit~Steiner and
-    Ilya~Sutskever and
-    Kunal~Talwar and
-    Paul~Tucker and
-    Vincent~Vanhoucke and
-    Vijay~Vasudevan and
-    Fernanda~Vi\'{e}gas and
-    Oriol~Vinyals and
-    Pete~Warden and
-    Martin~Wattenberg and
-    Martin~Wicke and
-    Yuan~Yu and
-    Xiaoqiang~Zheng},
-  year={2015},
-}
-
- -Or in textual form: - -
-Martín Abadi, Ashish Agarwal, Paul Barham, Eugene Brevdo,
-Zhifeng Chen, Craig Citro, Greg S. Corrado, Andy Davis,
-Jeffrey Dean, Matthieu Devin, Sanjay Ghemawat, Ian Goodfellow,
-Andrew Harp, Geoffrey Irving, Michael Isard, Rafal Jozefowicz, Yangqing Jia,
-Lukasz Kaiser, Manjunath Kudlur, Josh Levenberg, Dan Mané, Mike Schuster,
-Rajat Monga, Sherry Moore, Derek Murray, Chris Olah, Jonathon Shlens,
-Benoit Steiner, Ilya Sutskever, Kunal Talwar, Paul Tucker,
-Vincent Vanhoucke, Vijay Vasudevan, Fernanda Viégas,
-Oriol Vinyals, Pete Warden, Martin Wattenberg, Martin Wicke,
-Yuan Yu, and Xiaoqiang Zheng.
-TensorFlow: Large-scale machine learning on heterogeneous systems,
-2015. Software available from tensorflow.org.
-
- - - -## TensorFlow: A System for Large-Scale Machine Learning - -[Access this white paper.](https://www.usenix.org/system/files/conference/osdi16/osdi16-abadi.pdf) - -**Abstract:** TensorFlow is a machine learning system that operates at -large scale and in heterogeneous environments. TensorFlow -uses dataflow graphs to represent computation, -shared state, and the operations that mutate that state. It -maps the nodes of a dataflow graph across many machines -in a cluster, and within a machine across multiple computational -devices, including multicore CPUs, generalpurpose -GPUs, and custom-designed ASICs known as -Tensor Processing Units (TPUs). This architecture gives -flexibility to the application developer: whereas in previous -“parameter server” designs the management of shared -state is built into the system, TensorFlow enables developers -to experiment with novel optimizations and training algorithms. -TensorFlow supports a variety of applications, -with a focus on training and inference on deep neural networks. -Several Google services use TensorFlow in production, -we have released it as an open-source project, and -it has become widely used for machine learning research. -In this paper, we describe the TensorFlow dataflow model -and demonstrate the compelling performance that TensorFlow -achieves for several real-world applications. - diff --git a/tensorflow/docs_src/about/index.md b/tensorflow/docs_src/about/index.md deleted file mode 100644 index c3c13ff329..0000000000 --- a/tensorflow/docs_src/about/index.md +++ /dev/null @@ -1,11 +0,0 @@ -# About TensorFlow - -This section provides a few documents about TensorFlow itself, -including the following: - - * [TensorFlow in Use](../about/uses.md), which provides a link to our model zoo and - lists some popular ways that TensorFlow is being used. - * [TensorFlow White Papers](../about/bib.md), which provides abstracts of white papers - about TensorFlow. - * [Attribution](../about/attribution.md), which specifies how to attribute and refer - to TensorFlow. diff --git a/tensorflow/docs_src/about/leftnav_files b/tensorflow/docs_src/about/leftnav_files deleted file mode 100644 index 63763b9d9c..0000000000 --- a/tensorflow/docs_src/about/leftnav_files +++ /dev/null @@ -1,4 +0,0 @@ -index.md -uses.md -bib.md -attribution.md diff --git a/tensorflow/docs_src/about/uses.md b/tensorflow/docs_src/about/uses.md deleted file mode 100644 index d3db98203e..0000000000 --- a/tensorflow/docs_src/about/uses.md +++ /dev/null @@ -1,68 +0,0 @@ -# TensorFlow In Use - -This page highlights TensorFlow models in real world use. - - -## Model zoo - -Please visit our collection of TensorFlow models in the -[TensorFlow Zoo](https://github.com/tensorflow/models). - -If you have built a model with TensorFlow, please consider publishing it in -the Zoo. - - -## Current uses - -This section describes some of the current uses of the TensorFlow system. - -> If you are using TensorFlow for research, for education, or for production -> usage in some product, we would love to add something about your usage here. -> Please feel free to [email us](mailto:usecases@tensorflow.org) a brief -> description of how you're using TensorFlow, or even better, send us a -> pull request to add an entry to this file. - -* **Deep Speech** -
    -
  • **Organization**: Mozilla
  • -
  • **Domain**: Speech Recognition
  • -
  • **Description**: A TensorFlow implementation motivated by Baidu's Deep Speech architecture.
  • -
  • **More info**: [GitHub Repo](https://github.com/mozilla/deepspeech)
  • -
- -* **RankBrain** -
    -
  • **Organization**: Google
  • -
  • **Domain**: Information Retrieval
  • -
  • **Description**: A large-scale deployment of deep neural nets for search ranking on www.google.com.
  • -
  • **More info**: ["Google Turning Over Its Lucrative Search to AI Machines"](http://www.bloomberg.com/news/articles/2015-10-26/google-turning-its-lucrative-web-search-over-to-ai-machines)
  • -
- -* **Inception Image Classification Model** -
    -
  • **Organization**: Google
  • -
  • **Description**: Baseline model and follow on research into highly accurate computer vision models, starting with the model that won the 2014 Imagenet image classification challenge
  • -
  • **More Info**: Baseline model described in [Arxiv paper](http://arxiv.org/abs/1409.4842)
  • -
- -* **SmartReply** -
    -
  • **Organization**: Google
  • -
  • **Description**: Deep LSTM model to automatically generate email responses
  • -
  • **More Info**: [Google research blog post](http://googleresearch.blogspot.com/2015/11/computer-respond-to-this-email.html)
  • -
- -* **Massively Multitask Networks for Drug Discovery** -
    -
  • **Organization**: Google and Stanford University
  • -
  • **Domain**: Drug discovery
  • -
  • **Description**: A deep neural network model for identifying promising drug candidates.
  • -
  • **More info**: [Arxiv paper](http://arxiv.org/abs/1502.02072)
  • -
- -* **On-Device Computer Vision for OCR** -
    -
  • **Organization**: Google
  • -
  • **Description**: On-device computer vision model to do optical character recognition to enable real-time translation.
  • -
  • **More info**: [Google Research blog post](http://googleresearch.blogspot.com/2015/07/how-google-translate-squeezes-deep.html)
  • -
diff --git a/tensorflow/docs_src/api_guides/cc/guide.md b/tensorflow/docs_src/api_guides/cc/guide.md deleted file mode 100644 index 2cd645afa7..0000000000 --- a/tensorflow/docs_src/api_guides/cc/guide.md +++ /dev/null @@ -1,301 +0,0 @@ -# C++ API - -Note: By default [tensorflow.org](https://www.tensorflow.org) shows docs for the -most recent stable version. The instructions in this doc require building from -source. You will probably want to build from the `master` version of tensorflow. -You should, as a result, be sure you are following the -[`master` version of this doc](https://www.tensorflow.org/versions/master/api_guides/cc/guide), -in case there have been any changes. - -Note: The C++ API is only designed to work with TensorFlow `bazel build`. -If you need a stand-alone option use the [C-api](../../install/install_c.md). -See [these instructions](https://docs.bazel.build/versions/master/external.html) -for details on how to include TensorFlow as a subproject (instead of building -your project from inside TensorFlow, as in this example). - -[TOC] - -TensorFlow's C++ API provides mechanisms for constructing and executing a data -flow graph. The API is designed to be simple and concise: graph operations are -clearly expressed using a "functional" construction style, including easy -specification of names, device placement, etc., and the resulting graph can be -efficiently run and the desired outputs fetched in a few lines of code. This -guide explains the basic concepts and data structures needed to get started with -TensorFlow graph construction and execution in C++. - -## The Basics - -Let's start with a simple example that illustrates graph construction and -execution using the C++ API. - -```c++ -// tensorflow/cc/example/example.cc - -#include "tensorflow/cc/client/client_session.h" -#include "tensorflow/cc/ops/standard_ops.h" -#include "tensorflow/core/framework/tensor.h" - -int main() { - using namespace tensorflow; - using namespace tensorflow::ops; - Scope root = Scope::NewRootScope(); - // Matrix A = [3 2; -1 0] - auto A = Const(root, { {3.f, 2.f}, {-1.f, 0.f} }); - // Vector b = [3 5] - auto b = Const(root, { {3.f, 5.f} }); - // v = Ab^T - auto v = MatMul(root.WithOpName("v"), A, b, MatMul::TransposeB(true)); - std::vector outputs; - ClientSession session(root); - // Run and fetch v - TF_CHECK_OK(session.Run({v}, &outputs)); - // Expect outputs[0] == [19; -3] - LOG(INFO) << outputs[0].matrix(); - return 0; -} -``` - -Place this example code in the file `tensorflow/cc/example/example.cc` inside a -clone of the -TensorFlow -[github repository](http://www.github.com/tensorflow/tensorflow). Also place a -`BUILD` file in the same directory with the following contents: - -```python -load("//tensorflow:tensorflow.bzl", "tf_cc_binary") - -tf_cc_binary( - name = "example", - srcs = ["example.cc"], - deps = [ - "//tensorflow/cc:cc_ops", - "//tensorflow/cc:client_session", - "//tensorflow/core:tensorflow", - ], -) -``` - -Use `tf_cc_binary` rather than Bazel's native `cc_binary` to link in necessary -symbols from `libtensorflow_framework.so`. You should be able to build and run -the example using the following command (be sure to run `./configure` in your -build sandbox first): - -```shell -bazel run -c opt //tensorflow/cc/example:example -``` - -This example shows some of the important features of the C++ API such as the -following: - -* Constructing tensor constants from C++ nested initializer lists -* Constructing and naming of TensorFlow operations -* Specifying optional attributes to operation constructors -* Executing and fetching the tensor values from the TensorFlow session. - -We will delve into the details of each below. - -## Graph Construction - -### Scope - -`tensorflow::Scope` is the main data structure that holds the current state -of graph construction. A `Scope` acts as a handle to the graph being -constructed, as well as storing TensorFlow operation properties. The `Scope` -object is the first argument to operation constructors, and operations that use -a given `Scope` as their first argument inherit that `Scope`'s properties, such -as a common name prefix. Multiple `Scope`s can refer to the same graph, as -explained further below. - -Create a new `Scope` object by calling `Scope::NewRootScope`. This creates -some resources such as a graph to which operations are added. It also creates a -`tensorflow::Status` object which will be used to indicate errors encountered -when constructing operations. The `Scope` class has value semantics, thus, a -`Scope` object can be freely copied and passed around. - -The `Scope` object returned by `Scope::NewRootScope` is referred -to as the root scope. "Child" scopes can be constructed from the root scope by -calling various member functions of the `Scope` class, thus forming a hierarchy -of scopes. A child scope inherits all of the properties of the parent scope and -typically has one property added or changed. For instance, `NewSubScope(name)` -appends `name` to the prefix of names for operations created using the returned -`Scope` object. - -Here are some of the properties controlled by a `Scope` object: - -* Operation names -* Set of control dependencies for an operation -* Device placement for an operation -* Kernel attribute for an operation - -Please refer to `tensorflow::Scope` for the complete list of member functions -that let you create child scopes with new properties. - -### Operation Constructors - -You can create graph operations with operation constructors, one C++ class per -TensorFlow operation. Unlike the Python API which uses snake-case to name the -operation constructors, the C++ API uses camel-case to conform to C++ coding -style. For instance, the `MatMul` operation has a C++ class with the same name. - -Using this class-per-operation method, it is possible, though not recommended, -to construct an operation as follows: - -```c++ -// Not recommended -MatMul m(scope, a, b); -``` - -Instead, we recommend the following "functional" style for constructing -operations: - -```c++ -// Recommended -auto m = MatMul(scope, a, b); -``` - -The first parameter for all operation constructors is always a `Scope` object. -Tensor inputs and mandatory attributes form the rest of the arguments. - -For optional arguments, constructors have an optional parameter that allows -optional attributes. For operations with optional arguments, the constructor's -last optional parameter is a `struct` type called `[operation]:Attrs` that -contains data members for each optional attribute. You can construct such -`Attrs` in multiple ways: - -* You can specify a single optional attribute by constructing an `Attrs` object -using the `static` functions provided in the C++ class for the operation. For -example: - -```c++ -auto m = MatMul(scope, a, b, MatMul::TransposeA(true)); -``` - -* You can specify multiple optional attributes by chaining together functions - available in the `Attrs` struct. For example: - -```c++ -auto m = MatMul(scope, a, b, MatMul::TransposeA(true).TransposeB(true)); - -// Or, alternatively -auto m = MatMul(scope, a, b, MatMul::Attrs().TransposeA(true).TransposeB(true)); -``` - -The arguments and return values of operations are handled in different ways -depending on their type: - -* For operations that return single tensors, the object returned by - the operation object can be passed directly to other operation - constructors. For example: - -```c++ -auto m = MatMul(scope, x, W); -auto sum = Add(scope, m, bias); -``` - -* For operations producing multiple outputs, the object returned by the - operation constructor has a member for each of the outputs. The names of those - members are identical to the names present in the `OpDef` for the - operation. For example: - -```c++ -auto u = Unique(scope, a); -// u.y has the unique values and u.idx has the unique indices -auto m = Add(scope, u.y, b); -``` - -* Operations producing a list-typed output return an object that can - be indexed using the `[]` operator. That object can also be directly passed to - other constructors that expect list-typed inputs. For example: - -```c++ -auto s = Split(scope, 0, a, 2); -// Access elements of the returned list. -auto b = Add(scope, s[0], s[1]); -// Pass the list as a whole to other constructors. -auto c = Concat(scope, s, 0); -``` - -### Constants - -You may pass many different types of C++ values directly to tensor -constants. You may explicitly create a tensor constant by calling the -`tensorflow::ops::Const` function from various kinds of C++ values. For -example: - -* Scalars - -```c++ -auto f = Const(scope, 42.0f); -auto s = Const(scope, "hello world!"); -``` - -* Nested initializer lists - -```c++ -// 2x2 matrix -auto c1 = Const(scope, { {1, 2}, {2, 4} }); -// 1x3x1 tensor -auto c2 = Const(scope, { { {1}, {2}, {3} } }); -// 1x2x0 tensor -auto c3 = ops::Const(scope, { { {}, {} } }); -``` - -* Shapes explicitly specified - -```c++ -// 2x2 matrix with all elements = 10 -auto c1 = Const(scope, 10, /* shape */ {2, 2}); -// 1x3x2x1 tensor -auto c2 = Const(scope, {1, 2, 3, 4, 5, 6}, /* shape */ {1, 3, 2, 1}); -``` - -You may directly pass constants to other operation constructors, either by -explicitly constructing one using the `Const` function, or implicitly as any of -the above types of C++ values. For example: - -```c++ -// [1 1] * [41; 1] -auto x = MatMul(scope, { {1, 1} }, { {41}, {1} }); -// [1 2 3 4] + 10 -auto y = Add(scope, {1, 2, 3, 4}, 10); -``` - -## Graph Execution - -When executing a graph, you will need a session. The C++ API provides a -`tensorflow::ClientSession` class that will execute ops created by the -operation constructors. TensorFlow will automatically determine which parts of -the graph need to be executed, and what values need feeding. For example: - -```c++ -Scope root = Scope::NewRootScope(); -auto c = Const(root, { {1, 1} }); -auto m = MatMul(root, c, { {42}, {1} }); - -ClientSession session(root); -std::vector outputs; -session.Run({m}, &outputs); -// outputs[0] == {42} -``` - -Similarly, the object returned by the operation constructor can be used as the -argument to specify a value being fed when executing the graph. Furthermore, the -value to feed can be specified with the different kinds of C++ values used to -specify tensor constants. For example: - -```c++ -Scope root = Scope::NewRootScope(); -auto a = Placeholder(root, DT_INT32); -// [3 3; 3 3] -auto b = Const(root, 3, {2, 2}); -auto c = Add(root, a, b); -ClientSession session(root); -std::vector outputs; - -// Feed a <- [1 2; 3 4] -session.Run({ {a, { {1, 2}, {3, 4} } } }, {c}, &outputs); -// outputs[0] == [4 5; 6 7] -``` - -Please see the `tensorflow::Tensor` documentation for more information on how -to use the execution output. diff --git a/tensorflow/docs_src/api_guides/python/array_ops.md b/tensorflow/docs_src/api_guides/python/array_ops.md deleted file mode 100644 index ddeea80c56..0000000000 --- a/tensorflow/docs_src/api_guides/python/array_ops.md +++ /dev/null @@ -1,87 +0,0 @@ -# Tensor Transformations - -Note: Functions taking `Tensor` arguments can also take anything accepted by -`tf.convert_to_tensor`. - -[TOC] - -## Casting - -TensorFlow provides several operations that you can use to cast tensor data -types in your graph. - -* `tf.string_to_number` -* `tf.to_double` -* `tf.to_float` -* `tf.to_bfloat16` -* `tf.to_int32` -* `tf.to_int64` -* `tf.cast` -* `tf.bitcast` -* `tf.saturate_cast` - -## Shapes and Shaping - -TensorFlow provides several operations that you can use to determine the shape -of a tensor and change the shape of a tensor. - -* `tf.broadcast_dynamic_shape` -* `tf.broadcast_static_shape` -* `tf.shape` -* `tf.shape_n` -* `tf.size` -* `tf.rank` -* `tf.reshape` -* `tf.squeeze` -* `tf.expand_dims` -* `tf.meshgrid` - -## Slicing and Joining - -TensorFlow provides several operations to slice or extract parts of a tensor, -or join multiple tensors together. - -* `tf.slice` -* `tf.strided_slice` -* `tf.split` -* `tf.tile` -* `tf.pad` -* `tf.concat` -* `tf.stack` -* `tf.parallel_stack` -* `tf.unstack` -* `tf.reverse_sequence` -* `tf.reverse` -* `tf.reverse_v2` -* `tf.transpose` -* `tf.extract_image_patches` -* `tf.space_to_batch_nd` -* `tf.space_to_batch` -* `tf.required_space_to_batch_paddings` -* `tf.batch_to_space_nd` -* `tf.batch_to_space` -* `tf.space_to_depth` -* `tf.depth_to_space` -* `tf.gather` -* `tf.gather_nd` -* `tf.unique_with_counts` -* `tf.scatter_nd` -* `tf.dynamic_partition` -* `tf.dynamic_stitch` -* `tf.boolean_mask` -* `tf.one_hot` -* `tf.sequence_mask` -* `tf.dequantize` -* `tf.quantize_v2` -* `tf.quantized_concat` -* `tf.setdiff1d` - -## Fake quantization -Operations used to help train for better quantization accuracy. - -* `tf.fake_quant_with_min_max_args` -* `tf.fake_quant_with_min_max_args_gradient` -* `tf.fake_quant_with_min_max_vars` -* `tf.fake_quant_with_min_max_vars_gradient` -* `tf.fake_quant_with_min_max_vars_per_channel` -* `tf.fake_quant_with_min_max_vars_per_channel_gradient` diff --git a/tensorflow/docs_src/api_guides/python/check_ops.md b/tensorflow/docs_src/api_guides/python/check_ops.md deleted file mode 100644 index b52fdaa3ab..0000000000 --- a/tensorflow/docs_src/api_guides/python/check_ops.md +++ /dev/null @@ -1,19 +0,0 @@ -# Asserts and boolean checks - -* `tf.assert_negative` -* `tf.assert_positive` -* `tf.assert_proper_iterable` -* `tf.assert_non_negative` -* `tf.assert_non_positive` -* `tf.assert_equal` -* `tf.assert_integer` -* `tf.assert_less` -* `tf.assert_less_equal` -* `tf.assert_greater` -* `tf.assert_greater_equal` -* `tf.assert_rank` -* `tf.assert_rank_at_least` -* `tf.assert_type` -* `tf.is_non_decreasing` -* `tf.is_numeric_tensor` -* `tf.is_strictly_increasing` diff --git a/tensorflow/docs_src/api_guides/python/client.md b/tensorflow/docs_src/api_guides/python/client.md deleted file mode 100644 index fdd48e66dc..0000000000 --- a/tensorflow/docs_src/api_guides/python/client.md +++ /dev/null @@ -1,36 +0,0 @@ -# Running Graphs -[TOC] - -This library contains classes for launching graphs and executing operations. - -[This guide](../../guide/low_level_intro.md) has examples of how a graph -is launched in a `tf.Session`. - -## Session management - -* `tf.Session` -* `tf.InteractiveSession` -* `tf.get_default_session` - -## Error classes and convenience functions - -* `tf.OpError` -* `tf.errors.CancelledError` -* `tf.errors.UnknownError` -* `tf.errors.InvalidArgumentError` -* `tf.errors.DeadlineExceededError` -* `tf.errors.NotFoundError` -* `tf.errors.AlreadyExistsError` -* `tf.errors.PermissionDeniedError` -* `tf.errors.UnauthenticatedError` -* `tf.errors.ResourceExhaustedError` -* `tf.errors.FailedPreconditionError` -* `tf.errors.AbortedError` -* `tf.errors.OutOfRangeError` -* `tf.errors.UnimplementedError` -* `tf.errors.InternalError` -* `tf.errors.UnavailableError` -* `tf.errors.DataLossError` -* `tf.errors.exception_type_from_error_code` -* `tf.errors.error_code_from_exception_type` -* `tf.errors.raise_exception_on_not_ok_status` diff --git a/tensorflow/docs_src/api_guides/python/constant_op.md b/tensorflow/docs_src/api_guides/python/constant_op.md deleted file mode 100644 index 9ba95b0f55..0000000000 --- a/tensorflow/docs_src/api_guides/python/constant_op.md +++ /dev/null @@ -1,87 +0,0 @@ -# Constants, Sequences, and Random Values - -Note: Functions taking `Tensor` arguments can also take anything accepted by -`tf.convert_to_tensor`. - -[TOC] - -## Constant Value Tensors - -TensorFlow provides several operations that you can use to generate constants. - -* `tf.zeros` -* `tf.zeros_like` -* `tf.ones` -* `tf.ones_like` -* `tf.fill` -* `tf.constant` - -## Sequences - -* `tf.linspace` -* `tf.range` - -## Random Tensors - -TensorFlow has several ops that create random tensors with different -distributions. The random ops are stateful, and create new random values each -time they are evaluated. - -The `seed` keyword argument in these functions acts in conjunction with -the graph-level random seed. Changing either the graph-level seed using -`tf.set_random_seed` or the -op-level seed will change the underlying seed of these operations. Setting -neither graph-level nor op-level seed, results in a random seed for all -operations. -See `tf.set_random_seed` -for details on the interaction between operation-level and graph-level random -seeds. - -### Examples: - -```python -# Create a tensor of shape [2, 3] consisting of random normal values, with mean -# -1 and standard deviation 4. -norm = tf.random_normal([2, 3], mean=-1, stddev=4) - -# Shuffle the first dimension of a tensor -c = tf.constant([[1, 2], [3, 4], [5, 6]]) -shuff = tf.random_shuffle(c) - -# Each time we run these ops, different results are generated -sess = tf.Session() -print(sess.run(norm)) -print(sess.run(norm)) - -# Set an op-level seed to generate repeatable sequences across sessions. -norm = tf.random_normal([2, 3], seed=1234) -sess = tf.Session() -print(sess.run(norm)) -print(sess.run(norm)) -sess = tf.Session() -print(sess.run(norm)) -print(sess.run(norm)) -``` - -Another common use of random values is the initialization of variables. Also see -the [Variables How To](../../guide/variables.md). - -```python -# Use random uniform values in [0, 1) as the initializer for a variable of shape -# [2, 3]. The default type is float32. -var = tf.Variable(tf.random_uniform([2, 3]), name="var") -init = tf.global_variables_initializer() - -sess = tf.Session() -sess.run(init) -print(sess.run(var)) -``` - -* `tf.random_normal` -* `tf.truncated_normal` -* `tf.random_uniform` -* `tf.random_shuffle` -* `tf.random_crop` -* `tf.multinomial` -* `tf.random_gamma` -* `tf.set_random_seed` diff --git a/tensorflow/docs_src/api_guides/python/contrib.crf.md b/tensorflow/docs_src/api_guides/python/contrib.crf.md deleted file mode 100644 index a544f136b3..0000000000 --- a/tensorflow/docs_src/api_guides/python/contrib.crf.md +++ /dev/null @@ -1,11 +0,0 @@ -# CRF (contrib) - -Linear-chain CRF layer. - -* `tf.contrib.crf.crf_sequence_score` -* `tf.contrib.crf.crf_log_norm` -* `tf.contrib.crf.crf_log_likelihood` -* `tf.contrib.crf.crf_unary_score` -* `tf.contrib.crf.crf_binary_score` -* `tf.contrib.crf.CrfForwardRnnCell` -* `tf.contrib.crf.viterbi_decode` diff --git a/tensorflow/docs_src/api_guides/python/contrib.ffmpeg.md b/tensorflow/docs_src/api_guides/python/contrib.ffmpeg.md deleted file mode 100644 index 7df7547131..0000000000 --- a/tensorflow/docs_src/api_guides/python/contrib.ffmpeg.md +++ /dev/null @@ -1,23 +0,0 @@ -# FFmpeg (contrib) -[TOC] - -## Encoding and decoding audio using FFmpeg - -TensorFlow provides Ops to decode and encode audio files using the -[FFmpeg](https://www.ffmpeg.org/) library. FFmpeg must be -locally [installed](https://ffmpeg.org/download.html) for these Ops to succeed. - -Example: - -```python -from tensorflow.contrib import ffmpeg - -audio_binary = tf.read_file('song.mp3') -waveform = ffmpeg.decode_audio( - audio_binary, file_format='mp3', samples_per_second=44100, channel_count=2) -uncompressed_binary = ffmpeg.encode_audio( - waveform, file_format='wav', samples_per_second=44100) -``` - -* `tf.contrib.ffmpeg.decode_audio` -* `tf.contrib.ffmpeg.encode_audio` diff --git a/tensorflow/docs_src/api_guides/python/contrib.framework.md b/tensorflow/docs_src/api_guides/python/contrib.framework.md deleted file mode 100644 index 00fb8b0ac3..0000000000 --- a/tensorflow/docs_src/api_guides/python/contrib.framework.md +++ /dev/null @@ -1,64 +0,0 @@ -# Framework (contrib) -[TOC] - -Framework utilities. - -* `tf.contrib.framework.assert_same_float_dtype` -* `tf.contrib.framework.assert_scalar` -* `tf.contrib.framework.assert_scalar_int` -* `tf.convert_to_tensor_or_sparse_tensor` -* `tf.contrib.framework.get_graph_from_inputs` -* `tf.is_numeric_tensor` -* `tf.is_non_decreasing` -* `tf.is_strictly_increasing` -* `tf.contrib.framework.is_tensor` -* `tf.contrib.framework.reduce_sum_n` -* `tf.contrib.framework.remove_squeezable_dimensions` -* `tf.contrib.framework.with_shape` -* `tf.contrib.framework.with_same_shape` - -## Deprecation - -* `tf.contrib.framework.deprecated` -* `tf.contrib.framework.deprecated_args` -* `tf.contrib.framework.deprecated_arg_values` - -## Arg_Scope - -* `tf.contrib.framework.arg_scope` -* `tf.contrib.framework.add_arg_scope` -* `tf.contrib.framework.has_arg_scope` -* `tf.contrib.framework.arg_scoped_arguments` - -## Variables - -* `tf.contrib.framework.add_model_variable` -* `tf.train.assert_global_step` -* `tf.contrib.framework.assert_or_get_global_step` -* `tf.contrib.framework.assign_from_checkpoint` -* `tf.contrib.framework.assign_from_checkpoint_fn` -* `tf.contrib.framework.assign_from_values` -* `tf.contrib.framework.assign_from_values_fn` -* `tf.contrib.framework.create_global_step` -* `tf.contrib.framework.filter_variables` -* `tf.train.get_global_step` -* `tf.contrib.framework.get_or_create_global_step` -* `tf.contrib.framework.get_local_variables` -* `tf.contrib.framework.get_model_variables` -* `tf.contrib.framework.get_unique_variable` -* `tf.contrib.framework.get_variables_by_name` -* `tf.contrib.framework.get_variables_by_suffix` -* `tf.contrib.framework.get_variables_to_restore` -* `tf.contrib.framework.get_variables` -* `tf.contrib.framework.local_variable` -* `tf.contrib.framework.model_variable` -* `tf.contrib.framework.variable` -* `tf.contrib.framework.VariableDeviceChooser` -* `tf.contrib.framework.zero_initializer` - -## Checkpoint utilities - -* `tf.contrib.framework.load_checkpoint` -* `tf.contrib.framework.list_variables` -* `tf.contrib.framework.load_variable` -* `tf.contrib.framework.init_from_checkpoint` diff --git a/tensorflow/docs_src/api_guides/python/contrib.graph_editor.md b/tensorflow/docs_src/api_guides/python/contrib.graph_editor.md deleted file mode 100644 index 8ce49b952b..0000000000 --- a/tensorflow/docs_src/api_guides/python/contrib.graph_editor.md +++ /dev/null @@ -1,177 +0,0 @@ -# Graph Editor (contrib) -[TOC] - -TensorFlow Graph Editor. - -The TensorFlow Graph Editor library allows for modification of an existing -`tf.Graph` instance in-place. - -The author's github username is [purpledog](https://github.com/purpledog). - -## Library overview - -Appending new nodes is the only graph editing operation allowed by the -TensorFlow core library. The Graph Editor library is an attempt to allow for -other kinds of editing operations, namely, *rerouting* and *transforming*. - -* *rerouting* is a local operation consisting in re-plugging existing tensors - (the edges of the graph). Operations (the nodes) are not modified by this - operation. For example, rerouting can be used to insert an operation adding - noise in place of an existing tensor. -* *transforming* is a global operation consisting in transforming a graph into - another. By default, a transformation is a simple copy but it can be - customized to achieved other goals. For instance, a graph can be transformed - into another one in which noise is added after all the operations of a - specific type. - -**Important: modifying a graph in-place with the Graph Editor must be done -`offline`, that is, without any active sessions.** - -Of course new operations can be appended online but Graph Editor specific -operations like rerouting and transforming can currently only be done offline. - -Here is an example of what you **cannot** do: - -* Build a graph. -* Create a session and run the graph. -* Modify the graph with the Graph Editor. -* Re-run the graph with the `same` previously created session. - -To edit an already running graph, follow these steps: - -* Build a graph. -* Create a session and run the graph. -* Save the graph state and terminate the session -* Modify the graph with the Graph Editor. -* create a new session and restore the graph state -* Re-run the graph with the newly created session. - -Note that this procedure is very costly because a new session must be created -after any modifications. Among other things, it takes time because the entire -graph state must be saved and restored again. - -## Sub-graph - -Most of the functions in the Graph Editor library operate on *sub-graph*. -More precisely, they take as input arguments instances of the SubGraphView class -(or anything which can be converted to it). Doing so allows the same function -to transparently operate on single operations as well as sub-graph of any size. - -A subgraph can be created in several ways: - -* using a list of ops: - - ```python - my_sgv = ge.sgv(ops) - ``` - -* from a name scope: - - ```python - my_sgv = ge.sgv_scope("foo/bar", graph=tf.get_default_graph()) - ``` - -* using regular expression: - - ```python - my_sgv = ge.sgv("foo/.*/.*read$", graph=tf.get_default_graph()) - ``` - -Note that the Graph Editor is meant to manipulate several graphs at the same -time, typically during transform or copy operation. For that reason, -to avoid any confusion, the default graph is never used and the graph on -which to operate must always be given explicitly. This is the reason why -*`graph=tf.get_default_graph()`* is used in the code snippets above. - -## Modules overview - -* util: utility functions. -* select: various selection methods of TensorFlow tensors and operations. -* match: TensorFlow graph matching. Think of this as regular expressions for - graphs (but not quite yet). -* reroute: various ways of rerouting tensors to different consuming ops like - *swap* or *reroute_a2b*. -* subgraph: the SubGraphView class, which enables subgraph manipulations in a - TensorFlow `tf.Graph`. -* edit: various editing functions operating on subgraphs like *detach*, - *connect* or *bypass*. -* transform: the Transformer class, which enables transforming - (or simply copying) a subgraph into another one. - -## Module: util - -* `tf.contrib.graph_editor.make_list_of_op` -* `tf.contrib.graph_editor.get_tensors` -* `tf.contrib.graph_editor.make_list_of_t` -* `tf.contrib.graph_editor.get_generating_ops` -* `tf.contrib.graph_editor.get_consuming_ops` -* `tf.contrib.graph_editor.ControlOutputs` -* `tf.contrib.graph_editor.placeholder_name` -* `tf.contrib.graph_editor.make_placeholder_from_tensor` -* `tf.contrib.graph_editor.make_placeholder_from_dtype_and_shape` - -## Module: select - -* `tf.contrib.graph_editor.filter_ts` -* `tf.contrib.graph_editor.filter_ts_from_regex` -* `tf.contrib.graph_editor.filter_ops` -* `tf.contrib.graph_editor.filter_ops_from_regex` -* `tf.contrib.graph_editor.get_name_scope_ops` -* `tf.contrib.graph_editor.check_cios` -* `tf.contrib.graph_editor.get_ops_ios` -* `tf.contrib.graph_editor.compute_boundary_ts` -* `tf.contrib.graph_editor.get_within_boundary_ops` -* `tf.contrib.graph_editor.get_forward_walk_ops` -* `tf.contrib.graph_editor.get_backward_walk_ops` -* `tf.contrib.graph_editor.get_walks_intersection_ops` -* `tf.contrib.graph_editor.get_walks_union_ops` -* `tf.contrib.graph_editor.select_ops` -* `tf.contrib.graph_editor.select_ts` -* `tf.contrib.graph_editor.select_ops_and_ts` - -## Module: subgraph - -* `tf.contrib.graph_editor.SubGraphView` -* `tf.contrib.graph_editor.make_view` -* `tf.contrib.graph_editor.make_view_from_scope` - -## Module: reroute - -* `tf.contrib.graph_editor.swap_ts` -* `tf.contrib.graph_editor.reroute_ts` -* `tf.contrib.graph_editor.swap_inputs` -* `tf.contrib.graph_editor.reroute_inputs` -* `tf.contrib.graph_editor.swap_outputs` -* `tf.contrib.graph_editor.reroute_outputs` -* `tf.contrib.graph_editor.swap_ios` -* `tf.contrib.graph_editor.reroute_ios` -* `tf.contrib.graph_editor.remove_control_inputs` -* `tf.contrib.graph_editor.add_control_inputs` - -## Module: edit - -* `tf.contrib.graph_editor.detach_control_inputs` -* `tf.contrib.graph_editor.detach_control_outputs` -* `tf.contrib.graph_editor.detach_inputs` -* `tf.contrib.graph_editor.detach_outputs` -* `tf.contrib.graph_editor.detach` -* `tf.contrib.graph_editor.connect` -* `tf.contrib.graph_editor.bypass` - -## Module: transform - -* `tf.contrib.graph_editor.replace_t_with_placeholder_handler` -* `tf.contrib.graph_editor.keep_t_if_possible_handler` -* `tf.contrib.graph_editor.assign_renamed_collections_handler` -* `tf.contrib.graph_editor.transform_op_if_inside_handler` -* `tf.contrib.graph_editor.copy_op_handler` -* `tf.contrib.graph_editor.Transformer` -* `tf.contrib.graph_editor.copy` -* `tf.contrib.graph_editor.copy_with_input_replacements` -* `tf.contrib.graph_editor.graph_replace` - -## Useful aliases - -* `tf.contrib.graph_editor.ph` -* `tf.contrib.graph_editor.sgv` -* `tf.contrib.graph_editor.sgv_scope` diff --git a/tensorflow/docs_src/api_guides/python/contrib.integrate.md b/tensorflow/docs_src/api_guides/python/contrib.integrate.md deleted file mode 100644 index a70d202ab5..0000000000 --- a/tensorflow/docs_src/api_guides/python/contrib.integrate.md +++ /dev/null @@ -1,41 +0,0 @@ -# Integrate (contrib) -[TOC] - -Integration and ODE solvers for TensorFlow. - -## Example: Lorenz attractor - -We can use `odeint` to solve the -[Lorentz system](https://en.wikipedia.org/wiki/Lorenz_system) of ordinary -differential equations, a prototypical example of chaotic dynamics: - -```python -rho = 28.0 -sigma = 10.0 -beta = 8.0/3.0 - -def lorenz_equation(state, t): - x, y, z = tf.unstack(state) - dx = sigma * (y - x) - dy = x * (rho - z) - y - dz = x * y - beta * z - return tf.stack([dx, dy, dz]) - -init_state = tf.constant([0, 2, 20], dtype=tf.float64) -t = np.linspace(0, 50, num=5000) -tensor_state, tensor_info = tf.contrib.integrate.odeint( - lorenz_equation, init_state, t, full_output=True) - -sess = tf.Session() -state, info = sess.run([tensor_state, tensor_info]) -x, y, z = state.T -plt.plot(x, z) -``` - -
- -
- -## Ops - -* `tf.contrib.integrate.odeint` diff --git a/tensorflow/docs_src/api_guides/python/contrib.layers.md b/tensorflow/docs_src/api_guides/python/contrib.layers.md deleted file mode 100644 index 4c176a129c..0000000000 --- a/tensorflow/docs_src/api_guides/python/contrib.layers.md +++ /dev/null @@ -1,109 +0,0 @@ -# Layers (contrib) -[TOC] - -Ops for building neural network layers, regularizers, summaries, etc. - -## Higher level ops for building neural network layers - -This package provides several ops that take care of creating variables that are -used internally in a consistent way and provide the building blocks for many -common machine learning algorithms. - -* `tf.contrib.layers.avg_pool2d` -* `tf.contrib.layers.batch_norm` -* `tf.contrib.layers.convolution2d` -* `tf.contrib.layers.conv2d_in_plane` -* `tf.contrib.layers.convolution2d_in_plane` -* `tf.nn.conv2d_transpose` -* `tf.contrib.layers.convolution2d_transpose` -* `tf.nn.dropout` -* `tf.contrib.layers.flatten` -* `tf.contrib.layers.fully_connected` -* `tf.contrib.layers.layer_norm` -* `tf.contrib.layers.max_pool2d` -* `tf.contrib.layers.one_hot_encoding` -* `tf.nn.relu` -* `tf.nn.relu6` -* `tf.contrib.layers.repeat` -* `tf.contrib.layers.safe_embedding_lookup_sparse` -* `tf.nn.separable_conv2d` -* `tf.contrib.layers.separable_convolution2d` -* `tf.nn.softmax` -* `tf.stack` -* `tf.contrib.layers.unit_norm` -* `tf.contrib.layers.embed_sequence` - -Aliases for fully_connected which set a default activation function are -available: `relu`, `relu6` and `linear`. - -`stack` operation is also available. It builds a stack of layers by applying -a layer repeatedly. - -## Regularizers - -Regularization can help prevent overfitting. These have the signature -`fn(weights)`. The loss is typically added to -`tf.GraphKeys.REGULARIZATION_LOSSES`. - -* `tf.contrib.layers.apply_regularization` -* `tf.contrib.layers.l1_regularizer` -* `tf.contrib.layers.l2_regularizer` -* `tf.contrib.layers.sum_regularizer` - -## Initializers - -Initializers are used to initialize variables with sensible values given their -size, data type, and purpose. - -* `tf.contrib.layers.xavier_initializer` -* `tf.contrib.layers.xavier_initializer_conv2d` -* `tf.contrib.layers.variance_scaling_initializer` - -## Optimization - -Optimize weights given a loss. - -* `tf.contrib.layers.optimize_loss` - -## Summaries - -Helper functions to summarize specific variables or ops. - -* `tf.contrib.layers.summarize_activation` -* `tf.contrib.layers.summarize_tensor` -* `tf.contrib.layers.summarize_tensors` -* `tf.contrib.layers.summarize_collection` - -The layers module defines convenience functions `summarize_variables`, -`summarize_weights` and `summarize_biases`, which set the `collection` argument -of `summarize_collection` to `VARIABLES`, `WEIGHTS` and `BIASES`, respectively. - -* `tf.contrib.layers.summarize_activations` - -## Feature columns - -Feature columns provide a mechanism to map data to a model. - -* `tf.contrib.layers.bucketized_column` -* `tf.contrib.layers.check_feature_columns` -* `tf.contrib.layers.create_feature_spec_for_parsing` -* `tf.contrib.layers.crossed_column` -* `tf.contrib.layers.embedding_column` -* `tf.contrib.layers.scattered_embedding_column` -* `tf.contrib.layers.input_from_feature_columns` -* `tf.contrib.layers.joint_weighted_sum_from_feature_columns` -* `tf.contrib.layers.make_place_holder_tensors_for_base_features` -* `tf.contrib.layers.multi_class_target` -* `tf.contrib.layers.one_hot_column` -* `tf.contrib.layers.parse_feature_columns_from_examples` -* `tf.contrib.layers.parse_feature_columns_from_sequence_examples` -* `tf.contrib.layers.real_valued_column` -* `tf.contrib.layers.shared_embedding_columns` -* `tf.contrib.layers.sparse_column_with_hash_bucket` -* `tf.contrib.layers.sparse_column_with_integerized_feature` -* `tf.contrib.layers.sparse_column_with_keys` -* `tf.contrib.layers.sparse_column_with_vocabulary_file` -* `tf.contrib.layers.weighted_sparse_column` -* `tf.contrib.layers.weighted_sum_from_feature_columns` -* `tf.contrib.layers.infer_real_valued_columns` -* `tf.contrib.layers.sequence_input_from_feature_columns` diff --git a/tensorflow/docs_src/api_guides/python/contrib.learn.md b/tensorflow/docs_src/api_guides/python/contrib.learn.md deleted file mode 100644 index 635849ead5..0000000000 --- a/tensorflow/docs_src/api_guides/python/contrib.learn.md +++ /dev/null @@ -1,63 +0,0 @@ -# Learn (contrib) -[TOC] - -High level API for learning with TensorFlow. - -## Estimators - -Train and evaluate TensorFlow models. - -* `tf.contrib.learn.BaseEstimator` -* `tf.contrib.learn.Estimator` -* `tf.contrib.learn.Trainable` -* `tf.contrib.learn.Evaluable` -* `tf.contrib.learn.KMeansClustering` -* `tf.contrib.learn.ModeKeys` -* `tf.contrib.learn.ModelFnOps` -* `tf.contrib.learn.MetricSpec` -* `tf.contrib.learn.PredictionKey` -* `tf.contrib.learn.DNNClassifier` -* `tf.contrib.learn.DNNRegressor` -* `tf.contrib.learn.DNNLinearCombinedRegressor` -* `tf.contrib.learn.DNNLinearCombinedClassifier` -* `tf.contrib.learn.LinearClassifier` -* `tf.contrib.learn.LinearRegressor` -* `tf.contrib.learn.LogisticRegressor` - -## Distributed training utilities - -* `tf.contrib.learn.Experiment` -* `tf.contrib.learn.ExportStrategy` -* `tf.contrib.learn.TaskType` - -## Graph actions - -Perform various training, evaluation, and inference actions on a graph. - -* `tf.train.NanLossDuringTrainingError` -* `tf.contrib.learn.RunConfig` -* `tf.contrib.learn.evaluate` -* `tf.contrib.learn.infer` -* `tf.contrib.learn.run_feeds` -* `tf.contrib.learn.run_n` -* `tf.contrib.learn.train` - -## Input processing - -Queue and read batched input data. - -* `tf.contrib.learn.extract_dask_data` -* `tf.contrib.learn.extract_dask_labels` -* `tf.contrib.learn.extract_pandas_data` -* `tf.contrib.learn.extract_pandas_labels` -* `tf.contrib.learn.extract_pandas_matrix` -* `tf.contrib.learn.infer_real_valued_columns_from_input` -* `tf.contrib.learn.infer_real_valued_columns_from_input_fn` -* `tf.contrib.learn.read_batch_examples` -* `tf.contrib.learn.read_batch_features` -* `tf.contrib.learn.read_batch_record_features` - -Export utilities - -* `tf.contrib.learn.build_parsing_serving_input_fn` -* `tf.contrib.learn.ProblemType` diff --git a/tensorflow/docs_src/api_guides/python/contrib.linalg.md b/tensorflow/docs_src/api_guides/python/contrib.linalg.md deleted file mode 100644 index 3055449dc2..0000000000 --- a/tensorflow/docs_src/api_guides/python/contrib.linalg.md +++ /dev/null @@ -1,30 +0,0 @@ -# Linear Algebra (contrib) -[TOC] - -Linear algebra libraries for TensorFlow. - -## `LinearOperator` - -Subclasses of `LinearOperator` provide a access to common methods on a -(batch) matrix, without the need to materialize the matrix. This allows: - -* Matrix free computations -* Different operators to take advantage of special structure, while providing a - consistent API to users. - -### Base class - -* `tf.contrib.linalg.LinearOperator` - -### Individual operators - -* `tf.contrib.linalg.LinearOperatorDiag` -* `tf.contrib.linalg.LinearOperatorIdentity` -* `tf.contrib.linalg.LinearOperatorScaledIdentity` -* `tf.contrib.linalg.LinearOperatorFullMatrix` -* `tf.contrib.linalg.LinearOperatorLowerTriangular` -* `tf.contrib.linalg.LinearOperatorLowRankUpdate` - -### Transformations and Combinations of operators - -* `tf.contrib.linalg.LinearOperatorComposition` diff --git a/tensorflow/docs_src/api_guides/python/contrib.losses.md b/tensorflow/docs_src/api_guides/python/contrib.losses.md deleted file mode 100644 index 8787454af6..0000000000 --- a/tensorflow/docs_src/api_guides/python/contrib.losses.md +++ /dev/null @@ -1,125 +0,0 @@ -# Losses (contrib) - -## Deprecated - -This module is deprecated. Instructions for updating: Use `tf.losses` instead. - -## Loss operations for use in neural networks. - -Note: By default, all the losses are collected into the `GraphKeys.LOSSES` -collection. - -All of the loss functions take a pair of predictions and ground truth labels, -from which the loss is computed. It is assumed that the shape of both these -tensors is of the form [batch_size, d1, ... dN] where `batch_size` is the number -of samples in the batch and `d1` ... `dN` are the remaining dimensions. - -It is common, when training with multiple loss functions, to adjust the relative -strengths of individual losses. This is performed by rescaling the losses via -a `weight` parameter passed to the loss functions. For example, if we were -training with both log_loss and mean_squared_error, and we wished that the -log_loss penalty be twice as severe as the mean_squared_error, we would -implement this as: - -```python - # Explicitly set the weight. - tf.contrib.losses.log(predictions, labels, weight=2.0) - - # Uses default weight of 1.0 - tf.contrib.losses.mean_squared_error(predictions, labels) - - # All the losses are collected into the `GraphKeys.LOSSES` collection. - losses = tf.get_collection(tf.GraphKeys.LOSSES) -``` - -While specifying a scalar loss rescales the loss over the entire batch, -we sometimes want to rescale the loss per batch sample. For example, if we have -certain examples that matter more to us to get correctly, we might want to have -a higher loss that other samples whose mistakes matter less. In this case, we -can provide a weight vector of length `batch_size` which results in the loss -for each sample in the batch being scaled by the corresponding weight element. -For example, consider the case of a classification problem where we want to -maximize our accuracy but we especially interested in obtaining high accuracy -for a specific class: - -```python - inputs, labels = LoadData(batch_size=3) - logits = MyModelPredictions(inputs) - - # Ensures that the loss for examples whose ground truth class is `3` is 5x - # higher than the loss for all other examples. - weight = tf.multiply(4, tf.cast(tf.equal(labels, 3), tf.float32)) + 1 - - onehot_labels = tf.one_hot(labels, num_classes=5) - tf.contrib.losses.softmax_cross_entropy(logits, onehot_labels, weight=weight) -``` - -Finally, in certain cases, we may want to specify a different loss for every -single measurable value. For example, if we are performing per-pixel depth -prediction, or per-pixel denoising, a single batch sample has P values where P -is the number of pixels in the image. For many losses, the number of measurable -values matches the number of elements in the predictions and labels tensors. -For others, such as softmax_cross_entropy and cosine_distance, the -loss functions reduces the dimensions of the inputs to produces a tensor of -losses for each measurable value. For example, softmax_cross_entropy takes as -input predictions and labels of dimension [batch_size, num_classes] but the -number of measurable values is [batch_size]. Consequently, when passing a weight -tensor to specify a different loss for every measurable value, the dimension of -the tensor will depend on the loss being used. - -For a concrete example, consider the case of per-pixel depth prediction where -certain ground truth depth values are missing (due to sensor noise in the -capture process). In this case, we want to assign zero weight to losses for -these predictions. - -```python - # 'depths' that are missing have a value of 0: - images, depths = LoadData(...) - predictions = MyModelPredictions(images) - - weight = tf.cast(tf.greater(depths, 0), tf.float32) - loss = tf.contrib.losses.mean_squared_error(predictions, depths, weight) -``` - -Note that when using weights for the losses, the final average is computed -by rescaling the losses by the weights and then dividing by the total number of -non-zero samples. For an arbitrary set of weights, this may not necessarily -produce a weighted average. Instead, it simply and transparently rescales the -per-element losses before averaging over the number of observations. For example -if the losses computed by the loss function is an array [4, 1, 2, 3] and the -weights are an array [1, 0.5, 3, 9], then the average loss is: - -```python - (4*1 + 1*0.5 + 2*3 + 3*9) / 4 -``` - -However, with a single loss function and an arbitrary set of weights, one can -still easily create a loss function such that the resulting loss is a -weighted average over the individual prediction errors: - - -```python - images, labels = LoadData(...) - predictions = MyModelPredictions(images) - - weight = MyComplicatedWeightingFunction(labels) - weight = tf.div(weight, tf.size(weight)) - loss = tf.contrib.losses.mean_squared_error(predictions, depths, weight) -``` - -* `tf.contrib.losses.absolute_difference` -* `tf.contrib.losses.add_loss` -* `tf.contrib.losses.hinge_loss` -* `tf.contrib.losses.compute_weighted_loss` -* `tf.contrib.losses.cosine_distance` -* `tf.contrib.losses.get_losses` -* `tf.contrib.losses.get_regularization_losses` -* `tf.contrib.losses.get_total_loss` -* `tf.contrib.losses.log_loss` -* `tf.contrib.losses.mean_pairwise_squared_error` -* `tf.contrib.losses.mean_squared_error` -* `tf.contrib.losses.sigmoid_cross_entropy` -* `tf.contrib.losses.softmax_cross_entropy` -* `tf.contrib.losses.sparse_softmax_cross_entropy` - - diff --git a/tensorflow/docs_src/api_guides/python/contrib.metrics.md b/tensorflow/docs_src/api_guides/python/contrib.metrics.md deleted file mode 100644 index de6346ca80..0000000000 --- a/tensorflow/docs_src/api_guides/python/contrib.metrics.md +++ /dev/null @@ -1,133 +0,0 @@ -# Metrics (contrib) -[TOC] - -##Ops for evaluation metrics and summary statistics. - -### API - -This module provides functions for computing streaming metrics: metrics computed -on dynamically valued `Tensors`. Each metric declaration returns a -"value_tensor", an idempotent operation that returns the current value of the -metric, and an "update_op", an operation that accumulates the information -from the current value of the `Tensors` being measured as well as returns the -value of the "value_tensor". - -To use any of these metrics, one need only declare the metric, call `update_op` -repeatedly to accumulate data over the desired number of `Tensor` values (often -each one is a single batch) and finally evaluate the value_tensor. For example, -to use the `streaming_mean`: - -```python -value = ... -mean_value, update_op = tf.contrib.metrics.streaming_mean(values) -sess.run(tf.local_variables_initializer()) - -for i in range(number_of_batches): - print('Mean after batch %d: %f' % (i, update_op.eval()) -print('Final Mean: %f' % mean_value.eval()) -``` - -Each metric function adds nodes to the graph that hold the state necessary to -compute the value of the metric as well as a set of operations that actually -perform the computation. Every metric evaluation is composed of three steps - -* Initialization: initializing the metric state. -* Aggregation: updating the values of the metric state. -* Finalization: computing the final metric value. - -In the above example, calling streaming_mean creates a pair of state variables -that will contain (1) the running sum and (2) the count of the number of samples -in the sum. Because the streaming metrics use local variables, -the Initialization stage is performed by running the op returned -by `tf.local_variables_initializer()`. It sets the sum and count variables to -zero. - -Next, Aggregation is performed by examining the current state of `values` -and incrementing the state variables appropriately. This step is executed by -running the `update_op` returned by the metric. - -Finally, finalization is performed by evaluating the "value_tensor" - -In practice, we commonly want to evaluate across many batches and multiple -metrics. To do so, we need only run the metric computation operations multiple -times: - -```python -labels = ... -predictions = ... -accuracy, update_op_acc = tf.contrib.metrics.streaming_accuracy( - labels, predictions) -error, update_op_error = tf.contrib.metrics.streaming_mean_absolute_error( - labels, predictions) - -sess.run(tf.local_variables_initializer()) -for batch in range(num_batches): - sess.run([update_op_acc, update_op_error]) - -accuracy, error = sess.run([accuracy, error]) -``` - -Note that when evaluating the same metric multiple times on different inputs, -one must specify the scope of each metric to avoid accumulating the results -together: - -```python -labels = ... -predictions0 = ... -predictions1 = ... - -accuracy0 = tf.contrib.metrics.accuracy(labels, predictions0, name='preds0') -accuracy1 = tf.contrib.metrics.accuracy(labels, predictions1, name='preds1') -``` - -Certain metrics, such as streaming_mean or streaming_accuracy, can be weighted -via a `weights` argument. The `weights` tensor must be the same size as the -labels and predictions tensors and results in a weighted average of the metric. - -## Metric `Ops` - -* `tf.contrib.metrics.streaming_accuracy` -* `tf.contrib.metrics.streaming_mean` -* `tf.contrib.metrics.streaming_recall` -* `tf.contrib.metrics.streaming_recall_at_thresholds` -* `tf.contrib.metrics.streaming_precision` -* `tf.contrib.metrics.streaming_precision_at_thresholds` -* `tf.contrib.metrics.streaming_auc` -* `tf.contrib.metrics.streaming_recall_at_k` -* `tf.contrib.metrics.streaming_mean_absolute_error` -* `tf.contrib.metrics.streaming_mean_iou` -* `tf.contrib.metrics.streaming_mean_relative_error` -* `tf.contrib.metrics.streaming_mean_squared_error` -* `tf.contrib.metrics.streaming_mean_tensor` -* `tf.contrib.metrics.streaming_root_mean_squared_error` -* `tf.contrib.metrics.streaming_covariance` -* `tf.contrib.metrics.streaming_pearson_correlation` -* `tf.contrib.metrics.streaming_mean_cosine_distance` -* `tf.contrib.metrics.streaming_percentage_less` -* `tf.contrib.metrics.streaming_sensitivity_at_specificity` -* `tf.contrib.metrics.streaming_sparse_average_precision_at_k` -* `tf.contrib.metrics.streaming_sparse_precision_at_k` -* `tf.contrib.metrics.streaming_sparse_precision_at_top_k` -* `tf.contrib.metrics.streaming_sparse_recall_at_k` -* `tf.contrib.metrics.streaming_specificity_at_sensitivity` -* `tf.contrib.metrics.streaming_concat` -* `tf.contrib.metrics.streaming_false_negatives` -* `tf.contrib.metrics.streaming_false_negatives_at_thresholds` -* `tf.contrib.metrics.streaming_false_positives` -* `tf.contrib.metrics.streaming_false_positives_at_thresholds` -* `tf.contrib.metrics.streaming_true_negatives` -* `tf.contrib.metrics.streaming_true_negatives_at_thresholds` -* `tf.contrib.metrics.streaming_true_positives` -* `tf.contrib.metrics.streaming_true_positives_at_thresholds` -* `tf.contrib.metrics.auc_using_histogram` -* `tf.contrib.metrics.accuracy` -* `tf.contrib.metrics.aggregate_metrics` -* `tf.contrib.metrics.aggregate_metric_map` -* `tf.contrib.metrics.confusion_matrix` - -## Set `Ops` - -* `tf.contrib.metrics.set_difference` -* `tf.contrib.metrics.set_intersection` -* `tf.contrib.metrics.set_size` -* `tf.contrib.metrics.set_union` diff --git a/tensorflow/docs_src/api_guides/python/contrib.rnn.md b/tensorflow/docs_src/api_guides/python/contrib.rnn.md deleted file mode 100644 index d265ab6925..0000000000 --- a/tensorflow/docs_src/api_guides/python/contrib.rnn.md +++ /dev/null @@ -1,61 +0,0 @@ -# RNN and Cells (contrib) -[TOC] - -Module for constructing RNN Cells and additional RNN operations. - -## Base interface for all RNN Cells - -* `tf.contrib.rnn.RNNCell` - -## Core RNN Cells for use with TensorFlow's core RNN methods - -* `tf.contrib.rnn.BasicRNNCell` -* `tf.contrib.rnn.BasicLSTMCell` -* `tf.contrib.rnn.GRUCell` -* `tf.contrib.rnn.LSTMCell` -* `tf.contrib.rnn.LayerNormBasicLSTMCell` - -## Classes storing split `RNNCell` state - -* `tf.contrib.rnn.LSTMStateTuple` - -## Core RNN Cell wrappers (RNNCells that wrap other RNNCells) - -* `tf.contrib.rnn.MultiRNNCell` -* `tf.contrib.rnn.LSTMBlockWrapper` -* `tf.contrib.rnn.DropoutWrapper` -* `tf.contrib.rnn.EmbeddingWrapper` -* `tf.contrib.rnn.InputProjectionWrapper` -* `tf.contrib.rnn.OutputProjectionWrapper` -* `tf.contrib.rnn.DeviceWrapper` -* `tf.contrib.rnn.ResidualWrapper` - -### Block RNNCells -* `tf.contrib.rnn.LSTMBlockCell` -* `tf.contrib.rnn.GRUBlockCell` - -### Fused RNNCells -* `tf.contrib.rnn.FusedRNNCell` -* `tf.contrib.rnn.FusedRNNCellAdaptor` -* `tf.contrib.rnn.TimeReversedFusedRNN` -* `tf.contrib.rnn.LSTMBlockFusedCell` - -### LSTM-like cells -* `tf.contrib.rnn.CoupledInputForgetGateLSTMCell` -* `tf.contrib.rnn.TimeFreqLSTMCell` -* `tf.contrib.rnn.GridLSTMCell` - -### RNNCell wrappers -* `tf.contrib.rnn.AttentionCellWrapper` -* `tf.contrib.rnn.CompiledWrapper` - - -## Recurrent Neural Networks - -TensorFlow provides a number of methods for constructing Recurrent Neural -Networks. - -* `tf.contrib.rnn.static_rnn` -* `tf.contrib.rnn.static_state_saving_rnn` -* `tf.contrib.rnn.static_bidirectional_rnn` -* `tf.contrib.rnn.stack_bidirectional_dynamic_rnn` diff --git a/tensorflow/docs_src/api_guides/python/contrib.seq2seq.md b/tensorflow/docs_src/api_guides/python/contrib.seq2seq.md deleted file mode 100644 index 54f2fafc71..0000000000 --- a/tensorflow/docs_src/api_guides/python/contrib.seq2seq.md +++ /dev/null @@ -1,138 +0,0 @@ -# Seq2seq Library (contrib) -[TOC] - -Module for constructing seq2seq models and dynamic decoding. Builds on top of -libraries in `tf.contrib.rnn`. - -This library is composed of two primary components: - -* New attention wrappers for `tf.contrib.rnn.RNNCell` objects. -* A new object-oriented dynamic decoding framework. - -## Attention - -Attention wrappers are `RNNCell` objects that wrap other `RNNCell` objects and -implement attention. The form of attention is determined by a subclass of -`tf.contrib.seq2seq.AttentionMechanism`. These subclasses describe the form -of attention (e.g. additive vs. multiplicative) to use when creating the -wrapper. An instance of an `AttentionMechanism` is constructed with a -`memory` tensor, from which lookup keys and values tensors are created. - -### Attention Mechanisms - -The two basic attention mechanisms are: - -* `tf.contrib.seq2seq.BahdanauAttention` (additive attention, - [ref.](https://arxiv.org/abs/1409.0473)) -* `tf.contrib.seq2seq.LuongAttention` (multiplicative attention, - [ref.](https://arxiv.org/abs/1508.04025)) - -The `memory` tensor passed the attention mechanism's constructor is expected to -be shaped `[batch_size, memory_max_time, memory_depth]`; and often an additional -`memory_sequence_length` vector is accepted. If provided, the `memory` -tensors' rows are masked with zeros past their true sequence lengths. - -Attention mechanisms also have a concept of depth, usually determined as a -construction parameter `num_units`. For some kinds of attention (like -`BahdanauAttention`), both queries and memory are projected to tensors of depth -`num_units`. For other kinds (like `LuongAttention`), `num_units` should match -the depth of the queries; and the `memory` tensor will be projected to this -depth. - -### Attention Wrappers - -The basic attention wrapper is `tf.contrib.seq2seq.AttentionWrapper`. -This wrapper accepts an `RNNCell` instance, an instance of `AttentionMechanism`, -and an attention depth parameter (`attention_size`); as well as several -optional arguments that allow one to customize intermediate calculations. - -At each time step, the basic calculation performed by this wrapper is: - -```python -cell_inputs = concat([inputs, prev_state.attention], -1) -cell_output, next_cell_state = cell(cell_inputs, prev_state.cell_state) -score = attention_mechanism(cell_output) -alignments = softmax(score) -context = matmul(alignments, attention_mechanism.values) -attention = tf.layers.Dense(attention_size)(concat([cell_output, context], 1)) -next_state = AttentionWrapperState( - cell_state=next_cell_state, - attention=attention) -output = attention -return output, next_state -``` - -In practice, a number of the intermediate calculations are configurable. -For example, the initial concatenation of `inputs` and `prev_state.attention` -can be replaced with another mixing function. The function `softmax` can -be replaced with alternative options when calculating `alignments` from the -`score`. Finally, the outputs returned by the wrapper can be configured to -be the value `cell_output` instead of `attention`. - -The benefit of using a `AttentionWrapper` is that it plays nicely with -other wrappers and the dynamic decoder described below. For example, one can -write: - -```python -cell = tf.contrib.rnn.DeviceWrapper(LSTMCell(512), "/device:GPU:0") -attention_mechanism = tf.contrib.seq2seq.LuongAttention(512, encoder_outputs) -attn_cell = tf.contrib.seq2seq.AttentionWrapper( - cell, attention_mechanism, attention_size=256) -attn_cell = tf.contrib.rnn.DeviceWrapper(attn_cell, "/device:GPU:1") -top_cell = tf.contrib.rnn.DeviceWrapper(LSTMCell(512), "/device:GPU:1") -multi_cell = MultiRNNCell([attn_cell, top_cell]) -``` - -The `multi_rnn` cell will perform the bottom layer calculations on GPU 0; -attention calculations will be performed on GPU 1 and immediately passed -up to the top layer which is also calculated on GPU 1. The attention is -also passed forward in time to the next time step and copied to GPU 0 for the -next time step of `cell`. (*Note*: This is just an example of use, -not a suggested device partitioning strategy.) - -## Dynamic Decoding - -Example usage: - -``` python -cell = # instance of RNNCell - -if mode == "train": - helper = tf.contrib.seq2seq.TrainingHelper( - input=input_vectors, - sequence_length=input_lengths) -elif mode == "infer": - helper = tf.contrib.seq2seq.GreedyEmbeddingHelper( - embedding=embedding, - start_tokens=tf.tile([GO_SYMBOL], [batch_size]), - end_token=END_SYMBOL) - -decoder = tf.contrib.seq2seq.BasicDecoder( - cell=cell, - helper=helper, - initial_state=cell.zero_state(batch_size, tf.float32)) -outputs, _ = tf.contrib.seq2seq.dynamic_decode( - decoder=decoder, - output_time_major=False, - impute_finished=True, - maximum_iterations=20) -``` - -### Decoder base class and functions - -* `tf.contrib.seq2seq.Decoder` -* `tf.contrib.seq2seq.dynamic_decode` - -### Basic Decoder - -* `tf.contrib.seq2seq.BasicDecoderOutput` -* `tf.contrib.seq2seq.BasicDecoder` - -### Decoder Helpers - -* `tf.contrib.seq2seq.Helper` -* `tf.contrib.seq2seq.CustomHelper` -* `tf.contrib.seq2seq.GreedyEmbeddingHelper` -* `tf.contrib.seq2seq.ScheduledEmbeddingTrainingHelper` -* `tf.contrib.seq2seq.ScheduledOutputTrainingHelper` -* `tf.contrib.seq2seq.TrainingHelper` diff --git a/tensorflow/docs_src/api_guides/python/contrib.signal.md b/tensorflow/docs_src/api_guides/python/contrib.signal.md deleted file mode 100644 index 66df561084..0000000000 --- a/tensorflow/docs_src/api_guides/python/contrib.signal.md +++ /dev/null @@ -1,172 +0,0 @@ -# Signal Processing (contrib) -[TOC] - -`tf.contrib.signal` is a module for signal processing primitives. All -operations have GPU support and are differentiable. This module is especially -helpful for building TensorFlow models that process or generate audio, though -the techniques are useful in many domains. - -## Framing variable length sequences - -When dealing with variable length signals (e.g. audio) it is common to "frame" -them into multiple fixed length windows. These windows can overlap if the 'step' -of the frame is less than the frame length. `tf.contrib.signal.frame` does -exactly this. For example: - -```python -# A batch of float32 time-domain signals in the range [-1, 1] with shape -# [batch_size, signal_length]. Both batch_size and signal_length may be unknown. -signals = tf.placeholder(tf.float32, [None, None]) - -# Compute a [batch_size, ?, 128] tensor of fixed length, overlapping windows -# where each window overlaps the previous by 75% (frame_length - frame_step -# samples of overlap). -frames = tf.contrib.signal.frame(signals, frame_length=128, frame_step=32) -``` - -The `axis` parameter to `tf.contrib.signal.frame` allows you to frame tensors -with inner structure (e.g. a spectrogram): - -```python -# `magnitude_spectrograms` is a [batch_size, ?, 129] tensor of spectrograms. We -# would like to produce overlapping fixed-size spectrogram patches; for example, -# for use in a situation where a fixed size input is needed. -magnitude_spectrograms = tf.abs(tf.contrib.signal.stft( - signals, frame_length=256, frame_step=64, fft_length=256)) - -# `spectrogram_patches` is a [batch_size, ?, 64, 129] tensor containing a -# variable number of [64, 129] spectrogram patches per batch item. -spectrogram_patches = tf.contrib.signal.frame( - magnitude_spectrograms, frame_length=64, frame_step=16, axis=1) -``` - -## Reconstructing framed sequences and applying a tapering window - -`tf.contrib.signal.overlap_and_add` can be used to reconstruct a signal from a -framed representation. For example, the following code reconstructs the signal -produced in the preceding example: - -```python -# Reconstructs `signals` from `frames` produced in the above example. However, -# the magnitude of `reconstructed_signals` will be greater than `signals`. -reconstructed_signals = tf.contrib.signal.overlap_and_add(frames, frame_step=32) -``` - -Note that because `frame_step` is 25% of `frame_length` in the above example, -the resulting reconstruction will have a greater magnitude than the original -`signals`. To compensate for this, we can use a tapering window function. If the -window function satisfies the Constant Overlap-Add (COLA) property for the given -frame step, then it will recover the original `signals`. - -`tf.contrib.signal.hamming_window` and `tf.contrib.signal.hann_window` both -satisfy the COLA property for a 75% overlap. - -```python -frame_length = 128 -frame_step = 32 -windowed_frames = frames * tf.contrib.signal.hann_window(frame_length) -reconstructed_signals = tf.contrib.signal.overlap_and_add( - windowed_frames, frame_step) -``` - -## Computing spectrograms - -A spectrogram is a time-frequency decomposition of a signal that indicates its -frequency content over time. The most common approach to computing spectrograms -is to take the magnitude of the [Short-time Fourier Transform][stft] (STFT), -which `tf.contrib.signal.stft` can compute as follows: - -```python -# A batch of float32 time-domain signals in the range [-1, 1] with shape -# [batch_size, signal_length]. Both batch_size and signal_length may be unknown. -signals = tf.placeholder(tf.float32, [None, None]) - -# `stfts` is a complex64 Tensor representing the Short-time Fourier Transform of -# each signal in `signals`. Its shape is [batch_size, ?, fft_unique_bins] -# where fft_unique_bins = fft_length // 2 + 1 = 513. -stfts = tf.contrib.signal.stft(signals, frame_length=1024, frame_step=512, - fft_length=1024) - -# A power spectrogram is the squared magnitude of the complex-valued STFT. -# A float32 Tensor of shape [batch_size, ?, 513]. -power_spectrograms = tf.real(stfts * tf.conj(stfts)) - -# An energy spectrogram is the magnitude of the complex-valued STFT. -# A float32 Tensor of shape [batch_size, ?, 513]. -magnitude_spectrograms = tf.abs(stfts) -``` - -You may use a power spectrogram or a magnitude spectrogram; each has its -advantages. Note that if you apply logarithmic compression, the power -spectrogram and magnitude spectrogram will differ by a factor of 2. - -## Logarithmic compression - -It is common practice to apply a compressive nonlinearity such as a logarithm or -power-law compression to spectrograms. This helps to balance the importance of -detail in low and high energy regions of the spectrum, which more closely -matches human auditory sensitivity. - -When compressing with a logarithm, it's a good idea to use a stabilizing offset -to avoid high dynamic ranges caused by the singularity at zero. - -```python -log_offset = 1e-6 -log_magnitude_spectrograms = tf.log(magnitude_spectrograms + log_offset) -``` - -## Computing log-mel spectrograms - -When working with spectral representations of audio, the [mel scale][mel] is a -common reweighting of the frequency dimension, which results in a -lower-dimensional and more perceptually-relevant representation of the audio. - -`tf.contrib.signal.linear_to_mel_weight_matrix` produces a matrix you can use -to convert a spectrogram to the mel scale. - -```python -# Warp the linear-scale, magnitude spectrograms into the mel-scale. -num_spectrogram_bins = magnitude_spectrograms.shape[-1].value -lower_edge_hertz, upper_edge_hertz, num_mel_bins = 80.0, 7600.0, 64 -linear_to_mel_weight_matrix = tf.contrib.signal.linear_to_mel_weight_matrix( - num_mel_bins, num_spectrogram_bins, sample_rate, lower_edge_hertz, - upper_edge_hertz) -mel_spectrograms = tf.tensordot( - magnitude_spectrograms, linear_to_mel_weight_matrix, 1) -# Note: Shape inference for `tf.tensordot` does not currently handle this case. -mel_spectrograms.set_shape(magnitude_spectrograms.shape[:-1].concatenate( - linear_to_mel_weight_matrix.shape[-1:])) -``` - -If desired, compress the mel spectrogram magnitudes. For example, you may use -logarithmic compression (as discussed in the previous section). - -Order matters! Compressing the spectrogram magnitudes after -reweighting the frequencies is different from reweighting the compressed -spectrogram magnitudes. According to the perceptual justification of the mel -scale, conversion from linear scale entails summing intensity or energy among -adjacent bands, i.e. it should be applied before logarithmic compression. Taking -the weighted sum of log-compressed values amounts to multiplying the -pre-logarithm values, which rarely, if ever, makes sense. - -```python -log_offset = 1e-6 -log_mel_spectrograms = tf.log(mel_spectrograms + log_offset) -``` - -## Computing Mel-Frequency Cepstral Coefficients (MFCCs) - -Call `tf.contrib.signal.mfccs_from_log_mel_spectrograms` to compute -[MFCCs][mfcc] from log-magnitude, mel-scale spectrograms (as computed in the -preceding example): - -```python -num_mfccs = 13 -# Keep the first `num_mfccs` MFCCs. -mfccs = tf.contrib.signal.mfccs_from_log_mel_spectrograms( - log_mel_spectrograms)[..., :num_mfccs] -``` - -[stft]: https://en.wikipedia.org/wiki/Short-time_Fourier_transform -[mel]: https://en.wikipedia.org/wiki/Mel_scale -[mfcc]: https://en.wikipedia.org/wiki/Mel-frequency_cepstrum diff --git a/tensorflow/docs_src/api_guides/python/contrib.staging.md b/tensorflow/docs_src/api_guides/python/contrib.staging.md deleted file mode 100644 index de143a7bd3..0000000000 --- a/tensorflow/docs_src/api_guides/python/contrib.staging.md +++ /dev/null @@ -1,6 +0,0 @@ -# Staging (contrib) -[TOC] - -This library contains utilities for adding pipelining to a model. - -* `tf.contrib.staging.StagingArea` diff --git a/tensorflow/docs_src/api_guides/python/contrib.training.md b/tensorflow/docs_src/api_guides/python/contrib.training.md deleted file mode 100644 index 068efdc829..0000000000 --- a/tensorflow/docs_src/api_guides/python/contrib.training.md +++ /dev/null @@ -1,50 +0,0 @@ -# Training (contrib) -[TOC] - -Training and input utilities. - -## Splitting sequence inputs into minibatches with state saving - -Use `tf.contrib.training.SequenceQueueingStateSaver` or -its wrapper `tf.contrib.training.batch_sequences_with_states` if -you have input data with a dynamic primary time / frame count axis which -you'd like to convert into fixed size segments during minibatching, and would -like to store state in the forward direction across segments of an example. - -* `tf.contrib.training.batch_sequences_with_states` -* `tf.contrib.training.NextQueuedSequenceBatch` -* `tf.contrib.training.SequenceQueueingStateSaver` - - -## Online data resampling - -To resample data with replacement on a per-example basis, use -`tf.contrib.training.rejection_sample` or -`tf.contrib.training.resample_at_rate`. For `rejection_sample`, provide -a boolean Tensor describing whether to accept or reject. Resulting batch sizes -are always the same. For `resample_at_rate`, provide the desired rate for each -example. Resulting batch sizes may vary. If you wish to specify relative -rates, rather than absolute ones, use `tf.contrib.training.weighted_resample` -(which also returns the actual resampling rate used for each output example). - -Use `tf.contrib.training.stratified_sample` to resample without replacement -from the data to achieve a desired mix of class proportions that the Tensorflow -graph sees. For instance, if you have a binary classification dataset that is -99.9% class 1, a common approach is to resample from the data so that the data -is more balanced. - -* `tf.contrib.training.rejection_sample` -* `tf.contrib.training.resample_at_rate` -* `tf.contrib.training.stratified_sample` -* `tf.contrib.training.weighted_resample` - -## Bucketing - -Use `tf.contrib.training.bucket` or -`tf.contrib.training.bucket_by_sequence_length` to stratify -minibatches into groups ("buckets"). Use `bucket_by_sequence_length` -with the argument `dynamic_pad=True` to receive minibatches of similarly -sized sequences for efficient training via `dynamic_rnn`. - -* `tf.contrib.training.bucket` -* `tf.contrib.training.bucket_by_sequence_length` diff --git a/tensorflow/docs_src/api_guides/python/contrib.util.md b/tensorflow/docs_src/api_guides/python/contrib.util.md deleted file mode 100644 index e5fd97e9f2..0000000000 --- a/tensorflow/docs_src/api_guides/python/contrib.util.md +++ /dev/null @@ -1,12 +0,0 @@ -# Utilities (contrib) -[TOC] - -Utilities for dealing with Tensors. - -## Miscellaneous Utility Functions - -* `tf.contrib.util.constant_value` -* `tf.contrib.util.make_tensor_proto` -* `tf.contrib.util.make_ndarray` -* `tf.contrib.util.ops_used_by_graph_def` -* `tf.contrib.util.stripped_op_list_for_graph` diff --git a/tensorflow/docs_src/api_guides/python/control_flow_ops.md b/tensorflow/docs_src/api_guides/python/control_flow_ops.md deleted file mode 100644 index 42c86d9978..0000000000 --- a/tensorflow/docs_src/api_guides/python/control_flow_ops.md +++ /dev/null @@ -1,57 +0,0 @@ -# Control Flow - -Note: Functions taking `Tensor` arguments can also take anything accepted by -`tf.convert_to_tensor`. - -[TOC] - -## Control Flow Operations - -TensorFlow provides several operations and classes that you can use to control -the execution of operations and add conditional dependencies to your graph. - -* `tf.identity` -* `tf.tuple` -* `tf.group` -* `tf.no_op` -* `tf.count_up_to` -* `tf.cond` -* `tf.case` -* `tf.while_loop` - -## Logical Operators - -TensorFlow provides several operations that you can use to add logical operators -to your graph. - -* `tf.logical_and` -* `tf.logical_not` -* `tf.logical_or` -* `tf.logical_xor` - -## Comparison Operators - -TensorFlow provides several operations that you can use to add comparison -operators to your graph. - -* `tf.equal` -* `tf.not_equal` -* `tf.less` -* `tf.less_equal` -* `tf.greater` -* `tf.greater_equal` -* `tf.where` - -## Debugging Operations - -TensorFlow provides several operations that you can use to validate values and -debug your graph. - -* `tf.is_finite` -* `tf.is_inf` -* `tf.is_nan` -* `tf.verify_tensor_all_finite` -* `tf.check_numerics` -* `tf.add_check_numerics_ops` -* `tf.Assert` -* `tf.Print` diff --git a/tensorflow/docs_src/api_guides/python/framework.md b/tensorflow/docs_src/api_guides/python/framework.md deleted file mode 100644 index 40a6c0783a..0000000000 --- a/tensorflow/docs_src/api_guides/python/framework.md +++ /dev/null @@ -1,51 +0,0 @@ -# Building Graphs -[TOC] - -Classes and functions for building TensorFlow graphs. - -## Core graph data structures - -* `tf.Graph` -* `tf.Operation` -* `tf.Tensor` - -## Tensor types - -* `tf.DType` -* `tf.as_dtype` - -## Utility functions - -* `tf.device` -* `tf.container` -* `tf.name_scope` -* `tf.control_dependencies` -* `tf.convert_to_tensor` -* `tf.convert_to_tensor_or_indexed_slices` -* `tf.convert_to_tensor_or_sparse_tensor` -* `tf.get_default_graph` -* `tf.reset_default_graph` -* `tf.import_graph_def` -* `tf.load_file_system_library` -* `tf.load_op_library` - -## Graph collections - -* `tf.add_to_collection` -* `tf.get_collection` -* `tf.get_collection_ref` -* `tf.GraphKeys` - -## Defining new operations - -* `tf.RegisterGradient` -* `tf.NotDifferentiable` -* `tf.NoGradient` -* `tf.TensorShape` -* `tf.Dimension` -* `tf.op_scope` -* `tf.get_seed` - -## For libraries building on TensorFlow - -* `tf.register_tensor_conversion_function` diff --git a/tensorflow/docs_src/api_guides/python/functional_ops.md b/tensorflow/docs_src/api_guides/python/functional_ops.md deleted file mode 100644 index 0a9fe02ad5..0000000000 --- a/tensorflow/docs_src/api_guides/python/functional_ops.md +++ /dev/null @@ -1,18 +0,0 @@ -# Higher Order Functions - -Note: Functions taking `Tensor` arguments can also take anything accepted by -`tf.convert_to_tensor`. - -[TOC] - -Functional operations. - -## Higher Order Operators - -TensorFlow provides several higher order operators to simplify the common -map-reduce programming patterns. - -* `tf.map_fn` -* `tf.foldl` -* `tf.foldr` -* `tf.scan` diff --git a/tensorflow/docs_src/api_guides/python/image.md b/tensorflow/docs_src/api_guides/python/image.md deleted file mode 100644 index c51b92db05..0000000000 --- a/tensorflow/docs_src/api_guides/python/image.md +++ /dev/null @@ -1,144 +0,0 @@ -# Images - -Note: Functions taking `Tensor` arguments can also take anything accepted by -`tf.convert_to_tensor`. - -[TOC] - -## Encoding and Decoding - -TensorFlow provides Ops to decode and encode JPEG and PNG formats. Encoded -images are represented by scalar string Tensors, decoded images by 3-D uint8 -tensors of shape `[height, width, channels]`. (PNG also supports uint16.) - -The encode and decode Ops apply to one image at a time. Their input and output -are all of variable size. If you need fixed size images, pass the output of -the decode Ops to one of the cropping and resizing Ops. - -Note: The PNG encode and decode Ops support RGBA, but the conversions Ops -presently only support RGB, HSV, and GrayScale. Presently, the alpha channel has -to be stripped from the image and re-attached using slicing ops. - -* `tf.image.decode_bmp` -* `tf.image.decode_gif` -* `tf.image.decode_jpeg` -* `tf.image.encode_jpeg` -* `tf.image.decode_png` -* `tf.image.encode_png` -* `tf.image.decode_image` - -## Resizing - -The resizing Ops accept input images as tensors of several types. They always -output resized images as float32 tensors. - -The convenience function `tf.image.resize_images` supports both 4-D -and 3-D tensors as input and output. 4-D tensors are for batches of images, -3-D tensors for individual images. - -Other resizing Ops only support 4-D batches of images as input: -`tf.image.resize_area`, `tf.image.resize_bicubic`, -`tf.image.resize_bilinear`, -`tf.image.resize_nearest_neighbor`. - -Example: - -```python -# Decode a JPG image and resize it to 299 by 299 using default method. -image = tf.image.decode_jpeg(...) -resized_image = tf.image.resize_images(image, [299, 299]) -``` - -* `tf.image.resize_images` -* `tf.image.resize_area` -* `tf.image.resize_bicubic` -* `tf.image.resize_bilinear` -* `tf.image.resize_nearest_neighbor` - -## Cropping - -* `tf.image.resize_image_with_crop_or_pad` -* `tf.image.central_crop` -* `tf.image.pad_to_bounding_box` -* `tf.image.crop_to_bounding_box` -* `tf.image.extract_glimpse` -* `tf.image.crop_and_resize` - -## Flipping, Rotating and Transposing - -* `tf.image.flip_up_down` -* `tf.image.random_flip_up_down` -* `tf.image.flip_left_right` -* `tf.image.random_flip_left_right` -* `tf.image.transpose_image` -* `tf.image.rot90` - -## Converting Between Colorspaces - -Image ops work either on individual images or on batches of images, depending on -the shape of their input Tensor. - -If 3-D, the shape is `[height, width, channels]`, and the Tensor represents one -image. If 4-D, the shape is `[batch_size, height, width, channels]`, and the -Tensor represents `batch_size` images. - -Currently, `channels` can usefully be 1, 2, 3, or 4. Single-channel images are -grayscale, images with 3 channels are encoded as either RGB or HSV. Images -with 2 or 4 channels include an alpha channel, which has to be stripped from the -image before passing the image to most image processing functions (and can be -re-attached later). - -Internally, images are either stored in as one `float32` per channel per pixel -(implicitly, values are assumed to lie in `[0,1)`) or one `uint8` per channel -per pixel (values are assumed to lie in `[0,255]`). - -TensorFlow can convert between images in RGB or HSV. The conversion functions -work only on float images, so you need to convert images in other formats using -`tf.image.convert_image_dtype`. - -Example: - -```python -# Decode an image and convert it to HSV. -rgb_image = tf.image.decode_png(..., channels=3) -rgb_image_float = tf.image.convert_image_dtype(rgb_image, tf.float32) -hsv_image = tf.image.rgb_to_hsv(rgb_image) -``` - -* `tf.image.rgb_to_grayscale` -* `tf.image.grayscale_to_rgb` -* `tf.image.hsv_to_rgb` -* `tf.image.rgb_to_hsv` -* `tf.image.convert_image_dtype` - -## Image Adjustments - -TensorFlow provides functions to adjust images in various ways: brightness, -contrast, hue, and saturation. Each adjustment can be done with predefined -parameters or with random parameters picked from predefined intervals. Random -adjustments are often useful to expand a training set and reduce overfitting. - -If several adjustments are chained it is advisable to minimize the number of -redundant conversions by first converting the images to the most natural data -type and representation (RGB or HSV). - -* `tf.image.adjust_brightness` -* `tf.image.random_brightness` -* `tf.image.adjust_contrast` -* `tf.image.random_contrast` -* `tf.image.adjust_hue` -* `tf.image.random_hue` -* `tf.image.adjust_gamma` -* `tf.image.adjust_saturation` -* `tf.image.random_saturation` -* `tf.image.per_image_standardization` - -## Working with Bounding Boxes - -* `tf.image.draw_bounding_boxes` -* `tf.image.non_max_suppression` -* `tf.image.sample_distorted_bounding_box` - -## Denoising - -* `tf.image.total_variation` diff --git a/tensorflow/docs_src/api_guides/python/index.md b/tensorflow/docs_src/api_guides/python/index.md deleted file mode 100644 index a791a1432a..0000000000 --- a/tensorflow/docs_src/api_guides/python/index.md +++ /dev/null @@ -1,52 +0,0 @@ -# Python API Guides - -* [Asserts and boolean checks](check_ops.md) -* [Building Graphs](framework.md) -* [Constants, Sequences, and Random Values](constant_op.md) -* [Control Flow](control_flow_ops.md) -* [Data IO (Python functions)](python_io.md) -* [Exporting and Importing a MetaGraph](meta_graph.md) -* [Higher Order Functions](functional_ops.md) -* [Histograms](histogram_ops.md) -* [Images](image.md) -* [Inputs and Readers](io_ops.md) -* [Math](math_ops.md) -* [Neural Network](nn.md) -* [Reading data](reading_data.md) -* [Running Graphs](client.md) -* [Sparse Tensors](sparse_ops.md) -* [Spectral Functions](spectral_ops.md) -* [Strings](string_ops.md) -* [Summary Operations](summary.md) -* [TensorFlow Debugger](tfdbg.md) -* [Tensor Handle Operations](session_ops.md) -* [Tensor Transformations](array_ops.md) -* [Testing](test.md) -* [Training](train.md) -* [Variables](state_ops.md) -* [Wraps python functions](script_ops.md) -* [BayesFlow Entropy (contrib)](contrib.bayesflow.entropy.md) -* [BayesFlow Monte Carlo (contrib)](contrib.bayesflow.monte_carlo.md) -* [BayesFlow Stochastic Graph (contrib)](contrib.bayesflow.stochastic_graph.md) -* [BayesFlow Stochastic Tensors (contrib)](contrib.bayesflow.stochastic_tensor.md) -* [BayesFlow Variational Inference (contrib)](contrib.bayesflow.variational_inference.md) -* [Copying Graph Elements (contrib)](contrib.copy_graph.md) -* [CRF (contrib)](contrib.crf.md) -* [FFmpeg (contrib)](contrib.ffmpeg.md) -* [Framework (contrib)](contrib.framework.md) -* [Graph Editor (contrib)](contrib.graph_editor.md) -* [Integrate (contrib)](contrib.integrate.md) -* [Layers (contrib)](contrib.layers.md) -* [Learn (contrib)](contrib.learn.md) -* [Linear Algebra (contrib)](contrib.linalg.md) -* [Losses (contrib)](contrib.losses.md) -* [Metrics (contrib)](contrib.metrics.md) -* [Optimization (contrib)](contrib.opt.md) -* [Random variable transformations (contrib)](contrib.distributions.bijectors.md) -* [RNN and Cells (contrib)](contrib.rnn.md) -* [Seq2seq Library (contrib)](contrib.seq2seq.md) -* [Signal Processing (contrib)](contrib.signal.md) -* [Staging (contrib)](contrib.staging.md) -* [Statistical Distributions (contrib)](contrib.distributions.md) -* [Training (contrib)](contrib.training.md) -* [Utilities (contrib)](contrib.util.md) diff --git a/tensorflow/docs_src/api_guides/python/input_dataset.md b/tensorflow/docs_src/api_guides/python/input_dataset.md deleted file mode 100644 index 911a76c2df..0000000000 --- a/tensorflow/docs_src/api_guides/python/input_dataset.md +++ /dev/null @@ -1,85 +0,0 @@ -# Dataset Input Pipeline -[TOC] - -`tf.data.Dataset` allows you to build complex input pipelines. See the -[Importing Data](../../guide/datasets.md) for an in-depth explanation of how to use this API. - -## Reader classes - -Classes that create a dataset from input files. - -* `tf.data.FixedLengthRecordDataset` -* `tf.data.TextLineDataset` -* `tf.data.TFRecordDataset` - -## Creating new datasets - -Static methods in `Dataset` that create new datasets. - -* `tf.data.Dataset.from_generator` -* `tf.data.Dataset.from_tensor_slices` -* `tf.data.Dataset.from_tensors` -* `tf.data.Dataset.list_files` -* `tf.data.Dataset.range` -* `tf.data.Dataset.zip` - -## Transformations on existing datasets - -These functions transform an existing dataset, and return a new dataset. Calls -can be chained together, as shown in the example below: - -``` -train_data = train_data.batch(100).shuffle().repeat() -``` - -* `tf.data.Dataset.apply` -* `tf.data.Dataset.batch` -* `tf.data.Dataset.cache` -* `tf.data.Dataset.concatenate` -* `tf.data.Dataset.filter` -* `tf.data.Dataset.flat_map` -* `tf.data.Dataset.interleave` -* `tf.data.Dataset.map` -* `tf.data.Dataset.padded_batch` -* `tf.data.Dataset.prefetch` -* `tf.data.Dataset.repeat` -* `tf.data.Dataset.shard` -* `tf.data.Dataset.shuffle` -* `tf.data.Dataset.skip` -* `tf.data.Dataset.take` - -### Custom transformation functions - -Custom transformation functions can be applied to a `Dataset` using `tf.data.Dataset.apply`. Below are custom transformation functions from `tf.contrib.data`: - -* `tf.contrib.data.batch_and_drop_remainder` -* `tf.contrib.data.dense_to_sparse_batch` -* `tf.contrib.data.enumerate_dataset` -* `tf.contrib.data.group_by_window` -* `tf.contrib.data.ignore_errors` -* `tf.contrib.data.map_and_batch` -* `tf.contrib.data.padded_batch_and_drop_remainder` -* `tf.contrib.data.parallel_interleave` -* `tf.contrib.data.rejection_resample` -* `tf.contrib.data.scan` -* `tf.contrib.data.shuffle_and_repeat` -* `tf.contrib.data.unbatch` - -## Iterating over datasets - -These functions make a `tf.data.Iterator` from a `Dataset`. - -* `tf.data.Dataset.make_initializable_iterator` -* `tf.data.Dataset.make_one_shot_iterator` - -The `Iterator` class also contains static methods that create a `tf.data.Iterator` that can be used with multiple `Dataset` objects. - -* `tf.data.Iterator.from_structure` -* `tf.data.Iterator.from_string_handle` - -## Extra functions from `tf.contrib.data` - -* `tf.contrib.data.get_single_element` -* `tf.contrib.data.make_saveable_from_iterator` -* `tf.contrib.data.read_batch_features` - diff --git a/tensorflow/docs_src/api_guides/python/io_ops.md b/tensorflow/docs_src/api_guides/python/io_ops.md deleted file mode 100644 index d7ce6fdfde..0000000000 --- a/tensorflow/docs_src/api_guides/python/io_ops.md +++ /dev/null @@ -1,130 +0,0 @@ -# Inputs and Readers - -Note: Functions taking `Tensor` arguments can also take anything accepted by -`tf.convert_to_tensor`. - -[TOC] - -## Placeholders - -TensorFlow provides a placeholder operation that must be fed with data -on execution. For more info, see the section on [Feeding data](../../api_guides/python/reading_data.md#Feeding). - -* `tf.placeholder` -* `tf.placeholder_with_default` - -For feeding `SparseTensor`s which are composite type, -there is a convenience function: - -* `tf.sparse_placeholder` - -## Readers - -TensorFlow provides a set of Reader classes for reading data formats. -For more information on inputs and readers, see [Reading data](../../api_guides/python/reading_data.md). - -* `tf.ReaderBase` -* `tf.TextLineReader` -* `tf.WholeFileReader` -* `tf.IdentityReader` -* `tf.TFRecordReader` -* `tf.FixedLengthRecordReader` - -## Converting - -TensorFlow provides several operations that you can use to convert various data -formats into tensors. - -* `tf.decode_csv` -* `tf.decode_raw` - -- - - - -### Example protocol buffer - -TensorFlow's [recommended format for training examples](../../api_guides/python/reading_data.md#standard_tensorflow_format) -is serialized `Example` protocol buffers, [described -here](https://www.tensorflow.org/code/tensorflow/core/example/example.proto). -They contain `Features`, [described -here](https://www.tensorflow.org/code/tensorflow/core/example/feature.proto). - -* `tf.VarLenFeature` -* `tf.FixedLenFeature` -* `tf.FixedLenSequenceFeature` -* `tf.SparseFeature` -* `tf.parse_example` -* `tf.parse_single_example` -* `tf.parse_tensor` -* `tf.decode_json_example` - -## Queues - -TensorFlow provides several implementations of 'Queues', which are -structures within the TensorFlow computation graph to stage pipelines -of tensors together. The following describe the basic Queue interface -and some implementations. To see an example use, see [Threading and Queues](../../api_guides/python/threading_and_queues.md). - -* `tf.QueueBase` -* `tf.FIFOQueue` -* `tf.PaddingFIFOQueue` -* `tf.RandomShuffleQueue` -* `tf.PriorityQueue` - -## Conditional Accumulators - -* `tf.ConditionalAccumulatorBase` -* `tf.ConditionalAccumulator` -* `tf.SparseConditionalAccumulator` - -## Dealing with the filesystem - -* `tf.matching_files` -* `tf.read_file` -* `tf.write_file` - -## Input pipeline - -TensorFlow functions for setting up an input-prefetching pipeline. -Please see the [reading data how-to](../../api_guides/python/reading_data.md) -for context. - -### Beginning of an input pipeline - -The "producer" functions add a queue to the graph and a corresponding -`QueueRunner` for running the subgraph that fills that queue. - -* `tf.train.match_filenames_once` -* `tf.train.limit_epochs` -* `tf.train.input_producer` -* `tf.train.range_input_producer` -* `tf.train.slice_input_producer` -* `tf.train.string_input_producer` - -### Batching at the end of an input pipeline - -These functions add a queue to the graph to assemble a batch of -examples, with possible shuffling. They also add a `QueueRunner` for -running the subgraph that fills that queue. - -Use `tf.train.batch` or `tf.train.batch_join` for batching -examples that have already been well shuffled. Use -`tf.train.shuffle_batch` or -`tf.train.shuffle_batch_join` for examples that would -benefit from additional shuffling. - -Use `tf.train.batch` or `tf.train.shuffle_batch` if you want a -single thread producing examples to batch, or if you have a -single subgraph producing examples but you want to run it in *N* threads -(where you increase *N* until it can keep the queue full). Use -`tf.train.batch_join` or `tf.train.shuffle_batch_join` -if you have *N* different subgraphs producing examples to batch and you -want them run by *N* threads. Use `maybe_*` to enqueue conditionally. - -* `tf.train.batch` -* `tf.train.maybe_batch` -* `tf.train.batch_join` -* `tf.train.maybe_batch_join` -* `tf.train.shuffle_batch` -* `tf.train.maybe_shuffle_batch` -* `tf.train.shuffle_batch_join` -* `tf.train.maybe_shuffle_batch_join` diff --git a/tensorflow/docs_src/api_guides/python/math_ops.md b/tensorflow/docs_src/api_guides/python/math_ops.md deleted file mode 100644 index 6ec18f48ef..0000000000 --- a/tensorflow/docs_src/api_guides/python/math_ops.md +++ /dev/null @@ -1,200 +0,0 @@ -# Math - -Note: Functions taking `Tensor` arguments can also take anything accepted by -`tf.convert_to_tensor`. - -[TOC] - -Note: Elementwise binary operations in TensorFlow follow [numpy-style -broadcasting](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html). - -## Arithmetic Operators - -TensorFlow provides several operations that you can use to add basic arithmetic -operators to your graph. - -* `tf.add` -* `tf.subtract` -* `tf.multiply` -* `tf.scalar_mul` -* `tf.div` -* `tf.divide` -* `tf.truediv` -* `tf.floordiv` -* `tf.realdiv` -* `tf.truncatediv` -* `tf.floor_div` -* `tf.div_no_nan` -* `tf.truncatemod` -* `tf.floormod` -* `tf.mod` -* `tf.cross` - -## Basic Math Functions - -TensorFlow provides several operations that you can use to add basic -mathematical functions to your graph. - -* `tf.add_n` -* `tf.abs` -* `tf.negative` -* `tf.sign` -* `tf.reciprocal` -* `tf.square` -* `tf.round` -* `tf.sqrt` -* `tf.rsqrt` -* `tf.pow` -* `tf.exp` -* `tf.expm1` -* `tf.log` -* `tf.log1p` -* `tf.ceil` -* `tf.floor` -* `tf.maximum` -* `tf.minimum` -* `tf.cos` -* `tf.sin` -* `tf.lbeta` -* `tf.tan` -* `tf.acos` -* `tf.asin` -* `tf.atan` -* `tf.cosh` -* `tf.sinh` -* `tf.asinh` -* `tf.acosh` -* `tf.atanh` -* `tf.lgamma` -* `tf.digamma` -* `tf.erf` -* `tf.erfc` -* `tf.squared_difference` -* `tf.igamma` -* `tf.igammac` -* `tf.zeta` -* `tf.polygamma` -* `tf.betainc` -* `tf.rint` - -## Matrix Math Functions - -TensorFlow provides several operations that you can use to add linear algebra -functions on matrices to your graph. - -* `tf.diag` -* `tf.diag_part` -* `tf.trace` -* `tf.transpose` -* `tf.eye` -* `tf.matrix_diag` -* `tf.matrix_diag_part` -* `tf.matrix_band_part` -* `tf.matrix_set_diag` -* `tf.matrix_transpose` -* `tf.matmul` -* `tf.norm` -* `tf.matrix_determinant` -* `tf.matrix_inverse` -* `tf.cholesky` -* `tf.cholesky_solve` -* `tf.matrix_solve` -* `tf.matrix_triangular_solve` -* `tf.matrix_solve_ls` -* `tf.qr` -* `tf.self_adjoint_eig` -* `tf.self_adjoint_eigvals` -* `tf.svd` - - -## Tensor Math Function - -TensorFlow provides operations that you can use to add tensor functions to your -graph. - -* `tf.tensordot` - - -## Complex Number Functions - -TensorFlow provides several operations that you can use to add complex number -functions to your graph. - -* `tf.complex` -* `tf.conj` -* `tf.imag` -* `tf.angle` -* `tf.real` - - -## Reduction - -TensorFlow provides several operations that you can use to perform -common math computations that reduce various dimensions of a tensor. - -* `tf.reduce_sum` -* `tf.reduce_prod` -* `tf.reduce_min` -* `tf.reduce_max` -* `tf.reduce_mean` -* `tf.reduce_all` -* `tf.reduce_any` -* `tf.reduce_logsumexp` -* `tf.count_nonzero` -* `tf.accumulate_n` -* `tf.einsum` - -## Scan - -TensorFlow provides several operations that you can use to perform scans -(running totals) across one axis of a tensor. - -* `tf.cumsum` -* `tf.cumprod` - -## Segmentation - -TensorFlow provides several operations that you can use to perform common -math computations on tensor segments. -Here a segmentation is a partitioning of a tensor along -the first dimension, i.e. it defines a mapping from the first dimension onto -`segment_ids`. The `segment_ids` tensor should be the size of -the first dimension, `d0`, with consecutive IDs in the range `0` to `k`, -where `k [[0 0 0 0] - [5 6 7 8]] -``` - -* `tf.segment_sum` -* `tf.segment_prod` -* `tf.segment_min` -* `tf.segment_max` -* `tf.segment_mean` -* `tf.unsorted_segment_sum` -* `tf.sparse_segment_sum` -* `tf.sparse_segment_mean` -* `tf.sparse_segment_sqrt_n` - - -## Sequence Comparison and Indexing - -TensorFlow provides several operations that you can use to add sequence -comparison and index extraction to your graph. You can use these operations to -determine sequence differences and determine the indexes of specific values in -a tensor. - -* `tf.argmin` -* `tf.argmax` -* `tf.setdiff1d` -* `tf.where` -* `tf.unique` -* `tf.edit_distance` -* `tf.invert_permutation` diff --git a/tensorflow/docs_src/api_guides/python/meta_graph.md b/tensorflow/docs_src/api_guides/python/meta_graph.md deleted file mode 100644 index 5e8a8b4d0f..0000000000 --- a/tensorflow/docs_src/api_guides/python/meta_graph.md +++ /dev/null @@ -1,277 +0,0 @@ -# Exporting and Importing a MetaGraph - -A [`MetaGraph`](https://www.tensorflow.org/code/tensorflow/core/protobuf/meta_graph.proto) contains both a TensorFlow GraphDef -as well as associated metadata necessary for running computation in a -graph when crossing a process boundary. It can also be used for long -term storage of graphs. The MetaGraph contains the information required -to continue training, perform evaluation, or run inference on a previously trained graph. - -The APIs for exporting and importing the complete model are in -the `tf.train.Saver` class: -`tf.train.export_meta_graph` -and -`tf.train.import_meta_graph`. - -## What's in a MetaGraph - -The information contained in a MetaGraph is expressed as a -[`MetaGraphDef`](https://www.tensorflow.org/code/tensorflow/core/protobuf/meta_graph.proto) -protocol buffer. It contains the following fields: - -* [`MetaInfoDef`](https://www.tensorflow.org/code/tensorflow/core/protobuf/meta_graph.proto) for meta information, such as version and other user information. -* [`GraphDef`](https://www.tensorflow.org/code/tensorflow/core/framework/graph.proto) for describing the graph. -* [`SaverDef`](https://www.tensorflow.org/code/tensorflow/core/protobuf/saver.proto) for the saver. -* [`CollectionDef`](https://www.tensorflow.org/code/tensorflow/core/protobuf/meta_graph.proto) -map that further describes additional components of the model such as -[`Variables`](../../api_guides/python/state_ops.md), -`tf.train.QueueRunner`, etc. - -In order for a Python object to be serialized -to and from `MetaGraphDef`, the Python class must implement `to_proto()` and -`from_proto()` methods, and register them with the system using -`register_proto_function`. For example: - - ```Python - def to_proto(self, export_scope=None): - - """Converts a `Variable` to a `VariableDef` protocol buffer. - - Args: - export_scope: Optional `string`. Name scope to remove. - - Returns: - A `VariableDef` protocol buffer, or `None` if the `Variable` is not - in the specified name scope. - """ - if (export_scope is None or - self._variable.name.startswith(export_scope)): - var_def = variable_pb2.VariableDef() - var_def.variable_name = ops.strip_name_scope( - self._variable.name, export_scope) - var_def.initializer_name = ops.strip_name_scope( - self.initializer.name, export_scope) - var_def.snapshot_name = ops.strip_name_scope( - self._snapshot.name, export_scope) - if self._save_slice_info: - var_def.save_slice_info_def.MergeFrom(self._save_slice_info.to_proto( - export_scope=export_scope)) - return var_def - else: - return None - - @staticmethod - def from_proto(variable_def, import_scope=None): - """Returns a `Variable` object created from `variable_def`.""" - return Variable(variable_def=variable_def, import_scope=import_scope) - - ops.register_proto_function(ops.GraphKeys.GLOBAL_VARIABLES, - proto_type=variable_pb2.VariableDef, - to_proto=Variable.to_proto, - from_proto=Variable.from_proto) - ``` - -## Exporting a Complete Model to MetaGraph - -The API for exporting a running model as a MetaGraph is `export_meta_graph()`. - - ```Python - def export_meta_graph(filename=None, collection_list=None, as_text=False): - """Writes `MetaGraphDef` to save_path/filename. - - Args: - filename: Optional meta_graph filename including the path. - collection_list: List of string keys to collect. - as_text: If `True`, writes the meta_graph as an ASCII proto. - - Returns: - A `MetaGraphDef` proto. - """ - ``` - - A `collection` can contain any Python objects that users would like to - be able to uniquely identify and easily retrieve. These objects can be - special operations in the graph, such as `train_op`, or hyper parameters, - such as "learning rate". Users can specify the list of collections - they would like to export. If no `collection_list` is specified, - all collections in the model will be exported. - - The API returns a serialized protocol buffer. If `filename` is - specified, the protocol buffer will also be written to a file. - - Here are some of the typical usage models: - - * Export the default running graph: - - ```Python - # Build the model - ... - with tf.Session() as sess: - # Use the model - ... - # Export the model to /tmp/my-model.meta. - meta_graph_def = tf.train.export_meta_graph(filename='/tmp/my-model.meta') - ``` - - * Export the default running graph and only a subset of the collections. - - ```Python - meta_graph_def = tf.train.export_meta_graph( - filename='/tmp/my-model.meta', - collection_list=["input_tensor", "output_tensor"]) - ``` - - -The MetaGraph is also automatically exported via the `save()` API in -`tf.train.Saver`. - - -## Import a MetaGraph - -The API for importing a MetaGraph file into a graph is `import_meta_graph()`. - -Here are some of the typical usage models: - -* Import and continue training without building the model from scratch. - - ```Python - ... - # Create a saver. - saver = tf.train.Saver(...variables...) - # Remember the training_op we want to run by adding it to a collection. - tf.add_to_collection('train_op', train_op) - sess = tf.Session() - for step in xrange(1000000): - sess.run(train_op) - if step % 1000 == 0: - # Saves checkpoint, which by default also exports a meta_graph - # named 'my-model-global_step.meta'. - saver.save(sess, 'my-model', global_step=step) - ``` - - Later we can continue training from this saved `meta_graph` without building - the model from scratch. - - ```Python - with tf.Session() as sess: - new_saver = tf.train.import_meta_graph('my-save-dir/my-model-10000.meta') - new_saver.restore(sess, 'my-save-dir/my-model-10000') - # tf.get_collection() returns a list. In this example we only want the - # first one. - train_op = tf.get_collection('train_op')[0] - for step in xrange(1000000): - sess.run(train_op) - ``` - -* Import and extend the graph. - - For example, we can first build an inference graph, export it as a meta graph: - - ```Python - # Creates an inference graph. - # Hidden 1 - images = tf.constant(1.2, tf.float32, shape=[100, 28]) - with tf.name_scope("hidden1"): - weights = tf.Variable( - tf.truncated_normal([28, 128], - stddev=1.0 / math.sqrt(float(28))), - name="weights") - biases = tf.Variable(tf.zeros([128]), - name="biases") - hidden1 = tf.nn.relu(tf.matmul(images, weights) + biases) - # Hidden 2 - with tf.name_scope("hidden2"): - weights = tf.Variable( - tf.truncated_normal([128, 32], - stddev=1.0 / math.sqrt(float(128))), - name="weights") - biases = tf.Variable(tf.zeros([32]), - name="biases") - hidden2 = tf.nn.relu(tf.matmul(hidden1, weights) + biases) - # Linear - with tf.name_scope("softmax_linear"): - weights = tf.Variable( - tf.truncated_normal([32, 10], - stddev=1.0 / math.sqrt(float(32))), - name="weights") - biases = tf.Variable(tf.zeros([10]), - name="biases") - logits = tf.matmul(hidden2, weights) + biases - tf.add_to_collection("logits", logits) - - init_all_op = tf.global_variables_initializer() - - with tf.Session() as sess: - # Initializes all the variables. - sess.run(init_all_op) - # Runs to logit. - sess.run(logits) - # Creates a saver. - saver0 = tf.train.Saver() - saver0.save(sess, 'my-save-dir/my-model-10000') - # Generates MetaGraphDef. - saver0.export_meta_graph('my-save-dir/my-model-10000.meta') - ``` - - Then later import it and extend it to a training graph. - - ```Python - with tf.Session() as sess: - new_saver = tf.train.import_meta_graph('my-save-dir/my-model-10000.meta') - new_saver.restore(sess, 'my-save-dir/my-model-10000') - # Addes loss and train. - labels = tf.constant(0, tf.int32, shape=[100], name="labels") - batch_size = tf.size(labels) - logits = tf.get_collection("logits")[0] - loss = tf.losses.sparse_softmax_cross_entropy(labels=labels, - logits=logits) - - tf.summary.scalar('loss', loss) - # Creates the gradient descent optimizer with the given learning rate. - optimizer = tf.train.GradientDescentOptimizer(0.01) - - # Runs train_op. - train_op = optimizer.minimize(loss) - sess.run(train_op) - ``` - -* Import a graph with preset devices. - - Sometimes an exported meta graph is from a training environment that the - importer doesn't have. For example, the model might have been trained - on GPUs, or in a distributed environment with replicas. When importing - such models, it's useful to be able to clear the device settings in - the graph so that we can run it on locally available devices. This can - be achieved by calling `import_meta_graph` with the `clear_devices` - option set to `True`. - - ```Python - with tf.Session() as sess: - new_saver = tf.train.import_meta_graph('my-save-dir/my-model-10000.meta', - clear_devices=True) - new_saver.restore(sess, 'my-save-dir/my-model-10000') - ... - ``` - -* Import within the default graph. - - Sometimes you might want to run `export_meta_graph` and `import_meta_graph` - in codelab using the default graph. In that case, you need to reset - the default graph by calling `tf.reset_default_graph()` first before - running import. - - ```Python - meta_graph_def = tf.train.export_meta_graph() - ... - tf.reset_default_graph() - ... - tf.train.import_meta_graph(meta_graph_def) - ... - ``` - -* Retrieve Hyper Parameters - - ```Python - filename = ".".join([tf.train.latest_checkpoint(train_dir), "meta"]) - tf.train.import_meta_graph(filename) - hparams = tf.get_collection("hparams") - ``` diff --git a/tensorflow/docs_src/api_guides/python/nn.md b/tensorflow/docs_src/api_guides/python/nn.md deleted file mode 100644 index 40dda3941d..0000000000 --- a/tensorflow/docs_src/api_guides/python/nn.md +++ /dev/null @@ -1,418 +0,0 @@ -# Neural Network - -Note: Functions taking `Tensor` arguments can also take anything accepted by -`tf.convert_to_tensor`. - -[TOC] - -## Activation Functions - -The activation ops provide different types of nonlinearities for use in neural -networks. These include smooth nonlinearities (`sigmoid`, `tanh`, `elu`, `selu`, -`softplus`, and `softsign`), continuous but not everywhere differentiable -functions (`relu`, `relu6`, `crelu` and `relu_x`), and random regularization -(`dropout`). - -All activation ops apply componentwise, and produce a tensor of the same -shape as the input tensor. - -* `tf.nn.relu` -* `tf.nn.relu6` -* `tf.nn.crelu` -* `tf.nn.elu` -* `tf.nn.selu` -* `tf.nn.softplus` -* `tf.nn.softsign` -* `tf.nn.dropout` -* `tf.nn.bias_add` -* `tf.sigmoid` -* `tf.tanh` - -## Convolution - -The convolution ops sweep a 2-D filter over a batch of images, applying the -filter to each window of each image of the appropriate size. The different -ops trade off between generic vs. specific filters: - -* `conv2d`: Arbitrary filters that can mix channels together. -* `depthwise_conv2d`: Filters that operate on each channel independently. -* `separable_conv2d`: A depthwise spatial filter followed by a pointwise filter. - -Note that although these ops are called "convolution", they are strictly -speaking "cross-correlation" since the filter is combined with an input window -without reversing the filter. For details, see [the properties of -cross-correlation](https://en.wikipedia.org/wiki/Cross-correlation#Properties). - -The filter is applied to image patches of the same size as the filter and -strided according to the `strides` argument. `strides = [1, 1, 1, 1]` applies -the filter to a patch at every offset, `strides = [1, 2, 2, 1]` applies the -filter to every other image patch in each dimension, etc. - -Ignoring channels for the moment, assume that the 4-D `input` has shape -`[batch, in_height, in_width, ...]` and the 4-D `filter` has shape -`[filter_height, filter_width, ...]`. The spatial semantics of the -convolution ops depend on the padding scheme chosen: `'SAME'` or `'VALID'`. -Note that the padding values are always zero. - -First, consider the `'SAME'` padding scheme. A detailed explanation of the -reasoning behind it is given in -[these notes](#Notes_on_SAME_Convolution_Padding). Here, we summarize the -mechanics of this padding scheme. When using `'SAME'`, the output height and -width are computed as: - - out_height = ceil(float(in_height) / float(strides[1])) - out_width = ceil(float(in_width) / float(strides[2])) - -The total padding applied along the height and width is computed as: - - if (in_height % strides[1] == 0): - pad_along_height = max(filter_height - strides[1], 0) - else: - pad_along_height = max(filter_height - (in_height % strides[1]), 0) - if (in_width % strides[2] == 0): - pad_along_width = max(filter_width - strides[2], 0) - else: - pad_along_width = max(filter_width - (in_width % strides[2]), 0) - -Finally, the padding on the top, bottom, left and right are: - - pad_top = pad_along_height // 2 - pad_bottom = pad_along_height - pad_top - pad_left = pad_along_width // 2 - pad_right = pad_along_width - pad_left - -Note that the division by 2 means that there might be cases when the padding on -both sides (top vs bottom, right vs left) are off by one. In this case, the -bottom and right sides always get the one additional padded pixel. For example, -when `pad_along_height` is 5, we pad 2 pixels at the top and 3 pixels at the -bottom. Note that this is different from existing libraries such as cuDNN and -Caffe, which explicitly specify the number of padded pixels and always pad the -same number of pixels on both sides. - -For the `'VALID'` scheme, the output height and width are computed as: - - out_height = ceil(float(in_height - filter_height + 1) / float(strides[1])) - out_width = ceil(float(in_width - filter_width + 1) / float(strides[2])) - -and no padding is used. - -Given the output size and the padding, the output can be computed as - -$$ output[b, i, j, :] = - sum_{d_i, d_j} input[b, strides[1] * i + d_i - pad_{top},\ - strides[2] * j + d_j - pad_{left}, ...] * - filter[d_i, d_j,\ ...]$$ - -where any value outside the original input image region are considered zero ( -i.e. we pad zero values around the border of the image). - -Since `input` is 4-D, each `input[b, i, j, :]` is a vector. For `conv2d`, these -vectors are multiplied by the `filter[di, dj, :, :]` matrices to produce new -vectors. For `depthwise_conv_2d`, each scalar component `input[b, i, j, k]` -is multiplied by a vector `filter[di, dj, k]`, and all the vectors are -concatenated. - -* `tf.nn.convolution` -* `tf.nn.conv2d` -* `tf.nn.depthwise_conv2d` -* `tf.nn.depthwise_conv2d_native` -* `tf.nn.separable_conv2d` -* `tf.nn.atrous_conv2d` -* `tf.nn.atrous_conv2d_transpose` -* `tf.nn.conv2d_transpose` -* `tf.nn.conv1d` -* `tf.nn.conv3d` -* `tf.nn.conv3d_transpose` -* `tf.nn.conv2d_backprop_filter` -* `tf.nn.conv2d_backprop_input` -* `tf.nn.conv3d_backprop_filter_v2` -* `tf.nn.depthwise_conv2d_native_backprop_filter` -* `tf.nn.depthwise_conv2d_native_backprop_input` - -## Pooling - -The pooling ops sweep a rectangular window over the input tensor, computing a -reduction operation for each window (average, max, or max with argmax). Each -pooling op uses rectangular windows of size `ksize` separated by offset -`strides`. For example, if `strides` is all ones every window is used, if -`strides` is all twos every other window is used in each dimension, etc. - -In detail, the output is - - output[i] = reduce(value[strides * i:strides * i + ksize]) - -where the indices also take into consideration the padding values. Please refer -to the `Convolution` section for details about the padding calculation. - -* `tf.nn.avg_pool` -* `tf.nn.max_pool` -* `tf.nn.max_pool_with_argmax` -* `tf.nn.avg_pool3d` -* `tf.nn.max_pool3d` -* `tf.nn.fractional_avg_pool` -* `tf.nn.fractional_max_pool` -* `tf.nn.pool` - -## Morphological filtering - -Morphological operators are non-linear filters used in image processing. - -[Greyscale morphological dilation -](https://en.wikipedia.org/wiki/Dilation_(morphology)) -is the max-sum counterpart of standard sum-product convolution: - -$$ output[b, y, x, c] = - max_{dy, dx} input[b, - strides[1] * y + rates[1] * dy, - strides[2] * x + rates[2] * dx, - c] + - filter[dy, dx, c]$$ - -The `filter` is usually called structuring function. Max-pooling is a special -case of greyscale morphological dilation when the filter assumes all-zero -values (a.k.a. flat structuring function). - -[Greyscale morphological erosion -](https://en.wikipedia.org/wiki/Erosion_(morphology)) -is the min-sum counterpart of standard sum-product convolution: - -$$ output[b, y, x, c] = - min_{dy, dx} input[b, - strides[1] * y - rates[1] * dy, - strides[2] * x - rates[2] * dx, - c] - - filter[dy, dx, c]$$ - -Dilation and erosion are dual to each other. The dilation of the input signal -`f` by the structuring signal `g` is equal to the negation of the erosion of -`-f` by the reflected `g`, and vice versa. - -Striding and padding is carried out in exactly the same way as in standard -convolution. Please refer to the `Convolution` section for details. - -* `tf.nn.dilation2d` -* `tf.nn.erosion2d` -* `tf.nn.with_space_to_batch` - -## Normalization - -Normalization is useful to prevent neurons from saturating when inputs may -have varying scale, and to aid generalization. - -* `tf.nn.l2_normalize` -* `tf.nn.local_response_normalization` -* `tf.nn.sufficient_statistics` -* `tf.nn.normalize_moments` -* `tf.nn.moments` -* `tf.nn.weighted_moments` -* `tf.nn.fused_batch_norm` -* `tf.nn.batch_normalization` -* `tf.nn.batch_norm_with_global_normalization` - -## Losses - -The loss ops measure error between two tensors, or between a tensor and zero. -These can be used for measuring accuracy of a network in a regression task -or for regularization purposes (weight decay). - -* `tf.nn.l2_loss` -* `tf.nn.log_poisson_loss` - -## Classification - -TensorFlow provides several operations that help you perform classification. - -* `tf.nn.sigmoid_cross_entropy_with_logits` -* `tf.nn.softmax` -* `tf.nn.log_softmax` -* `tf.nn.softmax_cross_entropy_with_logits` -* `tf.nn.softmax_cross_entropy_with_logits_v2` - identical to the base - version, except it allows gradient propagation into the labels. -* `tf.nn.sparse_softmax_cross_entropy_with_logits` -* `tf.nn.weighted_cross_entropy_with_logits` - -## Embeddings - -TensorFlow provides library support for looking up values in embedding -tensors. - -* `tf.nn.embedding_lookup` -* `tf.nn.embedding_lookup_sparse` - -## Recurrent Neural Networks - -TensorFlow provides a number of methods for constructing Recurrent -Neural Networks. Most accept an `RNNCell`-subclassed object -(see the documentation for `tf.contrib.rnn`). - -* `tf.nn.dynamic_rnn` -* `tf.nn.bidirectional_dynamic_rnn` -* `tf.nn.raw_rnn` - -## Connectionist Temporal Classification (CTC) - -* `tf.nn.ctc_loss` -* `tf.nn.ctc_greedy_decoder` -* `tf.nn.ctc_beam_search_decoder` - -## Evaluation - -The evaluation ops are useful for measuring the performance of a network. -They are typically used at evaluation time. - -* `tf.nn.top_k` -* `tf.nn.in_top_k` - -## Candidate Sampling - -Do you want to train a multiclass or multilabel model with thousands -or millions of output classes (for example, a language model with a -large vocabulary)? Training with a full Softmax is slow in this case, -since all of the classes are evaluated for every training example. -Candidate Sampling training algorithms can speed up your step times by -only considering a small randomly-chosen subset of contrastive classes -(called candidates) for each batch of training examples. - -See our -[Candidate Sampling Algorithms -Reference](https://www.tensorflow.org/extras/candidate_sampling.pdf) - -### Sampled Loss Functions - -TensorFlow provides the following sampled loss functions for faster training. - -* `tf.nn.nce_loss` -* `tf.nn.sampled_softmax_loss` - -### Candidate Samplers - -TensorFlow provides the following samplers for randomly sampling candidate -classes when using one of the sampled loss functions above. - -* `tf.nn.uniform_candidate_sampler` -* `tf.nn.log_uniform_candidate_sampler` -* `tf.nn.learned_unigram_candidate_sampler` -* `tf.nn.fixed_unigram_candidate_sampler` - -### Miscellaneous candidate sampling utilities - -* `tf.nn.compute_accidental_hits` - -### Quantization ops - -* `tf.nn.quantized_conv2d` -* `tf.nn.quantized_relu_x` -* `tf.nn.quantized_max_pool` -* `tf.nn.quantized_avg_pool` - -## Notes on SAME Convolution Padding - -In these notes, we provide more background on the use of the `'SAME'` padding -scheme for convolution operations. - -Tensorflow uses the smallest possible padding to achieve the desired output -size. To understand what is done, consider the \\(1\\)-dimensional case. Denote -\\(n_i\\) and \\(n_o\\) the input and output sizes, respectively, and denote the -kernel size \\(k\\) and stride \\(s\\). As discussed in the -[Convolution section](#Convolution), for `'SAME'`, -\\(n_o = \left \lceil{\frac{n_i}{s}}\right \rceil\\). - -To achieve a desired output size \\(n_o\\), we need to pad the input such that the -output size after a `'VALID'` convolution is \\(n_o\\). In other words, we need to -have padding \\(p_i\\) such that: - -\begin{equation} -\left \lceil{\frac{n_i + p_i - k + 1}{s}}\right \rceil = n_o -\label{eq:tf_pad_1} -\end{equation} - -What is the smallest \\(p_i\\) that we could possibly use? In general, \\(\left -\lceil{\frac{x}{a}}\right \rceil = b\\) (with \\(a > 0\\)) means that \\(b-1 < -\frac{x}{a} \leq b\\), and the smallest integer \\(x\\) we can choose to satisfy -this is \\(x = a\cdot (b-1) + 1\\). The same applies to our problem; we need -\\(p_i\\) such that: - -\begin{equation} -n_i + p_i - k + 1 = s\cdot (n_o - 1) + 1 -\label{eq:tf_pad_2} -\end{equation} - -which leads to: - -\begin{equation} -p_i = s\cdot (n_o - 1) + k - n_i -\label{eq:tf_pad_3} -\end{equation} - -Note that this might lead to negative \\(p_i\\), since in some cases we might -already have more input samples than we actually need. Thus, - -\begin{equation} -p_i = max(s\cdot (n_o - 1) + k - n_i, 0) -\label{eq:tf_pad_4} -\end{equation} - -Remember that, for `'SAME'` padding, -\\(n_o = \left \lceil{\frac{n_i}{s}}\right \rceil\\), as mentioned above. -We need to analyze in detail two cases: - -- \\(n_i \text{ mod } s = 0\\) - -In this simple case, \\(n_o = \frac{n_i}{s}\\), and the expression for \\(p_i\\) -becomes: - -\begin{equation} -p_i = max(k - s, 0) -\label{eq:tf_pad_5} -\end{equation} - -- \\(n_i \text{ mod } s \neq 0\\) - -This case is more involved to parse. First, we write: - -\begin{equation} -n_i = s\cdot\left \lceil{\frac{n_i}{s}}\right \rceil -- s \left(\left \lceil{\frac{n_i}{s}}\right \rceil - - \left \lfloor{\frac{n_i}{s}}\right \rfloor\right) -+ (n_i \text{ mod } s) -\label{eq:tf_pad_6} -\end{equation} - -For the case where \\((n_i \text{ mod } s) \neq 0\\), we have \\(\left -\lceil{\frac{n_i}{s}}\right \rceil -\left \lfloor{\frac{n_i}{s}}\right \rfloor = -1\\), leading to: - -\begin{equation} -n_i = s\cdot\left \lceil{\frac{n_i}{s}}\right \rceil -- s -+ (n_i \text{ mod } s) -\label{eq:tf_pad_7} -\end{equation} - -We can use this expression to substitute \\(n_o = \left -\lceil{\frac{n_i}{s}}\right \rceil\\) and get: - -$$\begin{align} -p_i &= max\left(s\cdot \left(\frac{n_i + s - (n_i \text{ mod } s)}{s} - - 1\right) + k - n_i, 0\right) \nonumber\\ -&= max(n_i + s - (n_i \text{ mod } s) - s + k - n_i,0) \nonumber \\ -&= max(k - (n_i \text{ mod } s),0) -\label{eq:tf_pad_8} -\end{align}$$ - -### Final expression - -Putting all together, the total padding used by tensorflow's convolution with -`'SAME'` mode is: - -$$\begin{align} -p_i = - \begin{cases} - max(k - s, 0), & \text{if $(n_i \text{ mod } s) = 0$} \\ - max(k - (n_i \text{ mod } s),0), & \text{if $(n_i \text{ mod } s) \neq 0$} - \end{cases} - \label{eq:tf_pad_9} -\end{align}$$ - -This expression is exactly equal to the ones presented for `pad_along_height` -and `pad_along_width` in the [Convolution section](#Convolution). diff --git a/tensorflow/docs_src/api_guides/python/python_io.md b/tensorflow/docs_src/api_guides/python/python_io.md deleted file mode 100644 index e7e82a8701..0000000000 --- a/tensorflow/docs_src/api_guides/python/python_io.md +++ /dev/null @@ -1,29 +0,0 @@ -# Data IO (Python functions) -[TOC] - -A TFRecords file represents a sequence of (binary) strings. The format is not -random access, so it is suitable for streaming large amounts of data but not -suitable if fast sharding or other non-sequential access is desired. - -* `tf.python_io.TFRecordWriter` -* `tf.python_io.tf_record_iterator` -* `tf.python_io.TFRecordCompressionType` -* `tf.python_io.TFRecordOptions` - -- - - - -## TFRecords Format Details - -A TFRecords file contains a sequence of strings with CRC32C (32-bit CRC using -the Castagnoli polynomial) hashes. Each record has the format - - uint64 length - uint32 masked_crc32_of_length - byte data[length] - uint32 masked_crc32_of_data - -and the records are concatenated together to produce the file. CRCs are -[described here](https://en.wikipedia.org/wiki/Cyclic_redundancy_check), and -the mask of a CRC is - - masked_crc = ((crc >> 15) | (crc << 17)) + 0xa282ead8ul diff --git a/tensorflow/docs_src/api_guides/python/reading_data.md b/tensorflow/docs_src/api_guides/python/reading_data.md deleted file mode 100644 index 9f555ee85d..0000000000 --- a/tensorflow/docs_src/api_guides/python/reading_data.md +++ /dev/null @@ -1,522 +0,0 @@ -# Reading data - -Note: The preferred way to feed data into a tensorflow program is using the -[`tf.data` API](../../guide/datasets.md). - -There are four methods of getting data into a TensorFlow program: - -* `tf.data` API: Easily construct a complex input pipeline. (preferred method) -* Feeding: Python code provides the data when running each step. -* `QueueRunner`: a queue-based input pipeline reads the data from files - at the beginning of a TensorFlow graph. -* Preloaded data: a constant or variable in the TensorFlow graph holds - all the data (for small data sets). - -[TOC] - -## `tf.data` API - -See the [Importing Data](../../guide/datasets.md) for an in-depth explanation of `tf.data.Dataset`. -The `tf.data` API enables you to extract and preprocess data -from different input/file formats, and apply transformations such as batching, -shuffling, and mapping functions over the dataset. This is an improved version -of the old input methods---feeding and `QueueRunner`---which are described -below for historical purposes. - -## Feeding - -Warning: "Feeding" is the least efficient way to feed data into a TensorFlow -program and should only be used for small experiments and debugging. - -TensorFlow's feed mechanism lets you inject data into any Tensor in a -computation graph. A Python computation can thus feed data directly into the -graph. - -Supply feed data through the `feed_dict` argument to a run() or eval() call -that initiates computation. - -```python -with tf.Session(): - input = tf.placeholder(tf.float32) - classifier = ... - print(classifier.eval(feed_dict={input: my_python_preprocessing_fn()})) -``` - -While you can replace any Tensor with feed data, including variables and -constants, the best practice is to use a -`tf.placeholder` node. A -`placeholder` exists solely to serve as the target of feeds. It is not -initialized and contains no data. A placeholder generates an error if -it is executed without a feed, so you won't forget to feed it. - -An example using `placeholder` and feeding to train on MNIST data can be found -in -[`tensorflow/examples/tutorials/mnist/fully_connected_feed.py`](https://www.tensorflow.org/code/tensorflow/examples/tutorials/mnist/fully_connected_feed.py). - -## `QueueRunner` - -Warning: This section discusses implementing input pipelines using the -queue-based APIs which can be cleanly replaced by the [`tf.data` -API](../../guide/datasets.md). - -A typical queue-based pipeline for reading records from files has the following stages: - -1. The list of filenames -2. *Optional* filename shuffling -3. *Optional* epoch limit -4. Filename queue -5. A Reader for the file format -6. A decoder for a record read by the reader -7. *Optional* preprocessing -8. Example queue - -### Filenames, shuffling, and epoch limits - -For the list of filenames, use either a constant string Tensor (like -`["file0", "file1"]` or `[("file%d" % i) for i in range(2)]`) or the -`tf.train.match_filenames_once` function. - -Pass the list of filenames to the `tf.train.string_input_producer` function. -`string_input_producer` creates a FIFO queue for holding the filenames until -the reader needs them. - -`string_input_producer` has options for shuffling and setting a maximum number -of epochs. A queue runner adds the whole list of filenames to the queue once -for each epoch, shuffling the filenames within an epoch if `shuffle=True`. -This procedure provides a uniform sampling of files, so that examples are not -under- or over- sampled relative to each other. - -The queue runner works in a thread separate from the reader that pulls -filenames from the queue, so the shuffling and enqueuing process does not -block the reader. - -### File formats - -Select the reader that matches your input file format and pass the filename -queue to the reader's read method. The read method outputs a key identifying -the file and record (useful for debugging if you have some weird records), and -a scalar string value. Use one (or more) of the decoder and conversion ops to -decode this string into the tensors that make up an example. - -#### CSV files - -To read text files in [comma-separated value (CSV) -format](https://tools.ietf.org/html/rfc4180), use a -`tf.TextLineReader` with the -`tf.decode_csv` operation. For example: - -```python -filename_queue = tf.train.string_input_producer(["file0.csv", "file1.csv"]) - -reader = tf.TextLineReader() -key, value = reader.read(filename_queue) - -# Default values, in case of empty columns. Also specifies the type of the -# decoded result. -record_defaults = [[1], [1], [1], [1], [1]] -col1, col2, col3, col4, col5 = tf.decode_csv( - value, record_defaults=record_defaults) -features = tf.stack([col1, col2, col3, col4]) - -with tf.Session() as sess: - # Start populating the filename queue. - coord = tf.train.Coordinator() - threads = tf.train.start_queue_runners(coord=coord) - - for i in range(1200): - # Retrieve a single instance: - example, label = sess.run([features, col5]) - - coord.request_stop() - coord.join(threads) -``` - -Each execution of `read` reads a single line from the file. The -`decode_csv` op then parses the result into a list of tensors. The -`record_defaults` argument determines the type of the resulting tensors and -sets the default value to use if a value is missing in the input string. - -You must call `tf.train.start_queue_runners` to populate the queue before -you call `run` or `eval` to execute the `read`. Otherwise `read` will -block while it waits for filenames from the queue. - -#### Fixed length records - -To read binary files in which each record is a fixed number of bytes, use -`tf.FixedLengthRecordReader` -with the `tf.decode_raw` operation. -The `decode_raw` op converts from a string to a uint8 tensor. - -For example, [the CIFAR-10 dataset](http://www.cs.toronto.edu/~kriz/cifar.html) -uses a file format where each record is represented using a fixed number of -bytes: 1 byte for the label followed by 3072 bytes of image data. Once you have -a uint8 tensor, standard operations can slice out each piece and reformat as -needed. For CIFAR-10, you can see how to do the reading and decoding in -[`tensorflow_models/tutorials/image/cifar10/cifar10_input.py`](https://github.com/tensorflow/models/tree/master/tutorials/image/cifar10/cifar10_input.py) -and described in -[this tutorial](../../tutorials/images/deep_cnn.md#prepare-the-data). - -#### Standard TensorFlow format - -Another approach is to convert whatever data you have into a supported format. -This approach makes it easier to mix and match data sets and network -architectures. The recommended format for TensorFlow is a -[TFRecords file](../../api_guides/python/python_io.md#tfrecords_format_details) -containing -[`tf.train.Example` protocol buffers](https://www.tensorflow.org/code/tensorflow/core/example/example.proto) -(which contain -[`Features`](https://www.tensorflow.org/code/tensorflow/core/example/feature.proto) -as a field). You write a little program that gets your data, stuffs it in an -`Example` protocol buffer, serializes the protocol buffer to a string, and then -writes the string to a TFRecords file using the -`tf.python_io.TFRecordWriter`. -For example, -[`tensorflow/examples/how_tos/reading_data/convert_to_records.py`](https://www.tensorflow.org/code/tensorflow/examples/how_tos/reading_data/convert_to_records.py) -converts MNIST data to this format. - -The recommended way to read a TFRecord file is with a `tf.data.TFRecordDataset`, [as in this example](https://www.tensorflow.org/code/tensorflow/examples/how_tos/reading_data/fully_connected_reader.py): - -``` python - dataset = tf.data.TFRecordDataset(filename) - dataset = dataset.repeat(num_epochs) - - # map takes a python function and applies it to every sample - dataset = dataset.map(decode) -``` - -To accomplish the same task with a queue based input pipeline requires the following code -(using the same `decode` function from the above example): - -``` python - filename_queue = tf.train.string_input_producer([filename], num_epochs=num_epochs) - reader = tf.TFRecordReader() - _, serialized_example = reader.read(filename_queue) - image,label = decode(serialized_example) -``` - -### Preprocessing - -You can then do any preprocessing of these examples you want. This would be any -processing that doesn't depend on trainable parameters. Examples include -normalization of your data, picking a random slice, adding noise or distortions, -etc. See -[`tensorflow_models/tutorials/image/cifar10/cifar10_input.py`](https://github.com/tensorflow/models/tree/master/tutorials/image/cifar10/cifar10_input.py) -for an example. - -### Batching - -At the end of the pipeline we use another queue to batch together examples for -training, evaluation, or inference. For this we use a queue that randomizes the -order of examples, using the -`tf.train.shuffle_batch`. - -Example: - -``` -def read_my_file_format(filename_queue): - reader = tf.SomeReader() - key, record_string = reader.read(filename_queue) - example, label = tf.some_decoder(record_string) - processed_example = some_processing(example) - return processed_example, label - -def input_pipeline(filenames, batch_size, num_epochs=None): - filename_queue = tf.train.string_input_producer( - filenames, num_epochs=num_epochs, shuffle=True) - example, label = read_my_file_format(filename_queue) - # min_after_dequeue defines how big a buffer we will randomly sample - # from -- bigger means better shuffling but slower start up and more - # memory used. - # capacity must be larger than min_after_dequeue and the amount larger - # determines the maximum we will prefetch. Recommendation: - # min_after_dequeue + (num_threads + a small safety margin) * batch_size - min_after_dequeue = 10000 - capacity = min_after_dequeue + 3 * batch_size - example_batch, label_batch = tf.train.shuffle_batch( - [example, label], batch_size=batch_size, capacity=capacity, - min_after_dequeue=min_after_dequeue) - return example_batch, label_batch -``` - -If you need more parallelism or shuffling of examples between files, use -multiple reader instances using the -`tf.train.shuffle_batch_join`. -For example: - -``` -def read_my_file_format(filename_queue): - # Same as above - -def input_pipeline(filenames, batch_size, read_threads, num_epochs=None): - filename_queue = tf.train.string_input_producer( - filenames, num_epochs=num_epochs, shuffle=True) - example_list = [read_my_file_format(filename_queue) - for _ in range(read_threads)] - min_after_dequeue = 10000 - capacity = min_after_dequeue + 3 * batch_size - example_batch, label_batch = tf.train.shuffle_batch_join( - example_list, batch_size=batch_size, capacity=capacity, - min_after_dequeue=min_after_dequeue) - return example_batch, label_batch -``` - -You still only use a single filename queue that is shared by all the readers. -That way we ensure that the different readers use different files from the same -epoch until all the files from the epoch have been started. (It is also usually -sufficient to have a single thread filling the filename queue.) - -An alternative is to use a single reader via the -`tf.train.shuffle_batch` -with `num_threads` bigger than 1. This will make it read from a single file at -the same time (but faster than with 1 thread), instead of N files at once. -This can be important: - -* If you have more reading threads than input files, to avoid the risk that - you will have two threads reading the same example from the same file near - each other. -* Or if reading N files in parallel causes too many disk seeks. - -How many threads do you need? the `tf.train.shuffle_batch*` functions add a -summary to the graph that indicates how full the example queue is. If you have -enough reading threads, that summary will stay above zero. You can -[view your summaries as training progresses using TensorBoard](../../guide/summaries_and_tensorboard.md). - -### Creating threads to prefetch using `QueueRunner` objects - -The short version: many of the `tf.train` functions listed above add -`tf.train.QueueRunner` objects to your -graph. These require that you call -`tf.train.start_queue_runners` -before running any training or inference steps, or it will hang forever. This -will start threads that run the input pipeline, filling the example queue so -that the dequeue to get the examples will succeed. This is best combined with a -`tf.train.Coordinator` to cleanly -shut down these threads when there are errors. If you set a limit on the number -of epochs, that will use an epoch counter that will need to be initialized. The -recommended code pattern combining these is: - -```python -# Create the graph, etc. -init_op = tf.global_variables_initializer() - -# Create a session for running operations in the Graph. -sess = tf.Session() - -# Initialize the variables (like the epoch counter). -sess.run(init_op) - -# Start input enqueue threads. -coord = tf.train.Coordinator() -threads = tf.train.start_queue_runners(sess=sess, coord=coord) - -try: - while not coord.should_stop(): - # Run training steps or whatever - sess.run(train_op) - -except tf.errors.OutOfRangeError: - print('Done training -- epoch limit reached') -finally: - # When done, ask the threads to stop. - coord.request_stop() - -# Wait for threads to finish. -coord.join(threads) -sess.close() -``` - -#### Aside: What is happening here? - -First we create the graph. It will have a few pipeline stages that are -connected by queues. The first stage will generate filenames to read and enqueue -them in the filename queue. The second stage consumes filenames (using a -`Reader`), produces examples, and enqueues them in an example queue. Depending -on how you have set things up, you may actually have a few independent copies of -the second stage, so that you can read from multiple files in parallel. At the -end of these stages is an enqueue operation, which enqueues into a queue that -the next stage dequeues from. We want to start threads running these enqueuing -operations, so that our training loop can dequeue examples from the example -queue. - -
- -
- -The helpers in `tf.train` that create these queues and enqueuing operations add -a `tf.train.QueueRunner` to the -graph using the -`tf.train.add_queue_runner` -function. Each `QueueRunner` is responsible for one stage, and holds the list of -enqueue operations that need to be run in threads. Once the graph is -constructed, the -`tf.train.start_queue_runners` -function asks each QueueRunner in the graph to start its threads running the -enqueuing operations. - -If all goes well, you can now run your training steps and the queues will be -filled by the background threads. If you have set an epoch limit, at some point -an attempt to dequeue examples will get an -`tf.errors.OutOfRangeError`. This -is the TensorFlow equivalent of "end of file" (EOF) -- this means the epoch -limit has been reached and no more examples are available. - -The last ingredient is the -`tf.train.Coordinator`. This is responsible -for letting all the threads know if anything has signaled a shut down. Most -commonly this would be because an exception was raised, for example one of the -threads got an error when running some operation (or an ordinary Python -exception). - -For more about threading, queues, QueueRunners, and Coordinators -[see here](../../api_guides/python/threading_and_queues.md). - -#### Aside: How clean shut-down when limiting epochs works - -Imagine you have a model that has set a limit on the number of epochs to train -on. That means that the thread generating filenames will only run that many -times before generating an `OutOfRange` error. The QueueRunner will catch that -error, close the filename queue, and exit the thread. Closing the queue does two -things: - -* Any future attempt to enqueue in the filename queue will generate an error. - At this point there shouldn't be any threads trying to do that, but this - is helpful when queues are closed due to other errors. -* Any current or future dequeue will either succeed (if there are enough - elements left) or fail (with an `OutOfRange` error) immediately. They won't - block waiting for more elements to be enqueued, since by the previous point - that can't happen. - -The point is that when the filename queue is closed, there will likely still be -many filenames in that queue, so the next stage of the pipeline (with the reader -and other preprocessing) may continue running for some time. Once the filename -queue is exhausted, though, the next attempt to dequeue a filename (e.g. from a -reader that has finished with the file it was working on) will trigger an -`OutOfRange` error. In this case, though, you might have multiple threads -associated with a single QueueRunner. If this isn't the last thread in the -QueueRunner, the `OutOfRange` error just causes the one thread to exit. This -allows the other threads, which are still finishing up their last file, to -proceed until they finish as well. (Assuming you are using a -`tf.train.Coordinator`, -other types of errors will cause all the threads to stop.) Once all the reader -threads hit the `OutOfRange` error, only then does the next queue, the example -queue, gets closed. - -Again, the example queue will have some elements queued, so training will -continue until those are exhausted. If the example queue is a -`tf.RandomShuffleQueue`, say -because you are using `shuffle_batch` or `shuffle_batch_join`, it normally will -avoid ever having fewer than its `min_after_dequeue` attr elements buffered. -However, once the queue is closed that restriction will be lifted and the queue -will eventually empty. At that point the actual training threads, when they -try and dequeue from example queue, will start getting `OutOfRange` errors and -exiting. Once all the training threads are done, -`tf.train.Coordinator.join` -will return and you can exit cleanly. - -### Filtering records or producing multiple examples per record - -Instead of examples with shapes `[x, y, z]`, you will produce a batch of -examples with shape `[batch, x, y, z]`. The batch size can be 0 if you want to -filter this record out (maybe it is in a hold-out set?), or bigger than 1 if you -are producing multiple examples per record. Then simply set `enqueue_many=True` -when calling one of the batching functions (such as `shuffle_batch` or -`shuffle_batch_join`). - -### Sparse input data - -SparseTensors don't play well with queues. If you use SparseTensors you have -to decode the string records using -`tf.parse_example` **after** -batching (instead of using `tf.parse_single_example` before batching). - -## Preloaded data - -This is only used for small data sets that can be loaded entirely in memory. -There are two approaches: - -* Store the data in a constant. -* Store the data in a variable, that you initialize (or assign to) and then - never change. - -Using a constant is a bit simpler, but uses more memory (since the constant is -stored inline in the graph data structure, which may be duplicated a few times). - -```python -training_data = ... -training_labels = ... -with tf.Session(): - input_data = tf.constant(training_data) - input_labels = tf.constant(training_labels) - ... -``` - -To instead use a variable, you need to also initialize it after the graph has been built. - -```python -training_data = ... -training_labels = ... -with tf.Session() as sess: - data_initializer = tf.placeholder(dtype=training_data.dtype, - shape=training_data.shape) - label_initializer = tf.placeholder(dtype=training_labels.dtype, - shape=training_labels.shape) - input_data = tf.Variable(data_initializer, trainable=False, collections=[]) - input_labels = tf.Variable(label_initializer, trainable=False, collections=[]) - ... - sess.run(input_data.initializer, - feed_dict={data_initializer: training_data}) - sess.run(input_labels.initializer, - feed_dict={label_initializer: training_labels}) -``` - -Setting `trainable=False` keeps the variable out of the -`GraphKeys.TRAINABLE_VARIABLES` collection in the graph, so we won't try and -update it when training. Setting `collections=[]` keeps the variable out of the -`GraphKeys.GLOBAL_VARIABLES` collection used for saving and restoring checkpoints. - -Either way, -`tf.train.slice_input_producer` -can be used to produce a slice at a time. This shuffles the examples across an -entire epoch, so further shuffling when batching is undesirable. So instead of -using the `shuffle_batch` functions, we use the plain -`tf.train.batch` function. To use -multiple preprocessing threads, set the `num_threads` parameter to a number -bigger than 1. - -An MNIST example that preloads the data using constants can be found in -[`tensorflow/examples/how_tos/reading_data/fully_connected_preloaded.py`](https://www.tensorflow.org/code/tensorflow/examples/how_tos/reading_data/fully_connected_preloaded.py), and one that preloads the data using variables can be found in -[`tensorflow/examples/how_tos/reading_data/fully_connected_preloaded_var.py`](https://www.tensorflow.org/code/tensorflow/examples/how_tos/reading_data/fully_connected_preloaded_var.py), -You can compare these with the `fully_connected_feed` and -`fully_connected_reader` versions above. - -## Multiple input pipelines - -Commonly you will want to train on one dataset and evaluate (or "eval") on -another. One way to do this is to actually have two separate graphs and -sessions, maybe in separate processes: - -* The training process reads training input data and periodically writes - checkpoint files with all the trained variables. -* The evaluation process restores the checkpoint files into an inference - model that reads validation input data. - -This is what is done `tf.estimator` and manually in -[the example CIFAR-10 model](../../tutorials/images/deep_cnn.md#save-and-restore-checkpoints). -This has a couple of benefits: - -* The eval is performed on a single snapshot of the trained variables. -* You can perform the eval even after training has completed and exited. - -You can have the train and eval in the same graph in the same process, and share -their trained variables or layers. See [the shared variables tutorial](../../guide/variables.md). - -To support the single-graph approach -[`tf.data`](../../guide/datasets.md) also supplies -[advanced iterator types](../../guide/datasets.md#creating_an_iterator) that -that allow the user to change the input pipeline without rebuilding the graph or -session. - -Note: Regardless of the implementation, many -operations (like `tf.layers.batch_normalization`, and `tf.layers.dropout`) -need to know if they are in training or evaluation mode, and you must be -careful to set this appropriately if you change the data source. diff --git a/tensorflow/docs_src/api_guides/python/regression_examples.md b/tensorflow/docs_src/api_guides/python/regression_examples.md deleted file mode 100644 index d67f38f57a..0000000000 --- a/tensorflow/docs_src/api_guides/python/regression_examples.md +++ /dev/null @@ -1,232 +0,0 @@ -# Regression Examples - -This unit provides the following short examples demonstrating how -to implement regression in Estimators: - - - - - - - - - - - - - - - - - - - - - - - - -
Example Demonstrates How To...
linear_regression.pyUse the `tf.estimator.LinearRegressor` Estimator to train a - regression model on numeric data.
linear_regression_categorical.pyUse the `tf.estimator.LinearRegressor` Estimator to train a - regression model on categorical data.
dnn_regression.pyUse the `tf.estimator.DNNRegressor` Estimator to train a - regression model on discrete data with a deep neural network.
custom_regression.pyUse `tf.estimator.Estimator` to train a customized dnn - regression model.
- -The preceding examples rely on the following data set utility: - - - - - - - - - - -
Utility Description
imports85.pyThis program provides utility functions that load the - imports85 data set into formats that other TensorFlow - programs (for example, linear_regression.py and - dnn_regression.py) can use.
- - - - - - - - -## Running the examples - -You must [install TensorFlow](../../install/index.md) prior to running these examples. -Depending on the way you've installed TensorFlow, you might also -need to activate your TensorFlow environment. Then, do the following: - -1. Clone the TensorFlow repository from github. -2. `cd` to the top of the downloaded tree. -3. Check out the branch for you current tensorflow version: `git checkout rX.X` -4. `cd tensorflow/examples/get_started/regression`. - -You can now run any of the example TensorFlow programs in the -`tensorflow/examples/get_started/regression` directory as you -would run any Python program: - -```bsh -python linear_regressor.py -``` - -During training, all three programs output the following information: - -* The name of the checkpoint directory, which is important for TensorBoard. -* The training loss after every 100 iterations, which helps you - determine whether the model is converging. - -For example, here's some possible output for the `linear_regressor.py` -program: - -``` None -INFO:tensorflow:Saving checkpoints for 1 into /tmp/tmpAObiz9/model.ckpt. -INFO:tensorflow:loss = 161.308, step = 1 -INFO:tensorflow:global_step/sec: 1557.24 -INFO:tensorflow:loss = 15.7937, step = 101 (0.065 sec) -INFO:tensorflow:global_step/sec: 1529.17 -INFO:tensorflow:loss = 12.1988, step = 201 (0.065 sec) -INFO:tensorflow:global_step/sec: 1663.86 -... -INFO:tensorflow:loss = 6.99378, step = 901 (0.058 sec) -INFO:tensorflow:Saving checkpoints for 1000 into /tmp/tmpAObiz9/model.ckpt. -INFO:tensorflow:Loss for final step: 5.12413. -``` - - - -## linear_regressor.py - -`linear_regressor.py` trains a model that predicts the price of a car from -two numerical features. - - - - - - - - - - - - - - - - - - - - -
EstimatorLinearRegressor, which is a pre-made Estimator for linear - regression.
FeaturesNumerical: body-style and make.
LabelNumerical: price -
AlgorithmLinear regression.
- -After training the model, the program concludes by outputting predicted -car prices for two car models. - - - - -## linear_regression_categorical.py - -This program illustrates ways to represent categorical features. It -also demonstrates how to train a linear model based on a mix of -categorical and numerical features. - - - - - - - - - - - - - - - - - - - - - -
EstimatorLinearRegressor, which is a pre-made Estimator for linear - regression.
FeaturesCategorical: curb-weight and highway-mpg.
- Numerical: body-style and make.
LabelNumerical: price.
AlgorithmLinear regression.
- - - -## dnn_regression.py - -Like `linear_regression_categorical.py`, the `dnn_regression.py` example -trains a model that predicts the price of a car from two features. -Unlike `linear_regression_categorical.py`, the `dnn_regression.py` example uses -a deep neural network to train the model. Both examples rely on the same -features; `dnn_regression.py` demonstrates how to treat categorical features -in a deep neural network. - - - - - - - - - - - - - - - - - - - - - -
EstimatorDNNRegressor, which is a pre-made Estimator for - regression that relies on a deep neural network. The - `hidden_units` parameter defines the topography of the network.
FeaturesCategorical: curb-weight and highway-mpg.
- Numerical: body-style and make.
LabelNumerical: price.
AlgorithmRegression through a deep neural network.
- -After printing loss values, the program outputs the Mean Square Error -on a test set. - - - -## custom_regression.py - -The `custom_regression.py` example also trains a model that predicts the price -of a car based on mixed real-valued and categorical input features, described by -feature_columns. Unlike `linear_regression_categorical.py`, and -`dnn_regression.py` this example does not use a pre-made estimator, but defines -a custom model using the base `tf.estimator.Estimator` class. The -custom model is quite similar to the model defined by `dnn_regression.py`. - -The custom model is defined by the `model_fn` argument to the constructor. The -customization is made more reusable through `params` dictionary, which is later -passed through to the `model_fn` when the `model_fn` is called. - -The `model_fn` returns an -`tf.estimator.EstimatorSpec` which is a simple structure -indicating to the `Estimator` which operations should be run to accomplish -various tasks. diff --git a/tensorflow/docs_src/api_guides/python/session_ops.md b/tensorflow/docs_src/api_guides/python/session_ops.md deleted file mode 100644 index 5f41bcf209..0000000000 --- a/tensorflow/docs_src/api_guides/python/session_ops.md +++ /dev/null @@ -1,15 +0,0 @@ -# Tensor Handle Operations - -Note: Functions taking `Tensor` arguments can also take anything accepted by -`tf.convert_to_tensor`. - -[TOC] - -## Tensor Handle Operations - -TensorFlow provides several operators that allows the user to keep tensors -"in-place" across run calls. - -* `tf.get_session_handle` -* `tf.get_session_tensor` -* `tf.delete_session_tensor` diff --git a/tensorflow/docs_src/api_guides/python/sparse_ops.md b/tensorflow/docs_src/api_guides/python/sparse_ops.md deleted file mode 100644 index b360055ed0..0000000000 --- a/tensorflow/docs_src/api_guides/python/sparse_ops.md +++ /dev/null @@ -1,45 +0,0 @@ -# Sparse Tensors - -Note: Functions taking `Tensor` arguments can also take anything accepted by -`tf.convert_to_tensor`. - -[TOC] - -## Sparse Tensor Representation - -TensorFlow supports a `SparseTensor` representation for data that is sparse -in multiple dimensions. Contrast this representation with `IndexedSlices`, -which is efficient for representing tensors that are sparse in their first -dimension, and dense along all other dimensions. - -* `tf.SparseTensor` -* `tf.SparseTensorValue` - -## Conversion - -* `tf.sparse_to_dense` -* `tf.sparse_tensor_to_dense` -* `tf.sparse_to_indicator` -* `tf.sparse_merge` - -## Manipulation - -* `tf.sparse_concat` -* `tf.sparse_reorder` -* `tf.sparse_reshape` -* `tf.sparse_split` -* `tf.sparse_retain` -* `tf.sparse_reset_shape` -* `tf.sparse_fill_empty_rows` -* `tf.sparse_transpose` - -## Reduction -* `tf.sparse_reduce_sum` -* `tf.sparse_reduce_sum_sparse` - -## Math Operations -* `tf.sparse_add` -* `tf.sparse_softmax` -* `tf.sparse_tensor_dense_matmul` -* `tf.sparse_maximum` -* `tf.sparse_minimum` diff --git a/tensorflow/docs_src/api_guides/python/spectral_ops.md b/tensorflow/docs_src/api_guides/python/spectral_ops.md deleted file mode 100644 index f6d109a3a0..0000000000 --- a/tensorflow/docs_src/api_guides/python/spectral_ops.md +++ /dev/null @@ -1,26 +0,0 @@ -# Spectral Functions - -[TOC] - -The `tf.spectral` module supports several spectral decomposition operations -that you can use to transform Tensors of real and complex signals. - -## Discrete Fourier Transforms - -* `tf.spectral.fft` -* `tf.spectral.ifft` -* `tf.spectral.fft2d` -* `tf.spectral.ifft2d` -* `tf.spectral.fft3d` -* `tf.spectral.ifft3d` -* `tf.spectral.rfft` -* `tf.spectral.irfft` -* `tf.spectral.rfft2d` -* `tf.spectral.irfft2d` -* `tf.spectral.rfft3d` -* `tf.spectral.irfft3d` - -## Discrete Cosine Transforms - -* `tf.spectral.dct` -* `tf.spectral.idct` diff --git a/tensorflow/docs_src/api_guides/python/state_ops.md b/tensorflow/docs_src/api_guides/python/state_ops.md deleted file mode 100644 index fc55ea1481..0000000000 --- a/tensorflow/docs_src/api_guides/python/state_ops.md +++ /dev/null @@ -1,110 +0,0 @@ -# Variables - -Note: Functions taking `Tensor` arguments can also take anything accepted by -`tf.convert_to_tensor`. - -[TOC] - -## Variables - -* `tf.Variable` - -## Variable helper functions - -TensorFlow provides a set of functions to help manage the set of variables -collected in the graph. - -* `tf.global_variables` -* `tf.local_variables` -* `tf.model_variables` -* `tf.trainable_variables` -* `tf.moving_average_variables` -* `tf.global_variables_initializer` -* `tf.local_variables_initializer` -* `tf.variables_initializer` -* `tf.is_variable_initialized` -* `tf.report_uninitialized_variables` -* `tf.assert_variables_initialized` -* `tf.assign` -* `tf.assign_add` -* `tf.assign_sub` - -## Saving and Restoring Variables - -* `tf.train.Saver` -* `tf.train.latest_checkpoint` -* `tf.train.get_checkpoint_state` -* `tf.train.update_checkpoint_state` - -## Sharing Variables - -TensorFlow provides several classes and operations that you can use to -create variables contingent on certain conditions. - -* `tf.get_variable` -* `tf.get_local_variable` -* `tf.VariableScope` -* `tf.variable_scope` -* `tf.variable_op_scope` -* `tf.get_variable_scope` -* `tf.make_template` -* `tf.no_regularizer` -* `tf.constant_initializer` -* `tf.random_normal_initializer` -* `tf.truncated_normal_initializer` -* `tf.random_uniform_initializer` -* `tf.uniform_unit_scaling_initializer` -* `tf.zeros_initializer` -* `tf.ones_initializer` -* `tf.orthogonal_initializer` - -## Variable Partitioners for Sharding - -* `tf.fixed_size_partitioner` -* `tf.variable_axis_size_partitioner` -* `tf.min_max_variable_partitioner` - -## Sparse Variable Updates - -The sparse update ops modify a subset of the entries in a dense `Variable`, -either overwriting the entries or adding / subtracting a delta. These are -useful for training embedding models and similar lookup-based networks, since -only a small subset of embedding vectors change in any given step. - -Since a sparse update of a large tensor may be generated automatically during -gradient computation (as in the gradient of -`tf.gather`), -an `tf.IndexedSlices` class is provided that encapsulates a set -of sparse indices and values. `IndexedSlices` objects are detected and handled -automatically by the optimizers in most cases. - -* `tf.scatter_update` -* `tf.scatter_add` -* `tf.scatter_sub` -* `tf.scatter_mul` -* `tf.scatter_div` -* `tf.scatter_min` -* `tf.scatter_max` -* `tf.scatter_nd_update` -* `tf.scatter_nd_add` -* `tf.scatter_nd_sub` -* `tf.sparse_mask` -* `tf.IndexedSlices` - -### Read-only Lookup Tables - -* `tf.initialize_all_tables` -* `tf.tables_initializer` - - -## Exporting and Importing Meta Graphs - -* `tf.train.export_meta_graph` -* `tf.train.import_meta_graph` - -# Deprecated functions (removed after 2017-03-02). Please don't use them. - -* `tf.all_variables` -* `tf.initialize_all_variables` -* `tf.initialize_local_variables` -* `tf.initialize_variables` diff --git a/tensorflow/docs_src/api_guides/python/string_ops.md b/tensorflow/docs_src/api_guides/python/string_ops.md deleted file mode 100644 index 24a3aad642..0000000000 --- a/tensorflow/docs_src/api_guides/python/string_ops.md +++ /dev/null @@ -1,39 +0,0 @@ -# Strings - -Note: Functions taking `Tensor` arguments can also take anything accepted by -`tf.convert_to_tensor`. - -[TOC] - -## Hashing - -String hashing ops take a string input tensor and map each element to an -integer. - -* `tf.string_to_hash_bucket_fast` -* `tf.string_to_hash_bucket_strong` -* `tf.string_to_hash_bucket` - -## Joining - -String joining ops concatenate elements of input string tensors to produce a new -string tensor. - -* `tf.reduce_join` -* `tf.string_join` - -## Splitting - -* `tf.string_split` -* `tf.substr` - -## Conversion - -* `tf.as_string` -* `tf.string_to_number` - -* `tf.decode_raw` -* `tf.decode_csv` - -* `tf.encode_base64` -* `tf.decode_base64` diff --git a/tensorflow/docs_src/api_guides/python/summary.md b/tensorflow/docs_src/api_guides/python/summary.md deleted file mode 100644 index fc45e7b4c3..0000000000 --- a/tensorflow/docs_src/api_guides/python/summary.md +++ /dev/null @@ -1,23 +0,0 @@ -# Summary Operations -[TOC] - -Summaries provide a way to export condensed information about a model, which is -then accessible in tools such as [TensorBoard](../../guide/summaries_and_tensorboard.md). - -## Generation of Summaries - -### Class for writing Summaries -* `tf.summary.FileWriter` -* `tf.summary.FileWriterCache` - -### Summary Ops -* `tf.summary.tensor_summary` -* `tf.summary.scalar` -* `tf.summary.histogram` -* `tf.summary.audio` -* `tf.summary.image` -* `tf.summary.merge` -* `tf.summary.merge_all` - -## Utilities -* `tf.summary.get_summary_description` diff --git a/tensorflow/docs_src/api_guides/python/test.md b/tensorflow/docs_src/api_guides/python/test.md deleted file mode 100644 index b6e0a332b9..0000000000 --- a/tensorflow/docs_src/api_guides/python/test.md +++ /dev/null @@ -1,47 +0,0 @@ -# Testing -[TOC] - -## Unit tests - -TensorFlow provides a convenience class inheriting from `unittest.TestCase` -which adds methods relevant to TensorFlow tests. Here is an example: - -```python - import tensorflow as tf - - - class SquareTest(tf.test.TestCase): - - def testSquare(self): - with self.test_session(): - x = tf.square([2, 3]) - self.assertAllEqual(x.eval(), [4, 9]) - - - if __name__ == '__main__': - tf.test.main() -``` - -`tf.test.TestCase` inherits from `unittest.TestCase` but adds a few additional -methods. See `tf.test.TestCase` for details. - -* `tf.test.main` -* `tf.test.TestCase` -* `tf.test.test_src_dir_path` - -## Utilities - -Note: `tf.test.mock` is an alias to the python `mock` or `unittest.mock` -depending on the python version. - -* `tf.test.assert_equal_graph_def` -* `tf.test.get_temp_dir` -* `tf.test.is_built_with_cuda` -* `tf.test.is_gpu_available` -* `tf.test.gpu_device_name` - -## Gradient checking - -`tf.test.compute_gradient` and `tf.test.compute_gradient_error` perform -numerical differentiation of graphs for comparison against registered analytic -gradients. diff --git a/tensorflow/docs_src/api_guides/python/tfdbg.md b/tensorflow/docs_src/api_guides/python/tfdbg.md deleted file mode 100644 index 9778cdc0b0..0000000000 --- a/tensorflow/docs_src/api_guides/python/tfdbg.md +++ /dev/null @@ -1,50 +0,0 @@ -# TensorFlow Debugger -[TOC] - -Public Python API of TensorFlow Debugger (tfdbg). - -## Functions for adding debug watches - -These functions help you modify `RunOptions` to specify which `Tensor`s are to -be watched when the TensorFlow graph is executed at runtime. - -* `tfdbg.add_debug_tensor_watch` -* `tfdbg.watch_graph` -* `tfdbg.watch_graph_with_blacklists` - - -## Classes for debug-dump data and directories - -These classes allow you to load and inspect tensor values dumped from -TensorFlow graphs during runtime. - -* `tfdbg.DebugTensorDatum` -* `tfdbg.DebugDumpDir` - - -## Functions for loading debug-dump data - -* `tfdbg.load_tensor_from_event_file` - - -## Tensor-value predicates - -Built-in tensor-filter predicates to support conditional breakpoint between -runs. See `DebugDumpDir.find()` for more details. - -* `tfdbg.has_inf_or_nan` - - -## Session wrapper class and `SessionRunHook` implementations - -These classes allow you to - -* wrap aroundTensorFlow `Session` objects to debug plain TensorFlow models - (see `DumpingDebugWrapperSession` and `LocalCLIDebugWrapperSession`), or -* generate `SessionRunHook` objects to debug `tf.contrib.learn` models (see - `DumpingDebugHook` and `LocalCLIDebugHook`). - -* `tfdbg.DumpingDebugHook` -* `tfdbg.DumpingDebugWrapperSession` -* `tfdbg.LocalCLIDebugHook` -* `tfdbg.LocalCLIDebugWrapperSession` diff --git a/tensorflow/docs_src/api_guides/python/threading_and_queues.md b/tensorflow/docs_src/api_guides/python/threading_and_queues.md deleted file mode 100644 index e00f17f955..0000000000 --- a/tensorflow/docs_src/api_guides/python/threading_and_queues.md +++ /dev/null @@ -1,270 +0,0 @@ -# Threading and Queues - -Note: In versions of TensorFlow before 1.2, we recommended using multi-threaded, -queue-based input pipelines for performance. Beginning with TensorFlow 1.4, -however, we recommend using the `tf.data` module instead. (See -[Datasets](../../guide/datasets.md) for details. In TensorFlow 1.2 and 1.3, the module was -called `tf.contrib.data`.) The `tf.data` module offers an easier-to-use -interface for constructing efficient input pipelines. Furthermore, we've stopped -developing the old multi-threaded, queue-based input pipelines. We've retained -the documentation in this file to help developers who are still maintaining -older code. - -Multithreaded queues are a powerful and widely used mechanism supporting -asynchronous computation. - -Following the [dataflow programming model](graphs.md), TensorFlow's queues are -implemented using nodes in the computation graph. A queue is a stateful node, -like a variable: other nodes can modify its content. In particular, nodes can -enqueue new items in to the queue, or dequeue existing items from the -queue. TensorFlow's queues provide a way to coordinate multiple steps of a -computation: a queue will **block** any step that attempts to dequeue from it -when it is empty, or enqueue to it when it is full. When that condition no -longer holds, the queue will unblock the step and allow execution to proceed. - -TensorFlow implements several classes of queue. The principal difference between -these classes is the order that items are removed from the queue. To get a feel -for queues, let's consider a simple example. We will create a "first in, first -out" queue (`tf.FIFOQueue`) and fill it with zeros. Then we'll construct a -graph that takes an item off the queue, adds one to that item, and puts it back -on the end of the queue. Slowly, the numbers on the queue increase. - -
- -
- -`Enqueue`, `EnqueueMany`, and `Dequeue` are special nodes. They take a pointer -to the queue instead of a normal value, allowing them to mutate its state. We -recommend that you think of these operations as being like methods of the queue -in an object-oriented sense. In fact, in the Python API, these operations are -created by calling methods on a queue object (e.g. `q.enqueue(...)`). - -Note: Queue methods (such as `q.enqueue(...)`) *must* run on the same device -as the queue. Incompatible device placement directives will be ignored when -creating these operations. - -Now that you have a bit of a feel for queues, let's dive into the details... - -## Queue usage overview - -Queues, such as `tf.FIFOQueue` -and `tf.RandomShuffleQueue`, -are important TensorFlow objects that aid in computing tensors asynchronously -in a graph. - -For example, a typical queue-based input pipeline uses a `RandomShuffleQueue` to -prepare inputs for training a model as follows: - -* Multiple threads prepare training examples and enqueue them. -* A training thread executes a training op that dequeues mini-batches from the - queue - -We recommend using the `tf.data.Dataset.shuffle` -and `tf.data.Dataset.batch` methods of a -`tf.data.Dataset` to accomplish this. However, if you'd prefer -to use a queue-based version instead, you can find a full implementation in the -`tf.train.shuffle_batch` function. - -For demonstration purposes a simplified implementation is given below. - -This function takes a source tensor, a capacity, and a batch size as arguments -and returns a tensor that dequeues a shuffled batch when executed. - -``` python -def simple_shuffle_batch(source, capacity, batch_size=10): - # Create a random shuffle queue. - queue = tf.RandomShuffleQueue(capacity=capacity, - min_after_dequeue=int(0.9*capacity), - shapes=source.shape, dtypes=source.dtype) - - # Create an op to enqueue one item. - enqueue = queue.enqueue(source) - - # Create a queue runner that, when started, will launch 4 threads applying - # that enqueue op. - num_threads = 4 - qr = tf.train.QueueRunner(queue, [enqueue] * num_threads) - - # Register the queue runner so it can be found and started by - # `tf.train.start_queue_runners` later (the threads are not launched yet). - tf.train.add_queue_runner(qr) - - # Create an op to dequeue a batch - return queue.dequeue_many(batch_size) -``` - -Once started by `tf.train.start_queue_runners`, or indirectly through -`tf.train.MonitoredSession`, the `QueueRunner` will launch the -threads in the background to fill the queue. Meanwhile the main thread will -execute the `dequeue_many` op to pull data from it. Note how these ops do not -depend on each other, except indirectly through the internal state of the queue. - -The simplest possible use of this function might be something like this: - -``` python -# create a dataset that counts from 0 to 99 -input = tf.constant(list(range(100))) -input = tf.data.Dataset.from_tensor_slices(input) -input = input.make_one_shot_iterator().get_next() - -# Create a slightly shuffled batch from the sorted elements -get_batch = simple_shuffle_batch(input, capacity=20) - -# `MonitoredSession` will start and manage the `QueueRunner` threads. -with tf.train.MonitoredSession() as sess: - # Since the `QueueRunners` have been started, data is available in the - # queue, so the `sess.run(get_batch)` call will not hang. - while not sess.should_stop(): - print(sess.run(get_batch)) -``` - -``` -[ 8 10 7 5 4 13 15 14 25 0] -[23 29 28 31 33 18 19 11 34 27] -[12 21 37 39 35 22 44 36 20 46] -... -``` - -For most use cases, the automatic thread startup and management provided -by `tf.train.MonitoredSession` is sufficient. In the rare case that it is not, -TensorFlow provides tools for manually managing your threads and queues. - -## Manual Thread Management - -As we have seen, the TensorFlow `Session` object is multithreaded and -thread-safe, so multiple threads can -easily use the same session and run ops in parallel. However, it is not always -easy to implement a Python program that drives threads as required. All -threads must be able to stop together, exceptions must be caught and -reported, and queues must be properly closed when stopping. - -TensorFlow provides two classes to help: -`tf.train.Coordinator` and -`tf.train.QueueRunner`. These two classes -are designed to be used together. The `Coordinator` class helps multiple threads -stop together and report exceptions to a program that waits for them to stop. -The `QueueRunner` class is used to create a number of threads cooperating to -enqueue tensors in the same queue. - -### Coordinator - -The `tf.train.Coordinator` class manages background threads in a TensorFlow -program and helps multiple threads stop together. - -Its key methods are: - -* `tf.train.Coordinator.should_stop`: returns `True` if the threads should stop. -* `tf.train.Coordinator.request_stop`: requests that threads should stop. -* `tf.train.Coordinator.join`: waits until the specified threads have stopped. - -You first create a `Coordinator` object, and then create a number of threads -that use the coordinator. The threads typically run loops that stop when -`should_stop()` returns `True`. - -Any thread can decide that the computation should stop. It only has to call -`request_stop()` and the other threads will stop as `should_stop()` will then -return `True`. - -```python -# Using Python's threading library. -import threading - -# Thread body: loop until the coordinator indicates a stop was requested. -# If some condition becomes true, ask the coordinator to stop. -def MyLoop(coord): - while not coord.should_stop(): - ...do something... - if ...some condition...: - coord.request_stop() - -# Main thread: create a coordinator. -coord = tf.train.Coordinator() - -# Create 10 threads that run 'MyLoop()' -threads = [threading.Thread(target=MyLoop, args=(coord,)) for i in xrange(10)] - -# Start the threads and wait for all of them to stop. -for t in threads: - t.start() -coord.join(threads) -``` - -Obviously, the coordinator can manage threads doing very different things. -They don't have to be all the same as in the example above. The coordinator -also has support to capture and report exceptions. See the `tf.train.Coordinator` documentation for more details. - -### QueueRunner - -The `tf.train.QueueRunner` class creates a number of threads that repeatedly -run an enqueue op. These threads can use a coordinator to stop together. In -addition, a queue runner will run a *closer operation* that closes the queue if -an exception is reported to the coordinator. - -You can use a queue runner to implement the architecture described above. - -First build a graph that uses a TensorFlow queue (e.g. a `tf.RandomShuffleQueue`) for input examples. Add ops that -process examples and enqueue them in the queue. Add training ops that start by -dequeueing from the queue. - -```python -example = ...ops to create one example... -# Create a queue, and an op that enqueues examples one at a time in the queue. -queue = tf.RandomShuffleQueue(...) -enqueue_op = queue.enqueue(example) -# Create a training graph that starts by dequeueing a batch of examples. -inputs = queue.dequeue_many(batch_size) -train_op = ...use 'inputs' to build the training part of the graph... -``` - -In the Python training program, create a `QueueRunner` that will run a few -threads to process and enqueue examples. Create a `Coordinator` and ask the -queue runner to start its threads with the coordinator. Write a training loop -that also uses the coordinator. - -```python -# Create a queue runner that will run 4 threads in parallel to enqueue -# examples. -qr = tf.train.QueueRunner(queue, [enqueue_op] * 4) - -# Launch the graph. -sess = tf.Session() -# Create a coordinator, launch the queue runner threads. -coord = tf.train.Coordinator() -enqueue_threads = qr.create_threads(sess, coord=coord, start=True) -# Run the training loop, controlling termination with the coordinator. -for step in xrange(1000000): - if coord.should_stop(): - break - sess.run(train_op) -# When done, ask the threads to stop. -coord.request_stop() -# And wait for them to actually do it. -coord.join(enqueue_threads) -``` - -### Handling exceptions - -Threads started by queue runners do more than just run the enqueue ops. They -also catch and handle exceptions generated by queues, including the -`tf.errors.OutOfRangeError` exception, which is used to report that a queue was -closed. - -A training program that uses a coordinator must similarly catch and report -exceptions in its main loop. - -Here is an improved version of the training loop above. - -```python -try: - for step in xrange(1000000): - if coord.should_stop(): - break - sess.run(train_op) -except Exception, e: - # Report exceptions to the coordinator. - coord.request_stop(e) -finally: - # Terminate as usual. It is safe to call `coord.request_stop()` twice. - coord.request_stop() - coord.join(threads) -``` diff --git a/tensorflow/docs_src/api_guides/python/train.md b/tensorflow/docs_src/api_guides/python/train.md deleted file mode 100644 index 4b4c6a4fe3..0000000000 --- a/tensorflow/docs_src/api_guides/python/train.md +++ /dev/null @@ -1,139 +0,0 @@ -# Training -[TOC] - -`tf.train` provides a set of classes and functions that help train models. - -## Optimizers - -The Optimizer base class provides methods to compute gradients for a loss and -apply gradients to variables. A collection of subclasses implement classic -optimization algorithms such as GradientDescent and Adagrad. - -You never instantiate the Optimizer class itself, but instead instantiate one -of the subclasses. - -* `tf.train.Optimizer` -* `tf.train.GradientDescentOptimizer` -* `tf.train.AdadeltaOptimizer` -* `tf.train.AdagradOptimizer` -* `tf.train.AdagradDAOptimizer` -* `tf.train.MomentumOptimizer` -* `tf.train.AdamOptimizer` -* `tf.train.FtrlOptimizer` -* `tf.train.ProximalGradientDescentOptimizer` -* `tf.train.ProximalAdagradOptimizer` -* `tf.train.RMSPropOptimizer` - -See `tf.contrib.opt` for more optimizers. - -## Gradient Computation - -TensorFlow provides functions to compute the derivatives for a given -TensorFlow computation graph, adding operations to the graph. The -optimizer classes automatically compute derivatives on your graph, but -creators of new Optimizers or expert users can call the lower-level -functions below. - -* `tf.gradients` -* `tf.AggregationMethod` -* `tf.stop_gradient` -* `tf.hessians` - - -## Gradient Clipping - -TensorFlow provides several operations that you can use to add clipping -functions to your graph. You can use these functions to perform general data -clipping, but they're particularly useful for handling exploding or vanishing -gradients. - -* `tf.clip_by_value` -* `tf.clip_by_norm` -* `tf.clip_by_average_norm` -* `tf.clip_by_global_norm` -* `tf.global_norm` - -## Decaying the learning rate - -* `tf.train.exponential_decay` -* `tf.train.inverse_time_decay` -* `tf.train.natural_exp_decay` -* `tf.train.piecewise_constant` -* `tf.train.polynomial_decay` -* `tf.train.cosine_decay` -* `tf.train.linear_cosine_decay` -* `tf.train.noisy_linear_cosine_decay` - -## Moving Averages - -Some training algorithms, such as GradientDescent and Momentum often benefit -from maintaining a moving average of variables during optimization. Using the -moving averages for evaluations often improve results significantly. - -* `tf.train.ExponentialMovingAverage` - -## Coordinator and QueueRunner - -See [Threading and Queues](../../api_guides/python/threading_and_queues.md) -for how to use threads and queues. For documentation on the Queue API, -see [Queues](../../api_guides/python/io_ops.md#queues). - - -* `tf.train.Coordinator` -* `tf.train.QueueRunner` -* `tf.train.LooperThread` -* `tf.train.add_queue_runner` -* `tf.train.start_queue_runners` - -## Distributed execution - -See [Distributed TensorFlow](../../deploy/distributed.md) for -more information about how to configure a distributed TensorFlow program. - -* `tf.train.Server` -* `tf.train.Supervisor` -* `tf.train.SessionManager` -* `tf.train.ClusterSpec` -* `tf.train.replica_device_setter` -* `tf.train.MonitoredTrainingSession` -* `tf.train.MonitoredSession` -* `tf.train.SingularMonitoredSession` -* `tf.train.Scaffold` -* `tf.train.SessionCreator` -* `tf.train.ChiefSessionCreator` -* `tf.train.WorkerSessionCreator` - -## Reading Summaries from Event Files - -See [Summaries and TensorBoard](../../guide/summaries_and_tensorboard.md) for an -overview of summaries, event files, and visualization in TensorBoard. - -* `tf.train.summary_iterator` - -## Training Hooks - -Hooks are tools that run in the process of training/evaluation of the model. - -* `tf.train.SessionRunHook` -* `tf.train.SessionRunArgs` -* `tf.train.SessionRunContext` -* `tf.train.SessionRunValues` -* `tf.train.LoggingTensorHook` -* `tf.train.StopAtStepHook` -* `tf.train.CheckpointSaverHook` -* `tf.train.NewCheckpointReader` -* `tf.train.StepCounterHook` -* `tf.train.NanLossDuringTrainingError` -* `tf.train.NanTensorHook` -* `tf.train.SummarySaverHook` -* `tf.train.GlobalStepWaiterHook` -* `tf.train.FinalOpsHook` -* `tf.train.FeedFnHook` - -## Training Utilities - -* `tf.train.global_step` -* `tf.train.basic_train_loop` -* `tf.train.get_global_step` -* `tf.train.assert_global_step` -* `tf.train.write_graph` diff --git a/tensorflow/docs_src/community/benchmarks.md b/tensorflow/docs_src/community/benchmarks.md deleted file mode 100644 index 153ef4a015..0000000000 --- a/tensorflow/docs_src/community/benchmarks.md +++ /dev/null @@ -1,108 +0,0 @@ -# Defining and Running Benchmarks - -This guide contains instructions for defining and running a TensorFlow benchmark. These benchmarks store output in [TestResults](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/util/test_log.proto) format. If these benchmarks are added to the TensorFlow github repo, we will run them daily with our continuous build and display a graph on our dashboard: https://benchmarks-dot-tensorflow-testing.appspot.com/. - -[TOC] - - -## Defining a Benchmark - -Defining a TensorFlow benchmark requires extending the `tf.test.Benchmark` -class and calling the `self.report_benchmark` method. Below, you'll find an example of benchmark code: - -```python -import time - -import tensorflow as tf - - -# Define a class that extends from tf.test.Benchmark. -class SampleBenchmark(tf.test.Benchmark): - - # Note: benchmark method name must start with `benchmark`. - def benchmarkSum(self): - with tf.Session() as sess: - x = tf.constant(10) - y = tf.constant(5) - result = tf.add(x, y) - - iters = 100 - start_time = time.time() - for _ in range(iters): - sess.run(result) - total_wall_time = time.time() - start_time - - # Call report_benchmark to report a metric value. - self.report_benchmark( - name="sum_wall_time", - # This value should always be per iteration. - wall_time=total_wall_time/iters, - iters=iters) - -if __name__ == "__main__": - tf.test.main() -``` -See the full example for [SampleBenchmark](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/examples/benchmark/). - - -Key points to note in the example above: - -* Benchmark class extends from `tf.test.Benchmark`. -* Each benchmark method should start with `benchmark` prefix. -* Benchmark method calls `report_benchmark` to report the metric value. - - -## Running with Python - -Use the `--benchmarks` flag to run the benchmark with Python. A [BenchmarkEntries](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/core/util/test_log.proto) proto will be printed. - -``` -python sample_benchmark.py --benchmarks=SampleBenchmark -``` - -Setting the flag as `--benchmarks=.` or `--benchmarks=all` works as well. - -(Please ensure that Tensorflow is installed to successfully import the package in the line `import tensorflow as tf`. For installation instructions, see [Installing TensorFlow](https://www.tensorflow.org/install/). This step is not necessary when running with Bazel.) - - -## Adding a `bazel` Target - -We have a special target called `tf_py_logged_benchmark` for benchmarks defined under the TensorFlow github repo. `tf_py_logged_benchmark` should wrap around a regular `py_test` target. Running a `tf_py_logged_benchmark` would print a [TestResults](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/util/test_log.proto) proto. Defining a `tf_py_logged_benchmark` also lets us run it with TensorFlow continuous build. - -First, define a regular `py_test` target. See example below: - -```build -py_test( - name = "sample_benchmark", - srcs = ["sample_benchmark.py"], - srcs_version = "PY2AND3", - deps = [ - "//tensorflow:tensorflow_py", - ], -) -``` - -You can run benchmarks in a `py_test` target by passing the `--benchmarks` flag. The benchmark should just print out a [BenchmarkEntries](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/core/util/test_log.proto) proto. - -```shell -bazel test :sample_benchmark --test_arg=--benchmarks=all -``` - - -Now, add the `tf_py_logged_benchmark` target (if available). This target would -pass in `--benchmarks=all` to the wrapped `py_test` target and provide a way to store output for our TensorFlow continuous build. The target `tf_py_logged_benchmark` should be available in TensorFlow repository. - -```build -load("//tensorflow/tools/test:performance.bzl", "tf_py_logged_benchmark") - -tf_py_logged_benchmark( - name = "sample_logged_benchmark", - target = "//tensorflow/examples/benchmark:sample_benchmark", -) -``` - -Use the following command to run the benchmark target: - -```shell -bazel test :sample_logged_benchmark -``` diff --git a/tensorflow/docs_src/community/contributing.md b/tensorflow/docs_src/community/contributing.md deleted file mode 100644 index ece4a7c70b..0000000000 --- a/tensorflow/docs_src/community/contributing.md +++ /dev/null @@ -1,49 +0,0 @@ -# Contributing to TensorFlow - -TensorFlow is an open-source project, and we welcome your participation -and contribution. This page describes how to get involved. - -## Repositories - -The code for TensorFlow is hosted in the [TensorFlow GitHub -organization](https://github.com/tensorflow). Multiple projects are located -inside the organization, including: - -* [TensorFlow](https://github.com/tensorflow/tensorflow) -* [Models](https://github.com/tensorflow/models) -* [TensorBoard](https://github.com/tensorflow/tensorboard) -* [TensorFlow.js](https://github.com/tensorflow/tfjs) -* [TensorFlow Serving](https://github.com/tensorflow/serving) -* [TensorFlow Documentation](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/docs_src) - -## Contributor checklist - -* Before contributing to TensorFlow source code, please review the [contribution -guidelines](https://github.com/tensorflow/tensorflow/blob/master/CONTRIBUTING.md). - -* Join the -[developers@tensorflow.org](https://groups.google.com/a/tensorflow.org/d/forum/developers) -mailing list, to coordinate and discuss with others contributing to TensorFlow. - -* For coding style conventions, read the [TensorFlow Style Guide](../community/style_guide.md). - -* Finally, review [Writing TensorFlow Documentation](../community/documentation.md), which - explains documentation conventions. - -You may also wish to review our guide to [defining and running benchmarks](../community/benchmarks.md). - -## Special Interest Groups - -To enable focused collaboration on particular areas of TensorFlow, we host -Special Interest Groups (SIGs). SIGs do their work in public: if you want to -join and contribute, review the work of the group, and get in touch with the -relevant SIG leader. Membership policies vary on a per-SIG basis. - -* **SIG Build** focuses on issues surrounding building, packaging, and - distribution of TensorFlow. [Mailing list](https://groups.google.com/a/tensorflow.org/d/forum/build). - -* **SIG TensorBoard** furthers the development and direction of TensorBoard and its plugins. - [Mailing list](https://groups.google.com/a/tensorflow.org/d/forum/sig-tensorboard). - -* **SIG Rust** collaborates on the development of TensorFlow's Rust bindings. - [Mailing list](https://groups.google.com/a/tensorflow.org/d/forum/rust). diff --git a/tensorflow/docs_src/community/documentation.md b/tensorflow/docs_src/community/documentation.md deleted file mode 100644 index 8639656d07..0000000000 --- a/tensorflow/docs_src/community/documentation.md +++ /dev/null @@ -1,673 +0,0 @@ -# Writing TensorFlow Documentation - -We welcome contributions to the TensorFlow documentation from the community. -This document explains how you can contribute to that documentation. In -particular, this document explains the following: - -* Where the documentation is located. -* How to make conformant edits. -* How to build and test your documentation changes before you submit them. - -You can view TensorFlow documentation on https://www.tensorflow.org, and you -can view and edit the raw files on -[GitHub](https://www.tensorflow.org/code/tensorflow/docs_src/). -We're publishing our docs on GitHub so everybody can contribute. Whatever gets -checked in to `tensorflow/docs_src` will be published soon after on -https://www.tensorflow.org. - -Republishing TensorFlow documentation in different forms is absolutely allowed, -but we are unlikely to accept other documentation formats (or the tooling to -generate them) into our repository. If you do choose to republish our -documentation in another form, please be sure to include: - -* The version of the API this represents (for example, r1.0, master, etc.) -* The commit or version from which the documentation was generated -* Where to get the latest documentation (that is, https://www.tensorflow.org) -* The Apache 2.0 license. - -## A note on versions - -tensorflow.org, at root, shows documentation for the latest stable binary. This -is the documentation you should be reading if you are using `pip` to install -TensorFlow. - -However, most developers will contribute documentation into the master GitHub -branch, which is published, occasionally, -at [tensorflow.org/versions/master](https://www.tensorflow.org/versions/master). - -If you want documentation changes to appear at root, you will need to also -contribute that change to the current stable binary branch (and/or -[cherrypick](https://stackoverflow.com/questions/9339429/what-does-cherry-picking-a-commit-with-git-mean)). - -## Reference vs. non-reference documentation - -The following reference documentation is automatically generated from comments -in the code: - -- C++ API reference docs -- Java API reference docs -- Python API reference docs - -To modify the reference documentation, you edit the appropriate code comments. - -Non-reference documentation (for example, the TensorFlow installation guides) is -authored by humans. This documentation is located in the -[`tensorflow/docs_src`](https://www.tensorflow.org/code/tensorflow/docs_src/) -directory. Each subdirectory of `docs_src` contains a set of related TensorFlow -documentation. For example, the TensorFlow installation guides are all in the -`docs_src/install` directory. - -The C++ documentation is generated from XML files generated via doxygen; -however, those tools are not available in open source at this time. - -## Markdown - -Editable TensorFlow documentation is written in Markdown. With a few exceptions, -TensorFlow uses -the [standard Markdown rules](https://daringfireball.net/projects/markdown/). - -This section explains the primary differences between standard Markdown rules -and the Markdown rules that editable TensorFlow documentation uses. - -### Math in Markdown - -You may use MathJax within TensorFlow when editing Markdown files, but note the -following: - -- MathJax renders properly on [tensorflow.org](https://www.tensorflow.org) -- MathJax does not render properly on [github](https://github.com/tensorflow/tensorflow). - -When writing MathJax, you can use $$ and `\\(` and `\\)` to -surround your math. $$ guards will cause line breaks, so -within text, use `\\(` `\\)` instead. - -### Links in Markdown - -Links fall into a few categories: - -- Links to a different part of the same file -- Links to a URL outside of tensorflow.org -- Links from a Markdown file (or code comments) to another file within tensorflow.org - -For the first two link categories, you may use standard Markdown links, but put -the link entirely on one line, rather than splitting it across lines. For -example: - -- `[text](link) # Good link` -- `[text]\n(link) # Bad link` -- `[text](\nlink) # Bad link` - -For the final link category (links to another file within tensorflow.org), -please use a special link parameterization mechanism. This mechanism enables -authors to move and reorganize files without breaking links. - -The parameterization scheme is as follows. Use: - - -- @{tf.symbol} to make a link to the reference page for a - Python symbol. Note that class members don't get their own page, but the - syntax still works, since @{tf.MyClass.method} links to the - proper part of the tf.MyClass page. - -- @{tensorflow::symbol} to make a link to the reference page - for a C++ symbol. - -- @{$doc_page} to make a link to another (not an API reference) - doc page. To link to - - - `red/green/blue/index.md` use @{$blue} or - @{$green/blue}, - - - `foo/bar/baz.md` use @{$baz} or - @{$bar/baz}. - - The shorter one is preferred, so we can move pages around without breaking - these references. The main exception is that the Python API guides should - probably be referred to using @{$python/} to - avoid ambiguity. - -- @{$doc_page#anchor-tag$link-text} to link to an anchor in - that doc and use different link text (by default, the link text is the title - of the target page). - - To override the link text only, omit the `#anchor-tag`. - -To link to source code, use a link starting with: -`https://www.tensorflow.org/code/`, followed by -the file name starting at the github root. For instance, a link to the file you -are currently reading should be written as -`https://www.tensorflow.org/code/tensorflow/docs_src/community/documentation.md`. - -This URL naming scheme ensures -that [tensorflow.org](https://www.tensorflow.org/) can forward the link to the -branch of the code corresponding to the version of the documentation you're -viewing. Do not include url parameters in the source code URL. - -## Generating docs and previewing links - -Before building the documentation, you must first set up your environment by -doing the following: - -1. If bazel is not installed on your machine, install it now. If you are on - Linux, install bazel by issuing the following command: - - $ sudo apt-get install bazel # Linux - - If you are on Mac OS, find bazel installation instructions on - [this page](https://bazel.build/versions/master/docs/install.html#mac-os-x). - -2. Change directory to the top-level `tensorflow` directory of the TensorFlow - source code. - -3. Run the `configure` script and answer its prompts appropriately for your - system. - - $ ./configure - -Then, change to the `tensorflow` directory which contains `docs_src` (`cd -tensorflow`). Run the following command to compile TensorFlow and generate the -documentation in the `/tmp/tfdocs` dir: - - bazel run tools/docs:generate -- \ - --src_dir="$(pwd)/docs_src/" \ - --output_dir=/tmp/tfdocs/ - -Note: You must set `src_dir` and `output_dir` to absolute file paths. - -## Generating Python API documentation - -Ops, classes, and utility functions are defined in Python modules, such as -`image_ops.py`. Python modules contain a module docstring. For example: - -```python -"""Image processing and decoding ops.""" -``` - -The documentation generator places this module docstring at the beginning of the -Markdown file generated for the module, in this -case, [tf.image](https://www.tensorflow.org/api_docs/python/tf/image). - -It used to be a requirement to list every member of a module inside the module -file at the beginning, putting a `@@` before each member. The `@@member_name` -syntax is deprecated and no longer generates any docs. But depending on how a -module is [sealed](#sealing_modules) it may still be necessary to mark the -elements of the module’s contents as public. The called-out op, function, or -class does not have to be defined in the same file. The next few sections of -this document discuss sealing and how to add elements to the public -documentation. - -The new documentation system automatically documents public symbols, except for -the following: - -- Private symbols whose names start with an underscore. -- Symbols originally defined in `object` or protobuf’s `Message`. -- Some class members, such as `__base__`, `__class__`, which are dynamically - created but generally have no useful documentation. - -Only top level modules (currently just `tf` and `tfdbg`) need to be manually -added to the generate script. - -### Sealing modules - -Because the doc generator walks all visible symbols, and descends into anything -it finds, it will document any accidentally exposed symbols. If a module only -exposes symbols that are meant to be part of the public API, we call it -**sealed**. Because of Python’s loose import and visibility conventions, naively -written Python code will inadvertently expose a lot of modules which are -implementation details. Improperly sealed modules may expose other unsealed -modules, which will typically lead the doc generator to fail. **This failure is -the intended behavior.** It ensures that our API is well defined, and allows us -to change implementation details (including which modules are imported where) -without fear of accidentally breaking users. - -If a module is accidentally imported, it typically breaks the doc generator -(`generate_test`). This is a clear sign you need to seal your modules. However, -even if the doc generator succeeds, unwanted symbols may show up in the -docs. Check the generated docs to make sure that all symbols that are documented -are expected. If there are symbols that shouldn’t be there, you have the -following options for dealing with them: - -- Private symbols and imports -- The `remove_undocumented` filter -- A traversal blacklist. - -We'll discuss these options in detail below. - -#### Private symbols and imports - -The easiest way to conform to the API sealing expectations is to make non-public -symbols private (by prepending an underscore _). The doc generator respects -private symbols. This also applies to modules. If the only problem is that there -is a small number of imported modules that show up in the docs (or break the -generator), you can simply rename them on import, e.g.: `import sys as _sys`. - -Because Python considers all files to be modules, this applies to files as -well. If you have a directory containing the following two files/modules: - - module/__init__.py - module/private_impl.py - -Then, after `module` is imported, it will be possible to access -`module.private_impl`. Renaming `private_impl.py` to `_private_impl.py` solves -the problem. If renaming modules is awkward, read on. - -#### Use the `remove_undocumented` filter - -Another way to seal a module is to split your implementation from the API. To do -so, consider using `remove_undocumented`, which takes a list of allowed symbols, -and deletes everything else from the module. For example, the following snippet -demonstrates how to put `remove_undocumented` in the `__init__.py` file for a -module: - -__init__.py: - - # Use * imports only if __all__ defined in some_file - from tensorflow.some_module.some_file import * - - # Otherwise import symbols directly - from tensorflow.some_module.some_other_file import some_symbol - - from tensorflow.python.util.all_util import remove_undocumented - - _allowed_symbols = [‘some_symbol’, ‘some_other_symbol’] - - remove_undocumented(__name__, allowed_exception_list=_allowed_symbols) - -The `@@member_name` syntax is deprecated, but it still exists in some places in -the documentation as an indicator to `remove_undocumented` that those symbols -are public. All `@@`s will eventually be removed. If you see them, however, -please do not randomly delete them as they are still in use by some of our -systems. - -#### Traversal blacklist - -If all else fails, you may add entries to the traversal blacklist in -`generate_lib.py.` **Almost all entries in this list are an abuse of its -purpose; avoid adding to it if you can!** - -The traversal blacklist maps qualified module names (without the leading `tf.`) -to local names that are not to be descended into. For instance, the following -entry will exclude `some_module` from traversal. - - { ... - ‘contrib.my_module’: [‘some_module’] - ... - } - -That means that the doc generator will show that `some_module` exists, but it -will not enumerate its content. - -This blacklist was originally intended to make sure that system modules (mock, -flags, ...) included for platform abstraction can be documented without -documenting their interior. Its use beyond this purpose is a shortcut that may -be acceptable for contrib, but not for core tensorflow. - -## Op documentation style guide - -Long, descriptive module-level documentation for modules should go in the API -Guides in `docs_src/api_guides/python`. - -For classes and ops, ideally, you should provide the following information, in -order of presentation: - -* A short sentence that describes what the op does. -* A short description of what happens when you pass arguments to the op. -* An example showing how the op works (pseudocode is best). -* Requirements, caveats, important notes (if there are any). -* Descriptions of inputs, outputs, and Attrs or other parameters of the op - constructor. - -Each of these is described in more -detail [below](#description-of-the-docstring-sections). - -Write your text in Markdown format. A basic syntax reference -is [here](https://daringfireball.net/projects/markdown/). You are allowed to -use [MathJax](https://www.mathjax.org) notation for equations (see above for -restrictions). - -### Writing about code - -Put backticks around these things when they're used in text: - -* Argument names (for example, `input`, `x`, `tensor`) -* Returned tensor names (for example, `output`, `idx`, `out`) -* Data types (for example, `int32`, `float`, `uint8`) -* Other op names referenced in text (for example, `list_diff()`, `shuffle()`) -* Class names (for example, `Tensor` when you actually mean a `Tensor` object; - don't capitalize or use backticks if you're just explaining what an op does to - a tensor, or a graph, or an operation in general) -* File names (for example, `image_ops.py`, or - `/path-to-your-data/xml/example-name`) -* Math expressions or conditions (for example, `-1-input.dims() <= dim <= - input.dims()`) - -Put three backticks around sample code and pseudocode examples. And use `==>` -instead of a single equal sign when you want to show what an op returns. For -example: - - ``` - # 'input' is a tensor of shape [2, 3, 5] - (tf.expand_dims(input, 0)) ==> [1, 2, 3, 5] - ``` - -If you're providing a Python code sample, add the python style label to ensure -proper syntax highlighting: - - ```python - # some Python code - ``` - -Two notes about backticks for code samples in Markdown: - -1. You can use backticks for pretty printing languages other than Python, if - necessary. A full list of languages is available - [here](https://github.com/google/code-prettify#how-do-i-specify-the-language-of-my-code). -2. Markdown also allows you to indent four spaces to specify a code sample. - However, do NOT indent four spaces and use backticks simultaneously. Use one - or the other. - -### Tensor dimensions - -When you're talking about a tensor in general, don't capitalize the word tensor. -When you're talking about the specific object that's provided to an op as an -argument or returned by an op, then you should capitalize the word Tensor and -add backticks around it because you're talking about a `Tensor` object. - -Don't use the word `Tensors` to describe multiple Tensor objects unless you -really are talking about a `Tensors` object. Better to say "a list of `Tensor` -objects." - -Use the term "dimension" to refer to the size of a tensor. If you need to be -specific about the size, use these conventions: - -- Refer to a scalar as a "0-D tensor" -- Refer to a vector as a "1-D tensor" -- Refer to a matrix as a "2-D tensor" -- Refer to tensors with 3 or more dimensions as 3-D tensors or n-D tensors. Use - the word "rank" only if it makes sense, but try to use "dimension" instead. - Never use the word "order" to describe the size of a tensor. - -Use the word "shape" to detail the dimensions of a tensor, and show the shape in -square brackets with backticks. For example: - - If `input` is a 3-D tensor with shape `[3, 4, 3]`, this operation - returns a 3-D tensor with shape `[6, 8, 6]`. - -### Ops defined in C++ - -All Ops defined in C++ (and accessible from other languages) must be documented -with a `REGISTER_OP` declaration. The docstring in the C++ file is processed to -automatically add some information for the input types, output types, and Attr -types and default values. - -For example: - -```c++ -REGISTER_OP("PngDecode") - .Input("contents: string") - .Attr("channels: int = 0") - .Output("image: uint8") - .Doc(R"doc( -Decodes the contents of a PNG file into a uint8 tensor. - -contents: PNG file contents. -channels: Number of color channels, or 0 to autodetect based on the input. - Must be 0 for autodetect, 1 for grayscale, 3 for RGB, or 4 for RGBA. - If the input has a different number of channels, it will be transformed - accordingly. -image:= A 3-D uint8 tensor of shape `[height, width, channels]`. - If `channels` is 0, the last dimension is determined - from the png contents. -)doc"); -``` - -Results in this piece of Markdown: - - ### tf.image.png_decode(contents, channels=None, name=None) {#png_decode} - - Decodes the contents of a PNG file into a uint8 tensor. - - #### Args: - - * **contents**: A string Tensor. PNG file contents. - * **channels**: An optional int. Defaults to 0. - Number of color channels, or 0 to autodetect based on the input. - Must be 0 for autodetect, 1 for grayscale, 3 for RGB, or 4 for RGBA. If the - input has a different number of channels, it will be transformed accordingly. - * **name**: A name for the operation (optional). - - #### Returns: - A 3-D uint8 tensor of shape `[height, width, channels]`. If `channels` is - 0, the last dimension is determined from the png contents. - -Much of the argument description is added automatically. In particular, the doc -generator automatically adds the name and type of all inputs, attrs, and -outputs. In the above example, `contents: A string Tensor.` was added -automatically. You should write your additional text to flow naturally after -that description. - -For inputs and output, you can prefix your additional text with an equal sign to -prevent the automatically added name and type. In the above example, the -description for the output named `image` starts with `=` to prevent the addition -of `A uint8 Tensor.` before our text `A 3-D uint8 Tensor...`. You cannot prevent -the addition of the name, type, and default value of attrs this way, so write -your text carefully. - -### Ops defined in Python - -If your op is defined in a `python/ops/*.py` file, then you need to provide text -for all of the arguments and output (returned) tensors. The doc generator does -not auto-generate any text for ops that are defined in Python, so what you write -is what you get. - -You should conform to the usual Python docstring conventions, except that you -should use Markdown in the docstring. - -Here's a simple example: - - def foo(x, y, name="bar"): - """Computes foo. - - Given two 1-D tensors `x` and `y`, this operation computes the foo. - - Example: - - ``` - # x is [1, 1] - # y is [2, 2] - tf.foo(x, y) ==> [3, 3] - ``` - Args: - x: A `Tensor` of type `int32`. - y: A `Tensor` of type `int32`. - name: A name for the operation (optional). - - Returns: - A `Tensor` of type `int32` that is the foo of `x` and `y`. - - Raises: - ValueError: If `x` or `y` are not of type `int32`. - """ - -## Description of the docstring sections - -This section details each of the elements in docstrings. - -### Short sentence describing what the op does - -Examples: - -``` -Concatenates tensors. -``` - -``` -Flips an image horizontally from left to right. -``` - -``` -Computes the Levenshtein distance between two sequences. -``` - -``` -Saves a list of tensors to a file. -``` - -``` -Extracts a slice from a tensor. -``` - -### Short description of what happens when you pass arguments to the op - -Examples: - - Given a tensor input of numerical type, this operation returns a tensor of - the same type and size with values reversed along dimension `seq_dim`. A - vector `seq_lengths` determines which elements are reversed for each index - within dimension 0 (usually the batch dimension). - - - This operation returns a tensor of type `dtype` and dimensions `shape`, with - all elements set to zero. - -### Example demonstrating the op - -Good code samples are short and easy to understand, typically containing a brief -snippet of code to clarify what the example is demonstrating. When an op -manipulates the shape of a Tensor it is often useful to include an example of -the before and after, as well. - -The `squeeze()` op has a nice pseudocode example: - - # 't' is a tensor of shape [1, 2, 1, 3, 1, 1] - shape(squeeze(t)) ==> [2, 3] - -The `tile()` op provides a good example in descriptive text: - - For example, tiling `[a, b, c, d]` by `[2]` produces `[a b c d a b c d]`. - -It is often helpful to show code samples in Python. Never put them in the C++ -Ops file, and avoid putting them in the Python Ops doc. We recommend, if -possible, putting code samples in the -[API guides](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/docs_src/api_guides). -Otherwise, add them to the module or class docstring where the Ops constructors -are called out. - -Here's an example from the module docstring in `api_guides/python/math_ops.md`: - - ## Segmentation - - TensorFlow provides several operations that you can use to perform common - math computations on tensor segments. - ... - In particular, a segmentation of a matrix tensor is a mapping of rows to - segments. - - For example: - - ```python - c = tf.constant([[1,2,3,4], [-1,-2,-3,-4], [5,6,7,8]]) - tf.segment_sum(c, tf.constant([0, 0, 1])) - ==> [[0 0 0 0] - [5 6 7 8]] - ``` - -### Requirements, caveats, important notes - -Examples: - -``` -This operation requires that: `-1-input.dims() <= dim <= input.dims()` -``` - -``` -Note: This tensor will produce an error if evaluated. Its value must -be fed using the `feed_dict` optional argument to `Session.run()`, -`Tensor.eval()`, or `Operation.run()`. -``` - -### Descriptions of arguments and output (returned) tensors. - -Keep the descriptions brief and to the point. You should not have to explain how -the operation works in the argument sections. - -Mention if the Op has strong constraints on the dimensions of the input or -output tensors. Remember that for C++ Ops, the type of the tensor is -automatically added as either as "A ..type.. Tensor" or "A Tensor with type in -{...list of types...}". In such cases, if the Op has a constraint on the -dimensions either add text such as "Must be 4-D" or start the description with -`=` (to prevent the tensor type to be added) and write something like "A 4-D -float tensor". - -For example, here are two ways to document an image argument of a C++ op (note -the "=" sign): - -``` -image: Must be 4-D. The image to resize. -``` - -``` -image:= A 4-D `float` tensor. The image to resize. -``` - -In the documentation, these will be rendered to markdown as - -``` -image: A `float` Tensor. Must be 4-D. The image to resize. -``` - -``` -image: A 4-D `float` Tensor. The image to resize. -``` - -### Optional arguments descriptions ("attrs") - -The doc generator always describes the type for each attr and their default -value, if any. You cannot override that with an equal sign because the -description is very different in the C++ and Python generated docs. - -Phrase any additional attr description so that it flows well after the type -and default value. The type and defaults are displayed first, and additional -descriptions follow afterwards. Therefore, complete sentences are best. - -Here's an example from `image_ops.cc`: - - REGISTER_OP("DecodePng") - .Input("contents: string") - .Attr("channels: int = 0") - .Attr("dtype: {uint8, uint16} = DT_UINT8") - .Output("image: dtype") - .SetShapeFn(DecodeImageShapeFn) - .Doc(R"doc( - Decode a PNG-encoded image to a uint8 or uint16 tensor. - - The attr `channels` indicates the desired number of color channels for the - decoded image. - - Accepted values are: - - * 0: Use the number of channels in the PNG-encoded image. - * 1: output a grayscale image. - * 3: output an RGB image. - * 4: output an RGBA image. - - If needed, the PNG-encoded image is transformed to match the requested - number of color channels. - - contents: 0-D. The PNG-encoded image. - channels: Number of color channels for the decoded image. - image: 3-D with shape `[height, width, channels]`. - )doc"); - -This generates the following Args section in -`api_docs/python/tf/image/decode_png.md`: - - #### Args: - - * **`contents`**: A `Tensor` of type `string`. 0-D. The PNG-encoded - image. - * **`channels`**: An optional `int`. Defaults to `0`. Number of color - channels for the decoded image. - * **`dtype`**: An optional `tf.DType` from: `tf.uint8, - tf.uint16`. Defaults to `tf.uint 8`. - * **`name`**: A name for the operation (optional). diff --git a/tensorflow/docs_src/community/groups.md b/tensorflow/docs_src/community/groups.md deleted file mode 100644 index 0b07d413da..0000000000 --- a/tensorflow/docs_src/community/groups.md +++ /dev/null @@ -1,38 +0,0 @@ -# User Groups - -TensorFlow has communities around the world. [Submit your community!](https://docs.google.com/forms/d/e/1FAIpQLSc_RQIUYtVgLLihzATaO_WUXkEyBDE_OoRoOXYDPmBEvHuEBA/viewform) - -## Asia - -* [TensorFlow China community](https://www.tensorflowers.cn) -* [TensorFlow Korea (TF-KR) User Group](https://www.facebook.com/groups/TensorFlowKR/) -* [TensorFlow User Group Tokyo](https://tfug-tokyo.connpass.com/) -* [Soleil Data Dojo](https://soleildatadojo.connpass.com/) -* [TensorFlow User Group Utsunomiya](https://tfug-utsunomiya.connpass.com/) -* [TensorFlow Philippines Community](https://www.facebook.com/groups/TensorFlowPH/) -* [TensorFlow and Deep Learning Singapore](https://www.meetup.com/TensorFlow-and-Deep-Learning-Singapore/) -* [TensorFlow India](https://www.facebook.com/tensorflowindia) - - -## Europe - -* [TensorFlow Barcelona](https://www.meetup.com/Barcelona-Machine-Learning-Meetup/) -* [TensorFlow Madrid](https://www.meetup.com/TensorFlow-Madrid/) -* [Tensorflow Belgium](https://www.meetup.com/TensorFlow-Belgium) -* [TensorFlow x Rome Meetup](https://www.meetup.com/it-IT/TensorFlow-x-Rome-Meetup) -* [TensorFlow London](https://www.meetup.com/TensorFlow-London/) -* [TensorFlow Edinburgh](https://www.meetup.com/tensorflow-edinburgh/) - - -## America - -* [TensorFlow Buenos Aires](https://www.meetup.com/TensorFlow-Buenos-Aires/) - - -## Oceania -* [Melbourne TensorFlow Meetup](https://www.meetup.com/Melbourne-TensorFlow-Meetup) - - -## Africa - -* [TensorFlow Tunis Meetup](https://www.meetup.com/fr-FR/TensorFlow-Tunis-Meetup/) diff --git a/tensorflow/docs_src/community/index.md b/tensorflow/docs_src/community/index.md deleted file mode 100644 index 1a30be32a5..0000000000 --- a/tensorflow/docs_src/community/index.md +++ /dev/null @@ -1,85 +0,0 @@ -# Community - -Welcome to the TensorFlow community! This page explains where to get help, and -different ways to be part of the community. We are committed to fostering an -open and welcoming environment, and request that you review our [code of -conduct](https://github.com/tensorflow/tensorflow/blob/master/CODE_OF_CONDUCT.md). - -## Get Help - -### Technical Questions - -To ask or answer technical questions about TensorFlow, use [Stack -Overflow](https://stackoverflow.com/questions/tagged/tensorflow). For example, -ask or search about a particular error message you encountered during -installation. - -### Bugs and Feature Requests - -To report bugs or make feature requests, file an issue on GitHub. Please choose -the appropriate repository for the project. Major repositories include: - - * [TensorFlow](https://github.com/tensorflow/tensorflow/issues) - * [TensorBoard](https://github.com/tensorflow/tensorboard/issues) - * [TensorFlow models](https://github.com/tensorflow/models/issues) - -### Security - -Before using TensorFlow, please take a look at our [security model](https://github.com/tensorflow/tensorflow/blob/master/SECURITY.md#tensorflow-models-are-programs), -[list of recent security advisories and announcements](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/security/index.md), -and [ways you can report security issues](https://github.com/tensorflow/tensorflow/blob/master/SECURITY.md#reporting-vulnerabilities) -to the TensorFlow team at the [Using TensorFlow Securely](https://github.com/tensorflow/tensorflow/blob/master/SECURITY.md) page on GitHub. - -## Stay Informed - -### Announcements Mailing List - -All major releases and important announcements are sent to -[announce@tensorflow.org](https://groups.google.com/a/tensorflow.org/forum/#!forum/announce). -We recommend that you join this list if you depend on TensorFlow in any way. - -### Development Roadmap - -The [Roadmap](../community/roadmap.md) summarizes plans for upcoming additions to TensorFlow. - -### Social Media - -For news and updates from around the universe of TensorFlow projects, follow -[@tensorflow](https://twitter.com/tensorflow) on Twitter. - -### Blog - -We post regularly to the [TensorFlow Blog](http://blog.tensorflow.org/), -with content from the TensorFlow team and the best articles from the community. - -### YouTube - -Our [YouTube Channel](http://youtube.com/tensorflow/) focuses on machine learning -and AI with TensorFlow. On it we have a number of new shows, including: - -- TensorFlow Meets: meet with community contributors to learn and share what they're doing -- Ask TensorFlow: the team answers the best questions tagged #AskTensorFlow from social media -- Coding TensorFlow: short bites with tips for success with TensorFlow - -## Community Support - -### Mailing Lists - -For general discussion about TensorFlow development and direction, please join -the [TensorFlow discuss mailing -list](https://groups.google.com/a/tensorflow.org/d/forum/discuss). - -A number of other mailing lists exist, focused on different project areas, which -can be found at [TensorFlow Mailing Lists](../community/lists.md). - -### User Groups - -To meet with like-minded people local to you, check out the many -[TensorFlow user groups](../community/groups.md) around the world. - - -## Contributing To TensorFlow - -We welcome contributions and collaboration on TensorFlow. For more information, -please read [Contributing to TensorFlow](contributing.md). - diff --git a/tensorflow/docs_src/community/leftnav_files b/tensorflow/docs_src/community/leftnav_files deleted file mode 100644 index 0bd1f14de9..0000000000 --- a/tensorflow/docs_src/community/leftnav_files +++ /dev/null @@ -1,8 +0,0 @@ -index.md -roadmap.md -contributing.md -lists.md -groups.md -documentation.md -style_guide.md -benchmarks.md diff --git a/tensorflow/docs_src/community/lists.md b/tensorflow/docs_src/community/lists.md deleted file mode 100644 index bc2f573c29..0000000000 --- a/tensorflow/docs_src/community/lists.md +++ /dev/null @@ -1,53 +0,0 @@ -# Mailing Lists - -As a community, we do much of our collaboration on public mailing lists. -Please note that if you're looking for help using TensorFlow, [Stack -Overflow](https://stackoverflow.com/questions/tagged/tensorflow) and -[GitHub issues](https://github.com/tensorflow/tensorflow/issues) -are the best initial places to look. For more information, -see [how to get help](/community/#get_help). - -## General TensorFlow lists - -* [announce](https://groups.google.com/a/tensorflow.org/d/forum/announce) - Low-volume announcements of new releases. -* [discuss](https://groups.google.com/a/tensorflow.org/d/forum/discuss) - General community discussion around TensorFlow. -* [developers](https://groups.google.com/a/tensorflow.org/d/forum/developers) - Discussion for developers contributing to TensorFlow. - -## Project-specific lists - -These projects inside the TensorFlow GitHub organization have lists dedicated to their communities: - -* [hub](https://groups.google.com/a/tensorflow.org/d/forum/hub) - - Discussion and collaboration around [TensorFlow Hub](https://github.com/tensorflow/hub). -* [magenta-discuss](https://groups.google.com/a/tensorflow.org/d/forum/magenta-discuss) - - General discussion about [Magenta](https://magenta.tensorflow.org/) - development and directions. -* [swift](https://groups.google.com/a/tensorflow.org/d/forum/swift) - - Community and collaboration around Swift for TensorFlow. -* [tensor2tensor](https://groups.google.com/d/forum/tensor2tensor) - Discussion - and peer support for Tensor2Tensor. -* [tfjs-announce](https://groups.google.com/a/tensorflow.org/d/forum/tfjs-announce) - - Announcements of new TensorFlow.js releases. -* [tfjs](https://groups.google.com/a/tensorflow.org/d/forum/tfjs) - Discussion - and peer support for TensorFlow.js. -* [tflite](https://groups.google.com/a/tensorflow.org/d/forum/tflite) - Discussion and - peer support for TensorFlow Lite. -* [tfprobability](https://groups.google.com/a/tensorflow.org/d/forum/tfprobability) - Discussion and - peer support for TensorFlow Probability. -* [tpu-users](https://groups.google.com/a/tensorflow.org/d/forum/tpu-users) - Community discussion - and support for TPU users. - -## Special Interest Groups - -TensorFlow's [Special Interest -Groups](/community/contributing#special_interest_groups) (SIGs) support -community collaboration on particular project focuses. Members of these groups -work together to build and support TensorFlow related projects. While their -archives are public, different SIGs have their own membership policies. - -* [build](https://groups.google.com/a/tensorflow.org/d/forum/build) - - Supporting SIG Build, for build, distribution and packaging of TensorFlow. -* [sig-tensorboard](https://groups.google.com/a/tensorflow.org/d/forum/sig-tensorboard) - - Supporting SIG TensorBoard, for plugin development and other contribution. -* [rust](https://groups.google.com/a/tensorflow.org/d/forum/rust) - - Supporting SIG Rust, for the Rust language bindings. diff --git a/tensorflow/docs_src/community/roadmap.md b/tensorflow/docs_src/community/roadmap.md deleted file mode 100644 index d11b6ed467..0000000000 --- a/tensorflow/docs_src/community/roadmap.md +++ /dev/null @@ -1,123 +0,0 @@ -# Roadmap -**Last updated: Apr 27, 2018** - -TensorFlow is a rapidly moving, community supported project. This document is intended -to provide guidance about priorities and focus areas of the core set of TensorFlow -developers and about functionality that can be expected in the upcoming releases of -TensorFlow. Many of these areas are driven by community use cases, and we welcome -further -[contributions](https://github.com/tensorflow/tensorflow/blob/master/CONTRIBUTING.md) -to TensorFlow. - -The features below do not have concrete release dates. However, the majority can be -expected in the next one to two releases. - -### APIs -#### High Level APIs: -* Easy multi-GPU and TPU utilization with Estimators -* Easy-to-use high-level pre-made estimators for Gradient Boosted Trees, Time Series, and other models - -#### Eager Execution: -* Efficient utilization of multiple GPUs -* Distributed training support (multi-machine) -* Performance improvements -* Simpler export to a GraphDef/SavedModel - -#### Keras API: -* Better integration with tf.data (ability to call `model.fit` with data tensors) -* Full support for Eager Execution (both Eager support for the regular Keras API, and ability -to create Keras models Eager- style via Model subclassing) -* Better distribution/multi-GPU support and TPU support (including a smoother model-to-estimator workflow) - -#### Official Models: -* A set of -[models](https://github.com/tensorflow/models/tree/master/official) -across image recognition, speech, object detection, and - translation that demonstrate best practices and serve as a starting point for - high-performance model development. - -#### Contrib: -* Deprecate parts of tf.contrib where preferred implementations exist outside of tf.contrib. -* As much as possible, move large projects inside tf.contrib to separate repositories. -* The tf.contrib module will eventually be discontinued in its current form, experimental development will in future happen in other repositories. - - -#### Probabilistic Reasoning and Statistical Analysis: -* Rich set of tools for probabilistic and statistical analysis in tf.distributions - and tf.probability. These include new samplers, layers, optimizers, losses, and structured models -* Statistical tools for hypothesis testing, convergence diagnostics, and sample statistics -* Edward 2.0: High-level API for probabilistic programming - -### Platforms -#### TensorFlow Lite: -* Increase coverage of supported ops in TensorFlow Lite -* Easier conversion of a trained TensorFlow graph for use on TensorFlow Lite -* Support for GPU acceleration in TensorFlow Lite (iOS and Android) -* Support for hardware accelerators via Android NeuralNets API -* Improve CPU performance by quantization and other network optimizations (eg. pruning, distillation) -* Increase support for devices beyond Android and iOS (eg. RPi, Cortex-M) - -#### TensorFlow.js: -* Continue to expand support for importing TensorFlow SavedModels and Keras models into browser with unified APIs supporting retraining in browser -* Improve inference and training performance in both browser and Node.js environments -* Widen the collection of pre-built models in [tfjs-models](https://github.com/tensorflow/tfjs-models), - including but not limited to audio- and speech-oriented models -* Release tfjs-data API for efficient data input pipelines -* Integration with [TF-Hub](https://www.tensorflow.org/hub/) - -#### TensorFlow with Swift: -* Establish open source project including documentation, open design, and code availability. -* Continue implementing and refining implementation and design through 2018. -* Aim for implementation to be solid enough for general use later in 2018. - -### Performance -#### Distributed TensorFlow: -* Optimize Multi-GPU support for a variety of GPU topologies -* Improve mechanisms for distributing computations on several machines - -#### GPU Optimizations: -* Simplify mixed precision API with initial example model and guide. -* Finalize TensorRT API and move to core. -* CUDA 9.2 and NCCL 2.x default in TensorFlow builds. -* Optimizations for DGX-2. -* Remove support for CUDA less than 8.x and cuDNN less than 6.x. - - -#### CPU Optimizations -* Int8 support for SkyLake via MKL -* Dynamic loading of SIMD-optimized kernels -* MKL for Linux and Windows - -### End-to-end ML systems: -#### TensorFlow Hub: -* Expand support for module-types in TF Hub with TF Eager integration, Keras layers integration, and TensorFlow.js integration -* Accept variable-sized image input -* Improve multi-GPU estimator support -* Document and improve TPU integration - -#### TensorFlow Extended: -* Open source more of the TensorFlow Extended platform to facilitate adoption of TensorFlow in production settings. -* Release TFX libraries for Data Validation - -### Documentation and Resources: -* Update documentation, tutorials and Getting Started guides on all features and APIs -* Update [Youtube Tensorflow channel](https://youtube.com/tensorflow) weekly with new content: -Coding TensorFlow - where we teach folks coding with tensorflow -TensorFlow Meets - where we highlight community contributions -Ask TensorFlow - where we answer community questions -Guest and Showcase videos -* Update [Official TensorFlow blog](https://blog.tensorflow.org) with regular articles from Google team and the Community - - -### Community and Partner Engagement -#### Special Interest Groups: -* Mobilize the community to work together in focused domains -* [tf-distribute](https://groups.google.com/a/tensorflow.org/forum/#!forum/tf-distribute): build and packaging of TensorFlow -* SIG TensorBoard, SIG Rust, and more to be identified and launched - -#### Community: -* Incorporate public feedback on significant design decisions via a Request-for-Comment (RFC) process -* Formalize process for external contributions to land in TensorFlow and associated projects -* Grow global TensorFlow communities and user groups -* Collaborate with partners to co-develop and publish research papers -* Process to enable external contributions to tutorials, documentation, and blogs showcasing best practice use-cases of TensorFlow and high-impact applications diff --git a/tensorflow/docs_src/community/style_guide.md b/tensorflow/docs_src/community/style_guide.md deleted file mode 100644 index c78da20edd..0000000000 --- a/tensorflow/docs_src/community/style_guide.md +++ /dev/null @@ -1,136 +0,0 @@ -# TensorFlow Style Guide - -This page contains style decisions that both developers and users of TensorFlow -should follow to increase the readability of their code, reduce the number of -errors, and promote consistency. - -[TOC] - -## Python style - -Generally follow -[PEP8 Python style guide](https://www.python.org/dev/peps/pep-0008/), -except for using 2 spaces. - - -## Python 2 and 3 compatible - -* All code needs to be compatible with Python 2 and 3. - -* Next lines should be present in all Python files: - -``` -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -``` - -* Use `six` to write compatible code (for example `six.moves.range`). - - -## Bazel BUILD rules - -TensorFlow uses Bazel build system and enforces next requirements: - -* Every BUILD file should contain next header: - -``` -# Description: -# <...> - -package( - default_visibility = ["//visibility:private"], -) - -licenses(["notice"]) # Apache 2.0 - -exports_files(["LICENSE"]) -``` - - - -* For all Python BUILD targets (libraries and tests) add next line: - -``` -srcs_version = "PY2AND3", -``` - - -## Tensor - -* Operations that deal with batches may assume that the first dimension of a Tensor is the batch dimension. - -* In most models the *last dimension* is the number of channels. - -* Dimensions excluding the first and last usually make up the "space" dimensions: Sequence-length or Image-size. - -## Python operations - -A *Python operation* is a function that, given input tensors and parameters, -creates a part of the graph and returns output tensors. - -* The first arguments should be tensors, followed by basic python parameters. - The last argument is `name` with a default value of `None`. - If operation needs to save some `Tensor`s to Graph collections, - put the arguments with names of the collections right before `name` argument. - -* Tensor arguments should be either a single tensor or an iterable of tensors. - E.g. a "Tensor or list of Tensors" is too broad. See `assert_proper_iterable`. - -* Operations that take tensors as arguments should call `convert_to_tensor` - to convert non-tensor inputs into tensors if they are using C++ operations. - Note that the arguments are still described as a `Tensor` object - of a specific dtype in the documentation. - -* Each Python operation should have a `name_scope` like below. Pass as - arguments `name`, a default name of the op, and a list of the input tensors. - -* Operations should contain an extensive Python comment with Args and Returns - declarations that explain both the type and meaning of each value. Possible - shapes, dtypes, or ranks should be specified in the description. - [See documentation details](../community/documentation.md) - -* For increased usability include an example of usage with inputs / outputs - of the op in Example section. - -Example: - - def my_op(tensor_in, other_tensor_in, my_param, other_param=0.5, - output_collections=(), name=None): - """My operation that adds two tensors with given coefficients. - - Args: - tensor_in: `Tensor`, input tensor. - other_tensor_in: `Tensor`, same shape as `tensor_in`, other input tensor. - my_param: `float`, coefficient for `tensor_in`. - other_param: `float`, coefficient for `other_tensor_in`. - output_collections: `tuple` of `string`s, name of the collection to - collect result of this op. - name: `string`, name of the operation. - - Returns: - `Tensor` of same shape as `tensor_in`, sum of input values with coefficients. - - Example: - >>> my_op([1., 2.], [3., 4.], my_param=0.5, other_param=0.6, - output_collections=['MY_OPS'], name='add_t1t2') - [2.3, 3.4] - """ - with tf.name_scope(name, "my_op", [tensor_in, other_tensor_in]): - tensor_in = tf.convert_to_tensor(tensor_in) - other_tensor_in = tf.convert_to_tensor(other_tensor_in) - result = my_param * tensor_in + other_param * other_tensor_in - tf.add_to_collection(output_collections, result) - return result - -Usage: - - output = my_op(t1, t2, my_param=0.5, other_param=0.6, - output_collections=['MY_OPS'], name='add_t1t2') - - -## Layers - -Use `tf.keras.layers`, not `tf.layers`. - -See `tf.keras.layers` and [the Keras guide](../guide/keras.md#custom_layers) for details on how to sub-class layers. diff --git a/tensorflow/docs_src/deploy/deploy_to_js.md b/tensorflow/docs_src/deploy/deploy_to_js.md deleted file mode 100644 index d7ce3ea90b..0000000000 --- a/tensorflow/docs_src/deploy/deploy_to_js.md +++ /dev/null @@ -1,4 +0,0 @@ -# Deploy to JavaScript - -You can find details about deploying JavaScript TensorFlow programs -in the separate [js.tensorflow.org site](https://js.tensorflow.org). diff --git a/tensorflow/docs_src/deploy/distributed.md b/tensorflow/docs_src/deploy/distributed.md deleted file mode 100644 index 2fba36cfa7..0000000000 --- a/tensorflow/docs_src/deploy/distributed.md +++ /dev/null @@ -1,354 +0,0 @@ -# Distributed TensorFlow - -This document shows how to create a cluster of TensorFlow servers, and how to -distribute a computation graph across that cluster. We assume that you are -familiar with the [basic concepts](../guide/low_level_intro.md) of -writing low level TensorFlow programs. - -## Hello distributed TensorFlow! - -To see a simple TensorFlow cluster in action, execute the following: - -```shell -# Start a TensorFlow server as a single-process "cluster". -$ python ->>> import tensorflow as tf ->>> c = tf.constant("Hello, distributed TensorFlow!") ->>> server = tf.train.Server.create_local_server() ->>> sess = tf.Session(server.target) # Create a session on the server. ->>> sess.run(c) -'Hello, distributed TensorFlow!' -``` - -The -`tf.train.Server.create_local_server` -method creates a single-process cluster, with an in-process server. - -## Create a cluster - -
- -
- -A TensorFlow "cluster" is a set of "tasks" that participate in the distributed -execution of a TensorFlow graph. Each task is associated with a TensorFlow -"server", which contains a "master" that can be used to create sessions, and a -"worker" that executes operations in the graph. A cluster can also be divided -into one or more "jobs", where each job contains one or more tasks. - -To create a cluster, you start one TensorFlow server per task in the cluster. -Each task typically runs on a different machine, but you can run multiple tasks -on the same machine (e.g. to control different GPU devices). In each task, do -the following: - -1. **Create a `tf.train.ClusterSpec`** that describes all of the tasks - in the cluster. This should be the same for each task. - -2. **Create a `tf.train.Server`**, passing the `tf.train.ClusterSpec` to - the constructor, and identifying the local task with a job name - and task index. - - -### Create a `tf.train.ClusterSpec` to describe the cluster - -The cluster specification dictionary maps job names to lists of network -addresses. Pass this dictionary to -the `tf.train.ClusterSpec` -constructor. For example: - - - - - - - - - - -
tf.train.ClusterSpec constructionAvailable tasks
-tf.train.ClusterSpec({"local": ["localhost:2222", "localhost:2223"]})
-
/job:local/task:0
/job:local/task:1
-tf.train.ClusterSpec({
-    "worker": [
-        "worker0.example.com:2222",
-        "worker1.example.com:2222",
-        "worker2.example.com:2222"
-    ],
-    "ps": [
-        "ps0.example.com:2222",
-        "ps1.example.com:2222"
-    ]})
-
/job:worker/task:0
/job:worker/task:1
/job:worker/task:2
/job:ps/task:0
/job:ps/task:1
- -### Create a `tf.train.Server` instance in each task - -A `tf.train.Server` object contains a -set of local devices, a set of connections to other tasks in its -`tf.train.ClusterSpec`, and a -`tf.Session` that can use these -to perform a distributed computation. Each server is a member of a specific -named job and has a task index within that job. A server can communicate with -any other server in the cluster. - -For example, to launch a cluster with two servers running on `localhost:2222` -and `localhost:2223`, run the following snippets in two different processes on -the local machine: - -```python -# In task 0: -cluster = tf.train.ClusterSpec({"local": ["localhost:2222", "localhost:2223"]}) -server = tf.train.Server(cluster, job_name="local", task_index=0) -``` -```python -# In task 1: -cluster = tf.train.ClusterSpec({"local": ["localhost:2222", "localhost:2223"]}) -server = tf.train.Server(cluster, job_name="local", task_index=1) -``` - -**Note:** Manually specifying these cluster specifications can be tedious, -especially for large clusters. We are working on tools for launching tasks -programmatically, e.g. using a cluster manager like -[Kubernetes](http://kubernetes.io). If there are particular cluster managers for -which you'd like to see support, please raise a -[GitHub issue](https://github.com/tensorflow/tensorflow/issues). - -## Specifying distributed devices in your model - -To place operations on a particular process, you can use the same -`tf.device` -function that is used to specify whether ops run on the CPU or GPU. For example: - -```python -with tf.device("/job:ps/task:0"): - weights_1 = tf.Variable(...) - biases_1 = tf.Variable(...) - -with tf.device("/job:ps/task:1"): - weights_2 = tf.Variable(...) - biases_2 = tf.Variable(...) - -with tf.device("/job:worker/task:7"): - input, labels = ... - layer_1 = tf.nn.relu(tf.matmul(input, weights_1) + biases_1) - logits = tf.nn.relu(tf.matmul(layer_1, weights_2) + biases_2) - # ... - train_op = ... - -with tf.Session("grpc://worker7.example.com:2222") as sess: - for _ in range(10000): - sess.run(train_op) -``` - -In the above example, the variables are created on two tasks in the `ps` job, -and the compute-intensive part of the model is created in the `worker` -job. TensorFlow will insert the appropriate data transfers between the jobs -(from `ps` to `worker` for the forward pass, and from `worker` to `ps` for -applying gradients). - -## Replicated training - -A common training configuration, called "data parallelism," involves multiple -tasks in a `worker` job training the same model on different mini-batches of -data, updating shared parameters hosted in one or more tasks in a `ps` -job. All tasks typically run on different machines. There are many ways to -specify this structure in TensorFlow, and we are building libraries that will -simplify the work of specifying a replicated model. Possible approaches include: - -* **In-graph replication.** In this approach, the client builds a single - `tf.Graph` that contains one set of parameters (in `tf.Variable` nodes pinned - to `/job:ps`); and multiple copies of the compute-intensive part of the model, - each pinned to a different task in `/job:worker`. - -* **Between-graph replication.** In this approach, there is a separate client - for each `/job:worker` task, typically in the same process as the worker - task. Each client builds a similar graph containing the parameters (pinned to - `/job:ps` as before using - `tf.train.replica_device_setter` - to map them deterministically to the same tasks); and a single copy of the - compute-intensive part of the model, pinned to the local task in - `/job:worker`. - -* **Asynchronous training.** In this approach, each replica of the graph has an - independent training loop that executes without coordination. It is compatible - with both forms of replication above. - -* **Synchronous training.** In this approach, all of the replicas read the same - values for the current parameters, compute gradients in parallel, and then - apply them together. It is compatible with in-graph replication (e.g. using - gradient averaging as in the - [CIFAR-10 multi-GPU trainer](https://github.com/tensorflow/models/tree/master/tutorials/image/cifar10/cifar10_multi_gpu_train.py)), - and between-graph replication (e.g. using the - `tf.train.SyncReplicasOptimizer`). - -### Putting it all together: example trainer program - -The following code shows the skeleton of a distributed trainer program, -implementing **between-graph replication** and **asynchronous training**. It -includes the code for the parameter server and worker tasks. - -```python -import argparse -import sys - -import tensorflow as tf - -FLAGS = None - - -def main(_): - ps_hosts = FLAGS.ps_hosts.split(",") - worker_hosts = FLAGS.worker_hosts.split(",") - - # Create a cluster from the parameter server and worker hosts. - cluster = tf.train.ClusterSpec({"ps": ps_hosts, "worker": worker_hosts}) - - # Create and start a server for the local task. - server = tf.train.Server(cluster, - job_name=FLAGS.job_name, - task_index=FLAGS.task_index) - - if FLAGS.job_name == "ps": - server.join() - elif FLAGS.job_name == "worker": - - # Assigns ops to the local worker by default. - with tf.device(tf.train.replica_device_setter( - worker_device="/job:worker/task:%d" % FLAGS.task_index, - cluster=cluster)): - - # Build model... - loss = ... - global_step = tf.contrib.framework.get_or_create_global_step() - - train_op = tf.train.AdagradOptimizer(0.01).minimize( - loss, global_step=global_step) - - # The StopAtStepHook handles stopping after running given steps. - hooks=[tf.train.StopAtStepHook(last_step=1000000)] - - # The MonitoredTrainingSession takes care of session initialization, - # restoring from a checkpoint, saving to a checkpoint, and closing when done - # or an error occurs. - with tf.train.MonitoredTrainingSession(master=server.target, - is_chief=(FLAGS.task_index == 0), - checkpoint_dir="/tmp/train_logs", - hooks=hooks) as mon_sess: - while not mon_sess.should_stop(): - # Run a training step asynchronously. - # See `tf.train.SyncReplicasOptimizer` for additional details on how to - # perform *synchronous* training. - # mon_sess.run handles AbortedError in case of preempted PS. - mon_sess.run(train_op) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - parser.register("type", "bool", lambda v: v.lower() == "true") - # Flags for defining the tf.train.ClusterSpec - parser.add_argument( - "--ps_hosts", - type=str, - default="", - help="Comma-separated list of hostname:port pairs" - ) - parser.add_argument( - "--worker_hosts", - type=str, - default="", - help="Comma-separated list of hostname:port pairs" - ) - parser.add_argument( - "--job_name", - type=str, - default="", - help="One of 'ps', 'worker'" - ) - # Flags for defining the tf.train.Server - parser.add_argument( - "--task_index", - type=int, - default=0, - help="Index of task within the job" - ) - FLAGS, unparsed = parser.parse_known_args() - tf.app.run(main=main, argv=[sys.argv[0]] + unparsed) -``` - -To start the trainer with two parameter servers and two workers, use the -following command line (assuming the script is called `trainer.py`): - -```shell -# On ps0.example.com: -$ python trainer.py \ - --ps_hosts=ps0.example.com:2222,ps1.example.com:2222 \ - --worker_hosts=worker0.example.com:2222,worker1.example.com:2222 \ - --job_name=ps --task_index=0 -# On ps1.example.com: -$ python trainer.py \ - --ps_hosts=ps0.example.com:2222,ps1.example.com:2222 \ - --worker_hosts=worker0.example.com:2222,worker1.example.com:2222 \ - --job_name=ps --task_index=1 -# On worker0.example.com: -$ python trainer.py \ - --ps_hosts=ps0.example.com:2222,ps1.example.com:2222 \ - --worker_hosts=worker0.example.com:2222,worker1.example.com:2222 \ - --job_name=worker --task_index=0 -# On worker1.example.com: -$ python trainer.py \ - --ps_hosts=ps0.example.com:2222,ps1.example.com:2222 \ - --worker_hosts=worker0.example.com:2222,worker1.example.com:2222 \ - --job_name=worker --task_index=1 -``` - -## Glossary - -**Client** - -A client is typically a program that builds a TensorFlow graph and constructs a -`tensorflow::Session` to interact with a cluster. Clients are typically written -in Python or C++. A single client process can directly interact with multiple -TensorFlow servers (see "Replicated training" above), and a single server can -serve multiple clients. - -**Cluster** - -A TensorFlow cluster comprises one or more "jobs", each divided into lists of -one or more "tasks". A cluster is typically dedicated to a particular high-level -objective, such as training a neural network, using many machines in parallel. A -cluster is defined by -a `tf.train.ClusterSpec` object. - -**Job** - -A job comprises a list of "tasks", which typically serve a common purpose. -For example, a job named `ps` (for "parameter server") typically hosts nodes -that store and update variables; while a job named `worker` typically hosts -stateless nodes that perform compute-intensive tasks. The tasks in a job -typically run on different machines. The set of job roles is flexible: -for example, a `worker` may maintain some state. - -**Master service** - -An RPC service that provides remote access to a set of distributed devices, -and acts as a session target. The master service implements the -`tensorflow::Session` interface, and is responsible for coordinating work across -one or more "worker services". All TensorFlow servers implement the master -service. - -**Task** - -A task corresponds to a specific TensorFlow server, and typically corresponds -to a single process. A task belongs to a particular "job" and is identified by -its index within that job's list of tasks. - -**TensorFlow server** A process running -a `tf.train.Server` instance, which is -a member of a cluster, and exports a "master service" and "worker service". - -**Worker service** - -An RPC service that executes parts of a TensorFlow graph using its local devices. -A worker service implements [worker_service.proto](https://www.tensorflow.org/code/tensorflow/core/protobuf/worker_service.proto). -All TensorFlow servers implement the worker service. diff --git a/tensorflow/docs_src/deploy/hadoop.md b/tensorflow/docs_src/deploy/hadoop.md deleted file mode 100644 index b0d416df2e..0000000000 --- a/tensorflow/docs_src/deploy/hadoop.md +++ /dev/null @@ -1,65 +0,0 @@ -# How to run TensorFlow on Hadoop - -This document describes how to run TensorFlow on Hadoop. It will be expanded to -describe running on various cluster managers, but only describes running on HDFS -at the moment. - -## HDFS - -We assume that you are familiar with [reading data](../api_guides/python/reading_data.md). - -To use HDFS with TensorFlow, change the file paths you use to read and write -data to an HDFS path. For example: - -```python -filename_queue = tf.train.string_input_producer([ - "hdfs://namenode:8020/path/to/file1.csv", - "hdfs://namenode:8020/path/to/file2.csv", -]) -``` - -If you want to use the namenode specified in your HDFS configuration files, then -change the file prefix to `hdfs://default/`. - -When launching your TensorFlow program, the following environment variables must -be set: - -* **JAVA_HOME**: The location of your Java installation. -* **HADOOP_HDFS_HOME**: The location of your HDFS installation. You can also - set this environment variable by running: - - ```shell - source ${HADOOP_HOME}/libexec/hadoop-config.sh - ``` - -* **LD_LIBRARY_PATH**: To include the path to libjvm.so, and optionally the path - to libhdfs.so if your Hadoop distribution does not install libhdfs.so in - `$HADOOP_HDFS_HOME/lib/native`. On Linux: - - ```shell - export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:${JAVA_HOME}/jre/lib/amd64/server - ``` - -* **CLASSPATH**: The Hadoop jars must be added prior to running your - TensorFlow program. The CLASSPATH set by - `${HADOOP_HOME}/libexec/hadoop-config.sh` is insufficient. Globs must be - expanded as described in the libhdfs documentation: - - ```shell - CLASSPATH=$(${HADOOP_HDFS_HOME}/bin/hadoop classpath --glob) python your_script.py - ``` - For older version of Hadoop/libhdfs (older than 2.6.0), you have to expand the - classpath wildcard manually. For more details, see - [HADOOP-10903](https://issues.apache.org/jira/browse/HADOOP-10903). - -If the Hadoop cluster is in secure mode, the following environment variable must -be set: - -* **KRB5CCNAME**: The path of Kerberos ticket cache file. For example: - - ```shell - export KRB5CCNAME=/tmp/krb5cc_10002 - ``` - -If you are running [Distributed TensorFlow](../deploy/distributed.md), then all -workers must have the environment variables set and Hadoop installed. diff --git a/tensorflow/docs_src/deploy/index.md b/tensorflow/docs_src/deploy/index.md deleted file mode 100644 index 08b28de639..0000000000 --- a/tensorflow/docs_src/deploy/index.md +++ /dev/null @@ -1,21 +0,0 @@ -# Deploy - -This section focuses on deploying real-world models. It contains -the following documents: - - * [Distributed TensorFlow](../deploy/distributed.md), which explains how to create - a cluster of TensorFlow servers. - * [How to run TensorFlow on Hadoop](../deploy/hadoop.md), which has a highly - self-explanatory title. - * [How to run TensorFlow with the S3 filesystem](../deploy/s3.md), which explains how - to run TensorFlow with the S3 file system. - * The entire document set for [TensorFlow serving](/serving), an open-source, - flexible, high-performance serving system for machine-learned models - designed for production environments. TensorFlow Serving provides - out-of-the-box integration with TensorFlow models. - [Source code for TensorFlow Serving](https://github.com/tensorflow/serving) - is available on GitHub. - -[TensorFlow Extended (TFX)](/tfx) is an end-to-end machine learning platform for -TensorFlow. Implemented at Google, we've open sourced some TFX libraries with the -rest of the system to come. diff --git a/tensorflow/docs_src/deploy/leftnav_files b/tensorflow/docs_src/deploy/leftnav_files deleted file mode 100644 index 93f5bd1ed2..0000000000 --- a/tensorflow/docs_src/deploy/leftnav_files +++ /dev/null @@ -1,5 +0,0 @@ -index.md -distributed.md -hadoop.md -s3.md -deploy_to_js.md diff --git a/tensorflow/docs_src/deploy/s3.md b/tensorflow/docs_src/deploy/s3.md deleted file mode 100644 index b4a759d687..0000000000 --- a/tensorflow/docs_src/deploy/s3.md +++ /dev/null @@ -1,93 +0,0 @@ -# How to run TensorFlow on S3 - -Tensorflow supports reading and writing data to S3. S3 is an object storage API which is nearly ubiquitous, and can help in situations where data must accessed by multiple actors, such as in distributed training. - -This document guides you through the required setup, and provides examples on usage. - -## Configuration - -When reading or writing data on S3 with your TensorFlow program, the behavior -can be controlled by various environmental variables: - -* **AWS_REGION**: By default, regional endpoint is used for S3, with region - controlled by `AWS_REGION`. If `AWS_REGION` is not specified, then - `us-east-1` is used. -* **S3_ENDPOINT**: The endpoint could be overridden explicitly with - `S3_ENDPOINT` specified. -* **S3_USE_HTTPS**: HTTPS is used to access S3 by default, unless - `S3_USE_HTTPS=0`. -* **S3_VERIFY_SSL**: If HTTPS is used, SSL verification could be disabled - with `S3_VERIFY_SSL=0`. - -To read or write objects in a bucket that is not publicly accessible, -AWS credentials must be provided through one of the following methods: - -* Set credentials in the AWS credentials profile file on the local system, - located at: `~/.aws/credentials` on Linux, macOS, or Unix, or - `C:\Users\USERNAME\.aws\credentials` on Windows. -* Set the `AWS_ACCESS_KEY_ID` and `AWS_SECRET_ACCESS_KEY` environment - variables. -* If TensorFlow is deployed on an EC2 instance, specify an IAM role and then - give the EC2 instance access to that role. - -## Example Setup - -Using the above information, we can configure Tensorflow to communicate to an S3 endpoint by setting the following environment variables: - -```bash -AWS_ACCESS_KEY_ID=XXXXX # Credentials only needed if connecting to a private endpoint -AWS_SECRET_ACCESS_KEY=XXXXX -AWS_REGION=us-east-1 # Region for the S3 bucket, this is not always needed. Default is us-east-1. -S3_ENDPOINT=s3.us-east-1.amazonaws.com # The S3 API Endpoint to connect to. This is specified in a HOST:PORT format. -S3_USE_HTTPS=1 # Whether or not to use HTTPS. Disable with 0. -S3_VERIFY_SSL=1 # If HTTPS is used, controls if SSL should be enabled. Disable with 0. -``` - -## Usage - -Once setup is completed, Tensorflow can interact with S3 in a variety of ways. Anywhere there is a Tensorflow IO function, an S3 URL can be used. - -### Smoke Test - -To test your setup, stat a file: - -```python -from tensorflow.python.lib.io import file_io -print file_io.stat('s3://bucketname/path/') -``` - -You should see output similar to this: - -```console - > -``` - -### Reading Data - -When [reading data](../api_guides/python/reading_data.md), change the file paths you use to read and write -data to an S3 path. For example: - -```python -filenames = ["s3://bucketname/path/to/file1.tfrecord", - "s3://bucketname/path/to/file2.tfrecord"] -dataset = tf.data.TFRecordDataset(filenames) -``` - -### Tensorflow Tools - -Many Tensorflow tools, such as Tensorboard or model serving, can also take S3 URLS as arguments: - -```bash -tensorboard --logdir s3://bucketname/path/to/model/ -tensorflow_model_server --port=9000 --model_name=model --model_base_path=s3://bucketname/path/to/model/export/ -``` - -This enables an end to end workflow using S3 for all data needs. - -## S3 Endpoint Implementations - -S3 was invented by Amazon, but the S3 API has spread in popularity and has several implementations. The following implementations have passed basic compatibility tests: - -* [Amazon S3](https://aws.amazon.com/s3/) -* [Google Storage](https://cloud.google.com/storage/docs/interoperability) -* [Minio](https://www.minio.io/kubernetes.html) diff --git a/tensorflow/docs_src/extend/add_filesys.md b/tensorflow/docs_src/extend/add_filesys.md deleted file mode 100644 index 5f8ac64d25..0000000000 --- a/tensorflow/docs_src/extend/add_filesys.md +++ /dev/null @@ -1,260 +0,0 @@ -# Adding a Custom Filesystem Plugin - -## Background - -The TensorFlow framework is often used in multi-process and -multi-machine environments, such as Google data centers, Google Cloud -Machine Learning, Amazon Web Services (AWS), and on-site distributed clusters. -In order to both share and save certain types of state produced by TensorFlow, -the framework assumes the existence of a reliable, shared filesystem. This -shared filesystem has numerous uses, for example: - -* Checkpoints of state are often saved to a distributed filesystem for - reliability and fault-tolerance. -* Training processes communicate with TensorBoard by writing event files - to a directory, which TensorBoard watches. A shared filesystem allows this - communication to work even when TensorBoard runs in a different process or - machine. - -There are many different implementations of shared or distributed filesystems in -the real world, so TensorFlow provides an ability for users to implement a -custom FileSystem plugin that can be registered with the TensorFlow runtime. -When the TensorFlow runtime attempts to write to a file through the `FileSystem` -interface, it uses a portion of the pathname to dynamically select the -implementation that should be used for filesystem operations. Thus, adding -support for your custom filesystem requires implementing a `FileSystem` -interface, building a shared object containing that implementation, and loading -that object at runtime in whichever process needs to write to that filesystem. - -Note that TensorFlow already includes many filesystem implementations, such as: - -* A standard POSIX filesystem - - Note: NFS filesystems often mount as a POSIX interface, and so standard - TensorFlow can work on top of NFS-mounted remote filesystems. - -* HDFS - the Hadoop File System -* GCS - Google Cloud Storage filesystem -* S3 - Amazon Simple Storage Service filesystem -* A "memory-mapped-file" filesystem - -The rest of this guide describes how to implement a custom filesystem. - -## Implementing a custom filesystem plugin - -To implement a custom filesystem plugin, you must do the following: - -* Implement subclasses of `RandomAccessFile`, `WriteableFile`, - `AppendableFile`, and `ReadOnlyMemoryRegion`. -* Implement the `FileSystem` interface as a subclass. -* Register the `FileSystem` implementation with an appropriate prefix pattern. -* Load the filesystem plugin in a process that wants to write to that - filesystem. - -### The FileSystem interface - -The `FileSystem` interface is an abstract C++ interface defined in -[file_system.h](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/platform/file_system.h). -An implementation of the `FileSystem` interface should implement all relevant -the methods defined by the interface. Implementing the interface requires -defining operations such as creating `RandomAccessFile`, `WritableFile`, and -implementing standard filesystem operations such as `FileExists`, `IsDirectory`, -`GetMatchingPaths`, `DeleteFile`, and so on. An implementation of these -interfaces will often involve translating the function's input arguments to -delegate to an already-existing library function implementing the equivalent -functionality in your custom filesystem. - -For example, the `PosixFileSystem` implementation implements `DeleteFile` using -the POSIX `unlink()` function; `CreateDir` simply calls `mkdir()`; `GetFileSize` -involves calling `stat()` on the file and then returns the filesize as reported -by the return of the stat object. Similarly, for the `HDFSFileSystem` -implementation, these calls simply delegate to the `libHDFS` implementation of -similar functionality, such as `hdfsDelete` for -[DeleteFile](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/platform/hadoop/hadoop_file_system.cc#L386). - -We suggest looking through these code examples to get an idea of how different -filesystem implementations call their existing libraries. Examples include: - -* [POSIX - plugin](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/platform/posix/posix_file_system.h) -* [HDFS - plugin](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/platform/hadoop/hadoop_file_system.h) -* [GCS - plugin](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/platform/cloud/gcs_file_system.h) -* [S3 - plugin](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/platform/s3/s3_file_system.h) - -#### The File interfaces - -Beyond operations that allow you to query and manipulate files and directories -in a filesystem, the `FileSystem` interface requires you to implement factories -that return implementations of abstract objects such as the -[RandomAccessFile](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/platform/file_system.h#L223), -the `WritableFile`, so that TensorFlow code and read and write to files in that -`FileSystem` implementation. - -To implement a `RandomAccessFile`, you must implement a single interface called -`Read()`, in which the implementation must provide a way to read from an offset -within a named file. - -For example, below is the implementation of RandomAccessFile for the POSIX -filesystem, which uses the `pread()` random-access POSIX function to implement -read. Notice that the particular implementation must know how to retry or -propagate errors from the underlying filesystem. - -```C++ - class PosixRandomAccessFile : public RandomAccessFile { - public: - PosixRandomAccessFile(const string& fname, int fd) - : filename_(fname), fd_(fd) {} - ~PosixRandomAccessFile() override { close(fd_); } - - Status Read(uint64 offset, size_t n, StringPiece* result, - char* scratch) const override { - Status s; - char* dst = scratch; - while (n > 0 && s.ok()) { - ssize_t r = pread(fd_, dst, n, static_cast(offset)); - if (r > 0) { - dst += r; - n -= r; - offset += r; - } else if (r == 0) { - s = Status(error::OUT_OF_RANGE, "Read less bytes than requested"); - } else if (errno == EINTR || errno == EAGAIN) { - // Retry - } else { - s = IOError(filename_, errno); - } - } - *result = StringPiece(scratch, dst - scratch); - return s; - } - - private: - string filename_; - int fd_; - }; -``` - -To implement the WritableFile sequential-writing abstraction, one must implement -a few interfaces, such as `Append()`, `Flush()`, `Sync()`, and `Close()`. - -For example, below is the implementation of WritableFile for the POSIX -filesystem, which takes a `FILE` object in its constructor and uses standard -posix functions on that object to implement the interface. - -```C++ - class PosixWritableFile : public WritableFile { - public: - PosixWritableFile(const string& fname, FILE* f) - : filename_(fname), file_(f) {} - - ~PosixWritableFile() override { - if (file_ != NULL) { - fclose(file_); - } - } - - Status Append(const StringPiece& data) override { - size_t r = fwrite(data.data(), 1, data.size(), file_); - if (r != data.size()) { - return IOError(filename_, errno); - } - return Status::OK(); - } - - Status Close() override { - Status result; - if (fclose(file_) != 0) { - result = IOError(filename_, errno); - } - file_ = NULL; - return result; - } - - Status Flush() override { - if (fflush(file_) != 0) { - return IOError(filename_, errno); - } - return Status::OK(); - } - - Status Sync() override { - Status s; - if (fflush(file_) != 0) { - s = IOError(filename_, errno); - } - return s; - } - - private: - string filename_; - FILE* file_; - }; - -``` - -For more details, please see the documentations of those interfaces, and look at -example implementations for inspiration. - -### Registering and loading the filesystem - -Once you have implemented the `FileSystem` implementation for your custom -filesystem, you need to register it under a "scheme" so that paths prefixed with -that scheme are directed to your implementation. To do this, you call -`REGISTER_FILE_SYSTEM`:: - -``` - REGISTER_FILE_SYSTEM("foobar", FooBarFileSystem); -``` - -When TensorFlow tries to operate on a file whose path starts with `foobar://`, -it will use the `FooBarFileSystem` implementation. - -```C++ - string filename = "foobar://path/to/file.txt"; - std::unique_ptr file; - - // Calls FooBarFileSystem::NewWritableFile to return - // a WritableFile class, which happens to be the FooBarFileSystem's - // WritableFile implementation. - TF_RETURN_IF_ERROR(env->NewWritableFile(filename, &file)); -``` - -Next, you must build a shared object containing this implementation. An example -of doing so using bazel's `cc_binary` rule can be found -[here](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/BUILD#L244), -but you may use any build system to do so. See the section on [building the op library](../extend/adding_an_op.md#build_the_op_library) for similar -instructions. - -The result of building this target is a `.so` shared object file. - -Lastly, you must dynamically load this implementation in the process. In Python, -you can call the `tf.load_file_system_library(file_system_library)` function, -passing the path to the shared object. Calling this in your client program loads -the shared object in the process, thus registering your implementation as -available for any file operations going through the `FileSystem` interface. You -can see -[test_file_system.py](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/framework/file_system_test.py) -for an example. - -## What goes through this interface? - -Almost all core C++ file operations within TensorFlow use the `FileSystem` -interface, such as the `CheckpointWriter`, the `EventsWriter`, and many other -utilities. This means implementing a `FileSystem` implementation allows most of -your TensorFlow programs to write to your shared filesystem. - -In Python, the `gfile` and `file_io` classes bind underneath to the `FileSystem -implementation via SWIG, which means that once you have loaded this filesystem -library, you can do: - -``` -with gfile.Open("foobar://path/to/file.txt") as w: - - w.write("hi") -``` - -When you do this, a file containing "hi" will appear in the "/path/to/file.txt" -of your shared filesystem. diff --git a/tensorflow/docs_src/extend/adding_an_op.md b/tensorflow/docs_src/extend/adding_an_op.md deleted file mode 100644 index cc25ab9b45..0000000000 --- a/tensorflow/docs_src/extend/adding_an_op.md +++ /dev/null @@ -1,1460 +0,0 @@ -# Adding a New Op - -Note: By default [www.tensorflow.org](https://www.tensorflow.org) shows docs for the -most recent stable version. The instructions in this doc require building from -source. You will probably want to build from the `master` version of tensorflow. -You should, as a result, be sure you are following the -[`master` version of this doc](https://www.tensorflow.org/versions/master/extend/adding_an_op), -in case there have been any changes. - -If you'd like to create an op that isn't covered by the existing TensorFlow -library, we recommend that you first try writing the op in Python as -a composition of existing Python ops or functions. If that isn't possible, you -can create a custom C++ op. There are several reasons why you might want to -create a custom C++ op: - -* It's not easy or possible to express your operation as a composition of - existing ops. -* It's not efficient to express your operation as a composition of existing - primitives. -* You want to hand-fuse a composition of primitives that a future compiler - would find difficult fusing. - -For example, imagine you want to implement something like "median pooling", -similar to the "MaxPool" operator, but computing medians over sliding windows -instead of maximum values. Doing this using a composition of operations may be -possible (e.g., using ExtractImagePatches and TopK), but may not be as -performance- or memory-efficient as a native operation where you can do -something more clever in a single, fused operation. As always, it is typically -first worth trying to express what you want using operator composition, only -choosing to add a new operation if that proves to be difficult or inefficient. - -To incorporate your custom op you'll need to: - -1. Register the new op in a C++ file. Op registration defines an interface - (specification) for the op's functionality, which is independent of the - op's implementation. For example, op registration defines the op's name and - the op's inputs and outputs. It also defines the shape function - that is used for tensor shape inference. -2. Implement the op in C++. The implementation of an op is known - as a kernel, and it is the concrete implementation of the specification you - registered in Step 1. There can be multiple kernels for different input / - output types or architectures (for example, CPUs, GPUs). -3. Create a Python wrapper (optional). This wrapper is the public API that's - used to create the op in Python. A default wrapper is generated from the - op registration, which can be used directly or added to. -4. Write a function to compute gradients for the op (optional). -5. Test the op. We usually do this in Python for convenience, but you can also - test the op in C++. If you define gradients, you can verify them with the - Python `tf.test.compute_gradient_error`. - See - [`relu_op_test.py`](https://www.tensorflow.org/code/tensorflow/python/kernel_tests/relu_op_test.py) as - an example that tests the forward functions of Relu-like operators and - their gradients. - -PREREQUISITES: - -* Some familiarity with C++. -* Must have installed the - [TensorFlow binary](../install/index.md), or must have - [downloaded TensorFlow source](../install/install_sources.md), - and be able to build it. - -[TOC] - -## Define the op's interface - -You define the interface of an op by registering it with the TensorFlow system. -In the registration, you specify the name of your op, its inputs (types and -names) and outputs (types and names), as well as docstrings and -any [attrs](#attrs) the op might require. - -To see how this works, suppose you'd like to create an op that takes a tensor of -`int32`s and outputs a copy of the tensor, with all but the first element set to -zero. To do this, create a file named `zero_out.cc`. Then add a call to the -`REGISTER_OP` macro that defines the interface for your op: - -```c++ -#include "tensorflow/core/framework/op.h" -#include "tensorflow/core/framework/shape_inference.h" - -using namespace tensorflow; - -REGISTER_OP("ZeroOut") - .Input("to_zero: int32") - .Output("zeroed: int32") - .SetShapeFn([](::tensorflow::shape_inference::InferenceContext* c) { - c->set_output(0, c->input(0)); - return Status::OK(); - }); -``` - -This `ZeroOut` op takes one tensor `to_zero` of 32-bit integers as input, and -outputs a tensor `zeroed` of 32-bit integers. The op also uses a shape function -to ensure that the output tensor is the same shape as the input tensor. For -example, if the input is a tensor of shape [10, 20], then this shape function -specifies that the output shape is also [10, 20]. - - -> A note on naming: The op name must be in CamelCase and it must be unique -> among all other ops that are registered in the binary. - -## Implement the kernel for the op - -After you define the interface, provide one or more implementations of the op. -To create one of these kernels, create a class that extends `OpKernel` and -overrides the `Compute` method. The `Compute` method provides one `context` -argument of type `OpKernelContext*`, from which you can access useful things -like the input and output tensors. - -Add your kernel to the file you created above. The kernel might look something -like this: - -```c++ -#include "tensorflow/core/framework/op_kernel.h" - -using namespace tensorflow; - -class ZeroOutOp : public OpKernel { - public: - explicit ZeroOutOp(OpKernelConstruction* context) : OpKernel(context) {} - - void Compute(OpKernelContext* context) override { - // Grab the input tensor - const Tensor& input_tensor = context->input(0); - auto input = input_tensor.flat(); - - // Create an output tensor - Tensor* output_tensor = NULL; - OP_REQUIRES_OK(context, context->allocate_output(0, input_tensor.shape(), - &output_tensor)); - auto output_flat = output_tensor->flat(); - - // Set all but the first element of the output tensor to 0. - const int N = input.size(); - for (int i = 1; i < N; i++) { - output_flat(i) = 0; - } - - // Preserve the first input value if possible. - if (N > 0) output_flat(0) = input(0); - } -}; -``` - -After implementing your kernel, you register it with the TensorFlow system. In -the registration, you specify different constraints under which this kernel -will run. For example, you might have one kernel made for CPUs, and a separate -one for GPUs. - -To do this for the `ZeroOut` op, add the following to `zero_out.cc`: - -```c++ -REGISTER_KERNEL_BUILDER(Name("ZeroOut").Device(DEVICE_CPU), ZeroOutOp); -``` - -> Important: Instances of your OpKernel may be accessed concurrently. -> Your `Compute` method must be thread-safe. Guard any access to class -> members with a mutex. Or better yet, don't share state via class members! -> Consider using a [`ResourceMgr`](https://www.tensorflow.org/code/tensorflow/core/framework/resource_mgr.h) -> to keep track of op state. - -### Multi-threaded CPU kernels - -To write a multi-threaded CPU kernel, the Shard function in -[`work_sharder.h`](https://www.tensorflow.org/code/tensorflow/core/util/work_sharder.h) -can be used. This function shards a computation function across the -threads configured to be used for intra-op threading (see -intra_op_parallelism_threads in -[`config.proto`](https://www.tensorflow.org/code/tensorflow/core/protobuf/config.proto)). - -### GPU kernels - -A GPU kernel is implemented in two parts: the OpKernel and the CUDA kernel and -its launch code. - -Sometimes the OpKernel implementation is common between a CPU and GPU kernel, -such as around inspecting inputs and allocating outputs. In that case, a -suggested implementation is to: - -1. Define the OpKernel templated on the Device and the primitive type of the - tensor. -2. To do the actual computation of the output, the Compute function calls a - templated functor struct. -3. The specialization of that functor for the CPUDevice is defined in the same - file, but the specialization for the GPUDevice is defined in a .cu.cc file, - since it will be compiled with the CUDA compiler. - -Here is an example implementation. - -```c++ -// kernel_example.h -#ifndef KERNEL_EXAMPLE_H_ -#define KERNEL_EXAMPLE_H_ - -template -struct ExampleFunctor { - void operator()(const Device& d, int size, const T* in, T* out); -}; - -#if GOOGLE_CUDA -// Partially specialize functor for GpuDevice. -template -struct ExampleFunctor { - void operator()(const Eigen::GpuDevice& d, int size, const T* in, T* out); -}; -#endif - -#endif KERNEL_EXAMPLE_H_ -``` - -```c++ -// kernel_example.cc -#include "example.h" -#include "tensorflow/core/framework/op_kernel.h" - -using namespace tensorflow; - -using CPUDevice = Eigen::ThreadPoolDevice; -using GPUDevice = Eigen::GpuDevice; - -// CPU specialization of actual computation. -template -struct ExampleFunctor { - void operator()(const CPUDevice& d, int size, const T* in, T* out) { - for (int i = 0; i < size; ++i) { - out[i] = 2 * in[i]; - } - } -}; - -// OpKernel definition. -// template parameter is the datatype of the tensors. -template -class ExampleOp : public OpKernel { - public: - explicit ExampleOp(OpKernelConstruction* context) : OpKernel(context) {} - - void Compute(OpKernelContext* context) override { - // Grab the input tensor - const Tensor& input_tensor = context->input(0); - - // Create an output tensor - Tensor* output_tensor = NULL; - OP_REQUIRES_OK(context, context->allocate_output(0, input_tensor.shape(), - &output_tensor)); - - // Do the computation. - OP_REQUIRES(context, input_tensor.NumElements() <= tensorflow::kint32max, - errors::InvalidArgument("Too many elements in tensor")); - ExampleFunctor()( - context->eigen_device(), - static_cast(input_tensor.NumElements()), - input_tensor.flat().data(), - output_tensor->flat().data()); - } -}; - -// Register the CPU kernels. -#define REGISTER_CPU(T) \ - REGISTER_KERNEL_BUILDER( \ - Name("Example").Device(DEVICE_CPU).TypeConstraint("T"), \ - ExampleOp); -REGISTER_CPU(float); -REGISTER_CPU(int32); - -// Register the GPU kernels. -#ifdef GOOGLE_CUDA -#define REGISTER_GPU(T) \ - /* Declare explicit instantiations in kernel_example.cu.cc. */ \ - extern template ExampleFunctor; \ - REGISTER_KERNEL_BUILDER( \ - Name("Example").Device(DEVICE_GPU).TypeConstraint("T"), \ - ExampleOp); -REGISTER_GPU(float); -REGISTER_GPU(int32); -#endif // GOOGLE_CUDA -``` - -```c++ -// kernel_example.cu.cc -#ifdef GOOGLE_CUDA -#define EIGEN_USE_GPU -#include "example.h" -#include "tensorflow/core/util/cuda_kernel_helper.h" - -using namespace tensorflow; - -using GPUDevice = Eigen::GpuDevice; - -// Define the CUDA kernel. -template -__global__ void ExampleCudaKernel(const int size, const T* in, T* out) { - for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < size; - i += blockDim.x * gridDim.x) { - out[i] = 2 * ldg(in + i); - } -} - -// Define the GPU implementation that launches the CUDA kernel. -template -void ExampleFunctor::operator()( - const GPUDevice& d, int size, const T* in, T* out) { - // Launch the cuda kernel. - // - // See core/util/cuda_kernel_helper.h for example of computing - // block count and thread_per_block count. - int block_count = 1024; - int thread_per_block = 20; - ExampleCudaKernel - <<>>(size, in, out); -} - -// Explicitly instantiate functors for the types of OpKernels registered. -template struct ExampleFunctor; -template struct ExampleFunctor; - -#endif // GOOGLE_CUDA -``` - -## Build the op library -### Compile the op using your system compiler (TensorFlow binary installation) - -You should be able to compile `zero_out.cc` with a `C++` compiler such as `g++` -or `clang` available on your system. The binary PIP package installs the header -files and the library that you need to compile your op in locations that are -system specific. However, the TensorFlow python library provides the -`get_include` function to get the header directory, and the `get_lib` directory -has a shared object to link against. -Here are the outputs of these functions on an Ubuntu machine. - -```bash -$ python ->>> import tensorflow as tf ->>> tf.sysconfig.get_include() -'/usr/local/lib/python2.7/site-packages/tensorflow/include' ->>> tf.sysconfig.get_lib() -'/usr/local/lib/python2.7/site-packages/tensorflow' -``` - -Assuming you have `g++` installed, here is the sequence of commands you can use -to compile your op into a dynamic library. - -```bash -TF_CFLAGS=( $(python -c 'import tensorflow as tf; print(" ".join(tf.sysconfig.get_compile_flags()))') ) -TF_LFLAGS=( $(python -c 'import tensorflow as tf; print(" ".join(tf.sysconfig.get_link_flags()))') ) -g++ -std=c++11 -shared zero_out.cc -o zero_out.so -fPIC ${TF_CFLAGS[@]} ${TF_LFLAGS[@]} -O2 -``` - -On Mac OS X, the additional flag "-undefined dynamic_lookup" is required when -building the `.so` file. - -> Note on `gcc` version `>=5`: gcc uses the new C++ -> [ABI](https://gcc.gnu.org/gcc-5/changes.html#libstdcxx) since version `5`. The binary pip -> packages available on the TensorFlow website are built with `gcc4` that uses -> the older ABI. If you compile your op library with `gcc>=5`, add -> `-D_GLIBCXX_USE_CXX11_ABI=0` to the command line to make the library -> compatible with the older abi. -> Furthermore if you are using TensorFlow package created from source remember to add `--cxxopt="-D_GLIBCXX_USE_CXX11_ABI=0"` -> as bazel command to compile the Python package. - -### Compile the op using bazel (TensorFlow source installation) - -If you have TensorFlow sources installed, you can make use of TensorFlow's build -system to compile your op. Place a BUILD file with following Bazel build rule in -the [`tensorflow/core/user_ops`][user_ops] directory. - -```python -load("//tensorflow:tensorflow.bzl", "tf_custom_op_library") - -tf_custom_op_library( - name = "zero_out.so", - srcs = ["zero_out.cc"], -) -``` - -Run the following command to build `zero_out.so`. - -```bash -$ bazel build --config opt //tensorflow/core/user_ops:zero_out.so -``` - -> Note: Although you can create a shared library (a `.so` file) with the -> standard `cc_library` rule, we strongly recommend that you use the -> `tf_custom_op_library` macro. It adds some required dependencies, and -> performs checks to ensure that the shared library is compatible with -> TensorFlow's plugin loading mechanism. - -## Use the op in Python - -TensorFlow Python API provides the -`tf.load_op_library` function to -load the dynamic library and register the op with the TensorFlow -framework. `load_op_library` returns a Python module that contains the Python -wrappers for the op and the kernel. Thus, once you have built the op, you can -do the following to run it from Python: - -```python -import tensorflow as tf -zero_out_module = tf.load_op_library('./zero_out.so') -with tf.Session(''): - zero_out_module.zero_out([[1, 2], [3, 4]]).eval() - -# Prints -array([[1, 0], [0, 0]], dtype=int32) -``` - -Keep in mind, the generated function will be given a snake\_case name (to comply -with [PEP8](https://www.python.org/dev/peps/pep-0008/)). So, if your op is -named `ZeroOut` in the C++ files, the python function will be called `zero_out`. - -To make the op available as a regular function `import`-able from a Python -module, it maybe useful to have the `load_op_library` call in a Python source -file as follows: - -```python -import tensorflow as tf - -zero_out_module = tf.load_op_library('./zero_out.so') -zero_out = zero_out_module.zero_out -``` - -## Verify that the op works - -A good way to verify that you've successfully implemented your op is to write a -test for it. Create the file -`zero_out_op_test.py` with the contents: - -```python -import tensorflow as tf - -class ZeroOutTest(tf.test.TestCase): - def testZeroOut(self): - zero_out_module = tf.load_op_library('./zero_out.so') - with self.test_session(): - result = zero_out_module.zero_out([5, 4, 3, 2, 1]) - self.assertAllEqual(result.eval(), [5, 0, 0, 0, 0]) - -if __name__ == "__main__": - tf.test.main() -``` - -Then run your test (assuming you have tensorflow installed): - -```sh -$ python zero_out_op_test.py -``` - -## Building advanced features into your op - -Now that you know how to build a basic (and somewhat restricted) op and -implementation, we'll look at some of the more complicated things you will -typically need to build into your op. This includes: - -* [Conditional checks and validation](#conditional-checks-and-validation) -* [Op registration](#op-registration) - * [Attrs](#attrs) - * [Attr types](#attr-types) - * [Polymorphism](#polymorphism) - * [Inputs and outputs](#inputs-and-outputs) - * [Backwards compatibility](#backwards-compatibility) -* [GPU support](#gpu-support) - * [Compiling the kernel for the GPU device](#compiling-the-kernel-for-the-gpu-device) -* [Implement the gradient in Python](#implement-the-gradient-in-python) -* [Shape functions in C++](#shape-functions-in-c) - -### Conditional checks and validation - -The example above assumed that the op applied to a tensor of any shape. What -if it only applied to vectors? That means adding a check to the above OpKernel -implementation. - -```c++ - void Compute(OpKernelContext* context) override { - // Grab the input tensor - const Tensor& input_tensor = context->input(0); - - OP_REQUIRES(context, TensorShapeUtils::IsVector(input_tensor.shape()), - errors::InvalidArgument("ZeroOut expects a 1-D vector.")); - // ... - } -``` - -This asserts that the input is a vector, and returns having set the -`InvalidArgument` status if it isn't. The -[`OP_REQUIRES` macro][validation-macros] takes three arguments: - -* The `context`, which can either be an `OpKernelContext` or - `OpKernelConstruction` pointer (see - [`tensorflow/core/framework/op_kernel.h`](https://www.tensorflow.org/code/tensorflow/core/framework/op_kernel.h)), - for its `SetStatus()` method. -* The condition. For example, there are functions for validating the shape - of a tensor in - [`tensorflow/core/framework/tensor_shape.h`](https://www.tensorflow.org/code/tensorflow/core/framework/tensor_shape.h) -* The error itself, which is represented by a `Status` object, see - [`tensorflow/core/lib/core/status.h`](https://www.tensorflow.org/code/tensorflow/core/lib/core/status.h). A - `Status` has both a type (frequently `InvalidArgument`, but see the list of - types) and a message. Functions for constructing an error may be found in - [`tensorflow/core/lib/core/errors.h`][validation-macros]. - -Alternatively, if you want to test whether a `Status` object returned from some -function is an error, and if so return it, use -[`OP_REQUIRES_OK`][validation-macros]. Both of these macros return from the -function on error. - -### Op registration - -#### Attrs - -Ops can have attrs, whose values are set when the op is added to a graph. These -are used to configure the op, and their values can be accessed both within the -kernel implementation and in the types of inputs and outputs in the op -registration. Prefer using an input instead of an attr when possible, since -inputs are more flexible. This is because attrs are constants and must be -defined at graph construction time. In contrast, inputs are Tensors whose -values can be dynamic; that is, inputs can change every step, be set using a -feed, etc. Attrs are used for things that can't be done with inputs: any -configuration that affects the signature (number or type of inputs or outputs) -or that can't change from step-to-step. - -You define an attr when you register the op, by specifying its name and type -using the `Attr` method, which expects a spec of the form: - -``` -: -``` - -where `` begins with a letter and can be composed of alphanumeric -characters and underscores, and `` is a type expression of the -form [described below](#attr_types). - -For example, if you'd like the `ZeroOut` op to preserve a user-specified index, -instead of only the 0th element, you can register the op like so: -```c++ -REGISTER_OP("ZeroOut") - .Attr("preserve_index: int") - .Input("to_zero: int32") - .Output("zeroed: int32"); -``` - -(Note that the set of [attribute types](#attr_types) is different from the -`tf.DType` used for inputs and outputs.) - -Your kernel can then access this attr in its constructor via the `context` -parameter: -```c++ -class ZeroOutOp : public OpKernel { - public: - explicit ZeroOutOp(OpKernelConstruction* context) : OpKernel(context) { - // Get the index of the value to preserve - OP_REQUIRES_OK(context, - context->GetAttr("preserve_index", &preserve_index_)); - // Check that preserve_index is positive - OP_REQUIRES(context, preserve_index_ >= 0, - errors::InvalidArgument("Need preserve_index >= 0, got ", - preserve_index_)); - } - void Compute(OpKernelContext* context) override { - // ... - } - private: - int preserve_index_; -}; -``` - -which can then be used in the `Compute` method: -```c++ - void Compute(OpKernelContext* context) override { - // ... - - // We're using saved attr to validate potentially dynamic input - // So we check that preserve_index is in range - OP_REQUIRES(context, preserve_index_ < input.dimension(0), - errors::InvalidArgument("preserve_index out of range")); - - // Set all the elements of the output tensor to 0 - const int N = input.size(); - for (int i = 0; i < N; i++) { - output\_flat(i) = 0; - } - - // Preserve the requested input value - output_flat(preserve_index_) = input(preserve_index_); - } -``` - -#### Attr types - -The following types are supported in an attr: - -* `string`: Any sequence of bytes (not required to be UTF8). -* `int`: A signed integer. -* `float`: A floating point number. -* `bool`: True or false. -* `type`: One of the (non-ref) values of [`DataType`][DataTypeString]. -* `shape`: A [`TensorShapeProto`][TensorShapeProto]. -* `tensor`: A [`TensorProto`][TensorProto]. -* `list()`: A list of ``, where `` is one of the above types. - Note that `list(list())` is invalid. - -See also: [`op_def_builder.cc:FinalizeAttr`][FinalizeAttr] for a definitive list. - -##### Default values & constraints - -Attrs may have default values, and some types of attrs can have constraints. To -define an attr with constraints, you can use the following ``s: - -* `{'', ''}`: The value must be a string that has either the - value `` or ``. The name of the type, `string`, is implied - when you use this syntax. This emulates an enum: - - ```c++ - REGISTER_OP("EnumExample") - .Attr("e: {'apple', 'orange'}"); - ``` - -* `{, }`: The value is of type `type`, and must be one of - `` or ``, where `` and `` are supported - `tf.DType`. You don't specify - that the type of the attr is `type`. This is implied when you have a list of - types in `{...}`. For example, in this case the attr `t` is a type that must - be an `int32`, a `float`, or a `bool`: - - ```c++ - REGISTER_OP("RestrictedTypeExample") - .Attr("t: {int32, float, bool}"); - ``` - -* There are shortcuts for common type constraints: - * `numbertype`: Type `type` restricted to the numeric (non-string and - non-bool) types. - * `realnumbertype`: Like `numbertype` without complex types. - * `quantizedtype`: Like `numbertype` but just the quantized number types. - - The specific lists of types allowed by these are defined by the functions - (like `NumberTypes()`) in - [`tensorflow/core/framework/types.h`](https://www.tensorflow.org/code/tensorflow/core/framework/types.h). - In this example the attr `t` must be one of the numeric types: - - ```c++ - REGISTER_OP("NumberType") - .Attr("t: numbertype"); - ``` - - For this op: - - ```python - tf.number_type(t=tf.int32) # Valid - tf.number_type(t=tf.bool) # Invalid - ``` - - Lists can be combined with other lists and single types. The following - op allows attr `t` to be any of the numeric types, or the bool type: - - ```c++ - REGISTER_OP("NumberOrBooleanType") - .Attr("t: {numbertype, bool}"); - ``` - - For this op: - - ```python - tf.number_or_boolean_type(t=tf.int32) # Valid - tf.number_or_boolean_type(t=tf.bool) # Valid - tf.number_or_boolean_type(t=tf.string) # Invalid - ``` - -* `int >= `: The value must be an int whose value is greater than or equal to - ``, where `` is a natural number. - - For example, the following op registration specifies that the attr `a` must - have a value that is at least `2`: - - ```c++ - REGISTER_OP("MinIntExample") - .Attr("a: int >= 2"); - ``` - -* `list() >= `: A list of type `` whose length is greater than - or equal to ``. - - For example, the following op registration specifies that the attr `a` is a - list of types (either `int32` or `float`), and that there must be at least 3 - of them: - - ```c++ - REGISTER_OP("TypeListExample") - .Attr("a: list({int32, float}) >= 3"); - ``` - -To set a default value for an attr (making it optional in the generated code), -add `= ` to the end, as in: - -```c++ -REGISTER_OP("AttrDefaultExample") - .Attr("i: int = 0"); -``` - -The supported syntax of the default value is what would be used in the proto -representation of the resulting GraphDef definition. - -Here are examples for how to specify a default for all types: - -```c++ -REGISTER_OP("AttrDefaultExampleForAllTypes") - .Attr("s: string = 'foo'") - .Attr("i: int = 0") - .Attr("f: float = 1.0") - .Attr("b: bool = true") - .Attr("ty: type = DT_INT32") - .Attr("sh: shape = { dim { size: 1 } dim { size: 2 } }") - .Attr("te: tensor = { dtype: DT_INT32 int_val: 5 }") - .Attr("l_empty: list(int) = []") - .Attr("l_int: list(int) = [2, 3, 5, 7]"); -``` - -Note in particular that the values of type `type` -use `tf.DType`. - -#### Polymorphism - -##### Type Polymorphism - -For ops that can take different types as input or produce different output -types, you can specify [an attr](#attrs) in -[an input or output type](#inputs-and-outputs) in the op registration. Typically -you would then register an `OpKernel` for each supported type. - -For instance, if you'd like the `ZeroOut` op to work on `float`s -in addition to `int32`s, your op registration might look like: -```c++ -REGISTER_OP("ZeroOut") - .Attr("T: {float, int32}") - .Input("to_zero: T") - .Output("zeroed: T"); -``` - -Your op registration now specifies that the input's type must be `float`, or -`int32`, and that its output will be the same type, since both have type `T`. - -> A note on naming: Inputs, outputs, and attrs generally should be -> given snake\_case names. The one exception is attrs that are used as the type -> of an input or in the type of an input. Those attrs can be inferred when the -> op is added to the graph and so don't appear in the op's function. For -> example, this last definition of ZeroOut will generate a Python function that -> looks like: -> -> ```python -> def zero_out(to_zero, name=None): -> """... -> Args: -> to_zero: A `Tensor`. Must be one of the following types: -> `float32`, `int32`. -> name: A name for the operation (optional). -> -> Returns: -> A `Tensor`. Has the same type as `to_zero`. -> """ -> ``` -> -> If `to_zero` is passed an `int32` tensor, then `T` is automatically set to -> `int32` (well, actually `DT_INT32`). Those inferred attrs are given -> Capitalized or CamelCase names. -> -> Compare this with an op that has a type attr that determines the output -> type: -> -> ```c++ -> REGISTER_OP("StringToNumber") -> .Input("string_tensor: string") -> .Output("output: out_type") -> .Attr("out_type: {float, int32} = DT_FLOAT"); -> .Doc(R"doc( -> Converts each string in the input Tensor to the specified numeric type. -> )doc"); -> ``` -> -> In this case, the user has to specify the output type, as in the generated -> Python: -> -> ```python -> def string_to_number(string_tensor, out_type=None, name=None): -> """Converts each string in the input Tensor to the specified numeric type. -> -> Args: -> string_tensor: A `Tensor` of type `string`. -> out_type: An optional `tf.DType` from: `tf.float32, tf.int32`. -> Defaults to `tf.float32`. -> name: A name for the operation (optional). -> -> Returns: -> A `Tensor` of type `out_type`. -> """ -> ``` - -```c++ -#include "tensorflow/core/framework/op_kernel.h" - -class ZeroOutInt32Op : public OpKernel { - // as before -}; - -class ZeroOutFloatOp : public OpKernel { - public: - explicit ZeroOutFloatOp(OpKernelConstruction* context) - : OpKernel(context) {} - - void Compute(OpKernelContext* context) override { - // Grab the input tensor - const Tensor& input_tensor = context->input(0); - auto input = input_tensor.flat(); - - // Create an output tensor - Tensor* output = NULL; - OP_REQUIRES_OK(context, - context->allocate_output(0, input_tensor.shape(), &output)); - auto output_flat = output->template flat(); - - // Set all the elements of the output tensor to 0 - const int N = input.size(); - for (int i = 0; i < N; i++) { - output_flat(i) = 0; - } - - // Preserve the first input value - if (N > 0) output_flat(0) = input(0); - } -}; - -// Note that TypeConstraint("T") means that attr "T" (defined -// in the op registration above) must be "int32" to use this template -// instantiation. -REGISTER_KERNEL_BUILDER( - Name("ZeroOut") - .Device(DEVICE_CPU) - .TypeConstraint("T"), - ZeroOutOpInt32); -REGISTER_KERNEL_BUILDER( - Name("ZeroOut") - .Device(DEVICE_CPU) - .TypeConstraint("T"), - ZeroOutFloatOp); -``` - -> To preserve [backwards compatibility](#backwards-compatibility), you should -> specify a [default value](#default-values-constraints) when adding an attr to -> an existing op: -> -> ```c++ -> REGISTER_OP("ZeroOut") -> .Attr("T: {float, int32} = DT_INT32") -> .Input("to_zero: T") -> .Output("zeroed: T") -> ``` - -Let's say you wanted to add more types, say `double`: -```c++ -REGISTER_OP("ZeroOut") - .Attr("T: {float, double, int32}") - .Input("to_zero: T") - .Output("zeroed: T"); -``` - -Instead of writing another `OpKernel` with redundant code as above, often you -will be able to use a C++ template instead. You will still have one kernel -registration (`REGISTER_KERNEL_BUILDER` call) per overload. -```c++ -template -class ZeroOutOp : public OpKernel { - public: - explicit ZeroOutOp(OpKernelConstruction* context) : OpKernel(context) {} - - void Compute(OpKernelContext* context) override { - // Grab the input tensor - const Tensor& input_tensor = context->input(0); - auto input = input_tensor.flat(); - - // Create an output tensor - Tensor* output = NULL; - OP_REQUIRES_OK(context, - context->allocate_output(0, input_tensor.shape(), &output)); - auto output_flat = output->template flat(); - - // Set all the elements of the output tensor to 0 - const int N = input.size(); - for (int i = 0; i < N; i++) { - output_flat(i) = 0; - } - - // Preserve the first input value - if (N > 0) output_flat(0) = input(0); - } -}; - -// Note that TypeConstraint("T") means that attr "T" (defined -// in the op registration above) must be "int32" to use this template -// instantiation. -REGISTER_KERNEL_BUILDER( - Name("ZeroOut") - .Device(DEVICE_CPU) - .TypeConstraint("T"), - ZeroOutOp); -REGISTER_KERNEL_BUILDER( - Name("ZeroOut") - .Device(DEVICE_CPU) - .TypeConstraint("T"), - ZeroOutOp); -REGISTER_KERNEL_BUILDER( - Name("ZeroOut") - .Device(DEVICE_CPU) - .TypeConstraint("T"), - ZeroOutOp); -``` - -If you have more than a couple overloads, you can put the registration in a -macro. - -```c++ -#include "tensorflow/core/framework/op_kernel.h" - -#define REGISTER_KERNEL(type) \ - REGISTER_KERNEL_BUILDER( \ - Name("ZeroOut").Device(DEVICE_CPU).TypeConstraint("T"), \ - ZeroOutOp) - -REGISTER_KERNEL(int32); -REGISTER_KERNEL(float); -REGISTER_KERNEL(double); - -#undef REGISTER_KERNEL -``` - -Depending on the list of types you are registering the kernel for, you may be -able to use a macro provided by -[`tensorflow/core/framework/register_types.h`][register_types]: - -```c++ -#include "tensorflow/core/framework/op_kernel.h" -#include "tensorflow/core/framework/register_types.h" - -REGISTER_OP("ZeroOut") - .Attr("T: realnumbertype") - .Input("to_zero: T") - .Output("zeroed: T"); - -template -class ZeroOutOp : public OpKernel { ... }; - -#define REGISTER_KERNEL(type) \ - REGISTER_KERNEL_BUILDER( \ - Name("ZeroOut").Device(DEVICE_CPU).TypeConstraint("T"), \ - ZeroOutOp) - -TF_CALL_REAL_NUMBER_TYPES(REGISTER_KERNEL); - -#undef REGISTER_KERNEL -``` - -##### List Inputs and Outputs - -In addition to being able to accept or produce different types, ops can consume -or produce a variable number of tensors. - -In the next example, the attr `T` holds a *list* of types, and is used as the -type of both the input `in` and the output `out`. The input and output are -lists of tensors of that type (and the number and types of tensors in the output -are the same as the input, since both have type `T`). - -```c++ -REGISTER_OP("PolymorphicListExample") - .Attr("T: list(type)") - .Input("in: T") - .Output("out: T"); -``` - -You can also place restrictions on what types can be specified in the list. In -this next case, the input is a list of `float` and `double` tensors. The op -accepts, for example, input types `(float, double, float)` and in that case the -output type would also be `(float, double, float)`. - -```c++ -REGISTER_OP("ListTypeRestrictionExample") - .Attr("T: list({float, double})") - .Input("in: T") - .Output("out: T"); -``` - -If you want all the tensors in a list to be of the same type, you might do -something like: - -```c++ -REGISTER_OP("IntListInputExample") - .Attr("N: int") - .Input("in: N * int32") - .Output("out: int32"); -``` - -This accepts a list of `int32` tensors, and uses an `int` attr `N` to -specify the length of the list. - -This can be made [type polymorphic](#type-polymorphism) as well. In the next -example, the input is a list of tensors (with length `"N"`) of the same (but -unspecified) type (`"T"`), and the output is a single tensor of matching type: - -```c++ -REGISTER_OP("SameListInputExample") - .Attr("N: int") - .Attr("T: type") - .Input("in: N * T") - .Output("out: T"); -``` - -By default, tensor lists have a minimum length of 1. You can change that default -using -[a `">="` constraint on the corresponding attr](#default-values-constraints). -In this next example, the input is a list of at least 2 `int32` tensors: - -```c++ -REGISTER_OP("MinLengthIntListExample") - .Attr("N: int >= 2") - .Input("in: N * int32") - .Output("out: int32"); -``` - -The same syntax works with `"list(type)"` attrs: - -```c++ -REGISTER_OP("MinimumLengthPolymorphicListExample") - .Attr("T: list(type) >= 3") - .Input("in: T") - .Output("out: T"); -``` - -#### Inputs and Outputs - -To summarize the above, an op registration can have multiple inputs and outputs: - -```c++ -REGISTER_OP("MultipleInsAndOuts") - .Input("y: int32") - .Input("z: float") - .Output("a: string") - .Output("b: int32"); -``` - -Each input or output spec is of the form: - -``` -: -``` - -where `` begins with a letter and can be composed of alphanumeric -characters and underscores. `` is one of the following type -expressions: - -* ``, where `` is a supported input type (e.g. `float`, `int32`, - `string`). This specifies a single tensor of the given type. - - See - `tf.DType`. - - ```c++ - REGISTER_OP("BuiltInTypesExample") - .Input("integers: int32") - .Input("complex_numbers: complex64"); - ``` - -* ``, where `` is the name of an [Attr](#attrs) with type - `type` or `list(type)` (with a possible type restriction). This syntax allows - for [polymorphic ops](#polymorphism). - - ```c++ - REGISTER_OP("PolymorphicSingleInput") - .Attr("T: type") - .Input("in: T"); - - REGISTER_OP("RestrictedPolymorphicSingleInput") - .Attr("T: {int32, int64}") - .Input("in: T"); - ``` - - Referencing an attr of type `list(type)` allows you to accept a sequence of - tensors. - - ```c++ - REGISTER_OP("ArbitraryTensorSequenceExample") - .Attr("T: list(type)") - .Input("in: T") - .Output("out: T"); - - REGISTER_OP("RestrictedTensorSequenceExample") - .Attr("T: list({int32, int64})") - .Input("in: T") - .Output("out: T"); - ``` - - Note that the number and types of tensors in the output `out` is the same as - in the input `in`, since both are of type `T`. - -* For a sequence of tensors with the same type: ` * `, where - `` is the name of an [Attr](#attrs) with type `int`. The `` can - either be a `tf.DType`, - or the name of an attr with type `type`. As an example of the first, this - op accepts a list of `int32` tensors: - - ```c++ - REGISTER_OP("Int32SequenceExample") - .Attr("NumTensors: int") - .Input("in: NumTensors * int32") - ``` - - Whereas this op accepts a list of tensors of any type, as long as they are all - the same: - - ```c++ - REGISTER_OP("SameTypeSequenceExample") - .Attr("NumTensors: int") - .Attr("T: type") - .Input("in: NumTensors * T") - ``` - -* For a reference to a tensor: `Ref()`, where `` is one of the - previous types. - -> A note on naming: Any attr used in the type of an input will be inferred. By -> convention those inferred attrs use capital names (like `T` or `N`). -> Otherwise inputs, outputs, and attrs have names like function parameters -> (e.g. `num_outputs`). For more details, see the -> [earlier note on naming](#naming). - -For more details, see -[`tensorflow/core/framework/op_def_builder.h`][op_def_builder]. - -#### Backwards compatibility - -Let's assume you have written a nice, custom op and shared it with others, so -you have happy customers using your operation. However, you'd like to make -changes to the op in some way. - -In general, changes to existing, checked-in specifications must be -backwards-compatible: changing the specification of an op must not break prior -serialized `GraphDef` protocol buffers constructed from older specifications. -The details of `GraphDef` compatibility are -[described here](../guide/version_compat.md#compatibility_of_graphs_and_checkpoints). - -There are several ways to preserve backwards-compatibility. - -1. Any new attrs added to an operation must have default values defined, and - with that default value the op must have the original behavior. To change an - operation from not polymorphic to polymorphic, you *must* give a default - value to the new type attr to preserve the original signature by default. For - example, if your operation was: - - REGISTER_OP("MyGeneralUnaryOp") - .Input("in: float") - .Output("out: float"); - - you can make it polymorphic in a backwards-compatible way using: - - REGISTER_OP("MyGeneralUnaryOp") - .Input("in: T") - .Output("out: T") - .Attr("T: numerictype = DT_FLOAT"); - -2. You can safely make a constraint on an attr less restrictive. For example, - you can change from `{int32, int64}` to `{int32, int64, float}` or `type`. - Or you may change from `{"apple", "orange"}` to `{"apple", "banana", - "orange"}` or `string`. - -3. You can change single inputs / outputs into list inputs / outputs, as long as - the default for the list type matches the old signature. - -4. You can add a new list input / output, if it defaults to empty. - -5. Namespace any new ops you create, by prefixing the op names with something - unique to your project. This avoids having your op colliding with any ops - that might be included in future versions of TensorFlow. - -6. Plan ahead! Try to anticipate future uses for the op. Some signature changes - can't be done in a compatible way (for example, making a list of the same - type into a list of varying types). - -The full list of safe and unsafe changes can be found in -[`tensorflow/core/framework/op_compatibility_test.cc`](https://www.tensorflow.org/code/tensorflow/core/framework/op_compatibility_test.cc). -If you cannot make your change to an operation backwards compatible, then create -a new operation with a new name with the new semantics. - -Also note that while these changes can maintain `GraphDef` compatibility, the -generated Python code may change in a way that isn't compatible with old -callers. The Python API may be kept compatible by careful changes in a -hand-written Python wrapper, by keeping the old signature except possibly adding -new optional arguments to the end. Generally incompatible changes may only be -made when TensorFlow's changes major versions, and must conform to the -[`GraphDef` version semantics](../guide/version_compat.md#compatibility_of_graphs_and_checkpoints). - -### GPU Support - -You can implement different OpKernels and register one for CPU and another for -GPU, just like you can [register kernels for different types](#polymorphism). -There are several examples of kernels with GPU support in -[`tensorflow/core/kernels/`](https://www.tensorflow.org/code/tensorflow/core/kernels/). -Notice some kernels have a CPU version in a `.cc` file, a GPU version in a file -ending in `_gpu.cu.cc`, and some code shared in common in a `.h` file. - -For example, the `tf.pad` has -everything but the GPU kernel in [`tensorflow/core/kernels/pad_op.cc`][pad_op]. -The GPU kernel is in -[`tensorflow/core/kernels/pad_op_gpu.cu.cc`](https://www.tensorflow.org/code/tensorflow/core/kernels/pad_op_gpu.cu.cc), -and the shared code is a templated class defined in -[`tensorflow/core/kernels/pad_op.h`](https://www.tensorflow.org/code/tensorflow/core/kernels/pad_op.h). -We organize the code this way for two reasons: it allows you to share common -code among the CPU and GPU implementations, and it puts the GPU implementation -into a separate file so that it can be compiled only by the GPU compiler. - -One thing to note, even when the GPU kernel version of `pad` is used, it still -needs its `"paddings"` input in CPU memory. To mark that inputs or outputs are -kept on the CPU, add a `HostMemory()` call to the kernel registration, e.g.: - -```c++ -#define REGISTER_GPU_KERNEL(T) \ - REGISTER_KERNEL_BUILDER(Name("Pad") \ - .Device(DEVICE_GPU) \ - .TypeConstraint("T") \ - .HostMemory("paddings"), \ - PadOp) -``` - -#### Compiling the kernel for the GPU device - -Look at -[cuda_op_kernel.cu.cc](https://www.tensorflow.org/code/tensorflow/examples/adding_an_op/cuda_op_kernel.cu.cc) -for an example that uses a CUDA kernel to implement an op. The -`tf_custom_op_library` accepts a `gpu_srcs` argument in which the list of source -files containing the CUDA kernels (`*.cu.cc` files) can be specified. For use -with a binary installation of TensorFlow, the CUDA kernels have to be compiled -with NVIDIA's `nvcc` compiler. Here is the sequence of commands you can use to -compile the -[cuda_op_kernel.cu.cc](https://www.tensorflow.org/code/tensorflow/examples/adding_an_op/cuda_op_kernel.cu.cc) -and -[cuda_op_kernel.cc](https://www.tensorflow.org/code/tensorflow/examples/adding_an_op/cuda_op_kernel.cc) -into a single dynamically loadable library: - -```bash -nvcc -std=c++11 -c -o cuda_op_kernel.cu.o cuda_op_kernel.cu.cc \ - ${TF_CFLAGS[@]} -D GOOGLE_CUDA=1 -x cu -Xcompiler -fPIC - -g++ -std=c++11 -shared -o cuda_op_kernel.so cuda_op_kernel.cc \ - cuda_op_kernel.cu.o ${TF_CFLAGS[@]} -fPIC -lcudart ${TF_LFLAGS[@]} -``` - -`cuda_op_kernel.so` produced above can be loaded as usual in Python, using the -`tf.load_op_library` function. - -Note that if your CUDA libraries are not installed in `/usr/local/lib64`, -you'll need to specify the path explicitly in the second (g++) command above. -For example, add `-L /usr/local/cuda-8.0/lib64/` if your CUDA is installed in -`/usr/local/cuda-8.0`. - -> Note in some linux settings, additional options to `nvcc` compiling step are needed. Add `-D_MWAITXINTRIN_H_INCLUDED` to the `nvcc` command line to avoid errors from `mwaitxintrin.h`. - -### Implement the gradient in Python - -Given a graph of ops, TensorFlow uses automatic differentiation -(backpropagation) to add new ops representing gradients with respect to the -existing ops (see -[Gradient Computation](../api_guides/python/train.md#gradient_computation)). -To make automatic differentiation work for new ops, you must register a gradient -function which computes gradients with respect to the ops' inputs given -gradients with respect to the ops' outputs. - -Mathematically, if an op computes \\(y = f(x)\\) the registered gradient op -converts gradients \\(\partial L/ \partial y\\) of loss \\(L\\) with respect to -\\(y\\) into gradients \\(\partial L/ \partial x\\) with respect to \\(x\\) via -the chain rule: - -$$\frac{\partial L}{\partial x} - = \frac{\partial L}{\partial y} \frac{\partial y}{\partial x} - = \frac{\partial L}{\partial y} \frac{\partial f}{\partial x}.$$ - -In the case of `ZeroOut`, only one entry in the input affects the output, so the -gradient with respect to the input is a sparse "one hot" tensor. This is -expressed as follows: - -```python -from tensorflow.python.framework import ops -from tensorflow.python.ops import array_ops -from tensorflow.python.ops import sparse_ops - -@ops.RegisterGradient("ZeroOut") -def _zero_out_grad(op, grad): - """The gradients for `zero_out`. - - Args: - op: The `zero_out` `Operation` that we are differentiating, which we can use - to find the inputs and outputs of the original op. - grad: Gradient with respect to the output of the `zero_out` op. - - Returns: - Gradients with respect to the input of `zero_out`. - """ - to_zero = op.inputs[0] - shape = array_ops.shape(to_zero) - index = array_ops.zeros_like(shape) - first_grad = array_ops.reshape(grad, [-1])[0] - to_zero_grad = sparse_ops.sparse_to_dense([index], shape, first_grad, 0) - return [to_zero_grad] # List of one Tensor, since we have one input -``` - -Details about registering gradient functions with -`tf.RegisterGradient`: - -* For an op with one output, the gradient function will take an - `tf.Operation` `op` and a - `tf.Tensor` `grad` and build new ops - out of the tensors - [`op.inputs[i]`](../../api_docs/python/framework.md#Operation.inputs), - [`op.outputs[i]`](../../api_docs/python/framework.md#Operation.outputs), and `grad`. Information - about any attrs can be found via - `tf.Operation.get_attr`. - -* If the op has multiple outputs, the gradient function will take `op` and - `grads`, where `grads` is a list of gradients with respect to each output. - The result of the gradient function must be a list of `Tensor` objects - representing the gradients with respect to each input. - -* If there is no well-defined gradient for some input, such as for integer - inputs used as indices, the corresponding returned gradient should be - `None`. For example, for an op taking a floating point tensor `x` and an - integer index `i`, the gradient function would `return [x_grad, None]`. - -* If there is no meaningful gradient for the op at all, you often will not have - to register any gradient, and as long as the op's gradient is never needed, - you will be fine. In some cases, an op has no well-defined gradient but can - be involved in the computation of the gradient. Here you can use - `ops.NotDifferentiable` to automatically propagate zeros backwards. - -Note that at the time the gradient function is called, only the data flow graph -of ops is available, not the tensor data itself. Thus, all computation must be -performed using other tensorflow ops, to be run at graph execution time. - -### Shape functions in C++ - -The TensorFlow API has a feature called "shape inference" that provides -information about the shapes of tensors without having to execute the -graph. Shape inference is supported by "shape functions" that are registered for -each op type in the C++ `REGISTER_OP` declaration, and perform two roles: -asserting that the shapes of the inputs are compatible during graph -construction, and specifying the shapes for the outputs. - -Shape functions are defined as operations on the -`shape_inference::InferenceContext` class. For example, in the shape function -for ZeroOut: - -```c++ - .SetShapeFn([](::tensorflow::shape_inference::InferenceContext* c) { - c->set_output(0, c->input(0)); - return Status::OK(); - }); -``` - -`c->set_output(0, c->input(0));` declares that the first output's shape should -be set to the first input's shape. If the output is selected by its index as in the above example, the second parameter of `set_output` should be a `ShapeHandle` object. You can create an empty `ShapeHandle` object by its default constructor. The `ShapeHandle` object for an input with index `idx` can be obtained by `c->input(idx)`. - -There are a number of common shape functions -that apply to many ops, such as `shape_inference::UnchangedShape` which can be -found in [common_shape_fns.h](https://www.tensorflow.org/code/tensorflow/core/framework/common_shape_fns.h) and used as follows: - -```c++ -REGISTER_OP("ZeroOut") - .Input("to_zero: int32") - .Output("zeroed: int32") - .SetShapeFn(::tensorflow::shape_inference::UnchangedShape); -``` - -A shape function can also constrain the shape of an input. For the version of -[`ZeroOut` with a vector shape constraint](#validation), the shape function -would be as follows: - -```c++ - .SetShapeFn([](::tensorflow::shape_inference::InferenceContext* c) { - ::tensorflow::shape_inference::ShapeHandle input; - TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 1, &input)); - c->set_output(0, input); - return Status::OK(); - }); -``` - -The `WithRank` call validates that the input shape `c->input(0)` has -a shape with exactly one dimension (or if the input shape is unknown, -the output shape will be a vector with one unknown dimension). - -If your op is [polymorphic with multiple inputs](#polymorphism), you can use -members of `InferenceContext` to determine the number of shapes to check, and -`Merge` to validate that the shapes are all compatible (alternatively, access -attributes that indicate the lengths, with `InferenceContext::GetAttr`, which -provides access to the attributes of the op). - -```c++ - .SetShapeFn([](::tensorflow::shape_inference::InferenceContext* c) { - ::tensorflow::shape_inference::ShapeHandle input; - ::tensorflow::shape_inference::ShapeHandle output; - for (size_t i = 0; i < c->num_inputs(); ++i) { - TF_RETURN_IF_ERROR(c->WithRank(c->input(i), 2, &input)); - TF_RETURN_IF_ERROR(c->Merge(output, input, &output)); - } - c->set_output(0, output); - return Status::OK(); - }); -``` - -Since shape inference is an optional feature, and the shapes of tensors may vary -dynamically, shape functions must be robust to incomplete shape information for -any of the inputs. The `Merge` method in [`InferenceContext`](https://www.tensorflow.org/code/tensorflow/core/framework/shape_inference.h) -allows the caller to assert that two shapes are the same, even if either -or both of them do not have complete information. Shape functions are defined -for all of the core TensorFlow ops and provide many different usage examples. - -The `InferenceContext` class has a number of functions that can be used to -define shape function manipulations. For example, you can validate that a -particular dimension has a very specific value using `InferenceContext::Dim` and -`InferenceContext::WithValue`; you can specify that an output dimension is the -sum / product of two input dimensions using `InferenceContext::Add` and -`InferenceContext::Multiply`. See the `InferenceContext` class for -all of the various shape manipulations you can specify. The following example sets -shape of the first output to (n, 3), where first input has shape (n, ...) - -```c++ -.SetShapeFn([](::tensorflow::shape_inference::InferenceContext* c) { - c->set_output(0, c->Matrix(c->Dim(c->input(0), 0), 3)); - return Status::OK(); -}); -``` - -If you have a complicated shape function, you should consider adding a test for -validating that various input shape combinations produce the expected output -shape combinations. You can see examples of how to write these tests in some -our -[core ops tests](https://www.tensorflow.org/code/tensorflow/core/ops/array_ops_test.cc). -(The syntax of `INFER_OK` and `INFER_ERROR` are a little cryptic, but try to be -compact in representing input and output shape specifications in tests. For -now, see the surrounding comments in those tests to get a sense of the shape -string specification). - - -[core-array_ops]:https://www.tensorflow.org/code/tensorflow/core/ops/array_ops.cc -[python-user_ops]:https://www.tensorflow.org/code/tensorflow/python/user_ops/user_ops.py -[tf-kernels]:https://www.tensorflow.org/code/tensorflow/core/kernels/ -[user_ops]:https://www.tensorflow.org/code/tensorflow/core/user_ops/ -[pad_op]:https://www.tensorflow.org/code/tensorflow/core/kernels/pad_op.cc -[standard_ops-py]:https://www.tensorflow.org/code/tensorflow/python/ops/standard_ops.py -[standard_ops-cc]:https://www.tensorflow.org/code/tensorflow/cc/ops/standard_ops.h -[python-BUILD]:https://www.tensorflow.org/code/tensorflow/python/BUILD -[validation-macros]:https://www.tensorflow.org/code/tensorflow/core/lib/core/errors.h -[op_def_builder]:https://www.tensorflow.org/code/tensorflow/core/framework/op_def_builder.h -[register_types]:https://www.tensorflow.org/code/tensorflow/core/framework/register_types.h -[FinalizeAttr]:https://www.tensorflow.org/code/tensorflow/core/framework/op_def_builder.cc -[DataTypeString]:https://www.tensorflow.org/code/tensorflow/core/framework/types.cc -[python-BUILD]:https://www.tensorflow.org/code/tensorflow/python/BUILD -[types-proto]:https://www.tensorflow.org/code/tensorflow/core/framework/types.proto -[TensorShapeProto]:https://www.tensorflow.org/code/tensorflow/core/framework/tensor_shape.proto -[TensorProto]:https://www.tensorflow.org/code/tensorflow/core/framework/tensor.proto diff --git a/tensorflow/docs_src/extend/architecture.md b/tensorflow/docs_src/extend/architecture.md deleted file mode 100644 index eb33336bee..0000000000 --- a/tensorflow/docs_src/extend/architecture.md +++ /dev/null @@ -1,217 +0,0 @@ -# TensorFlow Architecture - -We designed TensorFlow for large-scale distributed training and inference, but -it is also flexible enough to support experimentation with new machine -learning models and system-level optimizations. - -This document describes the system architecture that makes this -combination of scale and flexibility possible. It assumes that you have basic familiarity -with TensorFlow programming concepts such as the computation graph, operations, -and sessions. See [this document](../guide/low_level_intro.md) for an introduction to -these topics. Some familiarity with [distributed TensorFlow](../deploy/distributed.md) -will also be helpful. - -This document is for developers who want to extend TensorFlow in some way not -supported by current APIs, hardware engineers who want to optimize for -TensorFlow, implementers of machine learning systems working on scaling and -distribution, or anyone who wants to look under Tensorflow's hood. By the end of this document -you should understand the TensorFlow architecture well enough to read -and modify the core TensorFlow code. - -## Overview - -The TensorFlow runtime is a cross-platform library. Figure 1 illustrates its -general architecture. A C API separates user level code in different languages -from the core runtime. - -![TensorFlow Layers](https://www.tensorflow.org/images/layers.png){: width="300"} - -**Figure 1** - - -This document focuses on the following layers: - -* **Client**: - * Defines the computation as a dataflow graph. - * Initiates graph execution using a [**session**]( - https://www.tensorflow.org/code/tensorflow/python/client/session.py). -* **Distributed Master** - * Prunes a specific subgraph from the graph, as defined by the arguments - to Session.run(). - * Partitions the subgraph into multiple pieces that run in different - processes and devices. - * Distributes the graph pieces to worker services. - * Initiates graph piece execution by worker services. -* **Worker Services** (one for each task) - * Schedule the execution of graph operations using kernel implementations - appropriate to the available hardware (CPUs, GPUs, etc). - * Send and receive operation results to and from other worker services. -* **Kernel Implementations** - * Perform the computation for individual graph operations. - -Figure 2 illustrates the interaction of these components. "/job:worker/task:0" and -"/job:ps/task:0" are both tasks with worker services. "PS" stands for "parameter -server": a task responsible for storing and updating the model's parameters. -Other tasks send updates to these parameters as they work on optimizing the -parameters. This particular division of labor between tasks is not required, but - is common for distributed training. - -![TensorFlow Architecture Diagram](https://www.tensorflow.org/images/diag1.svg){: width="500"} - -**Figure 2** - -Note that the Distributed Master and Worker Service only exist in -distributed TensorFlow. The single-process version of TensorFlow includes a -special Session implementation that does everything the distributed master does -but only communicates with devices in the local process. - -The following sections describe the core TensorFlow layers in greater detail and -step through the processing of an example graph. - -## Client - -Users write the client TensorFlow program that builds the computation graph. -This program can either directly compose individual operations or use a -convenience library like the Estimators API to compose neural network layers and -other higher-level abstractions. TensorFlow supports multiple client -languages, and we have prioritized Python and C++, because our internal users -are most familiar with these languages. As features become more established, -we typically port them to C++, so that users can access an optimized -implementation from all client languages. Most of the training libraries are -still Python-only, but C++ does have support for efficient inference. - -The client creates a session, which sends the graph definition to the -distributed master as a `tf.GraphDef` -protocol buffer. When the client evaluates a node or nodes in the -graph, the evaluation triggers a call to the distributed master to initiate -computation. - -In Figure 3, the client has built a graph that applies weights (w) to a -feature vector (x), adds a bias term (b) and saves the result in a variable -(s). - -![TensorFlow Architecture Diagram: Client](https://www.tensorflow.org/images/graph_client.svg){: width="700"} - -**Figure 3** - -### Code - -* `tf.Session` - -## Distributed master - -The distributed master: - -* prunes the graph to obtain the subgraph required to evaluate the nodes - requested by the client, -* partitions the graph to obtain graph pieces for - each participating device, and -* caches these pieces so that they may be re-used in subsequent steps. - -Since the master sees the overall computation for -a step, it applies standard optimizations such as common subexpression -elimination and constant folding. It then coordinates execution of the -optimized subgraphs across a set of tasks. - -![TensorFlow Architecture Diagram: Master](https://www.tensorflow.org/images/graph_master_cln.svg){: width="700"} - -**Figure 4** - - -Figure 5 shows a possible partition of our example graph. The distributed -master has grouped the model parameters in order to place them together on the -parameter server. - -![Partitioned Graph](https://www.tensorflow.org/images/graph_split1.svg){: width="700"} - -**Figure 5** - - -Where graph edges are cut by the partition, the distributed master inserts -send and receive nodes to pass information between the distributed tasks -(Figure 6). - -![Partitioned Graph](https://www.tensorflow.org/images/graph_split2.svg){: width="700"} - -**Figure 6** - - -The distributed master then ships the graph pieces to the distributed tasks. - -![Partitioned Graph](https://www.tensorflow.org/images/graph_workers_cln.svg){: width="700"} - -**Figure 7** - -### Code - -* [MasterService API definition](https://www.tensorflow.org/code/tensorflow/core/protobuf/master_service.proto) -* [Master interface](https://www.tensorflow.org/code/tensorflow/core/distributed_runtime/master_interface.h) - -## Worker Service - -The worker service in each task: - -* handles requests from the master, -* schedules the execution of the kernels for the operations that comprise a - local subgraph, and -* mediates direct communication between tasks. - -We optimize the worker service for running large graphs with low overhead. Our -current implementation can execute tens of thousands of subgraphs per second, -which enables a large number of replicas to make rapid, fine-grained training -steps. The worker service dispatches kernels to local devices and runs kernels -in parallel when possible, for example by using multiple CPU cores or GPU -streams. - -We specialize Send and Recv operations for each pair of source and destination -device types: - -* Transfers between local CPU and GPU devices use the - `cudaMemcpyAsync()` API to overlap computation and data transfer. -* Transfers between two local GPUs use peer-to-peer DMA, to avoid an expensive - copy via the host CPU. - -For transfers between tasks, TensorFlow uses multiple protocols, including: - -* gRPC over TCP. -* RDMA over Converged Ethernet. - -We also have preliminary support for NVIDIA's NCCL library for multi-GPU -communication (see [`tf.contrib.nccl`]( -https://www.tensorflow.org/code/tensorflow/contrib/nccl/python/ops/nccl_ops.py)). - -![Partitioned Graph](https://www.tensorflow.org/images/graph_send_recv.svg){: width="700"} - -**Figure 8** - -### Code - -* [WorkerService API definition](https://www.tensorflow.org/code/tensorflow/core/protobuf/worker_service.proto) -* [Worker interface](https://www.tensorflow.org/code/tensorflow/core/distributed_runtime/worker_interface.h) -* [Remote rendezvous (for Send and Recv implementations)](https://www.tensorflow.org/code/tensorflow/core/distributed_runtime/rpc/rpc_rendezvous_mgr.h) - -## Kernel Implementations - -The runtime contains over 200 standard operations including mathematical, array -manipulation, control flow, and state management operations. Each of these -operations can have kernel implementations optimized for a variety of devices. -Many of the operation kernels are implemented using Eigen::Tensor, which uses -C++ templates to generate efficient parallel code for multicore CPUs and GPUs; -however, we liberally use libraries like cuDNN where a more efficient kernel -implementation is possible. We have also implemented -[quantization](../performance/quantization.md), which enables -faster inference in environments such as mobile devices and high-throughput -datacenter applications, and use the -[gemmlowp](https://github.com/google/gemmlowp) low-precision matrix library to -accelerate quantized computation. - -If it is difficult or inefficient to represent a subcomputation as a composition -of operations, users can register additional kernels that provide an efficient -implementation written in C++. For example, we recommend registering your own -fused kernels for some performance critical operations, such as the ReLU and -Sigmoid activation functions and their corresponding gradients. The [XLA Compiler](../performance/xla/index.md) has an -experimental implementation of automatic kernel fusion. - -### Code - -* [`OpKernel` interface](https://www.tensorflow.org/code/tensorflow/core/framework/op_kernel.h) diff --git a/tensorflow/docs_src/extend/index.md b/tensorflow/docs_src/extend/index.md deleted file mode 100644 index bbf4a8139b..0000000000 --- a/tensorflow/docs_src/extend/index.md +++ /dev/null @@ -1,34 +0,0 @@ -# Extend - -This section explains how developers can add functionality to TensorFlow's -capabilities. Begin by reading the following architectural overview: - - * [TensorFlow Architecture](../extend/architecture.md) - -The following guides explain how to extend particular aspects of -TensorFlow: - - * [Adding a New Op](../extend/adding_an_op.md), which explains how to create your own - operations. - * [Adding a Custom Filesystem Plugin](../extend/add_filesys.md), which explains how to - add support for your own shared or distributed filesystem. - * [Custom Data Readers](../extend/new_data_formats.md), which details how to add support - for your own file and record formats. - -Python is currently the only language supported by TensorFlow's API stability -promises. However, TensorFlow also provides functionality in C++, Go, Java and -[JavaScript](https://js.tensorflow.org) (including -[Node.js](https://github.com/tensorflow/tfjs-node)), -plus community support for [Haskell](https://github.com/tensorflow/haskell) and -[Rust](https://github.com/tensorflow/rust). If you'd like to create or -develop TensorFlow features in a language other than these languages, read the -following guide: - - * [TensorFlow in Other Languages](../extend/language_bindings.md) - -To create tools compatible with TensorFlow's model format, read the following -guide: - - * [A Tool Developer's Guide to TensorFlow Model Files](../extend/tool_developers/index.md) - - diff --git a/tensorflow/docs_src/extend/language_bindings.md b/tensorflow/docs_src/extend/language_bindings.md deleted file mode 100644 index 4727eabdc1..0000000000 --- a/tensorflow/docs_src/extend/language_bindings.md +++ /dev/null @@ -1,231 +0,0 @@ -# TensorFlow in other languages - -## Background - -This document is intended as a guide for those interested in the creation or -development of TensorFlow functionality in other programming languages. It -describes the features of TensorFlow and recommended steps for making the same -available in other programming languages. - -Python was the first client language supported by TensorFlow and currently -supports the most features. More and more of that functionality is being moved -into the core of TensorFlow (implemented in C++) and exposed via a [C API]. -Client languages should use the language's [foreign function interface -(FFI)](https://en.wikipedia.org/wiki/Foreign_function_interface) to call into -this [C API] to provide TensorFlow functionality. - -## Overview - -Providing TensorFlow functionality in a programming language can be broken down -into broad categories: - -- *Run a predefined graph*: Given a `GraphDef` (or - `MetaGraphDef`) protocol message, be able to create a session, run queries, - and get tensor results. This is sufficient for a mobile app or server that - wants to run inference on a pre-trained model. -- *Graph construction*: At least one function per defined - TensorFlow op that adds an operation to the graph. Ideally these functions - would be automatically generated so they stay in sync as the op definitions - are modified. -- *Gradients (AKA automatic differentiation)*: Given a graph and a list of - input and output operations, add operations to the graph that compute the - partial derivatives (gradients) of the inputs with respect to the outputs. - Allows for customization of the gradient function for a particular operation - in the graph. -- *Functions*: Define a subgraph that may be called in multiple places in the - main `GraphDef`. Defines a `FunctionDef` in the `FunctionDefLibrary` - included in a `GraphDef`. -- *Control Flow*: Construct "If" and "While" with user-specified subgraphs. - Ideally these work with gradients (see above). -- *Neural Network library*: A number of components that together support the - creation of neural network models and training them (possibly in a - distributed setting). While it would be convenient to have this available in - other languages, there are currently no plans to support this in languages - other than Python. These libraries are typically wrappers over the features - described above. - -At a minimum, a language binding should support running a predefined graph, but -most should also support graph construction. The TensorFlow Python API provides -all these features. - -## Current Status - -New language support should be built on top of the [C API]. However, as you can -see in the table below, not all functionality is available in C yet. Providing -more functionality in the [C API] is an ongoing project. - -Feature | Python | C -:--------------------------------------------- | :---------------------------------------------------------- | :-- -Run a predefined Graph | `tf.import_graph_def`, `tf.Session` | `TF_GraphImportGraphDef`, `TF_NewSession` -Graph construction with generated op functions | Yes | Yes (The C API supports client languages that do this) -Gradients | `tf.gradients` | -Functions | `tf.python.framework.function.Defun` | -Control Flow | `tf.cond`, `tf.while_loop` | -Neural Network library | `tf.train`, `tf.nn`, `tf.contrib.layers`, `tf.contrib.slim` | - -## Recommended Approach - -### Run a predefined graph - -A language binding is expected to define the following classes: - -- `Graph`: A graph representing a TensorFlow computation. Consists of - operations (represented in the client language by `Operation`s) and - corresponds to a `TF_Graph` in the C API. Mainly used as an argument when - creating new `Operation` objects and when starting a `Session`. Also - supports iterating through the operations in the graph - (`TF_GraphNextOperation`), looking up operations by name - (`TF_GraphOperationByName`), and converting to and from a `GraphDef` - protocol message (`TF_GraphToGraphDef` and `TF_GraphImportGraphDef` in the C - API). -- `Operation`: Represents a computation node in the graph. Corresponds to a - `TF_Operation` in the C API. -- `Output`: Represents one of the outputs of an operation in the graph. Has a - `DataType` (and eventually a shape). May be passed as an input argument to a - function for adding operations to a graph, or to a `Session`'s `Run()` - method to fetch that output as a tensor. Corresponds to a `TF_Output` in the - C API. -- `Session`: Represents a client to a particular instance of the TensorFlow - runtime. Its main job is to be constructed with a `Graph` and some options - and then field calls to `Run()` the graph. Corresponds to a `TF_Session` in - the C API. -- `Tensor`: Represents an N-dimensional (rectangular) array with elements all - the same `DataType`. Gets data into and out of a `Session`'s `Run()` call. - Corresponds to a `TF_Tensor` in the C API. -- `DataType`: An enumerant with all the possible tensor types supported by - TensorFlow. Corresponds to `TF_DataType` in the C API and often referred to - as `dtype` in the Python API. - -### Graph construction - -TensorFlow has many ops, and the list is not static, so we recommend generating -the functions for adding ops to a graph instead of writing them by individually -by hand (though writing a few by hand is a good way to figure out what the -generator should generate). The information needed to generate a function is -contained in an `OpDef` protocol message. - -There are a few ways to get a list of the `OpDef`s for the registered ops: - -- `TF_GetAllOpList` in the C API retrieves all registered `OpDef` protocol - messages. This can be used to write the generator in the client language. - This requires that the client language have protocol buffer support in order - to interpret the `OpDef` messages. -- The C++ function `OpRegistry::Global()->GetRegisteredOps()` returns the same - list of all registered `OpDef`s (defined in - [`tensorflow/core/framework/op.h`](https://www.tensorflow.org/code/tensorflow/core/framework/op.h)). This can be used to write the generator - in C++ (particularly useful for languages that do not have protocol buffer - support). -- The ASCII-serialized version of that list is periodically checked in to - [`tensorflow/core/ops/ops.pbtxt`](https://www.tensorflow.org/code/tensorflow/core/ops/ops.pbtxt) by an automated process. - -The `OpDef` specifies the following: - -- Name of the op in CamelCase. For generated functions follow the conventions - of the language. For example, if the language uses snake_case, use that - instead of CamelCase for the op's function name. -- A list of inputs and outputs. The types for these may be polymorphic by - referencing attributes, as described in the inputs and outputs section of - [Adding an op](../extend/adding_an_op.md). -- A list of attributes, along with their default values (if any). Note that - some of these will be inferred (if they are determined by an input), some - will be optional (if they have a default), and some will be required (no - default). -- Documentation for the op in general and the inputs, outputs, and - non-inferred attributes. -- Some other fields that are used by the runtime and can be ignored by the - code generators. - -An `OpDef` can be converted into the text of a function that adds that op to the -graph using the `TF_OperationDescription` C API (wrapped in the language's FFI): - -- Start with `TF_NewOperation()` to create the `TF_OperationDescription*`. -- Call `TF_AddInput()` or `TF_AddInputList()` once per input (depending on - whether the input has a list type). -- Call `TF_SetAttr*()` functions to set non-inferred attributes. May skip - attributes with defaults if you don't want to override the default value. -- Set optional fields if necessary: - - `TF_SetDevice()`: force the operation onto a specific device. - - `TF_AddControlInput()`: add requirements that another operation finish - before this operation starts running - - `TF_SetAttrString("_kernel")` to set the kernel label (rarely used) - - `TF_ColocateWith()` to colocate one op with another -- Call `TF_FinishOperation()` when done. This adds the operation to the graph, - after which it can't be modified. - -The existing examples run the code generator as part of the build process (using -a Bazel genrule). Alternatively, the code generator can be run by an automated -cron process, possibly checking in the result. This creates a risk of divergence -between the generated code and the `OpDef`s checked into the repository, but is -useful for languages where code is expected to be generated ahead of time like -`go get` for Go and `cargo ops` for Rust. At the other end of the spectrum, for -some languages the code could be generated dynamically from -[`tensorflow/core/ops/ops.pbtxt`](https://www.tensorflow.org/code/tensorflow/core/ops/ops.pbtxt). - -#### Handling Constants - -Calling code will be much more concise if users can provide constants to input -arguments. The generated code should convert those constants to operations that -are added to the graph and used as input to the op being instantiated. - -#### Optional parameters - -If the language allows for optional parameters to a function (like keyword -arguments with defaults in Python), use them for optional attributes, operation -names, devices, control inputs etc. In some languages, these optional parameters -can be set using dynamic scopes (like "with" blocks in Python). Without these -features, the library may resort to the "builder pattern", as is done in the C++ -version of the TensorFlow API. - -#### Name scopes - -It is a good idea to have support for naming graph operations using some sort of -scoping hierarchy, especially considering the fact that TensorBoard relies on it -to display large graphs in a reasonable way. The existing Python and C++ APIs -take different approaches: In Python, the "directory" part of the name -(everything up to the last "/") comes from `with` blocks. In effect, there is a -thread-local stack with the scopes defining the name hierarchy. The last -component of the name is either supplied explicitly by the user (using the -optional `name` keyword argument) or defaults to the name of the type of the op -being added. In C++ the "directory" part of the name is stored in an explicit -`Scope` object. The `NewSubScope()` method appends to that part of the name and -returns a new `Scope`. The last component of the name is set using the -`WithOpName()` method, and like Python defaults to the name of the type of op -being added. `Scope` objects are explicitly passed around to specify the name of -the context. - -#### Wrappers - -It may make sense to keep the generated functions private for some ops so that -wrapper functions that do a little bit of additional work can be used instead. -This also gives an escape hatch for supporting features outside the scope of -generated code. - -One use of a wrapper is for supporting `SparseTensor` input and output. A -`SparseTensor` is a tuple of 3 dense tensors: indices, values, and shape. values -is a vector size [n], shape is a vector size [rank], and indices is a matrix -size [n, rank]. There are some sparse ops that use this triple to represent a -single sparse tensor. - -Another reason to use wrappers is for ops that hold state. There are a few such -ops (e.g. a variable) that have several companion ops for operating on that -state. The Python API has classes for these ops where the constructor creates -the op, and methods on that class add operations to the graph that operate on -the state. - -#### Other Considerations - -- It is good to have a list of keywords used to rename op functions and - arguments that collide with language keywords (or other symbols that will - cause trouble, like the names of library functions or variables referenced - in the generated code). -- The function for adding a `Const` operation to a graph typically is a - wrapper since the generated function will typically have redundant - `DataType` inputs. - -### Gradients, functions and control flow - -At this time, support for gradients, functions and control flow operations ("if" -and "while") is not available in languages other than Python. This will be -updated when the [C API] provides necessary support. - -[C API]: https://www.tensorflow.org/code/tensorflow/c/c_api.h diff --git a/tensorflow/docs_src/extend/leftnav_files b/tensorflow/docs_src/extend/leftnav_files deleted file mode 100644 index 12315b711b..0000000000 --- a/tensorflow/docs_src/extend/leftnav_files +++ /dev/null @@ -1,7 +0,0 @@ -index.md -architecture.md -adding_an_op.md -add_filesys.md -new_data_formats.md -language_bindings.md -tool_developers/index.md diff --git a/tensorflow/docs_src/extend/new_data_formats.md b/tensorflow/docs_src/extend/new_data_formats.md deleted file mode 100644 index 7ca50c9c76..0000000000 --- a/tensorflow/docs_src/extend/new_data_formats.md +++ /dev/null @@ -1,305 +0,0 @@ -# Reading custom file and record formats - -PREREQUISITES: - -* Some familiarity with C++. -* Must have - [downloaded TensorFlow source](../install/install_sources.md), and be - able to build it. - -We divide the task of supporting a file format into two pieces: - -* File formats: We use a reader `tf.data.Dataset` to read raw *records* (which - are typically represented by scalar string tensors, but can have more - structure) from a file. -* Record formats: We use decoder or parsing ops to turn a string record - into tensors usable by TensorFlow. - -For example, to re-implement `tf.contrib.data.make_csv_dataset` function, we -could use `tf.data.TextLineDataset` to extract the records, and then -use `tf.data.Dataset.map` and `tf.decode_csv` to parses the CSV records from -each line of text in the dataset. - -[TOC] - -## Writing a `Dataset` for a file format - -A `tf.data.Dataset` represents a sequence of *elements*, which can be the -individual records in a file. There are several examples of "reader" datasets -that are already built into TensorFlow: - -* `tf.data.TFRecordDataset` - ([source in `kernels/data/reader_dataset_ops.cc`](https://www.tensorflow.org/code/tensorflow/core/kernels/data/reader_dataset_ops.cc)) -* `tf.data.FixedLengthRecordDataset` - ([source in `kernels/data/reader_dataset_ops.cc`](https://www.tensorflow.org/code/tensorflow/core/kernels/data/reader_dataset_ops.cc)) -* `tf.data.TextLineDataset` - ([source in `kernels/data/reader_dataset_ops.cc`](https://www.tensorflow.org/code/tensorflow/core/kernels/data/reader_dataset_ops.cc)) - -Each of these implementations comprises three related classes: - -* A `tensorflow::DatasetOpKernel` subclass (e.g. `TextLineDatasetOp`), which - tells TensorFlow how to construct a dataset object from the inputs to and - attrs of an op, in its `MakeDataset()` method. - -* A `tensorflow::GraphDatasetBase` subclass (e.g. `TextLineDatasetOp::Dataset`), - which represents the *immutable* definition of the dataset itself, and tells - TensorFlow how to construct an iterator object over that dataset, in its - `MakeIteratorInternal()` method. - -* A `tensorflow::DatasetIterator` subclass (e.g. - `TextLineDatasetOp::Dataset::Iterator`), which represents the *mutable* state - of an iterator over a particular dataset, and tells TensorFlow how to get the - next element from the iterator, in its `GetNextInternal()` method. - -The most important method is the `GetNextInternal()` method, since it defines -how to actually read records from the file and represent them as one or more -`Tensor` objects. - -To create a new reader dataset called (for example) `MyReaderDataset`, you will -need to: - -1. In C++, define subclasses of `tensorflow::DatasetOpKernel`, - `tensorflow::GraphDatasetBase`, and `tensorflow::DatasetIterator` - that implement the reading logic. -2. In C++, register a new reader op and kernel with the name - `"MyReaderDataset"`. -3. In Python, define a subclass of `tf.data.Dataset` called `MyReaderDataset`. - -You can put all the C++ code in a single file, such as -`my_reader_dataset_op.cc`. It will help if you are -familiar with [the adding an op how-to](../extend/adding_an_op.md). The following skeleton -can be used as a starting point for your implementation: - -```c++ -#include "tensorflow/core/framework/common_shape_fns.h" -#include "tensorflow/core/framework/dataset.h" -#include "tensorflow/core/framework/op.h" -#include "tensorflow/core/framework/shape_inference.h" - -namespace myproject { -namespace { - -using ::tensorflow::DT_STRING; -using ::tensorflow::PartialTensorShape; -using ::tensorflow::Status; - -class MyReaderDatasetOp : public tensorflow::DatasetOpKernel { - public: - - MyReaderDatasetOp(tensorflow::OpKernelConstruction* ctx) - : DatasetOpKernel(ctx) { - // Parse and validate any attrs that define the dataset using - // `ctx->GetAttr()`, and store them in member variables. - } - - void MakeDataset(tensorflow::OpKernelContext* ctx, - tensorflow::DatasetBase** output) override { - // Parse and validate any input tensors 0that define the dataset using - // `ctx->input()` or the utility function - // `ParseScalarArgument(ctx, &arg)`. - - // Create the dataset object, passing any (already-validated) arguments from - // attrs or input tensors. - *output = new Dataset(ctx); - } - - private: - class Dataset : public tensorflow::GraphDatasetBase { - public: - Dataset(tensorflow::OpKernelContext* ctx) : GraphDatasetBase(ctx) {} - - std::unique_ptr MakeIteratorInternal( - const string& prefix) const override { - return std::unique_ptr(new Iterator( - {this, tensorflow::strings::StrCat(prefix, "::MyReader")})); - } - - // Record structure: Each record is represented by a scalar string tensor. - // - // Dataset elements can have a fixed number of components of different - // types and shapes; replace the following two methods to customize this - // aspect of the dataset. - const tensorflow::DataTypeVector& output_dtypes() const override { - static auto* const dtypes = new tensorflow::DataTypeVector({DT_STRING}); - return *dtypes; - } - const std::vector& output_shapes() const override { - static std::vector* shapes = - new std::vector({{}}); - return *shapes; - } - - string DebugString() const override { return "MyReaderDatasetOp::Dataset"; } - - protected: - // Optional: Implementation of `GraphDef` serialization for this dataset. - // - // Implement this method if you want to be able to save and restore - // instances of this dataset (and any iterators over it). - Status AsGraphDefInternal(DatasetGraphDefBuilder* b, - tensorflow::Node** output) const override { - // Construct nodes to represent any of the input tensors from this - // object's member variables using `b->AddScalar()` and `b->AddVector()`. - std::vector input_tensors; - TF_RETURN_IF_ERROR(b->AddDataset(this, input_tensors, output)); - return Status::OK(); - } - - private: - class Iterator : public tensorflow::DatasetIterator { - public: - explicit Iterator(const Params& params) - : DatasetIterator(params), i_(0) {} - - // Implementation of the reading logic. - // - // The example implementation in this file yields the string "MyReader!" - // ten times. In general there are three cases: - // - // 1. If an element is successfully read, store it as one or more tensors - // in `*out_tensors`, set `*end_of_sequence = false` and return - // `Status::OK()`. - // 2. If the end of input is reached, set `*end_of_sequence = true` and - // return `Status::OK()`. - // 3. If an error occurs, return an error status using one of the helper - // functions from "tensorflow/core/lib/core/errors.h". - Status GetNextInternal(tensorflow::IteratorContext* ctx, - std::vector* out_tensors, - bool* end_of_sequence) override { - // NOTE: `GetNextInternal()` may be called concurrently, so it is - // recommended that you protect the iterator state with a mutex. - tensorflow::mutex_lock l(mu_); - if (i_ < 10) { - // Create a scalar string tensor and add it to the output. - tensorflow::Tensor record_tensor(ctx->allocator({}), DT_STRING, {}); - record_tensor.scalar()() = "MyReader!"; - out_tensors->emplace_back(std::move(record_tensor)); - ++i_; - *end_of_sequence = false; - } else { - *end_of_sequence = true; - } - return Status::OK(); - } - - protected: - // Optional: Implementation of iterator state serialization for this - // iterator. - // - // Implement these two methods if you want to be able to save and restore - // instances of this iterator. - Status SaveInternal(tensorflow::IteratorStateWriter* writer) override { - tensorflow::mutex_lock l(mu_); - TF_RETURN_IF_ERROR(writer->WriteScalar(full_name("i"), i_)); - return Status::OK(); - } - Status RestoreInternal(tensorflow::IteratorContext* ctx, - tensorflow::IteratorStateReader* reader) override { - tensorflow::mutex_lock l(mu_); - TF_RETURN_IF_ERROR(reader->ReadScalar(full_name("i"), &i_)); - return Status::OK(); - } - - private: - tensorflow::mutex mu_; - int64 i_ GUARDED_BY(mu_); - }; - }; -}; - -// Register the op definition for MyReaderDataset. -// -// Dataset ops always have a single output, of type `variant`, which represents -// the constructed `Dataset` object. -// -// Add any attrs and input tensors that define the dataset here. -REGISTER_OP("MyReaderDataset") - .Output("handle: variant") - .SetIsStateful() - .SetShapeFn(tensorflow::shape_inference::ScalarShape); - -// Register the kernel implementation for MyReaderDataset. -REGISTER_KERNEL_BUILDER(Name("MyReaderDataset").Device(tensorflow::DEVICE_CPU), - MyReaderDatasetOp); - -} // namespace -} // namespace myproject -``` - -The last step is to build the C++ code and add a Python wrapper. The easiest way -to do this is by [compiling a dynamic -library](../extend/adding_an_op.md#build_the_op_library) (e.g. called `"my_reader_dataset_op.so"`), and adding a Python class -that subclasses `tf.data.Dataset` to wrap it. An example Python program is -given here: - -```python -import tensorflow as tf - -# Assumes the file is in the current working directory. -my_reader_dataset_module = tf.load_op_library("./my_reader_dataset_op.so") - -class MyReaderDataset(tf.data.Dataset): - - def __init__(self): - super(MyReaderDataset, self).__init__() - # Create any input attrs or tensors as members of this class. - - def _as_variant_tensor(self): - # Actually construct the graph node for the dataset op. - # - # This method will be invoked when you create an iterator on this dataset - # or a dataset derived from it. - return my_reader_dataset_module.my_reader_dataset() - - # The following properties define the structure of each element: a scalar - # `tf.string` tensor. Change these properties to match the `output_dtypes()` - # and `output_shapes()` methods of `MyReaderDataset::Dataset` if you modify - # the structure of each element. - @property - def output_types(self): - return tf.string - - @property - def output_shapes(self): - return tf.TensorShape([]) - - @property - def output_classes(self): - return tf.Tensor - -if __name__ == "__main__": - # Create a MyReaderDataset and print its elements. - with tf.Session() as sess: - iterator = MyReaderDataset().make_one_shot_iterator() - next_element = iterator.get_next() - try: - while True: - print(sess.run(next_element)) # Prints "MyReader!" ten times. - except tf.errors.OutOfRangeError: - pass -``` - -You can see some examples of `Dataset` wrapper classes in -[`tensorflow/python/data/ops/dataset_ops.py`](https://www.tensorflow.org/code/tensorflow/python/data/ops/dataset_ops.py). - -## Writing an Op for a record format - -Generally this is an ordinary op that takes a scalar string record as input, and -so follow [the instructions to add an Op](../extend/adding_an_op.md). -You may optionally take a scalar string key as input, and include that in error -messages reporting improperly formatted data. That way users can more easily -track down where the bad data came from. - -Examples of Ops useful for decoding records: - -* `tf.parse_single_example` (and `tf.parse_example`) -* `tf.decode_csv` -* `tf.decode_raw` - -Note that it can be useful to use multiple Ops to decode a particular record -format. For example, you may have an image saved as a string in -[a `tf.train.Example` protocol buffer](https://www.tensorflow.org/code/tensorflow/core/example/example.proto). -Depending on the format of that image, you might take the corresponding output -from a `tf.parse_single_example` op and call `tf.image.decode_jpeg`, -`tf.image.decode_png`, or `tf.decode_raw`. It is common to take the output -of `tf.decode_raw` and use `tf.slice` and `tf.reshape` to extract pieces. diff --git a/tensorflow/docs_src/extend/tool_developers/index.md b/tensorflow/docs_src/extend/tool_developers/index.md deleted file mode 100644 index f02cd23be8..0000000000 --- a/tensorflow/docs_src/extend/tool_developers/index.md +++ /dev/null @@ -1,186 +0,0 @@ -# A Tool Developer's Guide to TensorFlow Model Files - -Most users shouldn't need to care about the internal details of how TensorFlow -stores data on disk, but you might if you're a tool developer. For example, you -may want to analyze models, or convert back and forth between TensorFlow and -other formats. This guide tries to explain some of the details of how you can -work with the main files that hold model data, to make it easier to develop -those kind of tools. - -[TOC] - -## Protocol Buffers - -All of TensorFlow's file formats are based on -[Protocol Buffers](https://developers.google.com/protocol-buffers/?hl=en), so to -start it's worth getting familiar with how they work. The summary is that you -define data structures in text files, and the protobuf tools generate classes in -C, Python, and other languages that can load, save, and access the data in a -friendly way. We often refer to Protocol Buffers as protobufs, and I'll use -that convention in this guide. - -## GraphDef - -The foundation of computation in TensorFlow is the `Graph` object. This holds a -network of nodes, each representing one operation, connected to each other as -inputs and outputs. After you've created a `Graph` object, you can save it out -by calling `as_graph_def()`, which returns a `GraphDef` object. - -The GraphDef class is an object created by the ProtoBuf library from the -definition in -[tensorflow/core/framework/graph.proto](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/framework/graph.proto). The protobuf tools parse -this text file, and generate the code to load, store, and manipulate graph -definitions. If you see a standalone TensorFlow file representing a model, it's -likely to contain a serialized version of one of these `GraphDef` objects -saved out by the protobuf code. - -This generated code is used to save and load the GraphDef files from disk. The code that actually loads the model looks like this: - -```python -graph_def = graph_pb2.GraphDef() -``` - -This line creates an empty `GraphDef` object, the class that's been created -from the textual definition in graph.proto. This is the object we're going to -populate with the data from our file. - -```python -with open(FLAGS.graph, "rb") as f: -``` - -Here we get a file handle for the path we've passed in to the script - -```python - if FLAGS.input_binary: - graph_def.ParseFromString(f.read()) - else: - text_format.Merge(f.read(), graph_def) -``` - -## Text or Binary? - -There are actually two different formats that a ProtoBuf can be saved in. -TextFormat is a human-readable form, which makes it nice for debugging and -editing, but can get large when there's numerical data like weights stored in -it. You can see a small example of that in -[graph_run_run2.pbtxt](https://github.com/tensorflow/tensorboard/blob/master/tensorboard/demo/data/graph_run_run2.pbtxt). - -Binary format files are a lot smaller than their text equivalents, even though -they're not as readable for us. In this script, we ask the user to supply a -flag indicating whether the input file is binary or text, so we know the right -function to call. You can find an example of a large binary file inside the -[inception_v3 archive](https://storage.googleapis.com/download.tensorflow.org/models/inception_v3_2016_08_28_frozen.pb.tar.gz), -as `inception_v3_2016_08_28_frozen.pb`. - -The API itself can be a bit confusing - the binary call is actually -`ParseFromString()`, whereas you use a utility function from the `text_format` -module to load textual files. - -## Nodes - -Once you've loaded a file into the `graph_def` variable, you can now access the -data inside it. For most practical purposes, the important section is the list -of nodes stored in the node member. Here's the code that loops through those: - -```python -for node in graph_def.node -``` - -Each node is a `NodeDef` object, defined in -[tensorflow/core/framework/node_def.proto](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/framework/node_def.proto). These -are the fundamental building blocks of TensorFlow graphs, with each one defining -a single operation along with its input connections. Here are the members of a -`NodeDef`, and what they mean. - -### `name` - -Every node should have a unique identifier that's not used by any other nodes -in the graph. If you don't specify one as you're building a graph using the -Python API, one reflecting the name of operation, such as "MatMul", -concatenated with a monotonically increasing number, such as "5", will be -picked for you. The name is used when defining the connections between nodes, -and when setting inputs and outputs for the whole graph when it's run. - -### `op` - -This defines what operation to run, for example `"Add"`, `"MatMul"`, or -`"Conv2D"`. When a graph is run, this op name is looked up in a registry to -find an implementation. The registry is populated by calls to the -`REGISTER_OP()` macro, like those in -[tensorflow/core/ops/nn_ops.cc](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/ops/nn_ops.cc). - -### `input` - -A list of strings, each one of which is the name of another node, optionally -followed by a colon and an output port number. For example, a node with two -inputs might have a list like `["some_node_name", "another_node_name"]`, which -is equivalent to `["some_node_name:0", "another_node_name:0"]`, and defines the -node's first input as the first output from the node with the name -`"some_node_name"`, and a second input from the first output of -`"another_node_name"` - -### `device` - -In most cases you can ignore this, since it defines where to run a node in a -distributed environment, or when you want to force the operation onto CPU or -GPU. - -### `attr` - -This is a key/value store holding all the attributes of a node. These are the -permanent properties of nodes, things that don't change at runtime such as the -size of filters for convolutions, or the values of constant ops. Because there -can be so many different types of attribute values, from strings, to ints, to -arrays of tensor values, there's a separate protobuf file defining the data -structure that holds them, in -[tensorflow/core/framework/attr_value.proto](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/framework/attr_value.proto). - -Each attribute has a unique name string, and the expected attributes are listed -when the operation is defined. If an attribute isn't present in a node, but it -has a default listed in the operation definition, that default is used when the -graph is created. - -You can access all of these members by calling `node.name`, `node.op`, etc. in -Python. The list of nodes stored in the `GraphDef` is a full definition of the -model architecture. - -## Freezing - -One confusing part about this is that the weights usually aren't stored inside -the file format during training. Instead, they're held in separate checkpoint -files, and there are `Variable` ops in the graph that load the latest values -when they're initialized. It's often not very convenient to have separate files -when you're deploying to production, so there's the -[freeze_graph.py](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/tools/freeze_graph.py) script that takes a graph definition and a set -of checkpoints and freezes them together into a single file. - -What this does is load the `GraphDef`, pull in the values for all the variables -from the latest checkpoint file, and then replace each `Variable` op with a -`Const` that has the numerical data for the weights stored in its attributes -It then strips away all the extraneous nodes that aren't used for forward -inference, and saves out the resulting `GraphDef` into an output file. - -## Weight Formats - -If you're dealing with TensorFlow models that represent neural networks, one of -the most common problems is extracting and interpreting the weight values. A -common way to store them, for example in graphs created by the freeze_graph -script, is as `Const` ops containing the weights as `Tensors`. These are -defined in -[tensorflow/core/framework/tensor.proto](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/framework/tensor.proto), and contain information -about the size and type of the data, as well as the values themselves. In -Python, you get a `TensorProto` object from a `NodeDef` representing a `Const` -op by calling something like `some_node_def.attr['value'].tensor`. - -This will give you an object representing the weights data. The data itself -will be stored in one of the lists with the suffix _val as indicated by the -type of the object, for example `float_val` for 32-bit float data types. - -The ordering of convolution weight values is often tricky to deal with when -converting between different frameworks. In TensorFlow, the filter weights for -the `Conv2D` operation are stored on the second input, and are expected to be -in the order `[filter_height, filter_width, input_depth, output_depth]`, where -filter_count increasing by one means moving to an adjacent value in memory. - -Hopefully this rundown gives you a better idea of what's going on inside -TensorFlow model files, and will help you if you ever need to manipulate them. diff --git a/tensorflow/docs_src/extras/README.txt b/tensorflow/docs_src/extras/README.txt deleted file mode 100644 index 765809a762..0000000000 --- a/tensorflow/docs_src/extras/README.txt +++ /dev/null @@ -1,3 +0,0 @@ -This directory holds extra files we'd like to be able -to link to and serve from within tensorflow.org. -They are excluded from versioning. \ No newline at end of file diff --git a/tensorflow/docs_src/guide/autograph.md b/tensorflow/docs_src/guide/autograph.md deleted file mode 100644 index 823e1c6d6b..0000000000 --- a/tensorflow/docs_src/guide/autograph.md +++ /dev/null @@ -1,3 +0,0 @@ -# AutoGraph: Easy control flow for graphs - -[Colab notebook](https://colab.research.google.com/github/tensorflow/models/blob/master/samples/core/guide/autograph.ipynb) diff --git a/tensorflow/docs_src/guide/checkpoints.md b/tensorflow/docs_src/guide/checkpoints.md deleted file mode 100644 index 3c92cbbd40..0000000000 --- a/tensorflow/docs_src/guide/checkpoints.md +++ /dev/null @@ -1,238 +0,0 @@ -# Checkpoints - -This document examines how to save and restore TensorFlow models built with -Estimators. TensorFlow provides two model formats: - -* checkpoints, which is a format dependent on the code that created - the model. -* SavedModel, which is a format independent of the code that created - the model. - -This document focuses on checkpoints. For details on `SavedModel`, see the -[Saving and Restoring](../guide/saved_model.md) guide. - - -## Sample code - -This document relies on the same -[Iris classification example](https://github.com/tensorflow/models/blob/master/samples/core/get_started/premade_estimator.py) detailed in [Getting Started with TensorFlow](../guide/premade_estimators.md). -To download and access the example, invoke the following two commands: - -```shell -git clone https://github.com/tensorflow/models/ -cd models/samples/core/get_started -``` - -Most of the code snippets in this document are minor variations -on `premade_estimator.py`. - - -## Saving partially-trained models - -Estimators automatically write the following to disk: - -* **checkpoints**, which are versions of the model created during training. -* **event files**, which contain information that - [TensorBoard](https://developers.google.com/machine-learning/glossary/#TensorBoard) - uses to create visualizations. - -To specify the top-level directory in which the Estimator stores its -information, assign a value to the optional `model_dir` argument of *any* -`Estimator`'s constructor. -Taking `DNNClassifier` as an example, -the following code sets the `model_dir` -argument to the `models/iris` directory: - -```python -classifier = tf.estimator.DNNClassifier( - feature_columns=my_feature_columns, - hidden_units=[10, 10], - n_classes=3, - model_dir='models/iris') -``` - -Suppose you call the Estimator's `train` method. For example: - - -```python -classifier.train( - input_fn=lambda:train_input_fn(train_x, train_y, batch_size=100), - steps=200) -``` - -As suggested by the following diagrams, the first call to `train` -adds checkpoints and other files to the `model_dir` directory: - -
- -
-
-The first call to train(). -
- - -To see the objects in the created `model_dir` directory on a -UNIX-based system, just call `ls` as follows: - -```none -$ ls -1 models/iris -checkpoint -events.out.tfevents.timestamp.hostname -graph.pbtxt -model.ckpt-1.data-00000-of-00001 -model.ckpt-1.index -model.ckpt-1.meta -model.ckpt-200.data-00000-of-00001 -model.ckpt-200.index -model.ckpt-200.meta -``` - -The preceding `ls` command shows that the Estimator created checkpoints -at steps 1 (the start of training) and 200 (the end of training). - - -### Default checkpoint directory - -If you don't specify `model_dir` in an Estimator's constructor, the Estimator -writes checkpoint files to a temporary directory chosen by Python's -[tempfile.mkdtemp](https://docs.python.org/3/library/tempfile.html#tempfile.mkdtemp) -function. For example, the following Estimator constructor does *not* specify -the `model_dir` argument: - -```python -classifier = tf.estimator.DNNClassifier( - feature_columns=my_feature_columns, - hidden_units=[10, 10], - n_classes=3) - -print(classifier.model_dir) -``` - -The `tempfile.mkdtemp` function picks a secure, temporary directory -appropriate for your operating system. For example, a typical temporary -directory on macOS might be something like the following: - -```None -/var/folders/0s/5q9kfzfj3gx2knj0vj8p68yc00dhcr/T/tmpYm1Rwa -``` - -### Checkpointing Frequency - -By default, the Estimator saves -[checkpoints](https://developers.google.com/machine-learning/glossary/#checkpoint) -in the `model_dir` according to the following schedule: - -* Writes a checkpoint every 10 minutes (600 seconds). -* Writes a checkpoint when the `train` method starts (first iteration) - and completes (final iteration). -* Retains only the 5 most recent checkpoints in the directory. - -You may alter the default schedule by taking the following steps: - -1. Create a `tf.estimator.RunConfig` object that defines the - desired schedule. -2. When instantiating the Estimator, pass that `RunConfig` object to the - Estimator's `config` argument. - -For example, the following code changes the checkpointing schedule to every -20 minutes and retains the 10 most recent checkpoints: - -```python -my_checkpointing_config = tf.estimator.RunConfig( - save_checkpoints_secs = 20*60, # Save checkpoints every 20 minutes. - keep_checkpoint_max = 10, # Retain the 10 most recent checkpoints. -) - -classifier = tf.estimator.DNNClassifier( - feature_columns=my_feature_columns, - hidden_units=[10, 10], - n_classes=3, - model_dir='models/iris', - config=my_checkpointing_config) -``` - -## Restoring your model - -The first time you call an Estimator's `train` method, TensorFlow saves a -checkpoint to the `model_dir`. Each subsequent call to the Estimator's -`train`, `evaluate`, or `predict` method causes the following: - -1. The Estimator builds the model's - [graph](https://developers.google.com/machine-learning/glossary/#graph) - by running the `model_fn()`. (For details on the `model_fn()`, see - [Creating Custom Estimators.](../guide/custom_estimators.md)) -2. The Estimator initializes the weights of the new model from the data - stored in the most recent checkpoint. - -In other words, as the following illustration suggests, once checkpoints -exist, TensorFlow rebuilds the model each time you call `train()`, -`evaluate()`, or `predict()`. - -
- -
-
-Subsequent calls to train(), evaluate(), or predict() -
- - -### Avoiding a bad restoration - -Restoring a model's state from a checkpoint only works if the model -and checkpoint are compatible. For example, suppose you trained a -`DNNClassifier` Estimator containing two hidden layers, -each having 10 nodes: - -```python -classifier = tf.estimator.DNNClassifier( - feature_columns=feature_columns, - hidden_units=[10, 10], - n_classes=3, - model_dir='models/iris') - -classifier.train( - input_fn=lambda:train_input_fn(train_x, train_y, batch_size=100), - steps=200) -``` - -After training (and, therefore, after creating checkpoints in `models/iris`), -imagine that you changed the number of neurons in each hidden layer from 10 to -20 and then attempted to retrain the model: - -``` python -classifier2 = tf.estimator.DNNClassifier( - feature_columns=my_feature_columns, - hidden_units=[20, 20], # Change the number of neurons in the model. - n_classes=3, - model_dir='models/iris') - -classifier.train( - input_fn=lambda:train_input_fn(train_x, train_y, batch_size=100), - steps=200) -``` - -Since the state in the checkpoint is incompatible with the model described -in `classifier2`, retraining fails with the following error: - -```None -... -InvalidArgumentError (see above for traceback): tensor_name = -dnn/hiddenlayer_1/bias/t_0/Adagrad; shape in shape_and_slice spec [10] -does not match the shape stored in checkpoint: [20] -``` - -To run experiments in which you train and compare slightly different -versions of a model, save a copy of the code that created each -`model_dir`, possibly by creating a separate git branch for each version. -This separation will keep your checkpoints recoverable. - -## Summary - -Checkpoints provide an easy automatic mechanism for saving and restoring -models created by Estimators. - -See the [Saving and Restoring](../guide/saved_model.md) guide for details about: - -* Saving and restoring models using low-level TensorFlow APIs. -* Exporting and importing models in the SavedModel format, which is a - language-neutral, recoverable, serialization format. diff --git a/tensorflow/docs_src/guide/custom_estimators.md b/tensorflow/docs_src/guide/custom_estimators.md deleted file mode 100644 index 913a35920f..0000000000 --- a/tensorflow/docs_src/guide/custom_estimators.md +++ /dev/null @@ -1,602 +0,0 @@ - -# Creating Custom Estimators - -This document introduces custom Estimators. In particular, this document -demonstrates how to create a custom `tf.estimator.Estimator` that -mimics the behavior of the pre-made Estimator -`tf.estimator.DNNClassifier` in solving the Iris problem. See -the [Pre-Made Estimators chapter](../guide/premade_estimators.md) for details -on the Iris problem. - -To download and access the example code invoke the following two commands: - -```shell -git clone https://github.com/tensorflow/models/ -cd models/samples/core/get_started -``` - -In this document we will be looking at -[`custom_estimator.py`](https://github.com/tensorflow/models/blob/master/samples/core/get_started/custom_estimator.py). -You can run it with the following command: - -```bsh -python custom_estimator.py -``` - -If you are feeling impatient, feel free to compare and contrast -[`custom_estimator.py`](https://github.com/tensorflow/models/blob/master/samples/core/get_started/custom_estimator.py) -with -[`premade_estimator.py`](https://github.com/tensorflow/models/blob/master/samples/core/get_started/premade_estimator.py). -(which is in the same directory). - - - -## Pre-made vs. custom - -As the following figure shows, pre-made Estimators are subclasses of the -`tf.estimator.Estimator` base class, while custom Estimators are an instance -of tf.estimator.Estimator: - -
-Premade estimators are sub-classes of `Estimator`. Custom Estimators are usually (direct) instances of `Estimator` -
-
-Pre-made and custom Estimators are all Estimators. -
- -Pre-made Estimators are fully baked. Sometimes though, you need more control -over an Estimator's behavior. That's where custom Estimators come in. You can -create a custom Estimator to do just about anything. If you want hidden layers -connected in some unusual fashion, write a custom Estimator. If you want to -calculate a unique -[metric](https://developers.google.com/machine-learning/glossary/#metric) -for your model, write a custom Estimator. Basically, if you want an Estimator -optimized for your specific problem, write a custom Estimator. - -A model function (or `model_fn`) implements the ML algorithm. The -only difference between working with pre-made Estimators and custom Estimators -is: - -* With pre-made Estimators, someone already wrote the model function for you. -* With custom Estimators, you must write the model function. - -Your model function could implement a wide range of algorithms, defining all -sorts of hidden layers and metrics. Like input functions, all model functions -must accept a standard group of input parameters and return a standard group of -output values. Just as input functions can leverage the Dataset API, model -functions can leverage the Layers API and the Metrics API. - -Let's see how to solve the Iris problem with a custom Estimator. A quick -reminder--here's the organization of the Iris model that we're trying to mimic: - -
-A diagram of the network architecture: Inputs, 2 hidden layers, and outputs -
-
-Our implementation of Iris contains four features, two hidden layers, -and a logits output layer. -
- -## Write an Input function - -Our custom Estimator implementation uses the same input function as our -[pre-made Estimator implementation](../guide/premade_estimators.md), from -[`iris_data.py`](https://github.com/tensorflow/models/blob/master/samples/core/get_started/iris_data.py). -Namely: - -```python -def train_input_fn(features, labels, batch_size): - """An input function for training""" - # Convert the inputs to a Dataset. - dataset = tf.data.Dataset.from_tensor_slices((dict(features), labels)) - - # Shuffle, repeat, and batch the examples. - dataset = dataset.shuffle(1000).repeat().batch(batch_size) - - # Return the read end of the pipeline. - return dataset.make_one_shot_iterator().get_next() -``` - -This input function builds an input pipeline that yields batches of -`(features, labels)` pairs, where `features` is a dictionary features. - -## Create feature columns - -As detailed in the [Premade Estimators](../guide/premade_estimators.md) and -[Feature Columns](../guide/feature_columns.md) chapters, you must define -your model's feature columns to specify how the model should use each feature. -Whether working with pre-made Estimators or custom Estimators, you define -feature columns in the same fashion. - -The following code creates a simple `numeric_column` for each input feature, -indicating that the value of the input feature should be used directly as an -input to the model: - -```python -# Feature columns describe how to use the input. -my_feature_columns = [] -for key in train_x.keys(): - my_feature_columns.append(tf.feature_column.numeric_column(key=key)) -``` - -## Write a model function - -The model function we'll use has the following call signature: - -```python -def my_model_fn( - features, # This is batch_features from input_fn - labels, # This is batch_labels from input_fn - mode, # An instance of tf.estimator.ModeKeys - params): # Additional configuration -``` - -The first two arguments are the batches of features and labels returned from -the input function; that is, `features` and `labels` are the handles to the -data your model will use. The `mode` argument indicates whether the caller is -requesting training, predicting, or evaluation. - -The caller may pass `params` to an Estimator's constructor. Any `params` passed -to the constructor are in turn passed on to the `model_fn`. In -[`custom_estimator.py`](https://github.com/tensorflow/models/blob/master/samples/core/get_started/custom_estimator.py) -the following lines create the estimator and set the params to configure the -model. This configuration step is similar to how we configured the `tf.estimator.DNNClassifier` in -[Premade Estimators](../guide/premade_estimators.md). - -```python -classifier = tf.estimator.Estimator( - model_fn=my_model_fn, - params={ - 'feature_columns': my_feature_columns, - # Two hidden layers of 10 nodes each. - 'hidden_units': [10, 10], - # The model must choose between 3 classes. - 'n_classes': 3, - }) -``` - -To implement a typical model function, you must do the following: - -* [Define the model](#define_the_model). -* Specify additional calculations for each of - the [three different modes](#modes): - * [Predict](#predict) - * [Evaluate](#evaluate) - * [Train](#train) - -## Define the model - -The basic deep neural network model must define the following three sections: - -* An [input layer](https://developers.google.com/machine-learning/glossary/#input_layer) -* One or more [hidden layers](https://developers.google.com/machine-learning/glossary/#hidden_layer) -* An [output layer](https://developers.google.com/machine-learning/glossary/#output_layer) - -### Define the input layer - -The first line of the `model_fn` calls `tf.feature_column.input_layer` to -convert the feature dictionary and `feature_columns` into input for your model, -as follows: - -```python - # Use `input_layer` to apply the feature columns. - net = tf.feature_column.input_layer(features, params['feature_columns']) -``` - -The preceding line applies the transformations defined by your feature columns, -creating the model's input layer. - -
-A diagram of the input layer, in this case a 1:1 mapping from raw-inputs to features. -
- - -### Hidden Layers - -If you are creating a deep neural network, you must define one or more hidden -layers. The Layers API provides a rich set of functions to define all types of -hidden layers, including convolutional, pooling, and dropout layers. For Iris, -we're simply going to call `tf.layers.dense` to create hidden layers, with -dimensions defined by `params['hidden_layers']`. In a `dense` layer each node -is connected to every node in the preceding layer. Here's the relevant code: - -``` python - # Build the hidden layers, sized according to the 'hidden_units' param. - for units in params['hidden_units']: - net = tf.layers.dense(net, units=units, activation=tf.nn.relu) -``` - -* The `units` parameter defines the number of output neurons in a given layer. -* The `activation` parameter defines the [activation function](https://developers.google.com/machine-learning/glossary/#activation_function) — - [Relu](https://developers.google.com/machine-learning/glossary/#ReLU) in this - case. - -The variable `net` here signifies the current top layer of the network. During -the first iteration, `net` signifies the input layer. On each loop iteration -`tf.layers.dense` creates a new layer, which takes the previous layer's output -as its input, using the variable `net`. - -After creating two hidden layers, our network looks as follows. For -simplicity, the figure does not show all the units in each layer. - -
-The input layer with two hidden layers added. -
- -Note that `tf.layers.dense` provides many additional capabilities, including -the ability to set a multitude of regularization parameters. For the sake of -simplicity, though, we're going to simply accept the default values of the -other parameters. - -### Output Layer - -We'll define the output layer by calling `tf.layers.dense` yet again, this -time without an activation function: - -```python - # Compute logits (1 per class). - logits = tf.layers.dense(net, params['n_classes'], activation=None) -``` - -Here, `net` signifies the final hidden layer. Therefore, the full set of layers -is now connected as follows: - -
-A logit output layer connected to the top hidden layer -
-
-The final hidden layer feeds into the output layer. -
- -When defining an output layer, the `units` parameter specifies the number of -outputs. So, by setting `units` to `params['n_classes']`, the model produces -one output value per class. Each element of the output vector will contain the -score, or "logit", calculated for the associated class of Iris: Setosa, -Versicolor, or Virginica, respectively. - -Later on, these logits will be transformed into probabilities by the -`tf.nn.softmax` function. - -## Implement training, evaluation, and prediction {#modes} - -The final step in creating a model function is to write branching code that -implements prediction, evaluation, and training. - -The model function gets invoked whenever someone calls the Estimator's `train`, -`evaluate`, or `predict` methods. Recall that the signature for the model -function looks like this: - -``` python -def my_model_fn( - features, # This is batch_features from input_fn - labels, # This is batch_labels from input_fn - mode, # An instance of tf.estimator.ModeKeys, see below - params): # Additional configuration -``` - -Focus on that third argument, mode. As the following table shows, when someone -calls `train`, `evaluate`, or `predict`, the Estimator framework invokes your model -function with the mode parameter set as follows: - -| Estimator method | Estimator Mode | -|:---------------------------------|:------------------| -|`tf.estimator.Estimator.train` |`tf.estimator.ModeKeys.TRAIN` | -|`tf.estimator.Estimator.evaluate` |`tf.estimator.ModeKeys.EVAL` | -|`tf.estimator.Estimator.predict`|`tf.estimator.ModeKeys.PREDICT` | - -For example, suppose you instantiate a custom Estimator to generate an object -named `classifier`. Then, you make the following call: - -``` python -classifier = tf.estimator.Estimator(...) -classifier.train(input_fn=lambda: my_input_fn(FILE_TRAIN, True, 500)) -``` -The Estimator framework then calls your model function with mode set to -`ModeKeys.TRAIN`. - -Your model function must provide code to handle all three of the mode values. -For each mode value, your code must return an instance of -`tf.estimator.EstimatorSpec`, which contains the information the caller -requires. Let's examine each mode. - -### Predict - -When the Estimator's `predict` method is called, the `model_fn` receives -`mode = ModeKeys.PREDICT`. In this case, the model function must return a -`tf.estimator.EstimatorSpec` containing the prediction. - -The model must have been trained prior to making a prediction. The trained model -is stored on disk in the `model_dir` directory established when you -instantiated the Estimator. - -The code to generate the prediction for this model looks as follows: - -```python -# Compute predictions. -predicted_classes = tf.argmax(logits, 1) -if mode == tf.estimator.ModeKeys.PREDICT: - predictions = { - 'class_ids': predicted_classes[:, tf.newaxis], - 'probabilities': tf.nn.softmax(logits), - 'logits': logits, - } - return tf.estimator.EstimatorSpec(mode, predictions=predictions) -``` -The prediction dictionary contains everything that your model returns when run -in prediction mode. - -
-Additional outputs added to the output layer. -
- -The `predictions` holds the following three key/value pairs: - -* `class_ids` holds the class id (0, 1, or 2) representing the model's - prediction of the most likely species for this example. -* `probabilities` holds the three probabilities (in this example, 0.02, 0.95, - and 0.03) -* `logit` holds the raw logit values (in this example, -1.3, 2.6, and -0.9) - -We return that dictionary to the caller via the `predictions` parameter of the -`tf.estimator.EstimatorSpec`. The Estimator's -`tf.estimator.Estimator.predict` method will yield these -dictionaries. - -### Calculate the loss - -For both [training](#train) and [evaluation](#evaluate) we need to calculate the -model's loss. This is the -[objective](https://developers.google.com/machine-learning/glossary/#objective) -that will be optimized. - -We can calculate the loss by calling `tf.losses.sparse_softmax_cross_entropy`. -The value returned by this function will be approximately 0 at lowest, -when the probability of the correct class (at index `label`) is near 1.0. -The loss value returned is progressively larger as the probability of the -correct class decreases. - -This function returns the average over the whole batch. - -```python -# Compute loss. -loss = tf.losses.sparse_softmax_cross_entropy(labels=labels, logits=logits) -``` - -### Evaluate - -When the Estimator's `evaluate` method is called, the `model_fn` receives -`mode = ModeKeys.EVAL`. In this case, the model function must return a -`tf.estimator.EstimatorSpec` containing the model's loss and optionally one -or more metrics. - -Although returning metrics is optional, most custom Estimators do return at -least one metric. TensorFlow provides a Metrics module `tf.metrics` to -calculate common metrics. For brevity's sake, we'll only return accuracy. The -`tf.metrics.accuracy` function compares our predictions against the -true values, that is, against the labels provided by the input function. The -`tf.metrics.accuracy` function requires the labels and predictions to have the -same shape. Here's the call to `tf.metrics.accuracy`: - -``` python -# Compute evaluation metrics. -accuracy = tf.metrics.accuracy(labels=labels, - predictions=predicted_classes, - name='acc_op') -``` - -The `tf.estimator.EstimatorSpec` returned for evaluation -typically contains the following information: - -* `loss`, which is the model's loss -* `eval_metric_ops`, which is an optional dictionary of metrics. - -So, we'll create a dictionary containing our sole metric. If we had calculated -other metrics, we would have added them as additional key/value pairs to that -same dictionary. Then, we'll pass that dictionary in the `eval_metric_ops` -argument of `tf.estimator.EstimatorSpec`. Here's the code: - -```python -metrics = {'accuracy': accuracy} -tf.summary.scalar('accuracy', accuracy[1]) - -if mode == tf.estimator.ModeKeys.EVAL: - return tf.estimator.EstimatorSpec( - mode, loss=loss, eval_metric_ops=metrics) -``` - -The `tf.summary.scalar` will make accuracy available to TensorBoard -in both `TRAIN` and `EVAL` modes. (More on this later). - -### Train - -When the Estimator's `train` method is called, the `model_fn` is called -with `mode = ModeKeys.TRAIN`. In this case, the model function must return an -`EstimatorSpec` that contains the loss and a training operation. - -Building the training operation will require an optimizer. We will use -`tf.train.AdagradOptimizer` because we're mimicking the `DNNClassifier`, which -also uses `Adagrad` by default. The `tf.train` package provides many other -optimizers—feel free to experiment with them. - -Here is the code that builds the optimizer: - -``` python -optimizer = tf.train.AdagradOptimizer(learning_rate=0.1) -``` - -Next, we build the training operation using the optimizer's -`tf.train.Optimizer.minimize` method on the loss we calculated -earlier. - -The `minimize` method also takes a `global_step` parameter. TensorFlow uses this -parameter to count the number of training steps that have been processed -(to know when to end a training run). Furthermore, the `global_step` is -essential for TensorBoard graphs to work correctly. Simply call -`tf.train.get_global_step` and pass the result to the `global_step` -argument of `minimize`. - -Here's the code to train the model: - -``` python -train_op = optimizer.minimize(loss, global_step=tf.train.get_global_step()) -``` - -The `tf.estimator.EstimatorSpec` returned for training -must have the following fields set: - -* `loss`, which contains the value of the loss function. -* `train_op`, which executes a training step. - -Here's our code to call `EstimatorSpec`: - -```python -return tf.estimator.EstimatorSpec(mode, loss=loss, train_op=train_op) -``` - -The model function is now complete. - -## The custom Estimator - -Instantiate the custom Estimator through the Estimator base class as follows: - -```python - # Build 2 hidden layer DNN with 10, 10 units respectively. - classifier = tf.estimator.Estimator( - model_fn=my_model_fn, - params={ - 'feature_columns': my_feature_columns, - # Two hidden layers of 10 nodes each. - 'hidden_units': [10, 10], - # The model must choose between 3 classes. - 'n_classes': 3, - }) -``` -Here the `params` dictionary serves the same purpose as the key-word -arguments of `DNNClassifier`; that is, the `params` dictionary lets you -configure your Estimator without modifying the code in the `model_fn`. - -The rest of the code to train, evaluate, and generate predictions using our -Estimator is the same as in the -[Premade Estimators](../guide/premade_estimators.md) chapter. For -example, the following line will train the model: - -```python -# Train the Model. -classifier.train( - input_fn=lambda:iris_data.train_input_fn(train_x, train_y, args.batch_size), - steps=args.train_steps) -``` - -## TensorBoard - -You can view training results for your custom Estimator in TensorBoard. To see -this reporting, start TensorBoard from your command line as follows: - -```bsh -# Replace PATH with the actual path passed as model_dir -tensorboard --logdir=PATH -``` - -Then, open TensorBoard by browsing to: [http://localhost:6006](http://localhost:6006) - -All the pre-made Estimators automatically log a lot of information to -TensorBoard. With custom Estimators, however, TensorBoard only provides one -default log (a graph of the loss) plus the information you explicitly tell -TensorBoard to log. For the custom Estimator you just created, TensorBoard -generates the following: - -
- -Accuracy, 'scalar' graph from tensorboard - -loss 'scalar' graph from tensorboard - -steps/second 'scalar' graph from tensorboard -
- -
-TensorBoard displays three graphs. -
- - -In brief, here's what the three graphs tell you: - -* global_step/sec: A performance indicator showing how many batches (gradient - updates) we processed per second as the model trains. - -* loss: The loss reported. - -* accuracy: The accuracy is recorded by the following two lines: - - * `eval_metric_ops={'my_accuracy': accuracy}`, during evaluation. - * `tf.summary.scalar('accuracy', accuracy[1])`, during training. - -These tensorboard graphs are one of the main reasons it's important to pass a -`global_step` to your optimizer's `minimize` method. The model can't record -the x-coordinate for these graphs without it. - -Note the following in the `my_accuracy` and `loss` graphs: - -* The orange line represents training. -* The blue dot represents evaluation. - -During training, summaries (the orange line) are recorded periodically as -batches are processed, which is why it becomes a graph spanning x-axis range. - -By contrast, evaluation produces only a single point on the graph for each call -to `evaluate`. This point contains the average over the entire evaluation call. -This has no width on the graph as it is evaluated entirely from the model state -at a particular training step (from a single checkpoint). - -As suggested in the following figure, you may see and also selectively -disable/enable the reporting using the controls on the left side. - -
-Check-boxes allowing the user to select which runs are shown. -
-
-Enable or disable reporting. -
- - -## Summary - -Although pre-made Estimators can be an effective way to quickly create new -models, you will often need the additional flexibility that custom Estimators -provide. Fortunately, pre-made and custom Estimators follow the same -programming model. The only practical difference is that you must write a model -function for custom Estimators; everything else is the same. - -For more details, be sure to check out: - -* The - [official TensorFlow implementation of MNIST](https://github.com/tensorflow/models/tree/master/official/mnist), - which uses a custom estimator. -* The TensorFlow - [official models repository](https://github.com/tensorflow/models/tree/master/official), - which contains more curated examples using custom estimators. -* This [TensorBoard video](https://youtu.be/eBbEDRsCmv4), which introduces - TensorBoard. -* The [Low Level Introduction](../guide/low_level_intro.md), which demonstrates - how to experiment directly with TensorFlow's low level APIs, making debugging - easier. diff --git a/tensorflow/docs_src/guide/datasets.md b/tensorflow/docs_src/guide/datasets.md deleted file mode 100644 index 60de181b21..0000000000 --- a/tensorflow/docs_src/guide/datasets.md +++ /dev/null @@ -1,823 +0,0 @@ -# Importing Data - -The `tf.data` API enables you to build complex input pipelines from -simple, reusable pieces. For example, the pipeline for an image model might -aggregate data from files in a distributed file system, apply random -perturbations to each image, and merge randomly selected images into a batch -for training. The pipeline for a text model might involve extracting symbols -from raw text data, converting them to embedding identifiers with a lookup -table, and batching together sequences of different lengths. The `tf.data` API -makes it easy to deal with large amounts of data, different data formats, and -complicated transformations. - -The `tf.data` API introduces two new abstractions to TensorFlow: - -* A `tf.data.Dataset` represents a sequence of elements, in which - each element contains one or more `Tensor` objects. For example, in an image - pipeline, an element might be a single training example, with a pair of - tensors representing the image data and a label. There are two distinct - ways to create a dataset: - - * Creating a **source** (e.g. `Dataset.from_tensor_slices()`) constructs a - dataset from - one or more `tf.Tensor` objects. - - * Applying a **transformation** (e.g. `Dataset.batch()`) constructs a dataset - from one or more `tf.data.Dataset` objects. - -* A `tf.data.Iterator` provides the main way to extract elements from a - dataset. The operation returned by `Iterator.get_next()` yields the next - element of a `Dataset` when executed, and typically acts as the interface - between input pipeline code and your model. The simplest iterator is a - "one-shot iterator", which is associated with a particular `Dataset` and - iterates through it once. For more sophisticated uses, the - `Iterator.initializer` operation enables you to reinitialize and parameterize - an iterator with different datasets, so that you can, for example, iterate - over training and validation data multiple times in the same program. - -## Basic mechanics - -This section of the guide describes the fundamentals of creating different kinds -of `Dataset` and `Iterator` objects, and how to extract data from them. - -To start an input pipeline, you must define a *source*. For example, to -construct a `Dataset` from some tensors in memory, you can use -`tf.data.Dataset.from_tensors()` or -`tf.data.Dataset.from_tensor_slices()`. Alternatively, if your input -data are on disk in the recommended TFRecord format, you can construct a -`tf.data.TFRecordDataset`. - -Once you have a `Dataset` object, you can *transform* it into a new `Dataset` by -chaining method calls on the `tf.data.Dataset` object. For example, you -can apply per-element transformations such as `Dataset.map()` (to apply a -function to each element), and multi-element transformations such as -`Dataset.batch()`. See the documentation for `tf.data.Dataset` -for a complete list of transformations. - -The most common way to consume values from a `Dataset` is to make an -**iterator** object that provides access to one element of the dataset at a time -(for example, by calling `Dataset.make_one_shot_iterator()`). A -`tf.data.Iterator` provides two operations: `Iterator.initializer`, -which enables you to (re)initialize the iterator's state; and -`Iterator.get_next()`, which returns `tf.Tensor` objects that correspond to the -symbolic next element. Depending on your use case, you might choose a different -type of iterator, and the options are outlined below. - -### Dataset structure - -A dataset comprises elements that each have the same structure. An element -contains one or more `tf.Tensor` objects, called *components*. Each component -has a `tf.DType` representing the type of elements in the tensor, and a -`tf.TensorShape` representing the (possibly partially specified) static shape of -each element. The `Dataset.output_types` and `Dataset.output_shapes` properties -allow you to inspect the inferred types and shapes of each component of a -dataset element. The *nested structure* of these properties map to the structure -of an element, which may be a single tensor, a tuple of tensors, or a nested -tuple of tensors. For example: - -```python -dataset1 = tf.data.Dataset.from_tensor_slices(tf.random_uniform([4, 10])) -print(dataset1.output_types) # ==> "tf.float32" -print(dataset1.output_shapes) # ==> "(10,)" - -dataset2 = tf.data.Dataset.from_tensor_slices( - (tf.random_uniform([4]), - tf.random_uniform([4, 100], maxval=100, dtype=tf.int32))) -print(dataset2.output_types) # ==> "(tf.float32, tf.int32)" -print(dataset2.output_shapes) # ==> "((), (100,))" - -dataset3 = tf.data.Dataset.zip((dataset1, dataset2)) -print(dataset3.output_types) # ==> (tf.float32, (tf.float32, tf.int32)) -print(dataset3.output_shapes) # ==> "(10, ((), (100,)))" -``` - -It is often convenient to give names to each component of an element, for -example if they represent different features of a training example. In addition -to tuples, you can use `collections.namedtuple` or a dictionary mapping strings -to tensors to represent a single element of a `Dataset`. - -```python -dataset = tf.data.Dataset.from_tensor_slices( - {"a": tf.random_uniform([4]), - "b": tf.random_uniform([4, 100], maxval=100, dtype=tf.int32)}) -print(dataset.output_types) # ==> "{'a': tf.float32, 'b': tf.int32}" -print(dataset.output_shapes) # ==> "{'a': (), 'b': (100,)}" -``` - -The `Dataset` transformations support datasets of any structure. When using the -`Dataset.map()`, `Dataset.flat_map()`, and `Dataset.filter()` transformations, -which apply a function to each element, the element structure determines the -arguments of the function: - -```python -dataset1 = dataset1.map(lambda x: ...) - -dataset2 = dataset2.flat_map(lambda x, y: ...) - -# Note: Argument destructuring is not available in Python 3. -dataset3 = dataset3.filter(lambda x, (y, z): ...) -``` - -### Creating an iterator - -Once you have built a `Dataset` to represent your input data, the next step is to -create an `Iterator` to access elements from that dataset. The `tf.data` API -currently supports the following iterators, in increasing level of -sophistication: - -* **one-shot**, -* **initializable**, -* **reinitializable**, and -* **feedable**. - -A **one-shot** iterator is the simplest form of iterator, which only supports -iterating once through a dataset, with no need for explicit initialization. -One-shot iterators handle almost all of the cases that the existing queue-based -input pipelines support, but they do not support parameterization. Using the -example of `Dataset.range()`: - -```python -dataset = tf.data.Dataset.range(100) -iterator = dataset.make_one_shot_iterator() -next_element = iterator.get_next() - -for i in range(100): - value = sess.run(next_element) - assert i == value -``` - -Note: Currently, one-shot iterators are the only type that is easily usable -with an `Estimator`. - -An **initializable** iterator requires you to run an explicit -`iterator.initializer` operation before using it. In exchange for this -inconvenience, it enables you to *parameterize* the definition of the dataset, -using one or more `tf.placeholder()` tensors that can be fed when you -initialize the iterator. Continuing the `Dataset.range()` example: - -```python -max_value = tf.placeholder(tf.int64, shape=[]) -dataset = tf.data.Dataset.range(max_value) -iterator = dataset.make_initializable_iterator() -next_element = iterator.get_next() - -# Initialize an iterator over a dataset with 10 elements. -sess.run(iterator.initializer, feed_dict={max_value: 10}) -for i in range(10): - value = sess.run(next_element) - assert i == value - -# Initialize the same iterator over a dataset with 100 elements. -sess.run(iterator.initializer, feed_dict={max_value: 100}) -for i in range(100): - value = sess.run(next_element) - assert i == value -``` - -A **reinitializable** iterator can be initialized from multiple different -`Dataset` objects. For example, you might have a training input pipeline that -uses random perturbations to the input images to improve generalization, and -a validation input pipeline that evaluates predictions on unmodified data. These -pipelines will typically use different `Dataset` objects that have the same -structure (i.e. the same types and compatible shapes for each component). - -```python -# Define training and validation datasets with the same structure. -training_dataset = tf.data.Dataset.range(100).map( - lambda x: x + tf.random_uniform([], -10, 10, tf.int64)) -validation_dataset = tf.data.Dataset.range(50) - -# A reinitializable iterator is defined by its structure. We could use the -# `output_types` and `output_shapes` properties of either `training_dataset` -# or `validation_dataset` here, because they are compatible. -iterator = tf.data.Iterator.from_structure(training_dataset.output_types, - training_dataset.output_shapes) -next_element = iterator.get_next() - -training_init_op = iterator.make_initializer(training_dataset) -validation_init_op = iterator.make_initializer(validation_dataset) - -# Run 20 epochs in which the training dataset is traversed, followed by the -# validation dataset. -for _ in range(20): - # Initialize an iterator over the training dataset. - sess.run(training_init_op) - for _ in range(100): - sess.run(next_element) - - # Initialize an iterator over the validation dataset. - sess.run(validation_init_op) - for _ in range(50): - sess.run(next_element) -``` - -A **feedable** iterator can be used together with `tf.placeholder` to select -what `Iterator` to use in each call to `tf.Session.run`, via the familiar -`feed_dict` mechanism. It offers the same functionality as a reinitializable -iterator, but it does not require you to initialize the iterator from the start -of a dataset when you switch between iterators. For example, using the same -training and validation example from above, you can use -`tf.data.Iterator.from_string_handle` to define a feedable iterator -that allows you to switch between the two datasets: - -```python -# Define training and validation datasets with the same structure. -training_dataset = tf.data.Dataset.range(100).map( - lambda x: x + tf.random_uniform([], -10, 10, tf.int64)).repeat() -validation_dataset = tf.data.Dataset.range(50) - -# A feedable iterator is defined by a handle placeholder and its structure. We -# could use the `output_types` and `output_shapes` properties of either -# `training_dataset` or `validation_dataset` here, because they have -# identical structure. -handle = tf.placeholder(tf.string, shape=[]) -iterator = tf.data.Iterator.from_string_handle( - handle, training_dataset.output_types, training_dataset.output_shapes) -next_element = iterator.get_next() - -# You can use feedable iterators with a variety of different kinds of iterator -# (such as one-shot and initializable iterators). -training_iterator = training_dataset.make_one_shot_iterator() -validation_iterator = validation_dataset.make_initializable_iterator() - -# The `Iterator.string_handle()` method returns a tensor that can be evaluated -# and used to feed the `handle` placeholder. -training_handle = sess.run(training_iterator.string_handle()) -validation_handle = sess.run(validation_iterator.string_handle()) - -# Loop forever, alternating between training and validation. -while True: - # Run 200 steps using the training dataset. Note that the training dataset is - # infinite, and we resume from where we left off in the previous `while` loop - # iteration. - for _ in range(200): - sess.run(next_element, feed_dict={handle: training_handle}) - - # Run one pass over the validation dataset. - sess.run(validation_iterator.initializer) - for _ in range(50): - sess.run(next_element, feed_dict={handle: validation_handle}) -``` - -### Consuming values from an iterator - -The `Iterator.get_next()` method returns one or more `tf.Tensor` objects that -correspond to the symbolic next element of an iterator. Each time these tensors -are evaluated, they take the value of the next element in the underlying -dataset. (Note that, like other stateful objects in TensorFlow, calling -`Iterator.get_next()` does not immediately advance the iterator. Instead you -must use the returned `tf.Tensor` objects in a TensorFlow expression, and pass -the result of that expression to `tf.Session.run()` to get the next elements and -advance the iterator.) - -If the iterator reaches the end of the dataset, executing -the `Iterator.get_next()` operation will raise a `tf.errors.OutOfRangeError`. -After this point the iterator will be in an unusable state, and you must -initialize it again if you want to use it further. - -```python -dataset = tf.data.Dataset.range(5) -iterator = dataset.make_initializable_iterator() -next_element = iterator.get_next() - -# Typically `result` will be the output of a model, or an optimizer's -# training operation. -result = tf.add(next_element, next_element) - -sess.run(iterator.initializer) -print(sess.run(result)) # ==> "0" -print(sess.run(result)) # ==> "2" -print(sess.run(result)) # ==> "4" -print(sess.run(result)) # ==> "6" -print(sess.run(result)) # ==> "8" -try: - sess.run(result) -except tf.errors.OutOfRangeError: - print("End of dataset") # ==> "End of dataset" -``` - -A common pattern is to wrap the "training loop" in a `try`-`except` block: - -```python -sess.run(iterator.initializer) -while True: - try: - sess.run(result) - except tf.errors.OutOfRangeError: - break -``` - -If each element of the dataset has a nested structure, the return value of -`Iterator.get_next()` will be one or more `tf.Tensor` objects in the same -nested structure: - -```python -dataset1 = tf.data.Dataset.from_tensor_slices(tf.random_uniform([4, 10])) -dataset2 = tf.data.Dataset.from_tensor_slices((tf.random_uniform([4]), tf.random_uniform([4, 100]))) -dataset3 = tf.data.Dataset.zip((dataset1, dataset2)) - -iterator = dataset3.make_initializable_iterator() - -sess.run(iterator.initializer) -next1, (next2, next3) = iterator.get_next() -``` - -Note that `next1`, `next2`, and `next3` are tensors produced by the -same op/node (created by `Iterator.get_next()`). Therefore, evaluating *any* of -these tensors will advance the iterator for all components. A typical consumer -of an iterator will include all components in a single expression. - -### Saving iterator state - -The `tf.contrib.data.make_saveable_from_iterator` function creates a -`SaveableObject` from an iterator, which can be used to save and -restore the current state of the iterator (and, effectively, the whole input -pipeline). A saveable object thus created can be added to `tf.train.Saver` -variables list or the `tf.GraphKeys.SAVEABLE_OBJECTS` collection for saving and -restoring in the same manner as a `tf.Variable`. Refer to -[Saving and Restoring](../guide/saved_model.md) for details on how to save and restore -variables. - -```python -# Create saveable object from iterator. -saveable = tf.contrib.data.make_saveable_from_iterator(iterator) - -# Save the iterator state by adding it to the saveable objects collection. -tf.add_to_collection(tf.GraphKeys.SAVEABLE_OBJECTS, saveable) -saver = tf.train.Saver() - -with tf.Session() as sess: - - if should_checkpoint: - saver.save(path_to_checkpoint) - -# Restore the iterator state. -with tf.Session() as sess: - saver.restore(sess, path_to_checkpoint) -``` - -## Reading input data - -### Consuming NumPy arrays - -If all of your input data fit in memory, the simplest way to create a `Dataset` -from them is to convert them to `tf.Tensor` objects and use -`Dataset.from_tensor_slices()`. - -```python -# Load the training data into two NumPy arrays, for example using `np.load()`. -with np.load("/var/data/training_data.npy") as data: - features = data["features"] - labels = data["labels"] - -# Assume that each row of `features` corresponds to the same row as `labels`. -assert features.shape[0] == labels.shape[0] - -dataset = tf.data.Dataset.from_tensor_slices((features, labels)) -``` - -Note that the above code snippet will embed the `features` and `labels` arrays -in your TensorFlow graph as `tf.constant()` operations. This works well for a -small dataset, but wastes memory---because the contents of the array will be -copied multiple times---and can run into the 2GB limit for the `tf.GraphDef` -protocol buffer. - -As an alternative, you can define the `Dataset` in terms of `tf.placeholder()` -tensors, and *feed* the NumPy arrays when you initialize an `Iterator` over the -dataset. - -```python -# Load the training data into two NumPy arrays, for example using `np.load()`. -with np.load("/var/data/training_data.npy") as data: - features = data["features"] - labels = data["labels"] - -# Assume that each row of `features` corresponds to the same row as `labels`. -assert features.shape[0] == labels.shape[0] - -features_placeholder = tf.placeholder(features.dtype, features.shape) -labels_placeholder = tf.placeholder(labels.dtype, labels.shape) - -dataset = tf.data.Dataset.from_tensor_slices((features_placeholder, labels_placeholder)) -# [Other transformations on `dataset`...] -dataset = ... -iterator = dataset.make_initializable_iterator() - -sess.run(iterator.initializer, feed_dict={features_placeholder: features, - labels_placeholder: labels}) -``` - -### Consuming TFRecord data - -The `tf.data` API supports a variety of file formats so that you can process -large datasets that do not fit in memory. For example, the TFRecord file format -is a simple record-oriented binary format that many TensorFlow applications use -for training data. The `tf.data.TFRecordDataset` class enables you to -stream over the contents of one or more TFRecord files as part of an input -pipeline. - -```python -# Creates a dataset that reads all of the examples from two files. -filenames = ["/var/data/file1.tfrecord", "/var/data/file2.tfrecord"] -dataset = tf.data.TFRecordDataset(filenames) -``` - -The `filenames` argument to the `TFRecordDataset` initializer can either be a -string, a list of strings, or a `tf.Tensor` of strings. Therefore if you have -two sets of files for training and validation purposes, you can use a -`tf.placeholder(tf.string)` to represent the filenames, and initialize an -iterator from the appropriate filenames: - -```python -filenames = tf.placeholder(tf.string, shape=[None]) -dataset = tf.data.TFRecordDataset(filenames) -dataset = dataset.map(...) # Parse the record into tensors. -dataset = dataset.repeat() # Repeat the input indefinitely. -dataset = dataset.batch(32) -iterator = dataset.make_initializable_iterator() - -# You can feed the initializer with the appropriate filenames for the current -# phase of execution, e.g. training vs. validation. - -# Initialize `iterator` with training data. -training_filenames = ["/var/data/file1.tfrecord", "/var/data/file2.tfrecord"] -sess.run(iterator.initializer, feed_dict={filenames: training_filenames}) - -# Initialize `iterator` with validation data. -validation_filenames = ["/var/data/validation1.tfrecord", ...] -sess.run(iterator.initializer, feed_dict={filenames: validation_filenames}) -``` - -### Consuming text data - -Many datasets are distributed as one or more text files. The -`tf.data.TextLineDataset` provides an easy way to extract lines from -one or more text files. Given one or more filenames, a `TextLineDataset` will -produce one string-valued element per line of those files. Like a -`TFRecordDataset`, `TextLineDataset` accepts `filenames` as a `tf.Tensor`, so -you can parameterize it by passing a `tf.placeholder(tf.string)`. - -```python -filenames = ["/var/data/file1.txt", "/var/data/file2.txt"] -dataset = tf.data.TextLineDataset(filenames) -``` - -By default, a `TextLineDataset` yields *every* line of each file, which may -not be desirable, for example if the file starts with a header line, or contains -comments. These lines can be removed using the `Dataset.skip()` and -`Dataset.filter()` transformations. To apply these transformations to each -file separately, we use `Dataset.flat_map()` to create a nested `Dataset` for -each file. - -```python -filenames = ["/var/data/file1.txt", "/var/data/file2.txt"] - -dataset = tf.data.Dataset.from_tensor_slices(filenames) - -# Use `Dataset.flat_map()` to transform each file as a separate nested dataset, -# and then concatenate their contents sequentially into a single "flat" dataset. -# * Skip the first line (header row). -# * Filter out lines beginning with "#" (comments). -dataset = dataset.flat_map( - lambda filename: ( - tf.data.TextLineDataset(filename) - .skip(1) - .filter(lambda line: tf.not_equal(tf.substr(line, 0, 1), "#")))) -``` - -### Consuming CSV data - -The CSV file format is a popular format for storing tabular data in plain text. -The `tf.contrib.data.CsvDataset` class provides a way to extract records from -one or more CSV files that comply with [RFC 4180](https://tools.ietf.org/html/rfc4180). -Given one or more filenames and a list of defaults, a `CsvDataset` will produce -a tuple of elements whose types correspond to the types of the defaults -provided, per CSV record. Like `TFRecordDataset` and `TextLineDataset`, -`CsvDataset` accepts `filenames` as a `tf.Tensor`, so you can parameterize it -by passing a `tf.placeholder(tf.string)`. - -``` -# Creates a dataset that reads all of the records from two CSV files, each with -# eight float columns -filenames = ["/var/data/file1.csv", "/var/data/file2.csv"] -record_defaults = [tf.float32] * 8 # Eight required float columns -dataset = tf.contrib.data.CsvDataset(filenames, record_defaults) -``` - -If some columns are empty, you can provide defaults instead of types. - -``` -# Creates a dataset that reads all of the records from two CSV files, each with -# four float columns which may have missing values -record_defaults = [[0.0]] * 8 -dataset = tf.contrib.data.CsvDataset(filenames, record_defaults) -``` - -By default, a `CsvDataset` yields *every* column of *every* line of the file, -which may not be desirable, for example if the file starts with a header line -that should be ignored, or if some columns are not required in the input. -These lines and fields can be removed with the `header` and `select_cols` -arguments respectively. - -``` -# Creates a dataset that reads all of the records from two CSV files with -# headers, extracting float data from columns 2 and 4. -record_defaults = [[0.0]] * 2 # Only provide defaults for the selected columns -dataset = tf.contrib.data.CsvDataset(filenames, record_defaults, header=True, select_cols=[2,4]) -``` - - -## Preprocessing data with `Dataset.map()` - -The `Dataset.map(f)` transformation produces a new dataset by applying a given -function `f` to each element of the input dataset. It is based on -the -[`map()` function](https://en.wikipedia.org/wiki/Map_(higher-order_function)) -that is commonly applied to lists (and other structures) in functional -programming languages. The function `f` takes the `tf.Tensor` objects that -represent a single element in the input, and returns the `tf.Tensor` objects -that will represent a single element in the new dataset. Its implementation uses -standard TensorFlow operations to transform one element into another. - -This section covers common examples of how to use `Dataset.map()`. - -### Parsing `tf.Example` protocol buffer messages - -Many input pipelines extract `tf.train.Example` protocol buffer messages from a -TFRecord-format file (written, for example, using -`tf.python_io.TFRecordWriter`). Each `tf.train.Example` record contains one or -more "features", and the input pipeline typically converts these features into -tensors. - -```python -# Transforms a scalar string `example_proto` into a pair of a scalar string and -# a scalar integer, representing an image and its label, respectively. -def _parse_function(example_proto): - features = {"image": tf.FixedLenFeature((), tf.string, default_value=""), - "label": tf.FixedLenFeature((), tf.int32, default_value=0)} - parsed_features = tf.parse_single_example(example_proto, features) - return parsed_features["image"], parsed_features["label"] - -# Creates a dataset that reads all of the examples from two files, and extracts -# the image and label features. -filenames = ["/var/data/file1.tfrecord", "/var/data/file2.tfrecord"] -dataset = tf.data.TFRecordDataset(filenames) -dataset = dataset.map(_parse_function) -``` - -### Decoding image data and resizing it - -When training a neural network on real-world image data, it is often necessary -to convert images of different sizes to a common size, so that they may be -batched into a fixed size. - -```python -# Reads an image from a file, decodes it into a dense tensor, and resizes it -# to a fixed shape. -def _parse_function(filename, label): - image_string = tf.read_file(filename) - image_decoded = tf.image.decode_jpeg(image_string) - image_resized = tf.image.resize_images(image_decoded, [28, 28]) - return image_resized, label - -# A vector of filenames. -filenames = tf.constant(["/var/data/image1.jpg", "/var/data/image2.jpg", ...]) - -# `labels[i]` is the label for the image in `filenames[i]. -labels = tf.constant([0, 37, ...]) - -dataset = tf.data.Dataset.from_tensor_slices((filenames, labels)) -dataset = dataset.map(_parse_function) -``` - -### Applying arbitrary Python logic with `tf.py_func()` - -For performance reasons, we encourage you to use TensorFlow operations for -preprocessing your data whenever possible. However, it is sometimes useful to -call upon external Python libraries when parsing your input data. To do so, -invoke, the `tf.py_func()` operation in a `Dataset.map()` transformation. - -```python -import cv2 - -# Use a custom OpenCV function to read the image, instead of the standard -# TensorFlow `tf.read_file()` operation. -def _read_py_function(filename, label): - image_decoded = cv2.imread(filename.decode(), cv2.IMREAD_GRAYSCALE) - return image_decoded, label - -# Use standard TensorFlow operations to resize the image to a fixed shape. -def _resize_function(image_decoded, label): - image_decoded.set_shape([None, None, None]) - image_resized = tf.image.resize_images(image_decoded, [28, 28]) - return image_resized, label - -filenames = ["/var/data/image1.jpg", "/var/data/image2.jpg", ...] -labels = [0, 37, 29, 1, ...] - -dataset = tf.data.Dataset.from_tensor_slices((filenames, labels)) -dataset = dataset.map( - lambda filename, label: tuple(tf.py_func( - _read_py_function, [filename, label], [tf.uint8, label.dtype]))) -dataset = dataset.map(_resize_function) -``` - - - -## Batching dataset elements - -### Simple batching - -The simplest form of batching stacks `n` consecutive elements of a dataset into -a single element. The `Dataset.batch()` transformation does exactly this, with -the same constraints as the `tf.stack()` operator, applied to each component -of the elements: i.e. for each component *i*, all elements must have a tensor -of the exact same shape. - -```python -inc_dataset = tf.data.Dataset.range(100) -dec_dataset = tf.data.Dataset.range(0, -100, -1) -dataset = tf.data.Dataset.zip((inc_dataset, dec_dataset)) -batched_dataset = dataset.batch(4) - -iterator = batched_dataset.make_one_shot_iterator() -next_element = iterator.get_next() - -print(sess.run(next_element)) # ==> ([0, 1, 2, 3], [ 0, -1, -2, -3]) -print(sess.run(next_element)) # ==> ([4, 5, 6, 7], [-4, -5, -6, -7]) -print(sess.run(next_element)) # ==> ([8, 9, 10, 11], [-8, -9, -10, -11]) -``` - -### Batching tensors with padding - -The above recipe works for tensors that all have the same size. However, many -models (e.g. sequence models) work with input data that can have varying size -(e.g. sequences of different lengths). To handle this case, the -`Dataset.padded_batch()` transformation enables you to batch tensors of -different shape by specifying one or more dimensions in which they may be -padded. - -```python -dataset = tf.data.Dataset.range(100) -dataset = dataset.map(lambda x: tf.fill([tf.cast(x, tf.int32)], x)) -dataset = dataset.padded_batch(4, padded_shapes=[None]) - -iterator = dataset.make_one_shot_iterator() -next_element = iterator.get_next() - -print(sess.run(next_element)) # ==> [[0, 0, 0], [1, 0, 0], [2, 2, 0], [3, 3, 3]] -print(sess.run(next_element)) # ==> [[4, 4, 4, 4, 0, 0, 0], - # [5, 5, 5, 5, 5, 0, 0], - # [6, 6, 6, 6, 6, 6, 0], - # [7, 7, 7, 7, 7, 7, 7]] -``` - -The `Dataset.padded_batch()` transformation allows you to set different padding -for each dimension of each component, and it may be variable-length (signified -by `None` in the example above) or constant-length. It is also possible to -override the padding value, which defaults to 0. - - - -## Training workflows - -### Processing multiple epochs - -The `tf.data` API offers two main ways to process multiple epochs of the same -data. - -The simplest way to iterate over a dataset in multiple epochs is to use the -`Dataset.repeat()` transformation. For example, to create a dataset that repeats -its input for 10 epochs: - -```python -filenames = ["/var/data/file1.tfrecord", "/var/data/file2.tfrecord"] -dataset = tf.data.TFRecordDataset(filenames) -dataset = dataset.map(...) -dataset = dataset.repeat(10) -dataset = dataset.batch(32) -``` - -Applying the `Dataset.repeat()` transformation with no arguments will repeat -the input indefinitely. The `Dataset.repeat()` transformation concatenates its -arguments without signaling the end of one epoch and the beginning of the next -epoch. - -If you want to receive a signal at the end of each epoch, you can write a -training loop that catches the `tf.errors.OutOfRangeError` at the end of a -dataset. At that point you might collect some statistics (e.g. the validation -error) for the epoch. - -```python -filenames = ["/var/data/file1.tfrecord", "/var/data/file2.tfrecord"] -dataset = tf.data.TFRecordDataset(filenames) -dataset = dataset.map(...) -dataset = dataset.batch(32) -iterator = dataset.make_initializable_iterator() -next_element = iterator.get_next() - -# Compute for 100 epochs. -for _ in range(100): - sess.run(iterator.initializer) - while True: - try: - sess.run(next_element) - except tf.errors.OutOfRangeError: - break - - # [Perform end-of-epoch calculations here.] -``` - -### Randomly shuffling input data - -The `Dataset.shuffle()` transformation randomly shuffles the input dataset -using a similar algorithm to `tf.RandomShuffleQueue`: it maintains a fixed-size -buffer and chooses the next element uniformly at random from that buffer. - -```python -filenames = ["/var/data/file1.tfrecord", "/var/data/file2.tfrecord"] -dataset = tf.data.TFRecordDataset(filenames) -dataset = dataset.map(...) -dataset = dataset.shuffle(buffer_size=10000) -dataset = dataset.batch(32) -dataset = dataset.repeat() -``` - -### Using high-level APIs - -The `tf.train.MonitoredTrainingSession` API simplifies many aspects of running -TensorFlow in a distributed setting. `MonitoredTrainingSession` uses the -`tf.errors.OutOfRangeError` to signal that training has completed, so to use it -with the `tf.data` API, we recommend using -`Dataset.make_one_shot_iterator()`. For example: - -```python -filenames = ["/var/data/file1.tfrecord", "/var/data/file2.tfrecord"] -dataset = tf.data.TFRecordDataset(filenames) -dataset = dataset.map(...) -dataset = dataset.shuffle(buffer_size=10000) -dataset = dataset.batch(32) -dataset = dataset.repeat(num_epochs) -iterator = dataset.make_one_shot_iterator() - -next_example, next_label = iterator.get_next() -loss = model_function(next_example, next_label) - -training_op = tf.train.AdagradOptimizer(...).minimize(loss) - -with tf.train.MonitoredTrainingSession(...) as sess: - while not sess.should_stop(): - sess.run(training_op) -``` - -To use a `Dataset` in the `input_fn` of a `tf.estimator.Estimator`, simply -return the `Dataset` and the framework will take care of creating an iterator -and initializing it for you. For example: - -```python -def dataset_input_fn(): - filenames = ["/var/data/file1.tfrecord", "/var/data/file2.tfrecord"] - dataset = tf.data.TFRecordDataset(filenames) - - # Use `tf.parse_single_example()` to extract data from a `tf.Example` - # protocol buffer, and perform any additional per-record preprocessing. - def parser(record): - keys_to_features = { - "image_data": tf.FixedLenFeature((), tf.string, default_value=""), - "date_time": tf.FixedLenFeature((), tf.int64, default_value=""), - "label": tf.FixedLenFeature((), tf.int64, - default_value=tf.zeros([], dtype=tf.int64)), - } - parsed = tf.parse_single_example(record, keys_to_features) - - # Perform additional preprocessing on the parsed data. - image = tf.image.decode_jpeg(parsed["image_data"]) - image = tf.reshape(image, [299, 299, 1]) - label = tf.cast(parsed["label"], tf.int32) - - return {"image_data": image, "date_time": parsed["date_time"]}, label - - # Use `Dataset.map()` to build a pair of a feature dictionary and a label - # tensor for each example. - dataset = dataset.map(parser) - dataset = dataset.shuffle(buffer_size=10000) - dataset = dataset.batch(32) - dataset = dataset.repeat(num_epochs) - - # Each element of `dataset` is tuple containing a dictionary of features - # (in which each value is a batch of values for that feature), and a batch of - # labels. - return dataset -``` diff --git a/tensorflow/docs_src/guide/datasets_for_estimators.md b/tensorflow/docs_src/guide/datasets_for_estimators.md deleted file mode 100644 index 09a3830ca9..0000000000 --- a/tensorflow/docs_src/guide/datasets_for_estimators.md +++ /dev/null @@ -1,387 +0,0 @@ -# Datasets for Estimators - -The `tf.data` module contains a collection of classes that allows you to -easily load data, manipulate it, and pipe it into your model. This document -introduces the API by walking through two simple examples: - -* Reading in-memory data from numpy arrays. -* Reading lines from a csv file. - - - -## Basic input - -Taking slices from an array is the simplest way to get started with `tf.data`. - -The [Premade Estimators](../guide/premade_estimators.md) chapter describes -the following `train_input_fn`, from -[`iris_data.py`](https://github.com/tensorflow/models/blob/master/samples/core/get_started/iris_data.py), -to pipe the data into the Estimator: - -``` python -def train_input_fn(features, labels, batch_size): - """An input function for training""" - # Convert the inputs to a Dataset. - dataset = tf.data.Dataset.from_tensor_slices((dict(features), labels)) - - # Shuffle, repeat, and batch the examples. - dataset = dataset.shuffle(1000).repeat().batch(batch_size) - - # Return the dataset. - return dataset -``` - -Let's look at this more closely. - -### Arguments - -This function expects three arguments. Arguments expecting an "array" can -accept nearly anything that can be converted to an array with `numpy.array`. -One exception is -[`tuple`](https://docs.python.org/3/tutorial/datastructures.html#tuples-and-sequences) -which, as we will see, has special meaning for `Datasets`. - -* `features`: A `{'feature_name':array}` dictionary (or - [`DataFrame`](https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.html)) - containing the raw input features. -* `labels` : An array containing the - [label](https://developers.google.com/machine-learning/glossary/#label) - for each example. -* `batch_size` : An integer indicating the desired batch size. - -In [`premade_estimator.py`](https://github.com/tensorflow/models/blob/master/samples/core/get_started/premade_estimator.py) -we retrieved the Iris data using the `iris_data.load_data()` function. -You can run it, and unpack the results as follows: - -``` python -import iris_data - -# Fetch the data -train, test = iris_data.load_data() -features, labels = train -``` - -Then we passed this data to the input function, with a line similar to this: - -``` python -batch_size=100 -iris_data.train_input_fn(features, labels, batch_size) -``` - -Let's walk through the `train_input_fn()`. - -### Slices - -The function starts by using the `tf.data.Dataset.from_tensor_slices` function -to create a `tf.data.Dataset` representing slices of the array. The array is -sliced across the first dimension. For example, an array containing the -MNIST training data has a shape of `(60000, 28, 28)`. Passing this to -`from_tensor_slices` returns a `Dataset` object containing 60000 slices, each one -a 28x28 image. - -The code that returns this `Dataset` is as follows: - -``` python -train, test = tf.keras.datasets.mnist.load_data() -mnist_x, mnist_y = train - -mnist_ds = tf.data.Dataset.from_tensor_slices(mnist_x) -print(mnist_ds) -``` - -This will print the following line, showing the -[shapes](../guide/tensors.md#shapes) and -[types](../guide/tensors.md#data_types) of the items in -the dataset. Note that a `Dataset` does not know how many items it contains. - -``` None - -``` - -The `Dataset` above represents a simple collection of arrays, but datasets are -much more powerful than this. A `Dataset` can transparently handle any nested -combination of dictionaries or tuples (or -[`namedtuple`](https://docs.python.org/2/library/collections.html#collections.namedtuple) -). - -For example after converting the iris `features` -to a standard python dictionary, you can then convert the dictionary of arrays -to a `Dataset` of dictionaries as follows: - -``` python -dataset = tf.data.Dataset.from_tensor_slices(dict(features)) -print(dataset) -``` -``` None - -``` - -Here we see that when a `Dataset` contains structured elements, the `shapes` -and `types` of the `Dataset` take on the same structure. This dataset contains -dictionaries of [scalars](../guide/tensors.md#rank), all of type -`tf.float64`. - -The first line of the iris `train_input_fn` uses the same functionality, but -adds another level of structure. It creates a dataset containing -`(features_dict, label)` pairs. - -The following code shows that the label is a scalar with type `int64`: - -``` python -# Convert the inputs to a Dataset. -dataset = tf.data.Dataset.from_tensor_slices((dict(features), labels)) -print(dataset) -``` -``` - -``` - -### Manipulation - -Currently the `Dataset` would iterate over the data once, in a fixed order, and -only produce a single element at a time. It needs further processing before it -can be used for training. Fortunately, the `tf.data.Dataset` class provides -methods to better prepare the data for training. The next line of the input -function takes advantage of several of these methods: - -``` python -# Shuffle, repeat, and batch the examples. -dataset = dataset.shuffle(1000).repeat().batch(batch_size) -``` - -The `tf.data.Dataset.shuffle` method uses a fixed-size buffer to -shuffle the items as they pass through. In this case the `buffer_size` is -greater than the number of examples in the `Dataset`, ensuring that the data is -completely shuffled (The Iris data set only contains 150 examples). - -The `tf.data.Dataset.repeat` method restarts the `Dataset` when -it reaches the end. To limit the number of epochs, set the `count` argument. - -The `tf.data.Dataset.batch` method collects a number of examples and -stacks them, to create batches. This adds a dimension to their shape. The new -dimension is added as the first dimension. The following code uses -the `batch` method on the MNIST `Dataset`, from earlier. This results in a -`Dataset` containing 3D arrays representing stacks of `(28,28)` images: - -``` python -print(mnist_ds.batch(100)) -``` - -``` none - -``` -Note that the dataset has an unknown batch size because the last batch will -have fewer elements. - -In `train_input_fn`, after batching the `Dataset` contains 1D vectors of -elements where each scalar was previously: - -```python -print(dataset) -``` -``` - -``` - - -### Return - -At this point the `Dataset` contains `(features_dict, labels)` pairs. -This is the format expected by the `train` and `evaluate` methods, so the -`input_fn` returns the dataset. - -The `labels` can/should be omitted when using the `predict` method. - - - - -## Reading a CSV File - -The most common real-world use case for the `Dataset` class is to stream data -from files on disk. The `tf.data` module includes a variety of -file readers. Let's see how parsing the Iris dataset from the csv file looks -using a `Dataset`. - -The following call to the `iris_data.maybe_download` function downloads the -data if necessary, and returns the pathnames of the resulting files: - -``` python -import iris_data -train_path, test_path = iris_data.maybe_download() -``` - -The [`iris_data.csv_input_fn`](https://github.com/tensorflow/models/blob/master/samples/core/get_started/iris_data.py) -function contains an alternative implementation that parses the csv files using -a `Dataset`. - -Let's look at how to build an Estimator-compatible input function that reads -from the local files. - -### Build the `Dataset` - -We start by building a `tf.data.TextLineDataset` object to -read the file one line at a time. Then, we call the -`tf.data.Dataset.skip` method to skip over the first line of the file, which contains a header, not an example: - -``` python -ds = tf.data.TextLineDataset(train_path).skip(1) -``` - -### Build a csv line parser - -We will start by building a function to parse a single line. - -The following `iris_data.parse_line` function accomplishes this task using the -`tf.decode_csv` function, and some simple python code: - -We must parse each of the lines in the dataset in order to generate the -necessary `(features, label)` pairs. The following `_parse_line` function -calls `tf.decode_csv` to parse a single line into its features -and the label. Since Estimators require that features be represented as a -dictionary, we rely on Python's built-in `dict` and `zip` functions to build -that dictionary. The feature names are the keys of that dictionary. -We then call the dictionary's `pop` method to remove the label field from -the features dictionary: - -``` python -# Metadata describing the text columns -COLUMNS = ['SepalLength', 'SepalWidth', - 'PetalLength', 'PetalWidth', - 'label'] -FIELD_DEFAULTS = [[0.0], [0.0], [0.0], [0.0], [0]] -def _parse_line(line): - # Decode the line into its fields - fields = tf.decode_csv(line, FIELD_DEFAULTS) - - # Pack the result into a dictionary - features = dict(zip(COLUMNS,fields)) - - # Separate the label from the features - label = features.pop('label') - - return features, label -``` - -### Parse the lines - -Datasets have many methods for manipulating the data while it is being piped -to a model. The most heavily-used method is `tf.data.Dataset.map`, which -applies a transformation to each element of the `Dataset`. - -The `map` method takes a `map_func` argument that describes how each item in the -`Dataset` should be transformed. - -
- -
-
-The `tf.data.Dataset.map` method applies the `map_func` to -transform each item in the Dataset. -
- -So to parse the lines as they are streamed out of the csv file, we pass our -`_parse_line` function to the `map` method: - -``` python -ds = ds.map(_parse_line) -print(ds) -``` -``` None - -``` - -Now instead of simple scalar strings, the dataset contains `(features, label)` -pairs. - -the remainder of the `iris_data.csv_input_fn` function is identical -to `iris_data.train_input_fn` which was covered in the in the -[Basic input](#basic_input) section. - -### Try it out - -This function can be used as a replacement for -`iris_data.train_input_fn`. It can be used to feed an estimator as follows: - -``` python -train_path, test_path = iris_data.maybe_download() - -# All the inputs are numeric -feature_columns = [ - tf.feature_column.numeric_column(name) - for name in iris_data.CSV_COLUMN_NAMES[:-1]] - -# Build the estimator -est = tf.estimator.LinearClassifier(feature_columns, - n_classes=3) -# Train the estimator -batch_size = 100 -est.train( - steps=1000, - input_fn=lambda : iris_data.csv_input_fn(train_path, batch_size)) -``` - -Estimators expect an `input_fn` to take no arguments. To work around this -restriction, we use `lambda` to capture the arguments and provide the expected -interface. - -## Summary - -The `tf.data` module provides a collection of classes and functions for easily -reading data from a variety of sources. Furthermore, `tf.data` has simple -powerful methods for applying a wide variety of standard and custom -transformations. - -Now you have the basic idea of how to efficiently load data into an -Estimator. Consider the following documents next: - - -* [Creating Custom Estimators](../guide/custom_estimators.md), which demonstrates how to build your own - custom `Estimator` model. -* The [Low Level Introduction](../guide/low_level_intro.md#datasets), which demonstrates - how to experiment directly with `tf.data.Datasets` using TensorFlow's low - level APIs. -* [Importing Data](../guide/datasets.md) which goes into great detail about additional - functionality of `Datasets`. - diff --git a/tensorflow/docs_src/guide/debugger.md b/tensorflow/docs_src/guide/debugger.md deleted file mode 100644 index 5af27471a2..0000000000 --- a/tensorflow/docs_src/guide/debugger.md +++ /dev/null @@ -1,814 +0,0 @@ -# TensorFlow Debugger - - - -[TOC] - -`tfdbg` is a specialized debugger for TensorFlow. It lets you view the internal -structure and states of running TensorFlow graphs during training and inference, -which is difficult to debug with general-purpose debuggers such as Python's `pdb` -due to TensorFlow's computation-graph paradigm. - -This guide focuses on the command-line interface (CLI) of `tfdbg`. For guide on -how to use the graphical user interface (GUI) of tfdbg, i.e., the -**TensorBoard Debugger Plugin**, please visit -[its README](https://github.com/tensorflow/tensorboard/blob/master/tensorboard/plugins/debugger/README.md). - -Note: The TensorFlow debugger uses a -[curses](https://en.wikipedia.org/wiki/Curses_\(programming_library\))-based text -user interface. On Mac OS X, the `ncurses` library is required and can be -installed with `brew install ncurses`. On Windows, curses isn't as -well supported, so a [readline](https://en.wikipedia.org/wiki/GNU_Readline)-based -interface can be used with tfdbg by installing `pyreadline` with `pip`. If you -use Anaconda3, you can install it with a command such as -`"C:\Program Files\Anaconda3\Scripts\pip.exe" install pyreadline`. Unofficial -Windows curses packages can be downloaded -[here](https://www.lfd.uci.edu/~gohlke/pythonlibs/#curses), then subsequently -installed using `pip install .whl`, however curses on Windows may -not work as reliably as curses on Linux or Mac. - -This tutorial demonstrates how to use the **tfdbg** CLI to debug the appearance -of [`nan`s](https://en.wikipedia.org/wiki/NaN) -and [`inf`s](https://en.wikipedia.org/wiki/Infinity), a frequently-encountered -type of bug in TensorFlow model development. -The following example is for users who use the low-level -[`Session`](https://www.tensorflow.org/api_docs/python/tf/Session) API of -TensorFlow. Later sections of this document describe how to use **tfdbg** -with higher-level APIs of TensorFlow, including `tf.estimator`, -`tf.keras` / `keras` and `tf.contrib.slim`. -To *observe* such an issue, run the following command without the debugger (the -source code can be found -[here](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/debug/examples/debug_mnist.py)): - -```none -python -m tensorflow.python.debug.examples.debug_mnist -``` - -This code trains a simple neural network for MNIST digit image recognition. -Notice that the accuracy increases slightly after the first training step, but -then gets stuck at a low (near-chance) level: - -```none -Accuracy at step 0: 0.1113 -Accuracy at step 1: 0.3183 -Accuracy at step 2: 0.098 -Accuracy at step 3: 0.098 -Accuracy at step 4: 0.098 -``` - -Wondering what might have gone wrong, you suspect that certain nodes in the -training graph generated bad numeric values such as `inf`s and `nan`s, because -this is a common cause of this type of training failure. -Let's use tfdbg to debug this issue and pinpoint the exact graph node where this -numeric problem first surfaced. - -## Wrapping TensorFlow Sessions with tfdbg - -To add support for tfdbg in our example, all that is needed is to add the -following lines of code and wrap the Session object with a debugger wrapper. -This code is already added in -[debug_mnist.py](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/debug/examples/debug_mnist.py), -so you can activate tfdbg CLI with the `--debug` flag at the command line. - -```python -# Let your BUILD target depend on "//tensorflow/python/debug:debug_py" -# (You don't need to worry about the BUILD dependency if you are using a pip -# install of open-source TensorFlow.) -from tensorflow.python import debug as tf_debug - -sess = tf_debug.LocalCLIDebugWrapperSession(sess) -``` - -This wrapper has the same interface as Session, so enabling debugging requires -no other changes to the code. The wrapper provides additional features, -including: - -* Bringing up a CLI before and after `Session.run()` calls, to let you -control the execution and inspect the graph's internal state. -* Allowing you to register special `filters` for tensor values, to facilitate -the diagnosis of issues. - -In this example, we have already registered a tensor filter called -`tfdbg.has_inf_or_nan`, -which simply determines if there are any `nan` or `inf` values in any -intermediate tensors (tensors that are neither inputs or outputs of the -`Session.run()` call, but are in the path leading from the inputs to the -outputs). This filter is for `nan`s and `inf`s is a common enough use case that -we ship it with the -[`debug_data`](../api_guides/python/tfdbg.md#Classes_for_debug_dump_data_and_directories) -module. - -Note: You can also write your own custom filters. See `tfdbg.DebugDumpDir.find` -for additional information. - -## Debugging Model Training with tfdbg - -Let's try training the model again, but with the `--debug` flag added this time: - -```none -python -m tensorflow.python.debug.examples.debug_mnist --debug -``` - -The debug wrapper session will prompt you when it is about to execute the first -`Session.run()` call, with information regarding the fetched tensor and feed -dictionaries displayed on the screen. - -![tfdbg run-start UI](https://www.tensorflow.org/images/tfdbg_screenshot_run_start.png) - -This is what we refer to as the *run-start CLI*. It lists the feeds and fetches -to the current `Session.run` call, before executing anything. - -If the screen size is too small to display the content of the message in its -entirety, you can resize it. - -Use the **PageUp** / **PageDown** / **Home** / **End** keys to navigate the -screen output. On most keyboards lacking those keys **Fn + Up** / -**Fn + Down** / **Fn + Right** / **Fn + Left** will work. - -Enter the `run` command (or just `r`) at the command prompt: - -``` -tfdbg> run -``` - -The `run` command causes tfdbg to execute until the end of the next -`Session.run()` call, which calculates the model's accuracy using a test data -set. tfdbg augments the runtime Graph to dump all intermediate tensors. -After the run ends, tfdbg displays all the dumped tensors values in the -*run-end CLI*. For example: - -![tfdbg run-end UI: accuracy](https://www.tensorflow.org/images/tfdbg_screenshot_run_end_accuracy.png) - -This list of tensors can also be obtained by running the command `lt` after you -executed `run`. - -### tfdbg CLI Frequently-Used Commands - -Try the following commands at the `tfdbg>` prompt (referencing the code at -`tensorflow/python/debug/examples/debug_mnist.py`): - -| Command | Syntax or Option | Explanation | Example | -|:-------------------|:---------------- |:------------ |:------------------------- | -| **`lt`** | | **List dumped tensors.** | `lt` | -| | `-n ` | List dumped tensors with names matching given regular-expression pattern. | `lt -n Softmax.*` | -| | `-t ` | List dumped tensors with op types matching given regular-expression pattern. | `lt -t MatMul` | -| | `-f ` | List only the tensors that pass a registered tensor filter. | `lt -f has_inf_or_nan` | -| | `-f -fenn ` | List only the tensors that pass a registered tensor filter, excluding nodes with names matching the regular expression. | `lt -f has_inf_or_nan` `-fenn .*Sqrt.*` | -| | `-s ` | Sort the output by given `sort_key`, whose possible values are `timestamp` (default), `dump_size`, `op_type` and `tensor_name`. | `lt -s dump_size` | -| | `-r` | Sort in reverse order. | `lt -r -s dump_size` | -| **`pt`** | | **Print value of a dumped tensor.** | | -| | `pt ` | Print tensor value. | `pt hidden/Relu:0` | -| | `pt [slicing]` | Print a subarray of tensor, using [numpy](http://www.numpy.org/)-style array slicing. | `pt hidden/Relu:0[0:50,:]` | -| | `-a` | Print the entirety of a large tensor, without using ellipses. (May take a long time for large tensors.) | `pt -a hidden/Relu:0[0:50,:]` | -| | `-r ` | Highlight elements falling into specified numerical range. Multiple ranges can be used in conjunction. | `pt hidden/Relu:0 -a -r [[-inf,-1],[1,inf]]` | -| | `-n ` | Print dump corresponding to specified 0-based dump number. Required for tensors with multiple dumps. | `pt -n 0 hidden/Relu:0` | -| | `-s` | Include a summary of the numeric values of the tensor (applicable only to non-empty tensors with Boolean and numeric types such as `int*` and `float*`.) | `pt -s hidden/Relu:0[0:50,:]` | -| | `-w` | Write the value of the tensor (possibly sliced) to a Numpy file using [`numpy.save()`](https://docs.scipy.org/doc/numpy-1.13.0/reference/generated/numpy.save.html) | `pt -s hidden/Relu:0 -w /tmp/relu.npy` | -| **`@[coordinates]`** | | Navigate to specified element in `pt` output. | `@[10,0]` or `@10,0` | -| **`/regex`** | | [less](https://linux.die.net/man/1/less)-style search for given regular expression. | `/inf` | -| **`/`** | | Scroll to the next line with matches to the searched regex (if any). | `/` | -| **`pf`** | | **Print a value in the feed_dict to `Session.run`.** | | -| | `pf ` | Print the value of the feed. Also note that the `pf` command has the `-a`, `-r` and `-s` flags (not listed below), which have the same syntax and semantics as the identically-named flags of `pt`. | `pf input_xs:0` | -| **eval** | | **Evaluate arbitrary Python and numpy expression.** | | -| | `eval ` | Evaluate a Python / numpy expression, with numpy available as `np` and debug tensor names enclosed in backticks. | ``eval "np.matmul((`output/Identity:0` / `Softmax:0`).T, `Softmax:0`)"`` | -| | `-a` | Print a large-sized evaluation result in its entirety, i.e., without using ellipses. | ``eval -a 'np.sum(`Softmax:0`, axis=1)'`` | -| | `-w` | Write the result of the evaluation to a Numpy file using [`numpy.save()`](https://docs.scipy.org/doc/numpy-1.13.0/reference/generated/numpy.save.html) | ``eval -a 'np.sum(`Softmax:0`, axis=1)' -w /tmp/softmax_sum.npy`` | -| **`ni`** | | **Display node information.** | | -| | `-a` | Include node attributes in the output. | `ni -a hidden/Relu` | -| | `-d` | List the debug dumps available from the node. | `ni -d hidden/Relu` | -| | `-t` | Display the Python stack trace of the node's creation. | `ni -t hidden/Relu` | -| **`li`** | | **List inputs to node** | | -| | `-r` | List the inputs to node, recursively (the input tree.) | `li -r hidden/Relu:0` | -| | `-d ` | Limit recursion depth under the `-r` mode. | `li -r -d 3 hidden/Relu:0` | -| | `-c` | Include control inputs. | `li -c -r hidden/Relu:0` | -| | `-t` | Show op types of input nodes. | `li -t -r hidden/Relu:0` | -| **`lo`** | | **List output recipients of node** | | -| | `-r` | List the output recipients of node, recursively (the output tree.) | `lo -r hidden/Relu:0` | -| | `-d ` | Limit recursion depth under the `-r` mode. | `lo -r -d 3 hidden/Relu:0` | -| | `-c` | Include recipients via control edges. | `lo -c -r hidden/Relu:0` | -| | `-t` | Show op types of recipient nodes. | `lo -t -r hidden/Relu:0` | -| **`ls`** | | **List Python source files involved in node creation.** | | -| | `-p ` | Limit output to source files matching given regular-expression path pattern. | `ls -p .*debug_mnist.*` | -| | `-n` | Limit output to node names matching given regular-expression pattern. | `ls -n Softmax.*` | -| **`ps`** | | **Print Python source file.** | | -| | `ps ` | Print given Python source file source.py, with the lines annotated with the nodes created at each of them (if any). | `ps /path/to/source.py` | -| | `-t` | Perform annotation with respect to Tensors, instead of the default, nodes. | `ps -t /path/to/source.py` | -| | `-b ` | Annotate source.py beginning at given line. | `ps -b 30 /path/to/source.py` | -| | `-m ` | Limit the number of elements in the annotation for each line. | `ps -m 100 /path/to/source.py` | -| **`run`** | | **Proceed to the next Session.run()** | `run` | -| | `-n` | Execute through the next `Session.run` without debugging, and drop to CLI right before the run after that. | `run -n` | -| | `-t ` | Execute `Session.run` `T - 1` times without debugging, followed by a run with debugging. Then drop to CLI right after the debugged run. | `run -t 10` | -| | `-f ` | Continue executing `Session.run` until any intermediate tensor triggers the specified Tensor filter (causes the filter to return `True`). | `run -f has_inf_or_nan` | -| | `-f -fenn ` | Continue executing `Session.run` until any intermediate tensor whose node names doesn't match the regular expression triggers the specified Tensor filter (causes the filter to return `True`). | `run -f has_inf_or_nan -fenn .*Sqrt.*` | -| | `--node_name_filter ` | Execute the next `Session.run`, watching only nodes with names matching the given regular-expression pattern. | `run --node_name_filter Softmax.*` | -| | `--op_type_filter ` | Execute the next `Session.run`, watching only nodes with op types matching the given regular-expression pattern. | `run --op_type_filter Variable.*` | -| | `--tensor_dtype_filter ` | Execute the next `Session.run`, dumping only Tensors with data types (`dtype`s) matching the given regular-expression pattern. | `run --tensor_dtype_filter int.*` | -| | `-p` | Execute the next `Session.run` call in profiling mode. | `run -p` | -| **`ri`** | | **Display information about the run the current run, including fetches and feeds.** | `ri` | -| **`config`** | | **Set or show persistent TFDBG UI configuration.** | | -| | `set` | Set the value of a config item: {`graph_recursion_depth`, `mouse_mode`}. | `config set graph_recursion_depth 3` | -| | `show` | Show current persistent UI configuration. | `config show` | -| **`version`** | | **Print the version of TensorFlow and its key dependencies.** | `version` | -| **`help`** | | **Print general help information** | `help` | -| | `help ` | Print help for given command. | `help lt` | - -Note that each time you enter a command, a new screen output -will appear. This is somewhat analogous to web pages in a browser. You can -navigate between these screens by clicking the `<--` and -`-->` text arrows near the top-left corner of the CLI. - -### Other Features of the tfdbg CLI - -In addition to the commands listed above, the tfdbg CLI provides the following -additional features: - -* To navigate through previous tfdbg commands, type in a few characters - followed by the Up or Down arrow keys. tfdbg will show you the history of - commands that started with those characters. -* To navigate through the history of screen outputs, do either of the - following: - * Use the `prev` and `next` commands. - * Click underlined `<--` and `-->` links near the top left corner of the - screen. -* Tab completion of commands and some command arguments. -* To redirect the screen output to a file instead of the screen, end the - command with bash-style redirection. For example, the following command - redirects the output of the pt command to the `/tmp/xent_value_slices.txt` - file: - - ```none - tfdbg> pt cross_entropy/Log:0[:, 0:10] > /tmp/xent_value_slices.txt - ``` - -### Finding `nan`s and `inf`s - -In this first `Session.run()` call, there happen to be no problematic numerical -values. You can move on to the next run by using the command `run` or its -shorthand `r`. - -> TIP: If you enter `run` or `r` repeatedly, you will be able to move through -> the `Session.run()` calls in a sequential manner. -> -> You can also use the `-t` flag to move ahead a number of `Session.run()` calls -> at a time, for example: -> -> ``` -> tfdbg> run -t 10 -> ``` - -Instead of entering `run` repeatedly and manually searching for `nan`s and -`inf`s in the run-end UI after every `Session.run()` call (for example, by using -the `pt` command shown in the table above) , you can use the following -command to let the debugger repeatedly execute `Session.run()` calls without -stopping at the run-start or run-end prompt, until the first `nan` or `inf` -value shows up in the graph. This is analogous to *conditional breakpoints* in -some procedural-language debuggers: - -```none -tfdbg> run -f has_inf_or_nan -``` - -> NOTE: The preceding command works properly because a tensor filter called -> `has_inf_or_nan` has been registered for you when the wrapped session is -> created. This filter detects `nan`s and `inf`s (as explained previously). -> If you have registered any other filters, you can -> use "run -f" to have tfdbg run until any tensor triggers that filter (cause -> the filter to return True). -> -> ``` python -> def my_filter_callable(datum, tensor): -> # A filter that detects zero-valued scalars. -> return len(tensor.shape) == 0 and tensor == 0.0 -> -> sess.add_tensor_filter('my_filter', my_filter_callable) -> ``` -> -> Then at the tfdbg run-start prompt run until your filter is triggered: -> -> ``` -> tfdbg> run -f my_filter -> ``` - -See [this API document](https://www.tensorflow.org/api_docs/python/tfdbg/DebugDumpDir#find) -for more information on the expected signature and return value of the predicate -`Callable` used with `add_tensor_filter()`. - -![tfdbg run-end UI: infs and nans](https://www.tensorflow.org/images/tfdbg_screenshot_run_end_inf_nan.png) - -As the screen display indicates on the first line, the `has_inf_or_nan` filter is first triggered -during the fourth `Session.run()` call: an -[Adam optimizer](https://www.tensorflow.org/api_docs/python/tf/train/AdamOptimizer) -forward-backward training pass on the graph. In this run, 36 (out of the total -95) intermediate tensors contain `nan` or `inf` values. These tensors are listed -in chronological order, with their timestamps displayed on the left. At the top -of the list, you can see the first tensor in which the bad numerical values -first surfaced: `cross_entropy/Log:0`. - -To view the value of the tensor, click the underlined tensor name -`cross_entropy/Log:0` or enter the equivalent command: - -```none -tfdbg> pt cross_entropy/Log:0 -``` - -Scroll down a little and you will notice some scattered `inf` values. If the -instances of `inf` and `nan` are difficult to spot by eye, you can use the -following command to perform a regex search and highlight the output: - -```none -tfdbg> /inf -``` - -Or, alternatively: - -```none -tfdbg> /(inf|nan) -``` - -You can also use the `-s` or `--numeric_summary` command to get a quick summary -of the types of numeric values in the tensor: - -``` none -tfdbg> pt -s cross_entropy/Log:0 -``` - -From the summary, you can see that several of the 1000 elements of the -`cross_entropy/Log:0` tensor are `-inf`s (negative infinities). - -Why did these infinities appear? To further debug, display more information -about the node `cross_entropy/Log` by clicking the underlined `node_info` menu -item on the top or entering the equivalent node_info (`ni`) command: - -```none -tfdbg> ni cross_entropy/Log -``` - -![tfdbg run-end UI: infs and nans](https://www.tensorflow.org/images/tfdbg_screenshot_run_end_node_info.png) - -You can see that this node has the op type `Log` -and that its input is the node `Softmax`. Run the following command to -take a closer look at the input tensor: - -```none -tfdbg> pt Softmax:0 -``` - -Examine the values in the input tensor, searching for zeros: - -```none -tfdbg> /0\.000 -``` - -Indeed, there are zeros. Now it is clear that the origin of the bad numerical -values is the node `cross_entropy/Log` taking logs of zeros. To find out the -culprit line in the Python source code, use the `-t` flag of the `ni` command -to show the traceback of the node's construction: - -```none -tfdbg> ni -t cross_entropy/Log -``` - -If you click "node_info" at the top of the screen, tfdbg automatically shows the -traceback of the node's construction. - -From the traceback, you can see that the op is constructed at the following -line: -[`debug_mnist.py`](https://www.tensorflow.org/code/tensorflow/python/debug/examples/debug_mnist.py): - -```python -diff = y_ * tf.log(y) -``` - -**tfdbg** has a feature that makes it easy to trace Tensors and ops back to -lines in Python source files. It can annotate lines of a Python file with -the ops or Tensors created by them. To use this feature, -simply click the underlined line numbers in the stack trace output of the -`ni -t ` commands, or use the `ps` (or `print_source`) command such as: -`ps /path/to/source.py`. For example, the following screenshot shows the output -of a `ps` command. - -![tfdbg run-end UI: annotated Python source file](https://www.tensorflow.org/images/tfdbg_screenshot_run_end_annotated_source.png) - -### Fixing the problem - -To fix the problem, edit `debug_mnist.py`, changing the original line: - -```python -diff = -(y_ * tf.log(y)) -``` - -to the built-in, numerically-stable implementation of softmax cross-entropy: - -```python -diff = tf.losses.softmax_cross_entropy(labels=y_, logits=logits) -``` - -Rerun with the `--debug` flag as follows: - -```none -python -m tensorflow.python.debug.examples.debug_mnist --debug -``` - -At the `tfdbg>` prompt, enter the following command: - -```none -run -f has_inf_or_nan` -``` - -Confirm that no tensors are flagged as containing `nan` or `inf` values, and -accuracy now continues to rise rather than getting stuck. Success! - -## Debugging TensorFlow Estimators - -This section explains how to debug TensorFlow programs that use the `Estimator` -APIs. Part of the convenience provided by these APIs is that -they manage `Session`s internally. This makes the `LocalCLIDebugWrapperSession` -described in the preceding sections inapplicable. Fortunately, you can still -debug them by using special `hook`s provided by `tfdbg`. - -`tfdbg` can debug the -`tf.estimator.Estimator.train`, -`tf.estimator.Estimator.evaluate` and -`tf.estimator.Estimator.predict` -methods of tf-learn `Estimator`s. To debug `Estimator.train()`, -create a `LocalCLIDebugHook` and supply it in the `hooks` argument. For example: - -```python -# First, let your BUILD target depend on "//tensorflow/python/debug:debug_py" -# (You don't need to worry about the BUILD dependency if you are using a pip -# install of open-source TensorFlow.) -from tensorflow.python import debug as tf_debug - -# Create a LocalCLIDebugHook and use it as a monitor when calling fit(). -hooks = [tf_debug.LocalCLIDebugHook()] - -# To debug `train`: -classifier.train(input_fn, - steps=1000, - hooks=hooks) -``` - -Similarly, to debug `Estimator.evaluate()` and `Estimator.predict()`, assign -hooks to the `hooks` parameter, as in the following example: - -```python -# To debug `evaluate`: -accuracy_score = classifier.evaluate(eval_input_fn, - hooks=hooks)["accuracy"] - -# To debug `predict`: -predict_results = classifier.predict(predict_input_fn, hooks=hooks) -``` - -[debug_tflearn_iris.py](https://www.tensorflow.org/code/tensorflow/python/debug/examples/debug_tflearn_iris.py), -contains a full example of how to use the tfdbg with `Estimator`s. -To run this example, do: - -```none -python -m tensorflow.python.debug.examples.debug_tflearn_iris --debug -``` - -The `LocalCLIDebugHook` also allows you to configure a `watch_fn` that can be -used to flexibly specify what `Tensor`s to watch on different `Session.run()` -calls, as a function of the `fetches` and `feed_dict` and other states. See -`tfdbg.DumpingDebugWrapperSession.__init__` -for more details. - -## Debugging Keras Models with TFDBG - -To use TFDBG with -[tf.keras](https://www.tensorflow.org/api_docs/python/tf/keras), -let the Keras backend use a TFDBG-wrapped Session object. For example, to use -the CLI wrapper: - -``` python -import tensorflow as tf -from tensorflow.python import debug as tf_debug - -tf.keras.backend.set_session(tf_debug.LocalCLIDebugWrapperSession(tf.Session())) - -# Define your keras model, called "model". - -# Calls to `fit()`, 'evaluate()` and `predict()` methods will break into the -# TFDBG CLI. -model.fit(...) -model.evaluate(...) -model.predict(...) -``` - -With minor modification, the preceding code example also works for the -[non-TensorFlow version of Keras](https://keras.io/) running against a -TensorFlow backend. You just need to replace `tf.keras.backend` with -`keras.backend`. - -## Debugging tf-slim with TFDBG - -TFDBG supports debugging of training and evaluation with -[tf-slim](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/slim). -As detailed below, training and evaluation require slightly different debugging -workflows. - -### Debugging training in tf-slim -To debug the training process, provide `LocalCLIDebugWrapperSession` to the -`session_wrapper` argument of `slim.learning.train()`. For example: - -``` python -import tensorflow as tf -from tensorflow.python import debug as tf_debug - -# ... Code that creates the graph and the train_op ... -tf.contrib.slim.learning.train( - train_op, - logdir, - number_of_steps=10, - session_wrapper=tf_debug.LocalCLIDebugWrapperSession) -``` - -### Debugging evaluation in tf-slim -To debug the evaluation process, provide `LocalCLIDebugHook` to the -`hooks` argument of `slim.evaluation.evaluate_once()`. For example: - -``` python -import tensorflow as tf -from tensorflow.python import debug as tf_debug - -# ... Code that creates the graph and the eval and final ops ... -tf.contrib.slim.evaluation.evaluate_once( - '', - checkpoint_path, - logdir, - eval_op=my_eval_op, - final_op=my_value_op, - hooks=[tf_debug.LocalCLIDebugHook()]) -``` - -## Offline Debugging of Remotely-Running Sessions - -Often, your model is running on a remote machine or a process that you don't -have terminal access to. To perform model debugging in such cases, you can use -the `offline_analyzer` binary of `tfdbg` (described below). It operates on -dumped data directories. This can be done to both the lower-level `Session` API -and the higher-level `Estimator` API. - -### Debugging Remote tf.Sessions - -If you interact directly with the `tf.Session` API in `python`, you can -configure the `RunOptions` proto that you call your `Session.run()` method -with, by using the method `tfdbg.watch_graph`. -This will cause the intermediate tensors and runtime graphs to be dumped to a -shared storage location of your choice when the `Session.run()` call occurs -(at the cost of slower performance). For example: - -```python -from tensorflow.python import debug as tf_debug - -# ... Code where your session and graph are set up... - -run_options = tf.RunOptions() -tf_debug.watch_graph( - run_options, - session.graph, - debug_urls=["file:///shared/storage/location/tfdbg_dumps_1"]) -# Be sure to specify different directories for different run() calls. - -session.run(fetches, feed_dict=feeds, options=run_options) -``` - -Later, in an environment that you have terminal access to (for example, a local -computer that can access the shared storage location specified in the code -above), you can load and inspect the data in the dump directory on the shared -storage by using the `offline_analyzer` binary of `tfdbg`. For example: - -```none -python -m tensorflow.python.debug.cli.offline_analyzer \ - --dump_dir=/shared/storage/location/tfdbg_dumps_1 -``` - -The `Session` wrapper `DumpingDebugWrapperSession` offers an easier and more -flexible way to generate file-system dumps that can be analyzed offline. -To use it, simply wrap your session in a `tf_debug.DumpingDebugWrapperSession`. -For example: - -```python -# Let your BUILD target depend on "//tensorflow/python/debug:debug_py -# (You don't need to worry about the BUILD dependency if you are using a pip -# install of open-source TensorFlow.) -from tensorflow.python import debug as tf_debug - -sess = tf_debug.DumpingDebugWrapperSession( - sess, "/shared/storage/location/tfdbg_dumps_1/", watch_fn=my_watch_fn) -``` - -The `watch_fn` argument accepts a `Callable` that allows you to configure what -`tensor`s to watch on different `Session.run()` calls, as a function of the -`fetches` and `feed_dict` to the `run()` call and other states. - -### C++ and other languages - -If your model code is written in C++ or other languages, you can also -modify the `debug_options` field of `RunOptions` to generate debug dumps that -can be inspected offline. See -[the proto definition](https://www.tensorflow.org/code/tensorflow/core/protobuf/debug.proto) -for more details. - -### Debugging Remotely-Running Estimators - -If your remote TensorFlow server runs `Estimator`s, -you can use the non-interactive `DumpingDebugHook`. For example: - -```python -# Let your BUILD target depend on "//tensorflow/python/debug:debug_py -# (You don't need to worry about the BUILD dependency if you are using a pip -# install of open-source TensorFlow.) -from tensorflow.python import debug as tf_debug - -hooks = [tf_debug.DumpingDebugHook("/shared/storage/location/tfdbg_dumps_1")] -``` - -Then this `hook` can be used in the same way as the `LocalCLIDebugHook` examples -described earlier in this document. -As the training, evaluation or prediction happens with `Estimator`, -tfdbg creates directories having the following name pattern: -`/shared/storage/location/tfdbg_dumps_1/run__`. -Each directory corresponds to a `Session.run()` call that underlies -the `fit()` or `evaluate()` call. You can load these directories and inspect -them in a command-line interface in an offline manner using the -`offline_analyzer` offered by tfdbg. For example: - -```bash -python -m tensorflow.python.debug.cli.offline_analyzer \ - --dump_dir="/shared/storage/location/tfdbg_dumps_1/run__" -``` - -## Frequently Asked Questions - -**Q**: _Do the timestamps on the left side of the `lt` output reflect actual - performance in a non-debugging session?_ - -**A**: No. The debugger inserts additional special-purpose debug nodes to the - graph to record the values of intermediate tensors. These nodes - slow down the graph execution. If you are interested in profiling your - model, check out - - 1. The profiling mode of tfdbg: `tfdbg> run -p`. - 2. [tfprof](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/core/profiler) - and other profiling tools for TensorFlow. - -**Q**: _How do I link tfdbg against my `Session` in Bazel? Why do I see an - error such as "ImportError: cannot import name debug"?_ - -**A**: In your BUILD rule, declare dependencies: - `"//tensorflow:tensorflow_py"` and `"//tensorflow/python/debug:debug_py"`. - The first is the dependency that you include to use TensorFlow even - without debugger support; the second enables the debugger. - Then, In your Python file, add: - -```python -from tensorflow.python import debug as tf_debug - -# Then wrap your TensorFlow Session with the local-CLI wrapper. -sess = tf_debug.LocalCLIDebugWrapperSession(sess) -``` - -**Q**: _Does tfdbg help debug runtime errors such as shape mismatches?_ - -**A**: Yes. tfdbg intercepts errors generated by ops during runtime and presents - the errors with some debug instructions to the user in the CLI. - See examples: - -```none -# Debugging shape mismatch during matrix multiplication. -python -m tensorflow.python.debug.examples.debug_errors \ - --error shape_mismatch --debug - -# Debugging uninitialized variable. -python -m tensorflow.python.debug.examples.debug_errors \ - --error uninitialized_variable --debug -``` - -**Q**: _How can I let my tfdbg-wrapped Sessions or Hooks run the debug mode -only from the main thread?_ - -**A**: -This is a common use case, in which the `Session` object is used from multiple -threads concurrently. Typically, the child threads take care of background tasks -such as running enqueue operations. Often, you want to debug only the main -thread (or less frequently, only one of the child threads). You can use the -`thread_name_filter` keyword argument of `LocalCLIDebugWrapperSession` to -achieve this type of thread-selective debugging. For example, to debug from the -main thread only, construct a wrapped `Session` as follows: - -```python -sess = tf_debug.LocalCLIDebugWrapperSession(sess, thread_name_filter="MainThread$") -``` - -The above example relies on the fact that main threads in Python have the -default name `MainThread`. - -**Q**: _The model I am debugging is very large. The data dumped by tfdbg -fills up the free space of my disk. What can I do?_ - -**A**: -You might encounter this problem in any of the following situations: - -* models with many intermediate tensors -* very large intermediate tensors -* many `tf.while_loop` iterations - -There are three possible workarounds or solutions: - -* The constructors of `LocalCLIDebugWrapperSession` and `LocalCLIDebugHook` - provide a keyword argument, `dump_root`, to specify the path - to which tfdbg dumps the debug data. You can use it to let tfdbg dump the - debug data on a disk with larger free space. For example: - -```python -# For LocalCLIDebugWrapperSession -sess = tf_debug.LocalCLIDebugWrapperSession(dump_root="/with/lots/of/space") - -# For LocalCLIDebugHook -hooks = [tf_debug.LocalCLIDebugHook(dump_root="/with/lots/of/space")] -``` - Make sure that the directory pointed to by dump_root is empty or nonexistent. - `tfdbg` cleans up the dump directories before exiting. - -* Reduce the batch size used during the runs. -* Use the filtering options of tfdbg's `run` command to watch only specific - nodes in the graph. For example: - - ``` - tfdbg> run --node_name_filter .*hidden.* - tfdbg> run --op_type_filter Variable.* - tfdbg> run --tensor_dtype_filter int.* - ``` - - The first command above watches only nodes whose name match the - regular-expression pattern `.*hidden.*`. The second command watches only - operations whose name match the pattern `Variable.*`. The third one watches - only the tensors whose dtype match the pattern `int.*` (e.g., `int32`). - - -**Q**: _Why can't I select text in the tfdbg CLI?_ - -**A**: This is because the tfdbg CLI enables mouse events in the terminal by - default. This [mouse-mask](https://linux.die.net/man/3/mousemask) mode - overrides default terminal interactions, including text selection. You - can re-enable text selection by using the command `mouse off` or - `m off`. - -**Q**: _Why does the tfdbg CLI show no dumped tensors when I debug code like the following?_ - -``` python -a = tf.ones([10], name="a") -b = tf.add(a, a, name="b") -sess = tf.Session() -sess = tf_debug.LocalCLIDebugWrapperSession(sess) -sess.run(b) -``` - -**A**: The reason why you see no data dumped is because every node in the - executed TensorFlow graph is constant-folded by the TensorFlow runtime. - In this example, `a` is a constant tensor; therefore, the fetched - tensor `b` is effectively also a constant tensor. TensorFlow's graph - optimization folds the graph that contains `a` and `b` into a single - node to speed up future runs of the graph, which is why `tfdbg` does - not generate any intermediate tensor dumps. However, if `a` were a - `tf.Variable`, as in the following example: - -``` python -import numpy as np - -a = tf.Variable(np.ones(10), name="a") -b = tf.add(a, a, name="b") -sess = tf.Session() -sess.run(tf.global_variables_initializer()) -sess = tf_debug.LocalCLIDebugWrapperSession(sess) -sess.run(b) -``` - -the constant-folding would not occur and `tfdbg` should show the intermediate -tensor dumps. - - -**Q**: I am debugging a model that generates unwanted infinities or NaNs. But - there are some nodes in my model that are known to generate infinities - or NaNs in their output tensors even under completely normal conditions. - How can I skip those nodes during my `run -f has_inf_or_nan` actions? - -**A**: Use the `--filter_exclude_node_names` (`-fenn` for short) flag. For - example, if you known you have a node with name matching the regular - expression `.*Sqrt.*` that generates infinities or NaNs regardless - of whether the model is behaving correctly, you can exclude the nodes - from the infinity/NaN-finding runs with the command - `run -f has_inf_or_nan -fenn .*Sqrt.*`. - - -**Q**: Is there a GUI for tfdbg? - -**A**: Yes, the **TensorBoard Debugger Plugin** is the GUI of tfdbg. - It offers features such as inspection of the computation graph, - real-time visualization of tensor values, continuation to tensor - and conditional breakpoints, and tying tensors to their - graph-construction source code, all in the browser environment. - To get started, please visit - [its README](https://github.com/tensorflow/tensorboard/blob/master/tensorboard/plugins/debugger/README.md). diff --git a/tensorflow/docs_src/guide/eager.md b/tensorflow/docs_src/guide/eager.md deleted file mode 100644 index 3b5797a638..0000000000 --- a/tensorflow/docs_src/guide/eager.md +++ /dev/null @@ -1,854 +0,0 @@ -# Eager Execution - -TensorFlow's eager execution is an imperative programming environment that -evaluates operations immediately, without building graphs: operations return -concrete values instead of constructing a computational graph to run later. This -makes it easy to get started with TensorFlow and debug models, and it -reduces boilerplate as well. To follow along with this guide, run the code -samples below in an interactive `python` interpreter. - -Eager execution is a flexible machine learning platform for research and -experimentation, providing: - -* *An intuitive interface*—Structure your code naturally and use Python data - structures. Quickly iterate on small models and small data. -* *Easier debugging*—Call ops directly to inspect running models and test - changes. Use standard Python debugging tools for immediate error reporting. -* *Natural control flow*—Use Python control flow instead of graph control - flow, simplifying the specification of dynamic models. - -Eager execution supports most TensorFlow operations and GPU acceleration. For a -collection of examples running in eager execution, see: -[tensorflow/contrib/eager/python/examples](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/eager/python/examples). - -Note: Some models may experience increased overhead with eager execution -enabled. Performance improvements are ongoing, but please -[file a bug](https://github.com/tensorflow/tensorflow/issues) if you find a -problem and share your benchmarks. - -## Setup and basic usage - -Upgrade to the latest version of TensorFlow: - -``` -$ pip install --upgrade tensorflow -``` - -To start eager execution, add `tf.enable_eager_execution()` to the beginning of -the program or console session. Do not add this operation to other modules that -the program calls. - -```py -from __future__ import absolute_import, division, print_function - -import tensorflow as tf - -tf.enable_eager_execution() -``` - -Now you can run TensorFlow operations and the results will return immediately: - -```py -tf.executing_eagerly() # => True - -x = [[2.]] -m = tf.matmul(x, x) -print("hello, {}".format(m)) # => "hello, [[4.]]" -``` - -Enabling eager execution changes how TensorFlow operations behave—now they -immediately evaluate and return their values to Python. `tf.Tensor` objects -reference concrete values instead of symbolic handles to nodes in a computational -graph. Since there isn't a computational graph to build and run later in a -session, it's easy to inspect results using `print()` or a debugger. Evaluating, -printing, and checking tensor values does not break the flow for computing -gradients. - -Eager execution works nicely with [NumPy](http://www.numpy.org/). NumPy -operations accept `tf.Tensor` arguments. TensorFlow -[math operations](https://www.tensorflow.org/api_guides/python/math_ops) convert -Python objects and NumPy arrays to `tf.Tensor` objects. The -`tf.Tensor.numpy` method returns the object's value as a NumPy `ndarray`. - -```py -a = tf.constant([[1, 2], - [3, 4]]) -print(a) -# => tf.Tensor([[1 2] -# [3 4]], shape=(2, 2), dtype=int32) - -# Broadcasting support -b = tf.add(a, 1) -print(b) -# => tf.Tensor([[2 3] -# [4 5]], shape=(2, 2), dtype=int32) - -# Operator overloading is supported -print(a * b) -# => tf.Tensor([[ 2 6] -# [12 20]], shape=(2, 2), dtype=int32) - -# Use NumPy values -import numpy as np - -c = np.multiply(a, b) -print(c) -# => [[ 2 6] -# [12 20]] - -# Obtain numpy value from a tensor: -print(a.numpy()) -# => [[1 2] -# [3 4]] -``` - -The `tf.contrib.eager` module contains symbols available to both eager and graph execution -environments and is useful for writing code to [work with graphs](#work_with_graphs): - -```py -tfe = tf.contrib.eager -``` - -## Dynamic control flow - -A major benefit of eager execution is that all the functionality of the host -language is available while your model is executing. So, for example, -it is easy to write [fizzbuzz](https://en.wikipedia.org/wiki/Fizz_buzz): - -```py -def fizzbuzz(max_num): - counter = tf.constant(0) - max_num = tf.convert_to_tensor(max_num) - for num in range(max_num.numpy()): - num = tf.constant(num) - if int(num % 3) == 0 and int(num % 5) == 0: - print('FizzBuzz') - elif int(num % 3) == 0: - print('Fizz') - elif int(num % 5) == 0: - print('Buzz') - else: - print(num) - counter += 1 - return counter -``` - -This has conditionals that depend on tensor values and it prints these values -at runtime. - -## Build a model - -Many machine learning models are represented by composing layers. When -using TensorFlow with eager execution you can either write your own layers or -use a layer provided in the `tf.keras.layers` package. - -While you can use any Python object to represent a layer, -TensorFlow has `tf.keras.layers.Layer` as a convenient base class. Inherit from -it to implement your own layer: - -```py -class MySimpleLayer(tf.keras.layers.Layer): - def __init__(self, output_units): - super(MySimpleLayer, self).__init__() - self.output_units = output_units - - def build(self, input_shape): - # The build method gets called the first time your layer is used. - # Creating variables on build() allows you to make their shape depend - # on the input shape and hence removes the need for the user to specify - # full shapes. It is possible to create variables during __init__() if - # you already know their full shapes. - self.kernel = self.add_variable( - "kernel", [input_shape[-1], self.output_units]) - - def call(self, input): - # Override call() instead of __call__ so we can perform some bookkeeping. - return tf.matmul(input, self.kernel) -``` - -Use `tf.keras.layers.Dense` layer instead of `MySimpleLayer` above as it has -a superset of its functionality (it can also add a bias). - -When composing layers into models you can use `tf.keras.Sequential` to represent -models which are a linear stack of layers. It is easy to use for basic models: - -```py -model = tf.keras.Sequential([ - tf.keras.layers.Dense(10, input_shape=(784,)), # must declare input shape - tf.keras.layers.Dense(10) -]) -``` - -Alternatively, organize models in classes by inheriting from `tf.keras.Model`. -This is a container for layers that is a layer itself, allowing `tf.keras.Model` -objects to contain other `tf.keras.Model` objects. - -```py -class MNISTModel(tf.keras.Model): - def __init__(self): - super(MNISTModel, self).__init__() - self.dense1 = tf.keras.layers.Dense(units=10) - self.dense2 = tf.keras.layers.Dense(units=10) - - def call(self, input): - """Run the model.""" - result = self.dense1(input) - result = self.dense2(result) - result = self.dense2(result) # reuse variables from dense2 layer - return result - -model = MNISTModel() -``` - -It's not required to set an input shape for the `tf.keras.Model` class since -the parameters are set the first time input is passed to the layer. - -`tf.keras.layers` classes create and contain their own model variables that -are tied to the lifetime of their layer objects. To share layer variables, share -their objects. - - -## Eager training - -### Computing gradients - -[Automatic differentiation](https://en.wikipedia.org/wiki/Automatic_differentiation) -is useful for implementing machine learning algorithms such as -[backpropagation](https://en.wikipedia.org/wiki/Backpropagation) for training -neural networks. During eager execution, use `tf.GradientTape` to trace -operations for computing gradients later. - -`tf.GradientTape` is an opt-in feature to provide maximal performance when -not tracing. Since different operations can occur during each call, all -forward-pass operations get recorded to a "tape". To compute the gradient, play -the tape backwards and then discard. A particular `tf.GradientTape` can only -compute one gradient; subsequent calls throw a runtime error. - -```py -w = tf.Variable([[1.0]]) -with tf.GradientTape() as tape: - loss = w * w - -grad = tape.gradient(loss, w) -print(grad) # => tf.Tensor([[ 2.]], shape=(1, 1), dtype=float32) -``` - -Here's an example of `tf.GradientTape` that records forward-pass operations -to train a simple model: - -```py -# A toy dataset of points around 3 * x + 2 -NUM_EXAMPLES = 1000 -training_inputs = tf.random_normal([NUM_EXAMPLES]) -noise = tf.random_normal([NUM_EXAMPLES]) -training_outputs = training_inputs * 3 + 2 + noise - -def prediction(input, weight, bias): - return input * weight + bias - -# A loss function using mean-squared error -def loss(weights, biases): - error = prediction(training_inputs, weights, biases) - training_outputs - return tf.reduce_mean(tf.square(error)) - -# Return the derivative of loss with respect to weight and bias -def grad(weights, biases): - with tf.GradientTape() as tape: - loss_value = loss(weights, biases) - return tape.gradient(loss_value, [weights, biases]) - -train_steps = 200 -learning_rate = 0.01 -# Start with arbitrary values for W and B on the same batch of data -W = tf.Variable(5.) -B = tf.Variable(10.) - -print("Initial loss: {:.3f}".format(loss(W, B))) - -for i in range(train_steps): - dW, dB = grad(W, B) - W.assign_sub(dW * learning_rate) - B.assign_sub(dB * learning_rate) - if i % 20 == 0: - print("Loss at step {:03d}: {:.3f}".format(i, loss(W, B))) - -print("Final loss: {:.3f}".format(loss(W, B))) -print("W = {}, B = {}".format(W.numpy(), B.numpy())) -``` - -Output (exact numbers may vary): - -``` -Initial loss: 71.204 -Loss at step 000: 68.333 -Loss at step 020: 30.222 -Loss at step 040: 13.691 -Loss at step 060: 6.508 -Loss at step 080: 3.382 -Loss at step 100: 2.018 -Loss at step 120: 1.422 -Loss at step 140: 1.161 -Loss at step 160: 1.046 -Loss at step 180: 0.996 -Final loss: 0.974 -W = 3.01582956314, B = 2.1191945076 -``` - -Replay the `tf.GradientTape` to compute the gradients and apply them in a -training loop. This is demonstrated in an excerpt from the -[mnist_eager.py](https://github.com/tensorflow/models/blob/master/official/mnist/mnist_eager.py) -example: - -```py -dataset = tf.data.Dataset.from_tensor_slices((data.train.images, - data.train.labels)) -... -for (batch, (images, labels)) in enumerate(dataset): - ... - with tf.GradientTape() as tape: - logits = model(images, training=True) - loss_value = loss(logits, labels) - ... - grads = tape.gradient(loss_value, model.variables) - optimizer.apply_gradients(zip(grads, model.variables), - global_step=tf.train.get_or_create_global_step()) -``` - - -The following example creates a multi-layer model that classifies the standard -MNIST handwritten digits. It demonstrates the optimizer and layer APIs to build -trainable graphs in an eager execution environment. - -### Train a model - -Even without training, call the model and inspect the output in eager execution: - -```py -# Create a tensor representing a blank image -batch = tf.zeros([1, 1, 784]) -print(batch.shape) # => (1, 1, 784) - -result = model(batch) -# => tf.Tensor([[[ 0. 0., ..., 0.]]], shape=(1, 1, 10), dtype=float32) -``` - -This example uses the -[dataset.py module](https://github.com/tensorflow/models/blob/master/official/mnist/dataset.py) -from the -[TensorFlow MNIST example](https://github.com/tensorflow/models/tree/master/official/mnist); -download this file to your local directory. Run the following to download the -MNIST data files to your working directory and prepare a `tf.data.Dataset` -for training: - -```py -import dataset # download dataset.py file -dataset_train = dataset.train('./datasets').shuffle(60000).repeat(4).batch(32) -``` - -To train a model, define a loss function to optimize and then calculate -gradients. Use an optimizer to update the variables: - -```py -def loss(model, x, y): - prediction = model(x) - return tf.losses.sparse_softmax_cross_entropy(labels=y, logits=prediction) - -def grad(model, inputs, targets): - with tf.GradientTape() as tape: - loss_value = loss(model, inputs, targets) - return tape.gradient(loss_value, model.variables) - -optimizer = tf.train.GradientDescentOptimizer(learning_rate=0.001) - -x, y = iter(dataset_train).next() -print("Initial loss: {:.3f}".format(loss(model, x, y))) - -# Training loop -for (i, (x, y)) in enumerate(dataset_train): - # Calculate derivatives of the input function with respect to its parameters. - grads = grad(model, x, y) - # Apply the gradient to the model - optimizer.apply_gradients(zip(grads, model.variables), - global_step=tf.train.get_or_create_global_step()) - if i % 200 == 0: - print("Loss at step {:04d}: {:.3f}".format(i, loss(model, x, y))) - -print("Final loss: {:.3f}".format(loss(model, x, y))) -``` - -Output (exact numbers may vary): - -``` -Initial loss: 2.674 -Loss at step 0000: 2.593 -Loss at step 0200: 2.143 -Loss at step 0400: 2.009 -Loss at step 0600: 2.103 -Loss at step 0800: 1.621 -Loss at step 1000: 1.695 -... -Loss at step 6600: 0.602 -Loss at step 6800: 0.557 -Loss at step 7000: 0.499 -Loss at step 7200: 0.744 -Loss at step 7400: 0.681 -Final loss: 0.670 -``` - -And for faster training, move the computation to a GPU: - -```py -with tf.device("/gpu:0"): - for (i, (x, y)) in enumerate(dataset_train): - # minimize() is equivalent to the grad() and apply_gradients() calls. - optimizer.minimize(lambda: loss(model, x, y), - global_step=tf.train.get_or_create_global_step()) -``` - -### Variables and optimizers - -`tf.Variable` objects store mutable `tf.Tensor` values accessed during -training to make automatic differentiation easier. The parameters of a model can -be encapsulated in classes as variables. - -Better encapsulate model parameters by using `tf.Variable` with -`tf.GradientTape`. For example, the automatic differentiation example above -can be rewritten: - -```py -class Model(tf.keras.Model): - def __init__(self): - super(Model, self).__init__() - self.W = tf.Variable(5., name='weight') - self.B = tf.Variable(10., name='bias') - def call(self, inputs): - return inputs * self.W + self.B - -# A toy dataset of points around 3 * x + 2 -NUM_EXAMPLES = 2000 -training_inputs = tf.random_normal([NUM_EXAMPLES]) -noise = tf.random_normal([NUM_EXAMPLES]) -training_outputs = training_inputs * 3 + 2 + noise - -# The loss function to be optimized -def loss(model, inputs, targets): - error = model(inputs) - targets - return tf.reduce_mean(tf.square(error)) - -def grad(model, inputs, targets): - with tf.GradientTape() as tape: - loss_value = loss(model, inputs, targets) - return tape.gradient(loss_value, [model.W, model.B]) - -# Define: -# 1. A model. -# 2. Derivatives of a loss function with respect to model parameters. -# 3. A strategy for updating the variables based on the derivatives. -model = Model() -optimizer = tf.train.GradientDescentOptimizer(learning_rate=0.01) - -print("Initial loss: {:.3f}".format(loss(model, training_inputs, training_outputs))) - -# Training loop -for i in range(300): - grads = grad(model, training_inputs, training_outputs) - optimizer.apply_gradients(zip(grads, [model.W, model.B]), - global_step=tf.train.get_or_create_global_step()) - if i % 20 == 0: - print("Loss at step {:03d}: {:.3f}".format(i, loss(model, training_inputs, training_outputs))) - -print("Final loss: {:.3f}".format(loss(model, training_inputs, training_outputs))) -print("W = {}, B = {}".format(model.W.numpy(), model.B.numpy())) -``` - -Output (exact numbers may vary): - -``` -Initial loss: 69.066 -Loss at step 000: 66.368 -Loss at step 020: 30.107 -Loss at step 040: 13.959 -Loss at step 060: 6.769 -Loss at step 080: 3.567 -Loss at step 100: 2.141 -Loss at step 120: 1.506 -Loss at step 140: 1.223 -Loss at step 160: 1.097 -Loss at step 180: 1.041 -Loss at step 200: 1.016 -Loss at step 220: 1.005 -Loss at step 240: 1.000 -Loss at step 260: 0.998 -Loss at step 280: 0.997 -Final loss: 0.996 -W = 2.99431324005, B = 2.02129220963 -``` - -## Use objects for state during eager execution - -With graph execution, program state (such as the variables) is stored in global -collections and their lifetime is managed by the `tf.Session` object. In -contrast, during eager execution the lifetime of state objects is determined by -the lifetime of their corresponding Python object. - -### Variables are objects - -During eager execution, variables persist until the last reference to the object -is removed, and is then deleted. - -```py -with tf.device("gpu:0"): - v = tf.Variable(tf.random_normal([1000, 1000])) - v = None # v no longer takes up GPU memory -``` - -### Object-based saving - -`tf.train.Checkpoint` can save and restore `tf.Variable`s to and from -checkpoints: - -```py -x = tf.Variable(10.) - -checkpoint = tf.train.Checkpoint(x=x) # save as "x" - -x.assign(2.) # Assign a new value to the variables and save. -save_path = checkpoint.save('./ckpt/') - -x.assign(11.) # Change the variable after saving. - -# Restore values from the checkpoint -checkpoint.restore(save_path) - -print(x) # => 2.0 -``` - -To save and load models, `tf.train.Checkpoint` stores the internal state of objects, -without requiring hidden variables. To record the state of a `model`, -an `optimizer`, and a global step, pass them to a `tf.train.Checkpoint`: - -```py -model = MyModel() -optimizer = tf.train.AdamOptimizer(learning_rate=0.001) -checkpoint_dir = ‘/path/to/model_dir’ -checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt") -root = tf.train.Checkpoint(optimizer=optimizer, - model=model, - optimizer_step=tf.train.get_or_create_global_step()) - -root.save(file_prefix=checkpoint_prefix) -# or -root.restore(tf.train.latest_checkpoint(checkpoint_dir)) -``` - -### Object-oriented metrics - -`tfe.metrics` are stored as objects. Update a metric by passing the new data to -the callable, and retrieve the result using the `tfe.metrics.result` method, -for example: - -```py -m = tfe.metrics.Mean("loss") -m(0) -m(5) -m.result() # => 2.5 -m([8, 9]) -m.result() # => 5.5 -``` - -#### Summaries and TensorBoard - -[TensorBoard](../guide/summaries_and_tensorboard.md) is a visualization tool for -understanding, debugging and optimizing the model training process. It uses -summary events that are written while executing the program. - -`tf.contrib.summary` is compatible with both eager and graph execution -environments. Summary operations, such as `tf.contrib.summary.scalar`, are -inserted during model construction. For example, to record summaries once every -100 global steps: - -```py -global_step = tf.train.get_or_create_global_step() -writer = tf.contrib.summary.create_file_writer(logdir) -writer.set_as_default() - -for _ in range(iterations): - global_step.assign_add(1) - # Must include a record_summaries method - with tf.contrib.summary.record_summaries_every_n_global_steps(100): - # your model code goes here - tf.contrib.summary.scalar('loss', loss) - ... -``` - -## Advanced automatic differentiation topics - -### Dynamic models - -`tf.GradientTape` can also be used in dynamic models. This example for a -[backtracking line search](https://wikipedia.org/wiki/Backtracking_line_search) -algorithm looks like normal NumPy code, except there are gradients and is -differentiable, despite the complex control flow: - -```py -def line_search_step(fn, init_x, rate=1.0): - with tf.GradientTape() as tape: - # Variables are automatically recorded, but manually watch a tensor - tape.watch(init_x) - value = fn(init_x) - grad = tape.gradient(value, init_x) - grad_norm = tf.reduce_sum(grad * grad) - init_value = value - while value > init_value - rate * grad_norm: - x = init_x - rate * grad - value = fn(x) - rate /= 2.0 - return x, value -``` - -### Additional functions to compute gradients - -`tf.GradientTape` is a powerful interface for computing gradients, but there -is another [Autograd](https://github.com/HIPS/autograd)-style API available for -automatic differentiation. These functions are useful if writing math code with -only tensors and gradient functions, and without `tf.Variables`: - -* `tfe.gradients_function` —Returns a function that computes the derivatives - of its input function parameter with respect to its arguments. The input - function parameter must return a scalar value. When the returned function is - invoked, it returns a list of `tf.Tensor` objects: one element for each - argument of the input function. Since anything of interest must be passed as a - function parameter, this becomes unwieldy if there's a dependency on many - trainable parameters. -* `tfe.value_and_gradients_function` —Similar to - `tfe.gradients_function`, but when the returned function is invoked, it - returns the value from the input function in addition to the list of - derivatives of the input function with respect to its arguments. - -In the following example, `tfe.gradients_function` takes the `square` -function as an argument and returns a function that computes the partial -derivatives of `square` with respect to its inputs. To calculate the derivative -of `square` at `3`, `grad(3.0)` returns `6`. - -```py -def square(x): - return tf.multiply(x, x) - -grad = tfe.gradients_function(square) - -square(3.) # => 9.0 -grad(3.) # => [6.0] - -# The second-order derivative of square: -gradgrad = tfe.gradients_function(lambda x: grad(x)[0]) -gradgrad(3.) # => [2.0] - -# The third-order derivative is None: -gradgradgrad = tfe.gradients_function(lambda x: gradgrad(x)[0]) -gradgradgrad(3.) # => [None] - - -# With flow control: -def abs(x): - return x if x > 0. else -x - -grad = tfe.gradients_function(abs) - -grad(3.) # => [1.0] -grad(-3.) # => [-1.0] -``` - -### Custom gradients - -Custom gradients are an easy way to override gradients in eager and graph -execution. Within the forward function, define the gradient with respect to the -inputs, outputs, or intermediate results. For example, here's an easy way to clip -the norm of the gradients in the backward pass: - -```py -@tf.custom_gradient -def clip_gradient_by_norm(x, norm): - y = tf.identity(x) - def grad_fn(dresult): - return [tf.clip_by_norm(dresult, norm), None] - return y, grad_fn -``` - -Custom gradients are commonly used to provide a numerically stable gradient for a -sequence of operations: - -```py -def log1pexp(x): - return tf.log(1 + tf.exp(x)) -grad_log1pexp = tfe.gradients_function(log1pexp) - -# The gradient computation works fine at x = 0. -grad_log1pexp(0.) # => [0.5] - -# However, x = 100 fails because of numerical instability. -grad_log1pexp(100.) # => [nan] -``` - -Here, the `log1pexp` function can be analytically simplified with a custom -gradient. The implementation below reuses the value for `tf.exp(x)` that is -computed during the forward pass—making it more efficient by eliminating -redundant calculations: - -```py -@tf.custom_gradient -def log1pexp(x): - e = tf.exp(x) - def grad(dy): - return dy * (1 - 1 / (1 + e)) - return tf.log(1 + e), grad - -grad_log1pexp = tfe.gradients_function(log1pexp) - -# As before, the gradient computation works fine at x = 0. -grad_log1pexp(0.) # => [0.5] - -# And the gradient computation also works at x = 100. -grad_log1pexp(100.) # => [1.0] -``` - -## Performance - -Computation is automatically offloaded to GPUs during eager execution. If you -want control over where a computation runs you can enclose it in a -`tf.device('/gpu:0')` block (or the CPU equivalent): - -```py -import time - -def measure(x, steps): - # TensorFlow initializes a GPU the first time it's used, exclude from timing. - tf.matmul(x, x) - start = time.time() - for i in range(steps): - x = tf.matmul(x, x) - # tf.matmul can return before completing the matrix multiplication - # (e.g., can return after enqueing the operation on a CUDA stream). - # The x.numpy() call below will ensure that all enqueued operations - # have completed (and will also copy the result to host memory, - # so we're including a little more than just the matmul operation - # time). - _ = x.numpy() - end = time.time() - return end - start - -shape = (1000, 1000) -steps = 200 -print("Time to multiply a {} matrix by itself {} times:".format(shape, steps)) - -# Run on CPU: -with tf.device("/cpu:0"): - print("CPU: {} secs".format(measure(tf.random_normal(shape), steps))) - -# Run on GPU, if available: -if tfe.num_gpus() > 0: - with tf.device("/gpu:0"): - print("GPU: {} secs".format(measure(tf.random_normal(shape), steps))) -else: - print("GPU: not found") -``` - -Output (exact numbers depend on hardware): - -``` -Time to multiply a (1000, 1000) matrix by itself 200 times: -CPU: 1.46628093719 secs -GPU: 0.0593810081482 secs -``` - -A `tf.Tensor` object can be copied to a different device to execute its -operations: - -```py -x = tf.random_normal([10, 10]) - -x_gpu0 = x.gpu() -x_cpu = x.cpu() - -_ = tf.matmul(x_cpu, x_cpu) # Runs on CPU -_ = tf.matmul(x_gpu0, x_gpu0) # Runs on GPU:0 - -if tfe.num_gpus() > 1: - x_gpu1 = x.gpu(1) - _ = tf.matmul(x_gpu1, x_gpu1) # Runs on GPU:1 -``` - -### Benchmarks - -For compute-heavy models, such as -[ResNet50](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/eager/python/examples/resnet50) -training on a GPU, eager execution performance is comparable to graph execution. -But this gap grows larger for models with less computation and there is work to -be done for optimizing hot code paths for models with lots of small operations. - - -## Work with graphs - -While eager execution makes development and debugging more interactive, -TensorFlow graph execution has advantages for distributed training, performance -optimizations, and production deployment. However, writing graph code can feel -different than writing regular Python code and more difficult to debug. - -For building and training graph-constructed models, the Python program first -builds a graph representing the computation, then invokes `Session.run` to send -the graph for execution on the C++-based runtime. This provides: - -* Automatic differentiation using static autodiff. -* Simple deployment to a platform independent server. -* Graph-based optimizations (common subexpression elimination, constant-folding, etc.). -* Compilation and kernel fusion. -* Automatic distribution and replication (placing nodes on the distributed system). - -Deploying code written for eager execution is more difficult: either generate a -graph from the model, or run the Python runtime and code directly on the server. - -### Write compatible code - -The same code written for eager execution will also build a graph during graph -execution. Do this by simply running the same code in a new Python session where -eager execution is not enabled. - -Most TensorFlow operations work during eager execution, but there are some things -to keep in mind: - -* Use `tf.data` for input processing instead of queues. It's faster and easier. -* Use object-oriented layer APIs—like `tf.keras.layers` and - `tf.keras.Model`—since they have explicit storage for variables. -* Most model code works the same during eager and graph execution, but there are - exceptions. (For example, dynamic models using Python control flow to change the - computation based on inputs.) -* Once eager execution is enabled with `tf.enable_eager_execution`, it - cannot be turned off. Start a new Python session to return to graph execution. - -It's best to write code for both eager execution *and* graph execution. This -gives you eager's interactive experimentation and debuggability with the -distributed performance benefits of graph execution. - -Write, debug, and iterate in eager execution, then import the model graph for -production deployment. Use `tf.train.Checkpoint` to save and restore model -variables, this allows movement between eager and graph execution environments. -See the examples in: -[tensorflow/contrib/eager/python/examples](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/eager/python/examples). - -### Use eager execution in a graph environment - -Selectively enable eager execution in a TensorFlow graph environment using -`tfe.py_func`. This is used when `tf.enable_eager_execution()` has *not* -been called. - -```py -def my_py_func(x): - x = tf.matmul(x, x) # You can use tf ops - print(x) # but it's eager! - return x - -with tf.Session() as sess: - x = tf.placeholder(dtype=tf.float32) - # Call eager function in graph! - pf = tfe.py_func(my_py_func, [x], tf.float32) - sess.run(pf, feed_dict={x: [[2.0]]}) # [[4.0]] -``` diff --git a/tensorflow/docs_src/guide/embedding.md b/tensorflow/docs_src/guide/embedding.md deleted file mode 100644 index 6007e6847b..0000000000 --- a/tensorflow/docs_src/guide/embedding.md +++ /dev/null @@ -1,262 +0,0 @@ -# Embeddings - -This document introduces the concept of embeddings, gives a simple example of -how to train an embedding in TensorFlow, and explains how to view embeddings -with the TensorBoard Embedding Projector -([live example](http://projector.tensorflow.org)). The first two parts target -newcomers to machine learning or TensorFlow, and the Embedding Projector how-to -is for users at all levels. - -An alternative tutorial on these concepts is available in the -[Embeddings section of Machine Learning Crash Course](https://developers.google.com/machine-learning/crash-course/embeddings/video-lecture). - -[TOC] - -An **embedding** is a mapping from discrete objects, such as words, to vectors -of real numbers. For example, a 300-dimensional embedding for English words -could include: - -``` -blue: (0.01359, 0.00075997, 0.24608, ..., -0.2524, 1.0048, 0.06259) -blues: (0.01396, 0.11887, -0.48963, ..., 0.033483, -0.10007, 0.1158) -orange: (-0.24776, -0.12359, 0.20986, ..., 0.079717, 0.23865, -0.014213) -oranges: (-0.35609, 0.21854, 0.080944, ..., -0.35413, 0.38511, -0.070976) -``` - -The individual dimensions in these vectors typically have no inherent meaning. -Instead, it's the overall patterns of location and distance between vectors -that machine learning takes advantage of. - -Embeddings are important for input to machine learning. Classifiers, and neural -networks more generally, work on vectors of real numbers. They train best on -dense vectors, where all values contribute to define an object. However, many -important inputs to machine learning, such as words of text, do not have a -natural vector representation. Embedding functions are the standard and -effective way to transform such discrete input objects into useful -continuous vectors. - -Embeddings are also valuable as outputs of machine learning. Because embeddings -map objects to vectors, applications can use similarity in vector space (for -instance, Euclidean distance or the angle between vectors) as a robust and -flexible measure of object similarity. One common use is to find nearest -neighbors. Using the same word embeddings as above, for instance, here are the -three nearest neighbors for each word and the corresponding angles: - -``` -blue: (red, 47.6°), (yellow, 51.9°), (purple, 52.4°) -blues: (jazz, 53.3°), (folk, 59.1°), (bluegrass, 60.6°) -orange: (yellow, 53.5°), (colored, 58.0°), (bright, 59.9°) -oranges: (apples, 45.3°), (lemons, 48.3°), (mangoes, 50.4°) -``` - -This would tell an application that apples and oranges are in some way more -similar (45.3° apart) than lemons and oranges (48.3° apart). - -## Embeddings in TensorFlow - -To create word embeddings in TensorFlow, we first split the text into words -and then assign an integer to every word in the vocabulary. Let us assume that -this has already been done, and that `word_ids` is a vector of these integers. -For example, the sentence “I have a cat.” could be split into -`[“I”, “have”, “a”, “cat”, “.”]` and then the corresponding `word_ids` tensor -would have shape `[5]` and consist of 5 integers. To map these word ids -to vectors, we need to create the embedding variable and use the -`tf.nn.embedding_lookup` function as follows: - -``` -word_embeddings = tf.get_variable(“word_embeddings”, - [vocabulary_size, embedding_size]) -embedded_word_ids = tf.nn.embedding_lookup(word_embeddings, word_ids) -``` - -After this, the tensor `embedded_word_ids` will have shape `[5, embedding_size]` -in our example and contain the embeddings (dense vectors) for each of the 5 -words. At the end of training, `word_embeddings` will contain the embeddings -for all words in the vocabulary. - -Embeddings can be trained in many network types, and with various loss -functions and data sets. For example, one could use a recurrent neural network -to predict the next word from the previous one given a large corpus of -sentences, or one could train two networks to do multi-lingual translation. -These methods are described in the [Vector Representations of Words](../tutorials/representation/word2vec.md) -tutorial. - -## Visualizing Embeddings - -TensorBoard includes the **Embedding Projector**, a tool that lets you -interactively visualize embeddings. This tool can read embeddings from your -model and render them in two or three dimensions. - -The Embedding Projector has three panels: - -- *Data panel* on the top left, where you can choose the run, the embedding - variable and data columns to color and label points by. -- *Projections panel* on the bottom left, where you can choose the type of - projection. -- *Inspector panel* on the right side, where you can search for particular - points and see a list of nearest neighbors. - -### Projections -The Embedding Projector provides three ways to reduce the dimensionality of a -data set. - -- *[t-SNE](https://en.wikipedia.org/wiki/T-distributed_stochastic_neighbor_embedding)*: - a nonlinear nondeterministic algorithm (T-distributed stochastic neighbor - embedding) that tries to preserve local neighborhoods in the data, often at - the expense of distorting global structure. You can choose whether to compute - two- or three-dimensional projections. - -- *[PCA](https://en.wikipedia.org/wiki/Principal_component_analysis)*: - a linear deterministic algorithm (principal component analysis) that tries to - capture as much of the data variability in as few dimensions as possible. PCA - tends to highlight large-scale structure in the data, but can distort local - neighborhoods. The Embedding Projector computes the top 10 principal - components, from which you can choose two or three to view. - -- *Custom*: a linear projection onto horizontal and vertical axes that you - specify using labels in the data. You define the horizontal axis, for - instance, by giving text patterns for "Left" and "Right". The Embedding - Projector finds all points whose label matches the "Left" pattern and - computes the centroid of that set; similarly for "Right". The line passing - through these two centroids defines the horizontal axis. The vertical axis is - likewise computed from the centroids for points matching the "Up" and "Down" - text patterns. - -Further useful articles are -[How to Use t-SNE Effectively](https://distill.pub/2016/misread-tsne/) and -[Principal Component Analysis Explained Visually](http://setosa.io/ev/principal-component-analysis/). - -### Exploration - -You can explore visually by zooming, rotating, and panning using natural -click-and-drag gestures. Hovering your mouse over a point will show any -[metadata](#metadata) for that point. You can also inspect nearest-neighbor -subsets. Clicking on a point causes the right pane to list the nearest -neighbors, along with distances to the current point. The nearest-neighbor -points are also highlighted in the projection. - -It is sometimes useful to restrict the view to a subset of points and perform -projections only on those points. To do so, you can select points in multiple -ways: - -- After clicking on a point, its nearest neighbors are also selected. -- After a search, the points matching the query are selected. -- Enabling selection, clicking on a point and dragging defines a selection - sphere. - -Then click the "Isolate *nnn* points" button at the top of the Inspector pane -on the right hand side. The following image shows 101 points selected and ready -for the user to click "Isolate 101 points": - -![Selection of nearest neighbors](https://www.tensorflow.org/images/embedding-nearest-points.png "Selection of nearest neighbors") - -*Selection of the nearest neighbors of “important” in a word embedding dataset.* - -Advanced tip: filtering with custom projection can be powerful. Below, we -filtered the 100 nearest neighbors of “politics” and projected them onto the -“worst” - “best” vector as an x axis. The y axis is random. As a result, one -finds on the right side “ideas”, “science”, “perspective”, “journalism” but on -the left “crisis”, “violence” and “conflict”. - - - - - - - - - - -
- Custom controls panel - - Custom projection -
- Custom projection controls. - - Custom projection of neighbors of "politics" onto "best" - "worst" vector. -
- -To share your findings, you can use the bookmark panel in the bottom right -corner and save the current state (including computed coordinates of any -projection) as a small file. The Projector can then be pointed to a set of one -or more of these files, producing the panel below. Other users can then walk -through a sequence of bookmarks. - -Bookmark panel - -### Metadata - -If you are working with an embedding, you'll probably want to attach -labels/images to the data points. You can do this by generating a metadata file -containing the labels for each point and clicking "Load data" in the data panel -of the Embedding Projector. - -The metadata can be either labels or images, which are -stored in a separate file. For labels, the format should -be a [TSV file](https://en.wikipedia.org/wiki/Tab-separated_values) -(tab characters shown in red) whose first line contains column headers -(shown in bold) and subsequent lines contain the metadata values. For example: - - -Word\tFrequency
- Airplane\t345
- Car\t241
- ... -
- -The order of lines in the metadata file is assumed to match the order of -vectors in the embedding variable, except for the header. Consequently, the -(i+1)-th line in the metadata file corresponds to the i-th row of the embedding -variable. If the TSV metadata file has only a single column, then we don’t -expect a header row, and assume each row is the label of the embedding. We -include this exception because it matches the commonly-used "vocab file" -format. - -To use images as metadata, you must produce a single -[sprite image](https://www.google.com/webhp#q=what+is+a+sprite+image), -consisting of small thumbnails, one for each vector in the embedding. The -sprite should store thumbnails in row-first order: the first data point placed -in the top left and the last data point in the bottom right, though the last -row doesn't have to be filled, as shown below. - - - - - - - - - - - - - - - - - -
012
345
67
- -Follow [this link](https://www.tensorflow.org/images/embedding-mnist.mp4) -to see a fun example of thumbnail images in the Embedding Projector. - - -## Mini-FAQ - -**Is "embedding" an action or a thing?** -Both. People talk about embedding words in a vector space (action) and about -producing word embeddings (things). Common to both is the notion of embedding -as a mapping from discrete objects to vectors. Creating or applying that -mapping is an action, but the mapping itself is a thing. - -**Are embeddings high-dimensional or low-dimensional?** -It depends. A 300-dimensional vector space of words and phrases, for instance, -is often called low-dimensional (and dense) when compared to the millions of -words and phrases it can contain. But mathematically it is high-dimensional, -displaying many properties that are dramatically different from what our human -intuition has learned about 2- and 3-dimensional spaces. - -**Is an embedding the same as an embedding layer?** -No. An *embedding layer* is a part of neural network, but an *embedding* is a more -general concept. diff --git a/tensorflow/docs_src/guide/estimators.md b/tensorflow/docs_src/guide/estimators.md deleted file mode 100644 index 3903bfd126..0000000000 --- a/tensorflow/docs_src/guide/estimators.md +++ /dev/null @@ -1,196 +0,0 @@ -# Estimators - -This document introduces `tf.estimator`--a high-level TensorFlow -API that greatly simplifies machine learning programming. Estimators encapsulate -the following actions: - -* training -* evaluation -* prediction -* export for serving - -You may either use the pre-made Estimators we provide or write your -own custom Estimators. All Estimators--whether pre-made or custom--are -classes based on the `tf.estimator.Estimator` class. - -For a quick example try [Estimator tutorials]](../tutorials/estimators/linear). -To see each sub-topic in depth, see the [Estimator guides](premade_estimators). - -Note: TensorFlow also includes a deprecated `Estimator` class at -`tf.contrib.learn.Estimator`, which you should not use. - - -## Advantages of Estimators - -Estimators provide the following benefits: - -* You can run Estimator-based models on a local host or on a - distributed multi-server environment without changing your model. - Furthermore, you can run Estimator-based models on CPUs, GPUs, - or TPUs without recoding your model. -* Estimators simplify sharing implementations between model developers. -* You can develop a state of the art model with high-level intuitive code. - In short, it is generally much easier to create models with Estimators - than with the low-level TensorFlow APIs. -* Estimators are themselves built on `tf.keras.layers`, which - simplifies customization. -* Estimators build the graph for you. -* Estimators provide a safe distributed training loop that controls how and - when to: - * build the graph - * initialize variables - * load data - * handle exceptions - * create checkpoint files and recover from failures - * save summaries for TensorBoard - -When writing an application with Estimators, you must separate the data input -pipeline from the model. This separation simplifies experiments with -different data sets. - - -## Pre-made Estimators - -Pre-made Estimators enable you to work at a much higher conceptual level -than the base TensorFlow APIs. You no longer have to worry about creating -the computational graph or sessions since Estimators handle all -the "plumbing" for you. That is, pre-made Estimators create and manage -`tf.Graph` and `tf.Session` objects for you. Furthermore, -pre-made Estimators let you experiment with different model architectures by -making only minimal code changes. `tf.estimator.DNNClassifier`, -for example, is a pre-made Estimator class that trains classification models -based on dense, feed-forward neural networks. - - -### Structure of a pre-made Estimators program - -A TensorFlow program relying on a pre-made Estimator typically consists -of the following four steps: - -1. **Write one or more dataset importing functions.** For example, you might - create one function to import the training set and another function to - import the test set. Each dataset importing function must return two - objects: - - * a dictionary in which the keys are feature names and the - values are Tensors (or SparseTensors) containing the corresponding - feature data - * a Tensor containing one or more labels - - For example, the following code illustrates the basic skeleton for - an input function: - - def input_fn(dataset): - ... # manipulate dataset, extracting the feature dict and the label - return feature_dict, label - - (See [Importing Data](../guide/datasets.md) for full details.) - -2. **Define the feature columns.** Each `tf.feature_column` - identifies a feature name, its type, and any input pre-processing. - For example, the following snippet creates three feature - columns that hold integer or floating-point data. The first two - feature columns simply identify the feature's name and type. The - third feature column also specifies a lambda the program will invoke - to scale the raw data: - - # Define three numeric feature columns. - population = tf.feature_column.numeric_column('population') - crime_rate = tf.feature_column.numeric_column('crime_rate') - median_education = tf.feature_column.numeric_column('median_education', - normalizer_fn=lambda x: x - global_education_mean) - -3. **Instantiate the relevant pre-made Estimator.** For example, here's - a sample instantiation of a pre-made Estimator named `LinearClassifier`: - - # Instantiate an estimator, passing the feature columns. - estimator = tf.estimator.LinearClassifier( - feature_columns=[population, crime_rate, median_education], - ) - -4. **Call a training, evaluation, or inference method.** - For example, all Estimators provide a `train` method, which trains a model. - - # my_training_set is the function created in Step 1 - estimator.train(input_fn=my_training_set, steps=2000) - - -### Benefits of pre-made Estimators - -Pre-made Estimators encode best practices, providing the following benefits: - -* Best practices for determining where different parts of the computational - graph should run, implementing strategies on a single machine or on a - cluster. -* Best practices for event (summary) writing and universally useful - summaries. - -If you don't use pre-made Estimators, you must implement the preceding -features yourself. - - -## Custom Estimators - -The heart of every Estimator--whether pre-made or custom--is its -**model function**, which is a method that builds graphs for training, -evaluation, and prediction. When you are using a pre-made Estimator, -someone else has already implemented the model function. When relying -on a custom Estimator, you must write the model function yourself. A -[companion document](../guide/custom_estimators.md) -explains how to write the model function. - - -## Recommended workflow - -We recommend the following workflow: - -1. Assuming a suitable pre-made Estimator exists, use it to build your - first model and use its results to establish a baseline. -2. Build and test your overall pipeline, including the integrity and - reliability of your data with this pre-made Estimator. -3. If suitable alternative pre-made Estimators are available, run - experiments to determine which pre-made Estimator produces the - best results. -4. Possibly, further improve your model by building your own custom Estimator. - - -## Creating Estimators from Keras models - -You can convert existing Keras models to Estimators. Doing so enables your Keras -model to access Estimator's strengths, such as distributed training. Call -`tf.keras.estimator.model_to_estimator` as in the -following sample: - -```python -# Instantiate a Keras inception v3 model. -keras_inception_v3 = tf.keras.applications.inception_v3.InceptionV3(weights=None) -# Compile model with the optimizer, loss, and metrics you'd like to train with. -keras_inception_v3.compile(optimizer=tf.keras.optimizers.SGD(lr=0.0001, momentum=0.9), - loss='categorical_crossentropy', - metric='accuracy') -# Create an Estimator from the compiled Keras model. Note the initial model -# state of the keras model is preserved in the created Estimator. -est_inception_v3 = tf.keras.estimator.model_to_estimator(keras_model=keras_inception_v3) - -# Treat the derived Estimator as you would with any other Estimator. -# First, recover the input name(s) of Keras model, so we can use them as the -# feature column name(s) of the Estimator input function: -keras_inception_v3.input_names # print out: ['input_1'] -# Once we have the input name(s), we can create the input function, for example, -# for input(s) in the format of numpy ndarray: -train_input_fn = tf.estimator.inputs.numpy_input_fn( - x={"input_1": train_data}, - y=train_labels, - num_epochs=1, - shuffle=False) -# To train, we call Estimator's train function: -est_inception_v3.train(input_fn=train_input_fn, steps=2000) -``` -Note that the names of feature columns and labels of a keras estimator come from -the corresponding compiled keras model. For example, the input key names for -`train_input_fn` above can be obtained from `keras_inception_v3.input_names`, -and similarly, the predicted output names can be obtained from -`keras_inception_v3.output_names`. - -For more details, please refer to the documentation for -`tf.keras.estimator.model_to_estimator`. diff --git a/tensorflow/docs_src/guide/faq.md b/tensorflow/docs_src/guide/faq.md deleted file mode 100644 index a02635ebba..0000000000 --- a/tensorflow/docs_src/guide/faq.md +++ /dev/null @@ -1,296 +0,0 @@ -# Frequently Asked Questions - -This document provides answers to some of the frequently asked questions about -TensorFlow. If you have a question that is not covered here, you might find an -answer on one of the TensorFlow [community resources](../about/index.md). - -[TOC] - -## Features and Compatibility - -#### Can I run distributed training on multiple computers? - -Yes! TensorFlow gained -[support for distributed computation](../deploy/distributed.md) in -version 0.8. TensorFlow now supports multiple devices (CPUs and GPUs) in one or -more computers. - -#### Does TensorFlow work with Python 3? - -As of the 0.6.0 release timeframe (Early December 2015), we do support Python -3.3+. - -## Building a TensorFlow graph - -See also the -[API documentation on building graphs](../api_guides/python/framework.md). - -#### Why does `c = tf.matmul(a, b)` not execute the matrix multiplication immediately? - -In the TensorFlow Python API, `a`, `b`, and `c` are -`tf.Tensor` objects. A `Tensor` object is -a symbolic handle to the result of an operation, but does not actually hold the -values of the operation's output. Instead, TensorFlow encourages users to build -up complicated expressions (such as entire neural networks and its gradients) as -a dataflow graph. You then offload the computation of the entire dataflow graph -(or a subgraph of it) to a TensorFlow -`tf.Session`, which is able to execute the -whole computation much more efficiently than executing the operations -one-by-one. - -#### How are devices named? - -The supported device names are `"/device:CPU:0"` (or `"/cpu:0"`) for the CPU -device, and `"/device:GPU:i"` (or `"/gpu:i"`) for the *i*th GPU device. - -#### How do I place operations on a particular device? - -To place a group of operations on a device, create them within a -`tf.device` context. See -the how-to documentation on -[using GPUs with TensorFlow](../guide/using_gpu.md) for details of how -TensorFlow assigns operations to devices, and the -[CIFAR-10 tutorial](../tutorials/images/deep_cnn.md) for an example model that -uses multiple GPUs. - - -## Running a TensorFlow computation - -See also the -[API documentation on running graphs](../api_guides/python/client.md). - -#### What's the deal with feeding and placeholders? - -Feeding is a mechanism in the TensorFlow Session API that allows you to -substitute different values for one or more tensors at run time. The `feed_dict` -argument to `tf.Session.run` is a -dictionary that maps `tf.Tensor` objects to -numpy arrays (and some other types), which will be used as the values of those -tensors in the execution of a step. - -#### What is the difference between `Session.run()` and `Tensor.eval()`? - -If `t` is a `tf.Tensor` object, -`tf.Tensor.eval` is shorthand for -`tf.Session.run`, where `sess` is the -current `tf.get_default_session`. The -two following snippets of code are equivalent: - -```python -# Using `Session.run()`. -sess = tf.Session() -c = tf.constant(5.0) -print(sess.run(c)) - -# Using `Tensor.eval()`. -c = tf.constant(5.0) -with tf.Session(): - print(c.eval()) -``` - -In the second example, the session acts as a -[context manager](https://docs.python.org/2.7/reference/compound_stmts.html#with), -which has the effect of installing it as the default session for the lifetime of -the `with` block. The context manager approach can lead to more concise code for -simple use cases (like unit tests); if your code deals with multiple graphs and -sessions, it may be more straightforward to make explicit calls to -`Session.run()`. - -#### Do Sessions have a lifetime? What about intermediate tensors? - -Sessions can own resources, such as -`tf.Variable`, -`tf.QueueBase`, and -`tf.ReaderBase`. These resources can sometimes use -a significant amount of memory, and can be released when the session is closed by calling -`tf.Session.close`. - -The intermediate tensors that are created as part of a call to -[`Session.run()`](../api_guides/python/client.md) will be freed at or before the -end of the call. - -#### Does the runtime parallelize parts of graph execution? - -The TensorFlow runtime parallelizes graph execution across many different -dimensions: - -* The individual ops have parallel implementations, using multiple cores in a - CPU, or multiple threads in a GPU. -* Independent nodes in a TensorFlow graph can run in parallel on multiple - devices, which makes it possible to speed up - [CIFAR-10 training using multiple GPUs](../tutorials/images/deep_cnn.md). -* The Session API allows multiple concurrent steps (i.e. calls to - `tf.Session.run` in parallel). This - enables the runtime to get higher throughput, if a single step does not use - all of the resources in your computer. - -#### Which client languages are supported in TensorFlow? - -TensorFlow is designed to support multiple client languages. -Currently, the best-supported client language is [Python](../api_docs/python/index.md). Experimental interfaces for -executing and constructing graphs are also available for -[C++](../api_docs/cc/index.md), [Java](../api_docs/java/reference/org/tensorflow/package-summary.html) and [Go](https://godoc.org/github.com/tensorflow/tensorflow/tensorflow/go). - -TensorFlow also has a -[C-based client API](https://www.tensorflow.org/code/tensorflow/c/c_api.h) -to help build support for more client languages. We invite contributions of new -language bindings. - -Bindings for various other languages (such as [C#](https://github.com/migueldeicaza/TensorFlowSharp), [Julia](https://github.com/malmaud/TensorFlow.jl), [Ruby](https://github.com/somaticio/tensorflow.rb) and [Scala](https://github.com/eaplatanios/tensorflow_scala)) created and supported by the open source community build on top of the C API supported by the TensorFlow maintainers. - -#### Does TensorFlow make use of all the devices (GPUs and CPUs) available on my machine? - -TensorFlow supports multiple GPUs and CPUs. See the how-to documentation on -[using GPUs with TensorFlow](../guide/using_gpu.md) for details of how -TensorFlow assigns operations to devices, and the -[CIFAR-10 tutorial](../tutorials/images/deep_cnn.md) for an example model that -uses multiple GPUs. - -Note that TensorFlow only uses GPU devices with a compute capability greater -than 3.5. - -#### Why does `Session.run()` hang when using a reader or a queue? - -The `tf.ReaderBase` and -`tf.QueueBase` classes provide special operations that -can *block* until input (or free space in a bounded queue) becomes -available. These operations allow you to build sophisticated -[input pipelines](../api_guides/python/reading_data.md), at the cost of making the -TensorFlow computation somewhat more complicated. See the how-to documentation -for -[using `QueueRunner` objects to drive queues and readers](../api_guides/python/reading_data.md#creating_threads_to_prefetch_using_queuerunner_objects) -for more information on how to use them. - -## Variables - -See also the how-to documentation on [variables](../guide/variables.md) and -[the API documentation for variables](../api_guides/python/state_ops.md). - -#### What is the lifetime of a variable? - -A variable is created when you first run the -`tf.Variable.initializer` -operation for that variable in a session. It is destroyed when that -`tf.Session.close`. - -#### How do variables behave when they are concurrently accessed? - -Variables allow concurrent read and write operations. The value read from a -variable may change if it is concurrently updated. By default, concurrent -assignment operations to a variable are allowed to run with no mutual exclusion. -To acquire a lock when assigning to a variable, pass `use_locking=True` to -`tf.Variable.assign`. - -## Tensor shapes - -See also the -`tf.TensorShape`. - -#### How can I determine the shape of a tensor in Python? - -In TensorFlow, a tensor has both a static (inferred) shape and a dynamic (true) -shape. The static shape can be read using the -`tf.Tensor.get_shape` -method: this shape is inferred from the operations that were used to create the -tensor, and may be partially complete (the static-shape may contain `None`). If -the static shape is not fully defined, the dynamic shape of a `tf.Tensor`, `t` -can be determined using `tf.shape(t)`. - -#### What is the difference between `x.set_shape()` and `x = tf.reshape(x)`? - -The `tf.Tensor.set_shape` method updates -the static shape of a `Tensor` object, and it is typically used to provide -additional shape information when this cannot be inferred directly. It does not -change the dynamic shape of the tensor. - -The `tf.reshape` operation creates -a new tensor with a different dynamic shape. - -#### How do I build a graph that works with variable batch sizes? - -It is often useful to build a graph that works with variable batch sizes -so that the same code can be used for (mini-)batch training, and -single-instance inference. The resulting graph can be -`tf.Graph.as_graph_def` -and -`tf.import_graph_def`. - -When building a variable-size graph, the most important thing to remember is not -to encode the batch size as a Python constant, but instead to use a symbolic -`Tensor` to represent it. The following tips may be useful: - -* Use [`batch_size = tf.shape(input)[0]`](../api_docs/python/array_ops.md#shape) - to extract the batch dimension from a `Tensor` called `input`, and store it in - a `Tensor` called `batch_size`. - -* Use `tf.reduce_mean` instead - of `tf.reduce_sum(...) / batch_size`. - - -## TensorBoard - -#### How can I visualize a TensorFlow graph? - -See the [graph visualization tutorial](../guide/graph_viz.md). - -#### What is the simplest way to send data to TensorBoard? - -Add summary ops to your TensorFlow graph, and write -these summaries to a log directory. Then, start TensorBoard using - - python tensorflow/tensorboard/tensorboard.py --logdir=path/to/log-directory - -For more details, see the -[Summaries and TensorBoard tutorial](../guide/summaries_and_tensorboard.md). - -#### Every time I launch TensorBoard, I get a network security popup! - -You can change TensorBoard to serve on localhost rather than '0.0.0.0' by -the flag --host=localhost. This should quiet any security warnings. - -## Extending TensorFlow - -See the how-to documentation for -[adding a new operation to TensorFlow](../extend/adding_an_op.md). - -#### My data is in a custom format. How do I read it using TensorFlow? - -There are three main options for dealing with data in a custom format. - -The easiest option is to write parsing code in Python that transforms the data -into a numpy array. Then, use `tf.data.Dataset.from_tensor_slices` to -create an input pipeline from the in-memory data. - -If your data doesn't fit in memory, try doing the parsing in the Dataset -pipeline. Start with an appropriate file reader, like -`tf.data.TextLineDataset`. Then convert the dataset by mapping -`tf.data.Dataset.map` appropriate operations over it. -Prefer predefined TensorFlow operations such as `tf.decode_raw`, -`tf.decode_csv`, `tf.parse_example`, or `tf.image.decode_png`. - -If your data is not easily parsable with the built-in TensorFlow operations, -consider converting it, offline, to a format that is easily parsable, such -as `tf.python_io.TFRecordWriter` format. - -The most efficient method to customize the parsing behavior is to -[add a new op written in C++](../extend/adding_an_op.md) that parses your -data format. The [guide to handling new data formats](../extend/new_data_formats.md) has -more information about the steps for doing this. - - -## Miscellaneous - -#### What is TensorFlow's coding style convention? - -The TensorFlow Python API adheres to the -[PEP8](https://www.python.org/dev/peps/pep-0008/) conventions.* In -particular, we use `CamelCase` names for classes, and `snake_case` names for -functions, methods, and properties. We also adhere to the -[Google Python style guide](https://google.github.io/styleguide/pyguide.html). - -The TensorFlow C++ code base adheres to the -[Google C++ style guide](https://google.github.io/styleguide/cppguide.html). - -(* With one exception: we use 2-space indentation instead of 4-space -indentation.) - diff --git a/tensorflow/docs_src/guide/feature_columns.md b/tensorflow/docs_src/guide/feature_columns.md deleted file mode 100644 index 3ad41855e4..0000000000 --- a/tensorflow/docs_src/guide/feature_columns.md +++ /dev/null @@ -1,572 +0,0 @@ -# Feature Columns - -This document details feature columns. Think of **feature columns** as the -intermediaries between raw data and Estimators. Feature columns are very rich, -enabling you to transform a diverse range of raw data into formats that -Estimators can use, allowing easy experimentation. - -In [Premade Estimators](../guide/premade_estimators.md), we used the premade -Estimator, `tf.estimator.DNNClassifier` to train a model to -predict different types of Iris flowers from four input features. That example -created only numerical feature columns (of type -`tf.feature_column.numeric_column`). Although numerical feature columns model -the lengths of petals and sepals effectively, real world data sets contain all -kinds of features, many of which are non-numerical. - -
- -
-
-Some real-world features (such as, longitude) are numerical, but many are not. -
- -## Input to a Deep Neural Network - -What kind of data can a deep neural network operate on? The answer -is, of course, numbers (for example, `tf.float32`). After all, every neuron in -a neural network performs multiplication and addition operations on weights and -input data. Real-life input data, however, often contains non-numerical -(categorical) data. For example, consider a `product_class` feature that can -contain the following three non-numerical values: - -* `kitchenware` -* `electronics` -* `sports` - -ML models generally represent categorical values as simple vectors in which a -1 represents the presence of a value and a 0 represents the absence of a value. -For example, when `product_class` is set to `sports`, an ML model would usually -represent `product_class` as `[0, 0, 1]`, meaning: - -* `0`: `kitchenware` is absent -* `0`: `electronics` is absent -* `1`: `sports` is present - -So, although raw data can be numerical or categorical, an ML model represents -all features as numbers. - -## Feature Columns - -As the following figure suggests, you specify the input to a model through the -`feature_columns` argument of an Estimator (`DNNClassifier` for Iris). -Feature Columns bridge input data (as returned by `input_fn`) with your model. - -
- -
-
-Feature columns bridge raw data with the data your model needs. -
- -To create feature columns, call functions from the -`tf.feature_column` module. This document explains nine of the functions in -that module. As the following figure shows, all nine functions return either a -Categorical-Column or a Dense-Column object, except `bucketized_column`, which -inherits from both classes: - -
- -
-
-Feature column methods fall into two main categories and one hybrid category. -
- -Let's look at these functions in more detail. - -### Numeric column - -The Iris classifier calls the `tf.feature_column.numeric_column` function for -all input features: - - * `SepalLength` - * `SepalWidth` - * `PetalLength` - * `PetalWidth` - -Although `tf.numeric_column` provides optional arguments, calling -`tf.numeric_column` without any arguments, as follows, is a fine way to specify -a numerical value with the default data type (`tf.float32`) as input to your -model: - -```python -# Defaults to a tf.float32 scalar. -numeric_feature_column = tf.feature_column.numeric_column(key="SepalLength") -``` - -To specify a non-default numerical data type, use the `dtype` argument. For -example: - -``` python -# Represent a tf.float64 scalar. -numeric_feature_column = tf.feature_column.numeric_column(key="SepalLength", - dtype=tf.float64) -``` - -By default, a numeric column creates a single value (scalar). Use the shape -argument to specify another shape. For example: - - -```python -# Represent a 10-element vector in which each cell contains a tf.float32. -vector_feature_column = tf.feature_column.numeric_column(key="Bowling", - shape=10) - -# Represent a 10x5 matrix in which each cell contains a tf.float32. -matrix_feature_column = tf.feature_column.numeric_column(key="MyMatrix", - shape=[10,5]) -``` -### Bucketized column - -Often, you don't want to feed a number directly into the model, but instead -split its value into different categories based on numerical ranges. To do so, -create a `tf.feature_column.bucketized_column`. For -example, consider raw data that represents the year a house was built. Instead -of representing that year as a scalar numeric column, we could split the year -into the following four buckets: - -
- -
-
-Dividing year data into four buckets. -
- -The model will represent the buckets as follows: - -|Date Range |Represented as... | -|:----------|:-----------------| -|< 1960 | [1, 0, 0, 0] | -|>= 1960 but < 1980 | [0, 1, 0, 0] | -|>= 1980 but < 2000 | [0, 0, 1, 0] | -|>= 2000 | [0, 0, 0, 1] | - -Why would you want to split a number—a perfectly valid input to your -model—into a categorical value? Well, notice that the categorization splits a -single input number into a four-element vector. Therefore, the model now can -learn _four individual weights_ rather than just one; four weights creates a -richer model than one weight. More importantly, bucketizing enables the model -to clearly distinguish between different year categories since only one of the -elements is set (1) and the other three elements are cleared (0). For example, -when we just use a single number (a year) as input, a linear model can only -learn a linear relationship. So, bucketing provides the model with additional -flexibility that the model can use to learn. - -The following code demonstrates how to create a bucketized feature: - - -```python -# First, convert the raw input to a numeric column. -numeric_feature_column = tf.feature_column.numeric_column("Year") - -# Then, bucketize the numeric column on the years 1960, 1980, and 2000. -bucketized_feature_column = tf.feature_column.bucketized_column( - source_column = numeric_feature_column, - boundaries = [1960, 1980, 2000]) -``` -Note that specifying a _three_-element boundaries vector creates a -_four_-element bucketized vector. - - -### Categorical identity column - -**Categorical identity columns** can be seen as a special case of bucketized -columns. In traditional bucketized columns, each bucket represents a range of -values (for example, from 1960 to 1979). In a categorical identity column, each -bucket represents a single, unique integer. For example, let's say you want to -represent the integer range `[0, 4)`. That is, you want to represent the -integers 0, 1, 2, or 3. In this case, the categorical identity mapping looks -like this: - -
- -
-
-A categorical identity column mapping. Note that this is a one-hot -encoding, not a binary numerical encoding. -
- -As with bucketized columns, a model can learn a separate weight for each class -in a categorical identity column. For example, instead of using a string to -represent the `product_class`, let's represent each class with a unique integer -value. That is: - -* `0="kitchenware"` -* `1="electronics"` -* `2="sport"` - -Call `tf.feature_column.categorical_column_with_identity` to implement a -categorical identity column. For example: - -``` python -# Create categorical output for an integer feature named "my_feature_b", -# The values of my_feature_b must be >= 0 and < num_buckets -identity_feature_column = tf.feature_column.categorical_column_with_identity( - key='my_feature_b', - num_buckets=4) # Values [0, 4) - -# In order for the preceding call to work, the input_fn() must return -# a dictionary containing 'my_feature_b' as a key. Furthermore, the values -# assigned to 'my_feature_b' must belong to the set [0, 4). -def input_fn(): - ... - return ({ 'my_feature_a':[7, 9, 5, 2], 'my_feature_b':[3, 1, 2, 2] }, - [Label_values]) -``` - -### Categorical vocabulary column - -We cannot input strings directly to a model. Instead, we must first map strings -to numeric or categorical values. Categorical vocabulary columns provide a good -way to represent strings as a one-hot vector. For example: - -
- -
-
-Mapping string values to vocabulary columns. -
- -As you can see, categorical vocabulary columns are kind of an enum version of -categorical identity columns. TensorFlow provides two different functions to -create categorical vocabulary columns: - -* `tf.feature_column.categorical_column_with_vocabulary_list` -* `tf.feature_column.categorical_column_with_vocabulary_file` - -`categorical_column_with_vocabulary_list` maps each string to an integer based -on an explicit vocabulary list. For example: - -```python -# Given input "feature_name_from_input_fn" which is a string, -# create a categorical feature by mapping the input to one of -# the elements in the vocabulary list. -vocabulary_feature_column = - tf.feature_column.categorical_column_with_vocabulary_list( - key=feature_name_from_input_fn, - vocabulary_list=["kitchenware", "electronics", "sports"]) -``` - -The preceding function is pretty straightforward, but it has a significant -drawback. Namely, there's way too much typing when the vocabulary list is long. -For these cases, call -`tf.feature_column.categorical_column_with_vocabulary_file` instead, which lets -you place the vocabulary words in a separate file. For example: - -```python - -# Given input "feature_name_from_input_fn" which is a string, -# create a categorical feature to our model by mapping the input to one of -# the elements in the vocabulary file -vocabulary_feature_column = - tf.feature_column.categorical_column_with_vocabulary_file( - key=feature_name_from_input_fn, - vocabulary_file="product_class.txt", - vocabulary_size=3) -``` - -`product_class.txt` should contain one line for each vocabulary element. In our -case: - -```None -kitchenware -electronics -sports -``` - -### Hashed Column - -So far, we've worked with a naively small number of categories. For example, -our product_class example has only 3 categories. Often though, the number of -categories can be so big that it's not possible to have individual categories -for each vocabulary word or integer because that would consume too much memory. -For these cases, we can instead turn the question around and ask, "How many -categories am I willing to have for my input?" In fact, the -`tf.feature_column.categorical_column_with_hash_bucket` function enables you -to specify the number of categories. For this type of feature column the model -calculates a hash value of the input, then puts it into one of -the `hash_bucket_size` categories using the modulo operator, as in the following -pseudocode: - -```python -# pseudocode -feature_id = hash(raw_feature) % hash_bucket_size -``` - -The code to create the `feature_column` might look something like this: - -``` python -hashed_feature_column = - tf.feature_column.categorical_column_with_hash_bucket( - key = "some_feature", - hash_bucket_size = 100) # The number of categories -``` -At this point, you might rightfully think: "This is crazy!" After all, we are -forcing the different input values to a smaller set of categories. This means -that two probably unrelated inputs will be mapped to the same -category, and consequently mean the same thing to the neural network. The -following figure illustrates this dilemma, showing that kitchenware and sports -both get assigned to category (hash bucket) 12: - -
- -
-
-Representing data with hash buckets. -
- -As with many counterintuitive phenomena in machine learning, it turns out that -hashing often works well in practice. That's because hash categories provide -the model with some separation. The model can use additional features to further -separate kitchenware from sports. - -### Crossed column - -Combining features into a single feature, better known as -[feature crosses](https://developers.google.com/machine-learning/glossary/#feature_cross), -enables the model to learn separate weights for each combination of -features. - -More concretely, suppose we want our model to calculate real estate prices in -Atlanta, GA. Real-estate prices within this city vary greatly depending on -location. Representing latitude and longitude as separate features isn't very -useful in identifying real-estate location dependencies; however, crossing -latitude and longitude into a single feature can pinpoint locations. Suppose we -represent Atlanta as a grid of 100x100 rectangular sections, identifying each -of the 10,000 sections by a feature cross of latitude and longitude. This -feature cross enables the model to train on pricing conditions related to each -individual section, which is a much stronger signal than latitude and longitude -alone. - -The following figure shows our plan, with the latitude & longitude values for -the corners of the city in red text: - -
- -
-
-Map of Atlanta. Imagine this map divided into 10,000 sections of -equal size. -
- -For the solution, we used a combination of the `bucketized_column` we looked at -earlier, with the `tf.feature_column.crossed_column` function. - - - -``` python -def make_dataset(latitude, longitude, labels): - assert latitude.shape == longitude.shape == labels.shape - - features = {'latitude': latitude.flatten(), - 'longitude': longitude.flatten()} - labels=labels.flatten() - - return tf.data.Dataset.from_tensor_slices((features, labels)) - - -# Bucketize the latitude and longitude using the `edges` -latitude_bucket_fc = tf.feature_column.bucketized_column( - tf.feature_column.numeric_column('latitude'), - list(atlanta.latitude.edges)) - -longitude_bucket_fc = tf.feature_column.bucketized_column( - tf.feature_column.numeric_column('longitude'), - list(atlanta.longitude.edges)) - -# Cross the bucketized columns, using 5000 hash bins. -crossed_lat_lon_fc = tf.feature_column.crossed_column( - [latitude_bucket_fc, longitude_bucket_fc], 5000) - -fc = [ - latitude_bucket_fc, - longitude_bucket_fc, - crossed_lat_lon_fc] - -# Build and train the Estimator. -est = tf.estimator.LinearRegressor(fc, ...) -``` - -You may create a feature cross from either of the following: - -* Feature names; that is, names from the `dict` returned from `input_fn`. -* Any categorical column, except `categorical_column_with_hash_bucket` - (since `crossed_column` hashes the input). - -When the feature columns `latitude_bucket_fc` and `longitude_bucket_fc` are -crossed, TensorFlow will create `(latitude_fc, longitude_fc)` pairs for each -example. This would produce a full grid of possibilities as follows: - -``` None - (0,0), (0,1)... (0,99) - (1,0), (1,1)... (1,99) - ... ... ... -(99,0), (99,1)...(99, 99) -``` - -Except that a full grid would only be tractable for inputs with limited -vocabularies. Instead of building this, potentially huge, table of inputs, -the `crossed_column` only builds the number requested by the `hash_bucket_size` -argument. The feature column assigns an example to a index by running a hash -function on the tuple of inputs, followed by a modulo operation with -`hash_bucket_size`. - -As discussed earlier, performing the -hash and modulo function limits the number of categories, but can cause category -collisions; that is, multiple (latitude, longitude) feature crosses will end -up in the same hash bucket. In practice though, performing feature crosses -still adds significant value to the learning capability of your models. - -Somewhat counterintuitively, when creating feature crosses, you typically still -should include the original (uncrossed) features in your model (as in the -preceding code snippet). The independent latitude and longitude features help the -model distinguish between examples where a hash collision has occurred in the -crossed feature. - -## Indicator and embedding columns - -Indicator columns and embedding columns never work on features directly, but -instead take categorical columns as input. - -When using an indicator column, we're telling TensorFlow to do exactly what -we've seen in our categorical product_class example. That is, an -**indicator column** treats each category as an element in a one-hot vector, -where the matching category has value 1 and the rest have 0s: - -
- -
-
-Representing data in indicator columns. -
- -Here's how you create an indicator column by calling -`tf.feature_column.indicator_column`: - -``` python -categorical_column = ... # Create any type of categorical column. - -# Represent the categorical column as an indicator column. -indicator_column = tf.feature_column.indicator_column(categorical_column) -``` - -Now, suppose instead of having just three possible classes, we have a million. -Or maybe a billion. For a number of reasons, as the number of categories grow -large, it becomes infeasible to train a neural network using indicator columns. - -We can use an embedding column to overcome this limitation. Instead of -representing the data as a one-hot vector of many dimensions, an -**embedding column** represents that data as a lower-dimensional, ordinary -vector in which each cell can contain any number, not just 0 or 1. By -permitting a richer palette of numbers for every cell, an embedding column -contains far fewer cells than an indicator column. - -Let's look at an example comparing indicator and embedding columns. Suppose our -input examples consist of different words from a limited palette of only 81 -words. Further suppose that the data set provides the following input -words in 4 separate examples: - -* `"dog"` -* `"spoon"` -* `"scissors"` -* `"guitar"` - -In that case, the following figure illustrates the processing path for -embedding columns or indicator columns. - -
- -
-
-An embedding column stores categorical data in a lower-dimensional -vector than an indicator column. (We just placed random numbers into the -embedding vectors; training determines the actual numbers.) -
- -When an example is processed, one of the `categorical_column_with...` functions -maps the example string to a numerical categorical value. For example, a -function maps "spoon" to `[32]`. (The 32 comes from our imagination—the actual -values depend on the mapping function.) You may then represent these numerical -categorical values in either of the following two ways: - -* As an indicator column. A function converts each numeric categorical value - into an 81-element vector (because our palette consists of 81 words), placing - a 1 in the index of the categorical value (0, 32, 79, 80) and a 0 in all the - other positions. - -* As an embedding column. A function uses the numerical categorical values - `(0, 32, 79, 80)` as indices to a lookup table. Each slot in that lookup table - contains a 3-element vector. - -How do the values in the embeddings vectors magically get assigned? Actually, -the assignments happen during training. That is, the model learns the best way -to map your input numeric categorical values to the embeddings vector value in -order to solve your problem. Embedding columns increase your model's -capabilities, since an embeddings vector learns new relationships between -categories from the training data. - -Why is the embedding vector size 3 in our example? Well, the following "formula" -provides a general rule of thumb about the number of embedding dimensions: - -```python -embedding_dimensions = number_of_categories**0.25 -``` - -That is, the embedding vector dimension should be the 4th root of the number of -categories. Since our vocabulary size in this example is 81, the recommended -number of dimensions is 3: - -``` python -3 = 81**0.25 -``` -Note that this is just a general guideline; you can set the number of embedding -dimensions as you please. - -Call `tf.feature_column.embedding_column` to create an `embedding_column` as -suggested by the following snippet: - -``` python -categorical_column = ... # Create any categorical column - -# Represent the categorical column as an embedding column. -# This means creating an embedding vector lookup table with one element for each category. -embedding_column = tf.feature_column.embedding_column( - categorical_column=categorical_column, - dimension=embedding_dimensions) -``` - -[Embeddings](../guide/embedding.md) is a significant topic within machine -learning. This information was just to get you started using them as feature -columns. - -## Passing feature columns to Estimators - -As the following list indicates, not all Estimators permit all types of -`feature_columns` argument(s): - -* `tf.estimator.LinearClassifier` and - `tf.estimator.LinearRegressor`: Accept all types of - feature column. -* `tf.estimator.DNNClassifier` and - `tf.estimator.DNNRegressor`: Only accept dense columns. Other - column types must be wrapped in either an `indicator_column` or - `embedding_column`. -* `tf.estimator.DNNLinearCombinedClassifier` and - `tf.estimator.DNNLinearCombinedRegressor`: - * The `linear_feature_columns` argument accepts any feature column type. - * The `dnn_feature_columns` argument only accepts dense columns. - -## Other Sources - -For more examples on feature columns, view the following: - -* The [Low Level Introduction](../guide/low_level_intro.md#feature_columns) demonstrates how - experiment directly with `feature_columns` using TensorFlow's low level APIs. -* The [Estimator wide and deep learning tutorial](https://github.com/tensorflow/models/tree/master/official/wide_deep) - solves a binary classification problem using `feature_columns` on a variety of - input data types. - -To learn more about embeddings, see the following: - -* [Deep Learning, NLP, and representations](http://colah.github.io/posts/2014-07-NLP-RNNs-Representations/) - (Chris Olah's blog) -* The TensorFlow [Embedding Projector](http://projector.tensorflow.org) diff --git a/tensorflow/docs_src/guide/graph_viz.md b/tensorflow/docs_src/guide/graph_viz.md deleted file mode 100644 index 23f722bbe7..0000000000 --- a/tensorflow/docs_src/guide/graph_viz.md +++ /dev/null @@ -1,317 +0,0 @@ -# TensorBoard: Graph Visualization - -TensorFlow computation graphs are powerful but complicated. The graph visualization can help you understand and debug them. Here's an example of the visualization at work. - -![Visualization of a TensorFlow graph](https://www.tensorflow.org/images/graph_vis_animation.gif "Visualization of a TensorFlow graph") -*Visualization of a TensorFlow graph.* - -To see your own graph, run TensorBoard pointing it to the log directory of the job, click on the graph tab on the top pane and select the appropriate run using the menu at the upper left corner. For in depth information on how to run TensorBoard and make sure you are logging all the necessary information, see [TensorBoard: Visualizing Learning](../guide/summaries_and_tensorboard.md). - -## Name scoping and nodes - -Typical TensorFlow graphs can have many thousands of nodes--far too many to see -easily all at once, or even to lay out using standard graph tools. To simplify, -variable names can be scoped and the visualization uses this information to -define a hierarchy on the nodes in the graph. By default, only the top of this -hierarchy is shown. Here is an example that defines three operations under the -`hidden` name scope using -`tf.name_scope`: - -```python -import tensorflow as tf - -with tf.name_scope('hidden') as scope: - a = tf.constant(5, name='alpha') - W = tf.Variable(tf.random_uniform([1, 2], -1.0, 1.0), name='weights') - b = tf.Variable(tf.zeros([1]), name='biases') -``` - -This results in the following three op names: - -* `hidden/alpha` -* `hidden/weights` -* `hidden/biases` - -By default, the visualization will collapse all three into a node labeled `hidden`. -The extra detail isn't lost. You can double-click, or click -on the orange `+` sign in the top right to expand the node, and then you'll see -three subnodes for `alpha`, `weights` and `biases`. - -Here's a real-life example of a more complicated node in its initial and -expanded states. - - - - - - - - - - -
- Unexpanded name scope - - Expanded name scope -
- Initial view of top-level name scope pool_1. Clicking on the orange + button on the top right or double-clicking on the node itself will expand it. - - Expanded view of pool_1 name scope. Clicking on the orange - button on the top right or double-clicking on the node itself will collapse the name scope. -
- -Grouping nodes by name scopes is critical to making a legible graph. If you're -building a model, name scopes give you control over the resulting visualization. -**The better your name scopes, the better your visualization.** - -The figure above illustrates a second aspect of the visualization. TensorFlow -graphs have two kinds of connections: data dependencies and control -dependencies. Data dependencies show the flow of tensors between two ops and -are shown as solid arrows, while control dependencies use dotted lines. In the -expanded view (right side of the figure above) all the connections are data -dependencies with the exception of the dotted line connecting `CheckNumerics` -and `control_dependency`. - -There's a second trick to simplifying the layout. Most TensorFlow graphs have a -few nodes with many connections to other nodes. For example, many nodes might -have a control dependency on an initialization step. Drawing all edges between -the `init` node and its dependencies would create a very cluttered view. - -To reduce clutter, the visualization separates out all high-degree nodes to an -*auxiliary* area on the right and doesn't draw lines to represent their edges. -Instead of lines, we draw small *node icons* to indicate the connections. -Separating out the auxiliary nodes typically doesn't remove critical -information since these nodes are usually related to bookkeeping functions. -See [Interaction](#interaction) for how to move nodes between the main graph -and the auxiliary area. - - - - - - - - - - -
- conv_1 is part of the main graph - - save is extracted as auxiliary node -
- Node conv_1 is connected to save. Note the little save node icon on its right. - - save has a high degree, and will appear as an auxiliary node. The connection with conv_1 is shown as a node icon on its left. To further reduce clutter, since save has a lot of connections, we show the first 5 and abbreviate the others as ... 12 more. -
- -One last structural simplification is *series collapsing*. Sequential -motifs--that is, nodes whose names differ by a number at the end and have -isomorphic structures--are collapsed into a single *stack* of nodes, as shown -below. For networks with long sequences, this greatly simplifies the view. As -with hierarchical nodes, double-clicking expands the series. See -[Interaction](#interaction) for how to disable/enable series collapsing for a -specific set of nodes. - - - - - - - - - - -
- Sequence of nodes - - Expanded sequence of nodes -
- A collapsed view of a node sequence. - - A small piece of the expanded view, after double-click. -
- -Finally, as one last aid to legibility, the visualization uses special icons -for constants and summary nodes. To summarize, here's a table of node symbols: - -Symbol | Meaning ---- | --- -![Name scope](https://www.tensorflow.org/images/namespace_node.png "Name scope") | *High-level* node representing a name scope. Double-click to expand a high-level node. -![Sequence of unconnected nodes](https://www.tensorflow.org/images/horizontal_stack.png "Sequence of unconnected nodes") | Sequence of numbered nodes that are not connected to each other. -![Sequence of connected nodes](https://www.tensorflow.org/images/vertical_stack.png "Sequence of connected nodes") | Sequence of numbered nodes that are connected to each other. -![Operation node](https://www.tensorflow.org/images/op_node.png "Operation node") | An individual operation node. -![Constant node](https://www.tensorflow.org/images/constant.png "Constant node") | A constant. -![Summary node](https://www.tensorflow.org/images/summary.png "Summary node") | A summary node. -![Data flow edge](https://www.tensorflow.org/images/dataflow_edge.png "Data flow edge") | Edge showing the data flow between operations. -![Control dependency edge](https://www.tensorflow.org/images/control_edge.png "Control dependency edge") | Edge showing the control dependency between operations. -![Reference edge](https://www.tensorflow.org/images/reference_edge.png "Reference edge") | A reference edge showing that the outgoing operation node can mutate the incoming tensor. - -## Interaction {#interaction} - -Navigate the graph by panning and zooming. Click and drag to pan, and use a -scroll gesture to zoom. Double-click on a node, or click on its `+` button, to -expand a name scope that represents a group of operations. To easily keep -track of the current viewpoint when zooming and panning, there is a minimap in -the bottom right corner. - -To close an open node, double-click it again or click its `-` button. You can -also click once to select a node. It will turn a darker color, and details -about it and the nodes it connects to will appear in the info card at upper -right corner of the visualization. - - - - - - - - - - -
- Info card of a name scope - - Info card of operation node -
- Info card showing detailed information for the conv2 name scope. The inputs and outputs are combined from the inputs and outputs of the operation nodes inside the name scope. For name scopes no attributes are shown. - - Info card showing detailed information for the DecodeRaw operation node. In addition to inputs and outputs, the card shows the device and the attributes associated with the current operation. -
- -TensorBoard provides several ways to change the visual layout of the graph. This -doesn't change the graph's computational semantics, but it can bring some -clarity to the network's structure. By right clicking on a node or pressing -buttons on the bottom of that node's info card, you can make the following -changes to its layout: - -* Nodes can be moved between the main graph and the auxiliary area. -* A series of nodes can be ungrouped so that the nodes in the series do not -appear grouped together. Ungrouped series can likewise be regrouped. - -Selection can also be helpful in understanding high-degree nodes. Select any -high-degree node, and the corresponding node icons for its other connections -will be selected as well. This makes it easy, for example, to see which nodes -are being saved--and which aren't. - -Clicking on a node name in the info card will select it. If necessary, the -viewpoint will automatically pan so that the node is visible. - -Finally, you can choose two color schemes for your graph, using the color menu -above the legend. The default *Structure View* shows structure: when two -high-level nodes have the same structure, they appear in the same color of the -rainbow. Uniquely structured nodes are gray. There's a second view, which shows -what device the different operations run on. Name scopes are colored -proportionally to the fraction of devices for the operations inside them. - -The images below give an illustration for a piece of a real-life graph. - - - - - - - - - - -
- Color by structure - - Color by device -
- Structure view: The gray nodes have unique structure. The orange conv1 and conv2 nodes have the same structure, and analogously for nodes with other colors. - - Device view: Name scopes are colored proportionally to the fraction of devices of the operation nodes inside them. Here, purple means GPU and the green is CPU. -
- -## Tensor shape information - -When the serialized `GraphDef` includes tensor shapes, the graph visualizer -labels edges with tensor dimensions, and edge thickness reflects total tensor -size. To include tensor shapes in the `GraphDef` pass the actual graph object -(as in `sess.graph`) to the `FileWriter` when serializing the graph. -The images below show the CIFAR-10 model with tensor shape information: - - - - - - - -
- CIFAR-10 model with tensor shape information -
- CIFAR-10 model with tensor shape information. -
- -## Runtime statistics - -Often it is useful to collect runtime metadata for a run, such as total memory -usage, total compute time, and tensor shapes for nodes. The code example below -is a snippet from the train and test section of a modification of the -[Estimators MNIST tutorial](../tutorials/estimators/cnn.md), in which we have -recorded summaries and -runtime statistics. See the -[Summaries Tutorial](../guide/summaries_and_tensorboard.md#serializing-the-data) -for details on how to record summaries. -Full source is [here](https://www.tensorflow.org/code/tensorflow/examples/tutorials/mnist/mnist_with_summaries.py). - -```python - # Train the model, and also write summaries. - # Every 10th step, measure test-set accuracy, and write test summaries - # All other steps, run train_step on training data, & add training summaries - - def feed_dict(train): - """Make a TensorFlow feed_dict: maps data onto Tensor placeholders.""" - if train or FLAGS.fake_data: - xs, ys = mnist.train.next_batch(100, fake_data=FLAGS.fake_data) - k = FLAGS.dropout - else: - xs, ys = mnist.test.images, mnist.test.labels - k = 1.0 - return {x: xs, y_: ys, keep_prob: k} - - for i in range(FLAGS.max_steps): - if i % 10 == 0: # Record summaries and test-set accuracy - summary, acc = sess.run([merged, accuracy], feed_dict=feed_dict(False)) - test_writer.add_summary(summary, i) - print('Accuracy at step %s: %s' % (i, acc)) - else: # Record train set summaries, and train - if i % 100 == 99: # Record execution stats - run_options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE) - run_metadata = tf.RunMetadata() - summary, _ = sess.run([merged, train_step], - feed_dict=feed_dict(True), - options=run_options, - run_metadata=run_metadata) - train_writer.add_run_metadata(run_metadata, 'step%d' % i) - train_writer.add_summary(summary, i) - print('Adding run metadata for', i) - else: # Record a summary - summary, _ = sess.run([merged, train_step], feed_dict=feed_dict(True)) - train_writer.add_summary(summary, i) -``` - -This code will emit runtime statistics for every 100th step starting at step99. - -When you launch tensorboard and go to the Graph tab, you will now see options -under "Session runs" which correspond to the steps where run metadata was added. -Selecting one of these runs will show you the snapshot of the network at that -step, fading out unused nodes. In the controls on the left hand side, you will -be able to color the nodes by total memory or total compute time. Additionally, -clicking on a node will display the exact total memory, compute time, and -tensor output sizes. - - - - - - - - -
- Color by compute time - - Run metadata graph - - Run metadata info card -
diff --git a/tensorflow/docs_src/guide/graphs.md b/tensorflow/docs_src/guide/graphs.md deleted file mode 100644 index c70479dba2..0000000000 --- a/tensorflow/docs_src/guide/graphs.md +++ /dev/null @@ -1,558 +0,0 @@ -# Graphs and Sessions - -TensorFlow uses a **dataflow graph** to represent your computation in terms of -the dependencies between individual operations. This leads to a low-level -programming model in which you first define the dataflow graph, then create a -TensorFlow **session** to run parts of the graph across a set of local and -remote devices. - -This guide will be most useful if you intend to use the low-level programming -model directly. Higher-level APIs such as `tf.estimator.Estimator` and Keras -hide the details of graphs and sessions from the end user, but this guide may -also be useful if you want to understand how these APIs are implemented. - -## Why dataflow graphs? - -![](../images/tensors_flowing.gif) - -[Dataflow](https://en.wikipedia.org/wiki/Dataflow_programming) is a common -programming model for parallel computing. In a dataflow graph, the nodes -represent units of computation, and the edges represent the data consumed or -produced by a computation. For example, in a TensorFlow graph, the `tf.matmul` -operation would correspond to a single node with two incoming edges (the -matrices to be multiplied) and one outgoing edge (the result of the -multiplication). - - - -Dataflow has several advantages that TensorFlow leverages when executing your -programs: - -* **Parallelism.** By using explicit edges to represent dependencies between - operations, it is easy for the system to identify operations that can execute - in parallel. - -* **Distributed execution.** By using explicit edges to represent the values - that flow between operations, it is possible for TensorFlow to partition your - program across multiple devices (CPUs, GPUs, and TPUs) attached to different - machines. TensorFlow inserts the necessary communication and coordination - between devices. - -* **Compilation.** TensorFlow's [XLA compiler](../performance/xla/index.md) can - use the information in your dataflow graph to generate faster code, for - example, by fusing together adjacent operations. - -* **Portability.** The dataflow graph is a language-independent representation - of the code in your model. You can build a dataflow graph in Python, store it - in a [SavedModel](../guide/saved_model.md), and restore it in a C++ program for - low-latency inference. - - -## What is a `tf.Graph`? - -A `tf.Graph` contains two relevant kinds of information: - -* **Graph structure.** The nodes and edges of the graph, indicating how - individual operations are composed together, but not prescribing how they - should be used. The graph structure is like assembly code: inspecting it can - convey some useful information, but it does not contain all of the useful - context that source code conveys. - -* **Graph collections.** TensorFlow provides a general mechanism for storing - collections of metadata in a `tf.Graph`. The `tf.add_to_collection` function - enables you to associate a list of objects with a key (where `tf.GraphKeys` - defines some of the standard keys), and `tf.get_collection` enables you to - look up all objects associated with a key. Many parts of the TensorFlow - library use this facility: for example, when you create a `tf.Variable`, it - is added by default to collections representing "global variables" and - "trainable variables". When you later come to create a `tf.train.Saver` or - `tf.train.Optimizer`, the variables in these collections are used as the - default arguments. - - -## Building a `tf.Graph` - -Most TensorFlow programs start with a dataflow graph construction phase. In this -phase, you invoke TensorFlow API functions that construct new `tf.Operation` -(node) and `tf.Tensor` (edge) objects and add them to a `tf.Graph` -instance. TensorFlow provides a **default graph** that is an implicit argument -to all API functions in the same context. For example: - -* Calling `tf.constant(42.0)` creates a single `tf.Operation` that produces the - value `42.0`, adds it to the default graph, and returns a `tf.Tensor` that - represents the value of the constant. - -* Calling `tf.matmul(x, y)` creates a single `tf.Operation` that multiplies - the values of `tf.Tensor` objects `x` and `y`, adds it to the default graph, - and returns a `tf.Tensor` that represents the result of the multiplication. - -* Executing `v = tf.Variable(0)` adds to the graph a `tf.Operation` that will - store a writeable tensor value that persists between `tf.Session.run` calls. - The `tf.Variable` object wraps this operation, and can be used [like a - tensor](#tensor-like_objects), which will read the current value of the - stored value. The `tf.Variable` object also has methods such as - `tf.Variable.assign` and `tf.Variable.assign_add` that - create `tf.Operation` objects that, when executed, update the stored value. - (See [Variables](../guide/variables.md) for more information about variables.) - -* Calling `tf.train.Optimizer.minimize` will add operations and tensors to the - default graph that calculates gradients, and return a `tf.Operation` that, - when run, will apply those gradients to a set of variables. - -Most programs rely solely on the default graph. However, -see [Dealing with multiple graphs](#programming_with_multiple_graphs) for more -advanced use cases. High-level APIs such as the `tf.estimator.Estimator` API -manage the default graph on your behalf, and--for example--may create different -graphs for training and evaluation. - -Note: Calling most functions in the TensorFlow API merely adds operations -and tensors to the default graph, but **does not** perform the actual -computation. Instead, you compose these functions until you have a `tf.Tensor` -or `tf.Operation` that represents the overall computation--such as performing -one step of gradient descent--and then pass that object to a `tf.Session` to -perform the computation. See the section "Executing a graph in a `tf.Session`" -for more details. - -## Naming operations - -A `tf.Graph` object defines a **namespace** for the `tf.Operation` objects it -contains. TensorFlow automatically chooses a unique name for each operation in -your graph, but giving operations descriptive names can make your program easier -to read and debug. The TensorFlow API provides two ways to override the name of -an operation: - -* Each API function that creates a new `tf.Operation` or returns a new - `tf.Tensor` accepts an optional `name` argument. For example, - `tf.constant(42.0, name="answer")` creates a new `tf.Operation` named - `"answer"` and returns a `tf.Tensor` named `"answer:0"`. If the default graph - already contains an operation named `"answer"`, then TensorFlow would append - `"_1"`, `"_2"`, and so on to the name, in order to make it unique. - -* The `tf.name_scope` function makes it possible to add a **name scope** prefix - to all operations created in a particular context. The current name scope - prefix is a `"/"`-delimited list of the names of all active `tf.name_scope` - context managers. If a name scope has already been used in the current - context, TensorFlow appends `"_1"`, `"_2"`, and so on. For example: - - ```python - c_0 = tf.constant(0, name="c") # => operation named "c" - - # Already-used names will be "uniquified". - c_1 = tf.constant(2, name="c") # => operation named "c_1" - - # Name scopes add a prefix to all operations created in the same context. - with tf.name_scope("outer"): - c_2 = tf.constant(2, name="c") # => operation named "outer/c" - - # Name scopes nest like paths in a hierarchical file system. - with tf.name_scope("inner"): - c_3 = tf.constant(3, name="c") # => operation named "outer/inner/c" - - # Exiting a name scope context will return to the previous prefix. - c_4 = tf.constant(4, name="c") # => operation named "outer/c_1" - - # Already-used name scopes will be "uniquified". - with tf.name_scope("inner"): - c_5 = tf.constant(5, name="c") # => operation named "outer/inner_1/c" - ``` - -The graph visualizer uses name scopes to group operations and reduce the visual -complexity of a graph. See [Visualizing your graph](#visualizing-your-graph) for -more information. - -Note that `tf.Tensor` objects are implicitly named after the `tf.Operation` -that produces the tensor as output. A tensor name has the form `":"` -where: - -* `""` is the name of the operation that produces it. -* `""` is an integer representing the index of that tensor among the - operation's outputs. - -## Placing operations on different devices - -If you want your TensorFlow program to use multiple different devices, the -`tf.device` function provides a convenient way to request that all operations -created in a particular context are placed on the same device (or type of -device). - -A **device specification** has the following form: - -``` -/job:/task:/device:: -``` - -where: - -* `` is an alpha-numeric string that does not start with a number. -* `` is a registered device type (such as `GPU` or `CPU`). -* `` is a non-negative integer representing the index of the task - in the job named ``. See `tf.train.ClusterSpec` for an explanation - of jobs and tasks. -* `` is a non-negative integer representing the index of the - device, for example, to distinguish between different GPU devices used in the - same process. - -You do not need to specify every part of a device specification. For example, -if you are running in a single-machine configuration with a single GPU, you -might use `tf.device` to pin some operations to the CPU and GPU: - -```python -# Operations created outside either context will run on the "best possible" -# device. For example, if you have a GPU and a CPU available, and the operation -# has a GPU implementation, TensorFlow will choose the GPU. -weights = tf.random_normal(...) - -with tf.device("/device:CPU:0"): - # Operations created in this context will be pinned to the CPU. - img = tf.decode_jpeg(tf.read_file("img.jpg")) - -with tf.device("/device:GPU:0"): - # Operations created in this context will be pinned to the GPU. - result = tf.matmul(weights, img) -``` -If you are deploying TensorFlow in a [typical distributed configuration](../deploy/distributed.md), -you might specify the job name and task ID to place variables on -a task in the parameter server job (`"/job:ps"`), and the other operations on -task in the worker job (`"/job:worker"`): - -```python -with tf.device("/job:ps/task:0"): - weights_1 = tf.Variable(tf.truncated_normal([784, 100])) - biases_1 = tf.Variable(tf.zeroes([100])) - -with tf.device("/job:ps/task:1"): - weights_2 = tf.Variable(tf.truncated_normal([100, 10])) - biases_2 = tf.Variable(tf.zeroes([10])) - -with tf.device("/job:worker"): - layer_1 = tf.matmul(train_batch, weights_1) + biases_1 - layer_2 = tf.matmul(train_batch, weights_2) + biases_2 -``` - -`tf.device` gives you a lot of flexibility to choose placements for individual -operations or broad regions of a TensorFlow graph. In many cases, there are -simple heuristics that work well. For example, the -`tf.train.replica_device_setter` API can be used with `tf.device` to place -operations for **data-parallel distributed training**. For example, the -following code fragment shows how `tf.train.replica_device_setter` applies -different placement policies to `tf.Variable` objects and other operations: - -```python -with tf.device(tf.train.replica_device_setter(ps_tasks=3)): - # tf.Variable objects are, by default, placed on tasks in "/job:ps" in a - # round-robin fashion. - w_0 = tf.Variable(...) # placed on "/job:ps/task:0" - b_0 = tf.Variable(...) # placed on "/job:ps/task:1" - w_1 = tf.Variable(...) # placed on "/job:ps/task:2" - b_1 = tf.Variable(...) # placed on "/job:ps/task:0" - - input_data = tf.placeholder(tf.float32) # placed on "/job:worker" - layer_0 = tf.matmul(input_data, w_0) + b_0 # placed on "/job:worker" - layer_1 = tf.matmul(layer_0, w_1) + b_1 # placed on "/job:worker" -``` - -## Tensor-like objects - -Many TensorFlow operations take one or more `tf.Tensor` objects as arguments. -For example, `tf.matmul` takes two `tf.Tensor` objects, and `tf.add_n` takes -a list of `n` `tf.Tensor` objects. For convenience, these functions will accept -a **tensor-like object** in place of a `tf.Tensor`, and implicitly convert it -to a `tf.Tensor` using the `tf.convert_to_tensor` method. Tensor-like objects -include elements of the following types: - -* `tf.Tensor` -* `tf.Variable` -* [`numpy.ndarray`](https://docs.scipy.org/doc/numpy/reference/generated/numpy.ndarray.html) -* `list` (and lists of tensor-like objects) -* Scalar Python types: `bool`, `float`, `int`, `str` - -You can register additional tensor-like types using -`tf.register_tensor_conversion_function`. - -Note: By default, TensorFlow will create a new `tf.Tensor` each time you use -the same tensor-like object. If the tensor-like object is large (e.g. a -`numpy.ndarray` containing a set of training examples) and you use it multiple -times, you may run out of memory. To avoid this, manually call -`tf.convert_to_tensor` on the tensor-like object once and use the returned -`tf.Tensor` instead. - -## Executing a graph in a `tf.Session` - -TensorFlow uses the `tf.Session` class to represent a connection between the -client program---typically a Python program, although a similar interface is -available in other languages---and the C++ runtime. A `tf.Session` object -provides access to devices in the local machine, and remote devices using the -distributed TensorFlow runtime. It also caches information about your -`tf.Graph` so that you can efficiently run the same computation multiple times. - -### Creating a `tf.Session` - -If you are using the low-level TensorFlow API, you can create a `tf.Session` -for the current default graph as follows: - -```python -# Create a default in-process session. -with tf.Session() as sess: - # ... - -# Create a remote session. -with tf.Session("grpc://example.org:2222"): - # ... -``` - -Since a `tf.Session` owns physical resources (such as GPUs and -network connections), it is typically used as a context manager (in a `with` -block) that automatically closes the session when you exit the block. It is -also possible to create a session without using a `with` block, but you should -explicitly call `tf.Session.close` when you are finished with it to free the -resources. - -Note: Higher-level APIs such as `tf.train.MonitoredTrainingSession` or -`tf.estimator.Estimator` will create and manage a `tf.Session` for you. These -APIs accept optional `target` and `config` arguments (either directly, or as -part of a `tf.estimator.RunConfig` object), with the same meaning as -described below. - -`tf.Session.__init__` accepts three optional arguments: - -* **`target`.** If this argument is left empty (the default), the session will - only use devices in the local machine. However, you may also specify a - `grpc://` URL to specify the address of a TensorFlow server, which gives the - session access to all devices on machines that this server controls. See - `tf.train.Server` for details of how to create a TensorFlow - server. For example, in the common **between-graph replication** - configuration, the `tf.Session` connects to a `tf.train.Server` in the same - process as the client. The [distributed TensorFlow](../deploy/distributed.md) - deployment guide describes other common scenarios. - -* **`graph`.** By default, a new `tf.Session` will be bound to---and only able - to run operations in---the current default graph. If you are using multiple - graphs in your program (see [Programming with multiple - graphs](#programming_with_multiple_graphs) for more details), you can specify - an explicit `tf.Graph` when you construct the session. - -* **`config`.** This argument allows you to specify a `tf.ConfigProto` that - controls the behavior of the session. For example, some of the configuration - options include: - - * `allow_soft_placement`. Set this to `True` to enable a "soft" device - placement algorithm, which ignores `tf.device` annotations that attempt - to place CPU-only operations on a GPU device, and places them on the CPU - instead. - - * `cluster_def`. When using distributed TensorFlow, this option allows you - to specify what machines to use in the computation, and provide a mapping - between job names, task indices, and network addresses. See - `tf.train.ClusterSpec.as_cluster_def` for details. - - * `graph_options.optimizer_options`. Provides control over the optimizations - that TensorFlow performs on your graph before executing it. - - * `gpu_options.allow_growth`. Set this to `True` to change the GPU memory - allocator so that it gradually increases the amount of memory allocated, - rather than allocating most of the memory at startup. - - -### Using `tf.Session.run` to execute operations - -The `tf.Session.run` method is the main mechanism for running a `tf.Operation` -or evaluating a `tf.Tensor`. You can pass one or more `tf.Operation` or -`tf.Tensor` objects to `tf.Session.run`, and TensorFlow will execute the -operations that are needed to compute the result. - -`tf.Session.run` requires you to specify a list of **fetches**, which determine -the return values, and may be a `tf.Operation`, a `tf.Tensor`, or -a [tensor-like type](#tensor-like_objects) such as `tf.Variable`. These fetches -determine what **subgraph** of the overall `tf.Graph` must be executed to -produce the result: this is the subgraph that contains all operations named in -the fetch list, plus all operations whose outputs are used to compute the value -of the fetches. For example, the following code fragment shows how different -arguments to `tf.Session.run` cause different subgraphs to be executed: - -```python -x = tf.constant([[37.0, -23.0], [1.0, 4.0]]) -w = tf.Variable(tf.random_uniform([2, 2])) -y = tf.matmul(x, w) -output = tf.nn.softmax(y) -init_op = w.initializer - -with tf.Session() as sess: - # Run the initializer on `w`. - sess.run(init_op) - - # Evaluate `output`. `sess.run(output)` will return a NumPy array containing - # the result of the computation. - print(sess.run(output)) - - # Evaluate `y` and `output`. Note that `y` will only be computed once, and its - # result used both to return `y_val` and as an input to the `tf.nn.softmax()` - # op. Both `y_val` and `output_val` will be NumPy arrays. - y_val, output_val = sess.run([y, output]) -``` - -`tf.Session.run` also optionally takes a dictionary of **feeds**, which is a -mapping from `tf.Tensor` objects (typically `tf.placeholder` tensors) to -values (typically Python scalars, lists, or NumPy arrays) that will be -substituted for those tensors in the execution. For example: - -```python -# Define a placeholder that expects a vector of three floating-point values, -# and a computation that depends on it. -x = tf.placeholder(tf.float32, shape=[3]) -y = tf.square(x) - -with tf.Session() as sess: - # Feeding a value changes the result that is returned when you evaluate `y`. - print(sess.run(y, {x: [1.0, 2.0, 3.0]})) # => "[1.0, 4.0, 9.0]" - print(sess.run(y, {x: [0.0, 0.0, 5.0]})) # => "[0.0, 0.0, 25.0]" - - # Raises `tf.errors.InvalidArgumentError`, because you must feed a value for - # a `tf.placeholder()` when evaluating a tensor that depends on it. - sess.run(y) - - # Raises `ValueError`, because the shape of `37.0` does not match the shape - # of placeholder `x`. - sess.run(y, {x: 37.0}) -``` - -`tf.Session.run` also accepts an optional `options` argument that enables you -to specify options about the call, and an optional `run_metadata` argument that -enables you to collect metadata about the execution. For example, you can use -these options together to collect tracing information about the execution: - -``` -y = tf.matmul([[37.0, -23.0], [1.0, 4.0]], tf.random_uniform([2, 2])) - -with tf.Session() as sess: - # Define options for the `sess.run()` call. - options = tf.RunOptions() - options.output_partition_graphs = True - options.trace_level = tf.RunOptions.FULL_TRACE - - # Define a container for the returned metadata. - metadata = tf.RunMetadata() - - sess.run(y, options=options, run_metadata=metadata) - - # Print the subgraphs that executed on each device. - print(metadata.partition_graphs) - - # Print the timings of each operation that executed. - print(metadata.step_stats) -``` - - -## Visualizing your graph - -TensorFlow includes tools that can help you to understand the code in a graph. -The **graph visualizer** is a component of TensorBoard that renders the -structure of your graph visually in a browser. The easiest way to create a -visualization is to pass a `tf.Graph` when creating the -`tf.summary.FileWriter`: - -```python -# Build your graph. -x = tf.constant([[37.0, -23.0], [1.0, 4.0]]) -w = tf.Variable(tf.random_uniform([2, 2])) -y = tf.matmul(x, w) -# ... -loss = ... -train_op = tf.train.AdagradOptimizer(0.01).minimize(loss) - -with tf.Session() as sess: - # `sess.graph` provides access to the graph used in a `tf.Session`. - writer = tf.summary.FileWriter("/tmp/log/...", sess.graph) - - # Perform your computation... - for i in range(1000): - sess.run(train_op) - # ... - - writer.close() -``` - -Note: If you are using a `tf.estimator.Estimator`, the graph (and any -summaries) will be logged automatically to the `model_dir` that you specified -when creating the estimator. - -You can then open the log in `tensorboard`, navigate to the "Graph" tab, and -see a high-level visualization of your graph's structure. Note that a typical -TensorFlow graph---especially training graphs with automatically computed -gradients---has too many nodes to visualize at once. The graph visualizer makes -use of name scopes to group related operations into "super" nodes. You can -click on the orange "+" button on any of these super nodes to expand the -subgraph inside. - -![](../images/mnist_deep.png) - -For more information about visualizing your TensorFlow application with -TensorBoard, see the [TensorBoard guide](./summaries_and_tensorboard.md). - -## Programming with multiple graphs - -Note: When training a model, a common way of organizing your code is to use one -graph for training your model, and a separate graph for evaluating or performing -inference with a trained model. In many cases, the inference graph will be -different from the training graph: for example, techniques like dropout and -batch normalization use different operations in each case. Furthermore, by -default utilities like `tf.train.Saver` use the names of `tf.Variable` objects -(which have names based on an underlying `tf.Operation`) to identify each -variable in a saved checkpoint. When programming this way, you can either use -completely separate Python processes to build and execute the graphs, or you can -use multiple graphs in the same process. This section describes how to use -multiple graphs in the same process. - -As noted above, TensorFlow provides a "default graph" that is implicitly passed -to all API functions in the same context. For many applications, a single graph -is sufficient. However, TensorFlow also provides methods for manipulating -the default graph, which can be useful in more advanced use cases. For example: - -* A `tf.Graph` defines the namespace for `tf.Operation` objects: each - operation in a single graph must have a unique name. TensorFlow will - "uniquify" the names of operations by appending `"_1"`, `"_2"`, and so on to - their names if the requested name is already taken. Using multiple explicitly - created graphs gives you more control over what name is given to each - operation. - -* The default graph stores information about every `tf.Operation` and - `tf.Tensor` that was ever added to it. If your program creates a large number - of unconnected subgraphs, it may be more efficient to use a different - `tf.Graph` to build each subgraph, so that unrelated state can be garbage - collected. - -You can install a different `tf.Graph` as the default graph, using the -`tf.Graph.as_default` context manager: - -```python -g_1 = tf.Graph() -with g_1.as_default(): - # Operations created in this scope will be added to `g_1`. - c = tf.constant("Node in g_1") - - # Sessions created in this scope will run operations from `g_1`. - sess_1 = tf.Session() - -g_2 = tf.Graph() -with g_2.as_default(): - # Operations created in this scope will be added to `g_2`. - d = tf.constant("Node in g_2") - -# Alternatively, you can pass a graph when constructing a `tf.Session`: -# `sess_2` will run operations from `g_2`. -sess_2 = tf.Session(graph=g_2) - -assert c.graph is g_1 -assert sess_1.graph is g_1 - -assert d.graph is g_2 -assert sess_2.graph is g_2 -``` - -To inspect the current default graph, call `tf.get_default_graph`, which -returns a `tf.Graph` object: - -```python -# Print all of the operations in the default graph. -g = tf.get_default_graph() -print(g.get_operations()) -``` diff --git a/tensorflow/docs_src/guide/index.md b/tensorflow/docs_src/guide/index.md deleted file mode 100644 index 50499582cc..0000000000 --- a/tensorflow/docs_src/guide/index.md +++ /dev/null @@ -1,82 +0,0 @@ -# TensorFlow Guide - -The documents in this unit dive into the details of how TensorFlow -works. The units are as follows: - -## High Level APIs - - * [Keras](../guide/keras.md), TensorFlow's high-level API for building and - training deep learning models. - * [Eager Execution](../guide/eager.md), an API for writing TensorFlow code - imperatively, like you would use Numpy. - * [Importing Data](../guide/datasets.md), easy input pipelines to bring your data into - your TensorFlow program. - * [Estimators](../guide/estimators.md), a high-level API that provides - fully-packaged models ready for large-scale training and production. - -## Estimators - -* [Premade Estimators](../guide/premade_estimators.md), the basics of premade Estimators. -* [Checkpoints](../guide/checkpoints.md), save training progress and resume where you left off. -* [Feature Columns](../guide/feature_columns.md), handle a variety of input data types without changes to the model. -* [Datasets for Estimators](../guide/datasets_for_estimators.md), use `tf.data` to input data. -* [Creating Custom Estimators](../guide/custom_estimators.md), write your own Estimator. - -## Accelerators - - * [Using GPUs](../guide/using_gpu.md) explains how TensorFlow assigns operations to - devices and how you can change the arrangement manually. - * [Using TPUs](../guide/using_tpu.md) explains how to modify `Estimator` programs to run on a TPU. - -## Low Level APIs - - * [Introduction](../guide/low_level_intro.md), which introduces the - basics of how you can use TensorFlow outside of the high Level APIs. - * [Tensors](../guide/tensors.md), which explains how to create, - manipulate, and access Tensors--the fundamental object in TensorFlow. - * [Variables](../guide/variables.md), which details how - to represent shared, persistent state in your program. - * [Graphs and Sessions](../guide/graphs.md), which explains: - * dataflow graphs, which are TensorFlow's representation of computations - as dependencies between operations. - * sessions, which are TensorFlow's mechanism for running dataflow graphs - across one or more local or remote devices. - If you are programming with the low-level TensorFlow API, this unit - is essential. If you are programming with a high-level TensorFlow API - such as Estimators or Keras, the high-level API creates and manages - graphs and sessions for you, but understanding graphs and sessions - can still be helpful. - * [Save and Restore](../guide/saved_model.md), which - explains how to save and restore variables and models. - -## ML Concepts - - * [Embeddings](../guide/embedding.md), which introduces the concept - of embeddings, provides a simple example of training an embedding in - TensorFlow, and explains how to view embeddings with the TensorBoard - Embedding Projector. - -## Debugging - - * [TensorFlow Debugger](../guide/debugger.md), which - explains how to use the TensorFlow debugger (tfdbg). - -## TensorBoard - -TensorBoard is a utility to visualize different aspects of machine learning. -The following guides explain how to use TensorBoard: - - * [TensorBoard: Visualizing Learning](../guide/summaries_and_tensorboard.md), - which introduces TensorBoard. - * [TensorBoard: Graph Visualization](../guide/graph_viz.md), which - explains how to visualize the computational graph. - * [TensorBoard Histogram Dashboard](../guide/tensorboard_histograms.md) which demonstrates the how to - use TensorBoard's histogram dashboard. - - -## Misc - - * [TensorFlow Version Compatibility](../guide/version_compat.md), - which explains backward compatibility guarantees and non-guarantees. - * [Frequently Asked Questions](../guide/faq.md), which contains frequently asked - questions about TensorFlow. diff --git a/tensorflow/docs_src/guide/keras.md b/tensorflow/docs_src/guide/keras.md deleted file mode 100644 index 2330fa03c7..0000000000 --- a/tensorflow/docs_src/guide/keras.md +++ /dev/null @@ -1,623 +0,0 @@ -# Keras - -Keras is a high-level API to build and train deep learning models. It's used for -fast prototyping, advanced research, and production, with three key advantages: - -- *User friendly*
- Keras has a simple, consistent interface optimized for common use cases. It - provides clear and actionable feedback for user errors. -- *Modular and composable*
- Keras models are made by connecting configurable building blocks together, - with few restrictions. -- *Easy to extend*
Write custom building blocks to express new ideas for - research. Create new layers, loss functions, and develop state-of-the-art - models. - -## Import tf.keras - -`tf.keras` is TensorFlow's implementation of the -[Keras API specification](https://keras.io){:.external}. This is a high-level -API to build and train models that includes first-class support for -TensorFlow-specific functionality, such as [eager execution](#eager_execution), -`tf.data` pipelines, and [Estimators](./estimators.md). -`tf.keras` makes TensorFlow easier to use without sacrificing flexibility and -performance. - -To get started, import `tf.keras` as part of your TensorFlow program setup: - -```python -import tensorflow as tf -from tensorflow import keras -``` - -`tf.keras` can run any Keras-compatible code, but keep in mind: - -* The `tf.keras` version in the latest TensorFlow release might not be the same - as the latest `keras` version from PyPI. Check `tf.keras.__version__`. -* When [saving a model's weights](#weights_only), `tf.keras` defaults to the - [checkpoint format](./checkpoints.md). Pass `save_format='h5'` to - use HDF5. - -## Build a simple model - -### Sequential model - -In Keras, you assemble *layers* to build *models*. A model is (usually) a graph -of layers. The most common type of model is a stack of layers: the -`tf.keras.Sequential` model. - -To build a simple, fully-connected network (i.e. multi-layer perceptron): - -```python -model = keras.Sequential() -# Adds a densely-connected layer with 64 units to the model: -model.add(keras.layers.Dense(64, activation='relu')) -# Add another: -model.add(keras.layers.Dense(64, activation='relu')) -# Add a softmax layer with 10 output units: -model.add(keras.layers.Dense(10, activation='softmax')) -``` - -### Configure the layers - -There are many `tf.keras.layers` available with some common constructor -parameters: - -* `activation`: Set the activation function for the layer. This parameter is - specified by the name of a built-in function or as a callable object. By - default, no activation is applied. -* `kernel_initializer` and `bias_initializer`: The initialization schemes - that create the layer's weights (kernel and bias). This parameter is a name or - a callable object. This defaults to the `"Glorot uniform"` initializer. -* `kernel_regularizer` and `bias_regularizer`: The regularization schemes - that apply the layer's weights (kernel and bias), such as L1 or L2 - regularization. By default, no regularization is applied. - -The following instantiates `tf.keras.layers.Dense` layers using constructor -arguments: - -```python -# Create a sigmoid layer: -layers.Dense(64, activation='sigmoid') -# Or: -layers.Dense(64, activation=tf.sigmoid) - -# A linear layer with L1 regularization of factor 0.01 applied to the kernel matrix: -layers.Dense(64, kernel_regularizer=keras.regularizers.l1(0.01)) -# A linear layer with L2 regularization of factor 0.01 applied to the bias vector: -layers.Dense(64, bias_regularizer=keras.regularizers.l2(0.01)) - -# A linear layer with a kernel initialized to a random orthogonal matrix: -layers.Dense(64, kernel_initializer='orthogonal') -# A linear layer with a bias vector initialized to 2.0s: -layers.Dense(64, bias_initializer=keras.initializers.constant(2.0)) -``` - -## Train and evaluate - -### Set up training - -After the model is constructed, configure its learning process by calling the -`compile` method: - -```python -model.compile(optimizer=tf.train.AdamOptimizer(0.001), - loss='categorical_crossentropy', - metrics=['accuracy']) -``` - -`tf.keras.Model.compile` takes three important arguments: - -* `optimizer`: This object specifies the training procedure. Pass it optimizer - instances from the `tf.train` module, such as - [`AdamOptimizer`](/api_docs/python/tf/train/AdamOptimizer), - [`RMSPropOptimizer`](/api_docs/python/tf/train/RMSPropOptimizer), or - [`GradientDescentOptimizer`](/api_docs/python/tf/train/GradientDescentOptimizer). -* `loss`: The function to minimize during optimization. Common choices include - mean square error (`mse`), `categorical_crossentropy`, and - `binary_crossentropy`. Loss functions are specified by name or by - passing a callable object from the `tf.keras.losses` module. -* `metrics`: Used to monitor training. These are string names or callables from - the `tf.keras.metrics` module. - -The following shows a few examples of configuring a model for training: - -```python -# Configure a model for mean-squared error regression. -model.compile(optimizer=tf.train.AdamOptimizer(0.01), - loss='mse', # mean squared error - metrics=['mae']) # mean absolute error - -# Configure a model for categorical classification. -model.compile(optimizer=tf.train.RMSPropOptimizer(0.01), - loss=keras.losses.categorical_crossentropy, - metrics=[keras.metrics.categorical_accuracy]) -``` - -### Input NumPy data - -For small datasets, use in-memory [NumPy](https://www.numpy.org/){:.external} -arrays to train and evaluate a model. The model is "fit" to the training data -using the `fit` method: - -```python -import numpy as np - -data = np.random.random((1000, 32)) -labels = np.random.random((1000, 10)) - -model.fit(data, labels, epochs=10, batch_size=32) -``` - -`tf.keras.Model.fit` takes three important arguments: - -* `epochs`: Training is structured into *epochs*. An epoch is one iteration over - the entire input data (this is done in smaller batches). -* `batch_size`: When passed NumPy data, the model slices the data into smaller - batches and iterates over these batches during training. This integer - specifies the size of each batch. Be aware that the last batch may be smaller - if the total number of samples is not divisible by the batch size. -* `validation_data`: When prototyping a model, you want to easily monitor its - performance on some validation data. Passing this argument—a tuple of inputs - and labels—allows the model to display the loss and metrics in inference mode - for the passed data, at the end of each epoch. - -Here's an example using `validation_data`: - -```python -import numpy as np - -data = np.random.random((1000, 32)) -labels = np.random.random((1000, 10)) - -val_data = np.random.random((100, 32)) -val_labels = np.random.random((100, 10)) - -model.fit(data, labels, epochs=10, batch_size=32, - validation_data=(val_data, val_labels)) -``` - -### Input tf.data datasets - -Use the [Datasets API](./datasets.md) to scale to large datasets -or multi-device training. Pass a `tf.data.Dataset` instance to the `fit` -method: - -```python -# Instantiates a toy dataset instance: -dataset = tf.data.Dataset.from_tensor_slices((data, labels)) -dataset = dataset.batch(32) -dataset = dataset.repeat() - -# Don't forget to specify `steps_per_epoch` when calling `fit` on a dataset. -model.fit(dataset, epochs=10, steps_per_epoch=30) -``` - -Here, the `fit` method uses the `steps_per_epoch` argument—this is the number of -training steps the model runs before it moves to the next epoch. Since the -`Dataset` yields batches of data, this snippet does not require a `batch_size`. - -Datasets can also be used for validation: - -```python -dataset = tf.data.Dataset.from_tensor_slices((data, labels)) -dataset = dataset.batch(32).repeat() - -val_dataset = tf.data.Dataset.from_tensor_slices((val_data, val_labels)) -val_dataset = val_dataset.batch(32).repeat() - -model.fit(dataset, epochs=10, steps_per_epoch=30, - validation_data=val_dataset, - validation_steps=3) -``` - -### Evaluate and predict - -The `tf.keras.Model.evaluate` and `tf.keras.Model.predict` methods can use NumPy -data and a `tf.data.Dataset`. - -To *evaluate* the inference-mode loss and metrics for the data provided: - -```python -model.evaluate(x, y, batch_size=32) - -model.evaluate(dataset, steps=30) -``` - -And to *predict* the output of the last layer in inference for the data provided, -as a NumPy array: - -``` -model.predict(x, batch_size=32) - -model.predict(dataset, steps=30) -``` - - -## Build advanced models - -### Functional API - -The `tf.keras.Sequential` model is a simple stack of layers that cannot -represent arbitrary models. Use the -[Keras functional API](https://keras.io/getting-started/functional-api-guide/){:.external} -to build complex model topologies such as: - -* Multi-input models, -* Multi-output models, -* Models with shared layers (the same layer called several times), -* Models with non-sequential data flows (e.g. residual connections). - -Building a model with the functional API works like this: - -1. A layer instance is callable and returns a tensor. -2. Input tensors and output tensors are used to define a `tf.keras.Model` - instance. -3. This model is trained just like the `Sequential` model. - -The following example uses the functional API to build a simple, fully-connected -network: - -```python -inputs = keras.Input(shape=(32,)) # Returns a placeholder tensor - -# A layer instance is callable on a tensor, and returns a tensor. -x = keras.layers.Dense(64, activation='relu')(inputs) -x = keras.layers.Dense(64, activation='relu')(x) -predictions = keras.layers.Dense(10, activation='softmax')(x) - -# Instantiate the model given inputs and outputs. -model = keras.Model(inputs=inputs, outputs=predictions) - -# The compile step specifies the training configuration. -model.compile(optimizer=tf.train.RMSPropOptimizer(0.001), - loss='categorical_crossentropy', - metrics=['accuracy']) - -# Trains for 5 epochs -model.fit(data, labels, batch_size=32, epochs=5) -``` - -### Model subclassing - -Build a fully-customizable model by subclassing `tf.keras.Model` and defining -your own forward pass. Create layers in the `__init__` method and set them as -attributes of the class instance. Define the forward pass in the `call` method. - -Model subclassing is particularly useful when -[eager execution](./eager.md) is enabled since the forward pass -can be written imperatively. - -Key Point: Use the right API for the job. While model subclassing offers -flexibility, it comes at a cost of greater complexity and more opportunities for -user errors. If possible, prefer the functional API. - -The following example shows a subclassed `tf.keras.Model` using a custom forward -pass: - -```python -class MyModel(keras.Model): - - def __init__(self, num_classes=10): - super(MyModel, self).__init__(name='my_model') - self.num_classes = num_classes - # Define your layers here. - self.dense_1 = keras.layers.Dense(32, activation='relu') - self.dense_2 = keras.layers.Dense(num_classes, activation='sigmoid') - - def call(self, inputs): - # Define your forward pass here, - # using layers you previously defined (in `__init__`). - x = self.dense_1(inputs) - return self.dense_2(x) - - def compute_output_shape(self, input_shape): - # You need to override this function if you want to use the subclassed model - # as part of a functional-style model. - # Otherwise, this method is optional. - shape = tf.TensorShape(input_shape).as_list() - shape[-1] = self.num_classes - return tf.TensorShape(shape) - - -# Instantiates the subclassed model. -model = MyModel(num_classes=10) - -# The compile step specifies the training configuration. -model.compile(optimizer=tf.train.RMSPropOptimizer(0.001), - loss='categorical_crossentropy', - metrics=['accuracy']) - -# Trains for 5 epochs. -model.fit(data, labels, batch_size=32, epochs=5) -``` - - -### Custom layers - -Create a custom layer by subclassing `tf.keras.layers.Layer` and implementing -the following methods: - -* `build`: Create the weights of the layer. Add weights with the `add_weight` - method. -* `call`: Define the forward pass. -* `compute_output_shape`: Specify how to compute the output shape of the layer - given the input shape. -* Optionally, a layer can be serialized by implementing the `get_config` method - and the `from_config` class method. - -Here's an example of a custom layer that implements a `matmul` of an input with -a kernel matrix: - -```python -class MyLayer(keras.layers.Layer): - - def __init__(self, output_dim, **kwargs): - self.output_dim = output_dim - super(MyLayer, self).__init__(**kwargs) - - def build(self, input_shape): - shape = tf.TensorShape((input_shape[1], self.output_dim)) - # Create a trainable weight variable for this layer. - self.kernel = self.add_weight(name='kernel', - shape=shape, - initializer='uniform', - trainable=True) - # Be sure to call this at the end - super(MyLayer, self).build(input_shape) - - def call(self, inputs): - return tf.matmul(inputs, self.kernel) - - def compute_output_shape(self, input_shape): - shape = tf.TensorShape(input_shape).as_list() - shape[-1] = self.output_dim - return tf.TensorShape(shape) - - def get_config(self): - base_config = super(MyLayer, self).get_config() - base_config['output_dim'] = self.output_dim - - @classmethod - def from_config(cls, config): - return cls(**config) - - -# Create a model using the custom layer -model = keras.Sequential([MyLayer(10), - keras.layers.Activation('softmax')]) - -# The compile step specifies the training configuration -model.compile(optimizer=tf.train.RMSPropOptimizer(0.001), - loss='categorical_crossentropy', - metrics=['accuracy']) - -# Trains for 5 epochs. -model.fit(data, targets, batch_size=32, epochs=5) -``` - - -## Callbacks - -A callback is an object passed to a model to customize and extend its behavior -during training. You can write your own custom callback, or use the built-in -`tf.keras.callbacks` that include: - -* `tf.keras.callbacks.ModelCheckpoint`: Save checkpoints of your model at - regular intervals. -* `tf.keras.callbacks.LearningRateScheduler`: Dynamically change the learning - rate. -* `tf.keras.callbacks.EarlyStopping`: Interrupt training when validation - performance has stopped improving. -* `tf.keras.callbacks.TensorBoard`: Monitor the model's behavior using - [TensorBoard](./summaries_and_tensorboard.md). - -To use a `tf.keras.callbacks.Callback`, pass it to the model's `fit` method: - -```python -callbacks = [ - # Interrupt training if `val_loss` stops improving for over 2 epochs - keras.callbacks.EarlyStopping(patience=2, monitor='val_loss'), - # Write TensorBoard logs to `./logs` directory - keras.callbacks.TensorBoard(log_dir='./logs') -] -model.fit(data, labels, batch_size=32, epochs=5, callbacks=callbacks, - validation_data=(val_data, val_targets)) -``` - - -## Save and restore - -### Weights only - -Save and load the weights of a model using `tf.keras.Model.save_weights`: - -```python -# Save weights to a TensorFlow Checkpoint file -model.save_weights('./my_model') - -# Restore the model's state, -# this requires a model with the same architecture. -model.load_weights('my_model') -``` - -By default, this saves the model's weights in the -[TensorFlow checkpoint](./checkpoints.md) file format. Weights can -also be saved to the Keras HDF5 format (the default for the multi-backend -implementation of Keras): - -```python -# Save weights to a HDF5 file -model.save_weights('my_model.h5', save_format='h5') - -# Restore the model's state -model.load_weights('my_model.h5') -``` - - -### Configuration only - -A model's configuration can be saved—this serializes the model architecture -without any weights. A saved configuration can recreate and initialize the same -model, even without the code that defined the original model. Keras supports -JSON and YAML serialization formats: - -```python -# Serialize a model to JSON format -json_string = model.to_json() - -# Recreate the model (freshly initialized) -fresh_model = keras.models.model_from_json(json_string) - -# Serializes a model to YAML format -yaml_string = model.to_yaml() - -# Recreate the model -fresh_model = keras.models.model_from_yaml(yaml_string) -``` - -Caution: Subclassed models are not serializable because their architecture is -defined by the Python code in the body of the `call` method. - - -### Entire model - -The entire model can be saved to a file that contains the weight values, the -model's configuration, and even the optimizer's configuration. This allows you -to checkpoint a model and resume training later—from the exact same -state—without access to the original code. - -```python -# Create a trivial model -model = keras.Sequential([ - keras.layers.Dense(10, activation='softmax', input_shape=(32,)), - keras.layers.Dense(10, activation='softmax') -]) -model.compile(optimizer='rmsprop', - loss='categorical_crossentropy', - metrics=['accuracy']) -model.fit(data, targets, batch_size=32, epochs=5) - - -# Save entire model to a HDF5 file -model.save('my_model.h5') - -# Recreate the exact same model, including weights and optimizer. -model = keras.models.load_model('my_model.h5') -``` - - -## Eager execution - -[Eager execution](./eager.md) is an imperative programming -environment that evaluates operations immediately. This is not required for -Keras, but is supported by `tf.keras` and useful for inspecting your program and -debugging. - -All of the `tf.keras` model-building APIs are compatible with eager execution. -And while the `Sequential` and functional APIs can be used, eager execution -especially benefits *model subclassing* and building *custom layers*—the APIs -that require you to write the forward pass as code (instead of the APIs that -create models by assembling existing layers). - -See the [eager execution guide](./eager.md#build_a_model) for -examples of using Keras models with custom training loops and `tf.GradientTape`. - - -## Distribution - -### Estimators - -The [Estimators](./estimators.md) API is used for training models -for distributed environments. This targets industry use cases such as -distributed training on large datasets that can export a model for production. - -A `tf.keras.Model` can be trained with the `tf.estimator` API by converting the -model to an `tf.estimator.Estimator` object with -`tf.keras.estimator.model_to_estimator`. See -[Creating Estimators from Keras models](./estimators.md#creating_estimators_from_keras_models). - -```python -model = keras.Sequential([layers.Dense(10,activation='softmax'), - layers.Dense(10,activation='softmax')]) - -model.compile(optimizer=tf.train.RMSPropOptimizer(0.001), - loss='categorical_crossentropy', - metrics=['accuracy']) - -estimator = keras.estimator.model_to_estimator(model) -``` - -Note: Enable [eager execution](./eager.md) for debugging -[Estimator input functions](./premade_estimators.md#create_input_functions) -and inspecting data. - -### Multiple GPUs - -`tf.keras` models can run on multiple GPUs using -`tf.contrib.distribute.DistributionStrategy`. This API provides distributed -training on multiple GPUs with almost no changes to existing code. - -Currently, `tf.contrib.distribute.MirroredStrategy` is the only supported -distribution strategy. `MirroredStrategy` does in-graph replication with -synchronous training using all-reduce on a single machine. To use -`DistributionStrategy` with Keras, convert the `tf.keras.Model` to a -`tf.estimator.Estimator` with `tf.keras.estimator.model_to_estimator`, then -train the estimator - -The following example distributes a `tf.keras.Model` across multiple GPUs on a -single machine. - -First, define a simple model: - -```python -model = keras.Sequential() -model.add(keras.layers.Dense(16, activation='relu', input_shape=(10,))) -model.add(keras.layers.Dense(1, activation='sigmoid')) - -optimizer = tf.train.GradientDescentOptimizer(0.2) - -model.compile(loss='binary_crossentropy', optimizer=optimizer) -model.summary() -``` - -Define an *input pipeline*. The `input_fn` returns a `tf.data.Dataset` object -used to distribute the data across multiple devices—with each device processing -a slice of the input batch. - -```python -def input_fn(): - x = np.random.random((1024, 10)) - y = np.random.randint(2, size=(1024, 1)) - x = tf.cast(x, tf.float32) - dataset = tf.data.Dataset.from_tensor_slices((x, y)) - dataset = dataset.repeat(10) - dataset = dataset.batch(32) - return dataset -``` - -Next, create a `tf.estimator.RunConfig` and set the `train_distribute` argument -to the `tf.contrib.distribute.MirroredStrategy` instance. When creating -`MirroredStrategy`, you can specify a list of devices or set the `num_gpus` -argument. The default uses all available GPUs, like the following: - -```python -strategy = tf.contrib.distribute.MirroredStrategy() -config = tf.estimator.RunConfig(train_distribute=strategy) -``` - -Convert the Keras model to a `tf.estimator.Estimator` instance: - -```python -keras_estimator = keras.estimator.model_to_estimator( - keras_model=model, - config=config, - model_dir='/tmp/model_dir') -``` - -Finally, train the `Estimator` instance by providing the `input_fn` and `steps` -arguments: - -```python -keras_estimator.train(input_fn=input_fn, steps=10) -``` diff --git a/tensorflow/docs_src/guide/leftnav_files b/tensorflow/docs_src/guide/leftnav_files deleted file mode 100644 index 8e227e0c8f..0000000000 --- a/tensorflow/docs_src/guide/leftnav_files +++ /dev/null @@ -1,41 +0,0 @@ -index.md - -### High Level APIs -keras.md -eager.md -datasets.md -estimators.md: Introduction to Estimators - -### Estimators -premade_estimators.md -checkpoints.md -feature_columns.md -datasets_for_estimators.md -custom_estimators.md - -### Accelerators -using_gpu.md -using_tpu.md - -### Low Level APIs -low_level_intro.md -tensors.md -variables.md -graphs.md -saved_model.md -autograph.md : Control flow - -### ML Concepts -embedding.md - -### Debugging -debugger.md - -### TensorBoard -summaries_and_tensorboard.md: Visualizing Learning -graph_viz.md: Graphs -tensorboard_histograms.md: Histograms - -### Misc -version_compat.md -faq.md diff --git a/tensorflow/docs_src/guide/low_level_intro.md b/tensorflow/docs_src/guide/low_level_intro.md deleted file mode 100644 index d002f8af0b..0000000000 --- a/tensorflow/docs_src/guide/low_level_intro.md +++ /dev/null @@ -1,604 +0,0 @@ -# Introduction - -This guide gets you started programming in the low-level TensorFlow APIs -(TensorFlow Core), showing you how to: - - * Manage your own TensorFlow program (a `tf.Graph`) and TensorFlow - runtime (a `tf.Session`), instead of relying on Estimators to manage them. - * Run TensorFlow operations, using a `tf.Session`. - * Use high level components ([datasets](#datasets), [layers](#layers), and - [feature_columns](#feature_columns)) in this low level environment. - * Build your own training loop, instead of using the one - [provided by Estimators](../guide/premade_estimators.md). - -We recommend using the higher level APIs to build models when possible. -Knowing TensorFlow Core is valuable for the following reasons: - - * Experimentation and debugging are both more straight forward - when you can use low level TensorFlow operations directly. - * It gives you a mental model of how things work internally when - using the higher level APIs. - -## Setup - -Before using this guide, [install TensorFlow](../install/index.md). - -To get the most out of this guide, you should know the following: - -* How to program in Python. -* At least a little bit about arrays. -* Ideally, something about machine learning. - -Feel free to launch `python` and follow along with this walkthrough. -Run the following lines to set up your Python environment: - -```python -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -import numpy as np -import tensorflow as tf -``` - -## Tensor Values - -The central unit of data in TensorFlow is the **tensor**. A tensor consists of a -set of primitive values shaped into an array of any number of dimensions. A -tensor's **rank** is its number of dimensions, while its **shape** is a tuple -of integers specifying the array's length along each dimension. Here are some -examples of tensor values: - -```python -3. # a rank 0 tensor; a scalar with shape [], -[1., 2., 3.] # a rank 1 tensor; a vector with shape [3] -[[1., 2., 3.], [4., 5., 6.]] # a rank 2 tensor; a matrix with shape [2, 3] -[[[1., 2., 3.]], [[7., 8., 9.]]] # a rank 3 tensor with shape [2, 1, 3] -``` - -TensorFlow uses numpy arrays to represent tensor **values**. - -## TensorFlow Core Walkthrough - -You might think of TensorFlow Core programs as consisting of two discrete -sections: - -1. Building the computational graph (a `tf.Graph`). -2. Running the computational graph (using a `tf.Session`). - -### Graph - -A **computational graph** is a series of TensorFlow operations arranged into a -graph. The graph is composed of two types of objects. - - * `tf.Operation` (or "ops"): The nodes of the graph. - Operations describe calculations that consume and produce tensors. - * `tf.Tensor`: The edges in the graph. These represent the values - that will flow through the graph. Most TensorFlow functions return - `tf.Tensors`. - -Important: `tf.Tensors` do not have values, they are just handles to elements -in the computation graph. - -Let's build a simple computational graph. The most basic operation is a -constant. The Python function that builds the operation takes a tensor value as -input. The resulting operation takes no inputs. When run, it outputs the -value that was passed to the constructor. We can create two floating point -constants `a` and `b` as follows: - -```python -a = tf.constant(3.0, dtype=tf.float32) -b = tf.constant(4.0) # also tf.float32 implicitly -total = a + b -print(a) -print(b) -print(total) -``` - -The print statements produce: - -``` -Tensor("Const:0", shape=(), dtype=float32) -Tensor("Const_1:0", shape=(), dtype=float32) -Tensor("add:0", shape=(), dtype=float32) -``` - -Notice that printing the tensors does not output the values `3.0`, `4.0`, and -`7.0` as you might expect. The above statements only build the computation -graph. These `tf.Tensor` objects just represent the results of the operations -that will be run. - -Each operation in a graph is given a unique name. This name is independent of -the names the objects are assigned to in Python. Tensors are named after the -operation that produces them followed by an output index, as in -`"add:0"` above. - -### TensorBoard - -TensorFlow provides a utility called TensorBoard. One of TensorBoard's many -capabilities is visualizing a computation graph. You can easily do this with -a few simple commands. - -First you save the computation graph to a TensorBoard summary file as -follows: - -``` -writer = tf.summary.FileWriter('.') -writer.add_graph(tf.get_default_graph()) -``` - -This will produce an `event` file in the current directory with a name in the -following format: - -``` -events.out.tfevents.{timestamp}.{hostname} -``` - -Now, in a new terminal, launch TensorBoard with the following shell command: - -```bsh -tensorboard --logdir . -``` - -Then open TensorBoard's [graphs page](http://localhost:6006/#graphs) in your -browser, and you should see a graph similar to the following: - -![TensorBoard screenshot](https://www.tensorflow.org/images/getting_started_add.png) - -For more about TensorBoard's graph visualization tools see [TensorBoard: Graph Visualization](../guide/graph_viz.md). - -### Session - -To evaluate tensors, instantiate a `tf.Session` object, informally known as a -**session**. A session encapsulates the state of the TensorFlow runtime, and -runs TensorFlow operations. If a `tf.Graph` is like a `.py` file, a `tf.Session` -is like the `python` executable. - -The following code creates a `tf.Session` object and then invokes its `run` -method to evaluate the `total` tensor we created above: - -```python -sess = tf.Session() -print(sess.run(total)) -``` - -When you request the output of a node with `Session.run` TensorFlow backtracks -through the graph and runs all the nodes that provide input to the requested -output node. So this prints the expected value of 7.0: - -``` -7.0 -``` - -You can pass multiple tensors to `tf.Session.run`. The `run` method -transparently handles any combination of tuples or dictionaries, as in the -following example: - -```python -print(sess.run({'ab':(a, b), 'total':total})) -``` - -which returns the results in a structure of the same layout: - -``` None -{'total': 7.0, 'ab': (3.0, 4.0)} -``` - -During a call to `tf.Session.run` any `tf.Tensor` only has a single value. -For example, the following code calls `tf.random_uniform` to produce a -`tf.Tensor` that generates a random 3-element vector (with values in `[0,1)`): - -```python -vec = tf.random_uniform(shape=(3,)) -out1 = vec + 1 -out2 = vec + 2 -print(sess.run(vec)) -print(sess.run(vec)) -print(sess.run((out1, out2))) -``` - -The result shows a different random value on each call to `run`, but -a consistent value during a single `run` (`out1` and `out2` receive the same -random input): - -``` -[ 0.52917576 0.64076328 0.68353939] -[ 0.66192627 0.89126778 0.06254101] -( - array([ 1.88408756, 1.87149239, 1.84057522], dtype=float32), - array([ 2.88408756, 2.87149239, 2.84057522], dtype=float32) -) -``` - -Some TensorFlow functions return `tf.Operations` instead of `tf.Tensors`. -The result of calling `run` on an Operation is `None`. You run an operation -to cause a side-effect, not to retrieve a value. Examples of this include the -[initialization](#Initializing Layers), and [training](#Training) ops -demonstrated later. - -### Feeding - -As it stands, this graph is not especially interesting because it always -produces a constant result. A graph can be parameterized to accept external -inputs, known as **placeholders**. A **placeholder** is a promise to provide a -value later, like a function argument. - -```python -x = tf.placeholder(tf.float32) -y = tf.placeholder(tf.float32) -z = x + y -``` - -The preceding three lines are a bit like a function in which we -define two input parameters (`x` and `y`) and then an operation on them. We can -evaluate this graph with multiple inputs by using the `feed_dict` argument of -the `tf.Session.run` method to feed concrete values to the placeholders: - -```python -print(sess.run(z, feed_dict={x: 3, y: 4.5})) -print(sess.run(z, feed_dict={x: [1, 3], y: [2, 4]})) -``` -This results in the following output: - -``` -7.5 -[ 3. 7.] -``` - -Also note that the `feed_dict` argument can be used to overwrite any tensor in -the graph. The only difference between placeholders and other `tf.Tensors` is -that placeholders throw an error if no value is fed to them. - -## Datasets - -Placeholders work for simple experiments, but `tf.data` are the -preferred method of streaming data into a model. - -To get a runnable `tf.Tensor` from a Dataset you must first convert it to a -`tf.data.Iterator`, and then call the Iterator's -`tf.data.Iterator.get_next` method. - -The simplest way to create an Iterator is with the -`tf.data.Dataset.make_one_shot_iterator` method. -For example, in the following code the `next_item` tensor will return a row from -the `my_data` array on each `run` call: - -``` python -my_data = [ - [0, 1,], - [2, 3,], - [4, 5,], - [6, 7,], -] -slices = tf.data.Dataset.from_tensor_slices(my_data) -next_item = slices.make_one_shot_iterator().get_next() -``` - -Reaching the end of the data stream causes `Dataset` to throw an -`tf.errors.OutOfRangeError`. For example, the following code -reads the `next_item` until there is no more data to read: - -``` python -while True: - try: - print(sess.run(next_item)) - except tf.errors.OutOfRangeError: - break -``` - -If the `Dataset` depends on stateful operations you may need to -initialize the iterator before using it, as shown below: - -``` python -r = tf.random_normal([10,3]) -dataset = tf.data.Dataset.from_tensor_slices(r) -iterator = dataset.make_initializable_iterator() -next_row = iterator.get_next() - -sess.run(iterator.initializer) -while True: - try: - print(sess.run(next_row)) - except tf.errors.OutOfRangeError: - break -``` - -For more details on Datasets and Iterators see: [Importing Data](../guide/datasets.md). - -## Layers - -A trainable model must modify the values in the graph to get new outputs with -the same input. `tf.layers` are the preferred way to add trainable -parameters to a graph. - -Layers package together both the variables and the operations that act -on them. For example a -[densely-connected layer](https://developers.google.com/machine-learning/glossary/#fully_connected_layer) -performs a weighted sum across all inputs -for each output and applies an optional -[activation function](https://developers.google.com/machine-learning/glossary/#activation_function). -The connection weights and biases are managed by the layer object. - -### Creating Layers - -The following code creates a `tf.layers.Dense` layer that takes a -batch of input vectors, and produces a single output value for each. To apply a -layer to an input, call the layer as if it were a function. For example: - -```python -x = tf.placeholder(tf.float32, shape=[None, 3]) -linear_model = tf.layers.Dense(units=1) -y = linear_model(x) -``` - -The layer inspects its input to determine sizes for its internal variables. So -here we must set the shape of the `x` placeholder so that the layer can -build a weight matrix of the correct size. - -Now that we have defined the calculation of the output, `y`, there is one more -detail we need to take care of before we run the calculation. - -### Initializing Layers - -The layer contains variables that must be **initialized** before they can be -used. While it is possible to initialize variables individually, you can easily -initialize all the variables in a TensorFlow graph as follows: - -```python -init = tf.global_variables_initializer() -sess.run(init) -``` - -Important: Calling `tf.global_variables_initializer` only -creates and returns a handle to a TensorFlow operation. That op -will initialize all the global variables when we run it with `tf.Session.run`. - -Also note that this `global_variables_initializer` only initializes variables -that existed in the graph when the initializer was created. So the initializer -should be one of the last things added during graph construction. - -### Executing Layers - -Now that the layer is initialized, we can evaluate the `linear_model`'s output -tensor as we would any other tensor. For example, the following code: - -```python -print(sess.run(y, {x: [[1, 2, 3],[4, 5, 6]]})) -``` - -will generate a two-element output vector such as the following: - -``` -[[-3.41378999] - [-9.14999008]] -``` - -### Layer Function shortcuts - -For each layer class (like `tf.layers.Dense`) TensorFlow also supplies a -shortcut function (like `tf.layers.dense`). The only difference is that the -shortcut function versions create and run the layer in a single call. For -example, the following code is equivalent to the earlier version: - -```python -x = tf.placeholder(tf.float32, shape=[None, 3]) -y = tf.layers.dense(x, units=1) - -init = tf.global_variables_initializer() -sess.run(init) - -print(sess.run(y, {x: [[1, 2, 3], [4, 5, 6]]})) -``` - -While convenient, this approach allows no access to the `tf.layers.Layer` -object. This makes introspection and debugging more difficult, -and layer reuse impossible. - -## Feature columns - -The easiest way to experiment with feature columns is using the -`tf.feature_column.input_layer` function. This function only accepts -[dense columns](../guide/feature_columns.md) as inputs, so to view the result -of a categorical column you must wrap it in an -`tf.feature_column.indicator_column`. For example: - -``` python -features = { - 'sales' : [[5], [10], [8], [9]], - 'department': ['sports', 'sports', 'gardening', 'gardening']} - -department_column = tf.feature_column.categorical_column_with_vocabulary_list( - 'department', ['sports', 'gardening']) -department_column = tf.feature_column.indicator_column(department_column) - -columns = [ - tf.feature_column.numeric_column('sales'), - department_column -] - -inputs = tf.feature_column.input_layer(features, columns) -``` - -Running the `inputs` tensor will parse the `features` into a batch of vectors. - -Feature columns can have internal state, like layers, so they often need to be -initialized. Categorical columns use `tf.contrib.lookup` -internally and these require a separate initialization op, -`tf.tables_initializer`. - -``` python -var_init = tf.global_variables_initializer() -table_init = tf.tables_initializer() -sess = tf.Session() -sess.run((var_init, table_init)) -``` - -Once the internal state has been initialized you can run `inputs` like any -other `tf.Tensor`: - -```python -print(sess.run(inputs)) -``` - -This shows how the feature columns have packed the input vectors, with the -one-hot "department" as the first two indices and "sales" as the third. - -```None -[[ 1. 0. 5.] - [ 1. 0. 10.] - [ 0. 1. 8.] - [ 0. 1. 9.]] -``` - -## Training - -Now that you're familiar with the basics of core TensorFlow, let's train a -small regression model manually. - -### Define the data - -First let's define some inputs, `x`, and the expected output for each input, -`y_true`: - -```python -x = tf.constant([[1], [2], [3], [4]], dtype=tf.float32) -y_true = tf.constant([[0], [-1], [-2], [-3]], dtype=tf.float32) -``` - -### Define the model - -Next, build a simple linear model, with 1 output: - -``` python -linear_model = tf.layers.Dense(units=1) - -y_pred = linear_model(x) -``` - -You can evaluate the predictions as follows: - -``` python -sess = tf.Session() -init = tf.global_variables_initializer() -sess.run(init) - -print(sess.run(y_pred)) -``` - -The model hasn't yet been trained, so the four "predicted" values aren't very -good. Here's what we got; your own output will almost certainly differ: - -``` None -[[ 0.02631879] - [ 0.05263758] - [ 0.07895637] - [ 0.10527515]] -``` - -### Loss - -To optimize a model, you first need to define the loss. We'll use the mean -square error, a standard loss for regression problems. - -While you could do this manually with lower level math operations, -the `tf.losses` module provides a set of common loss functions. You can use it -to calculate the mean square error as follows: - -``` python -loss = tf.losses.mean_squared_error(labels=y_true, predictions=y_pred) - -print(sess.run(loss)) -``` -This will produce a loss value, something like: - -``` None -2.23962 -``` - -### Training - -TensorFlow provides -[**optimizers**](https://developers.google.com/machine-learning/glossary/#optimizer) -implementing standard optimization algorithms. These are implemented as -sub-classes of `tf.train.Optimizer`. They incrementally change each -variable in order to minimize the loss. The simplest optimization algorithm is -[**gradient descent**](https://developers.google.com/machine-learning/glossary/#gradient_descent), -implemented by `tf.train.GradientDescentOptimizer`. It modifies each -variable according to the magnitude of the derivative of loss with respect to -that variable. For example: - -```python -optimizer = tf.train.GradientDescentOptimizer(0.01) -train = optimizer.minimize(loss) -``` - -This code builds all the graph components necessary for the optimization, and -returns a training operation. When run, the training op will update variables -in the graph. You might run it as follows: - -```python -for i in range(100): - _, loss_value = sess.run((train, loss)) - print(loss_value) -``` - -Since `train` is an op, not a tensor, it doesn't return a value when run. -To see the progression of the loss during training, we run the loss tensor at -the same time, producing output like the following: - -``` None -1.35659 -1.00412 -0.759167 -0.588829 -0.470264 -0.387626 -0.329918 -0.289511 -0.261112 -0.241046 -... -``` - -### Complete program - -```python -x = tf.constant([[1], [2], [3], [4]], dtype=tf.float32) -y_true = tf.constant([[0], [-1], [-2], [-3]], dtype=tf.float32) - -linear_model = tf.layers.Dense(units=1) - -y_pred = linear_model(x) -loss = tf.losses.mean_squared_error(labels=y_true, predictions=y_pred) - -optimizer = tf.train.GradientDescentOptimizer(0.01) -train = optimizer.minimize(loss) - -init = tf.global_variables_initializer() - -sess = tf.Session() -sess.run(init) -for i in range(100): - _, loss_value = sess.run((train, loss)) - print(loss_value) - -print(sess.run(y_pred)) -``` - -## Next steps - -To learn more about building models with TensorFlow consider the following: - -* [Custom Estimators](../guide/custom_estimators.md), to learn how to build - customized models with TensorFlow. Your knowledge of TensorFlow Core will - help you understand and debug your own models. - -If you want to learn more about the inner workings of TensorFlow consider the -following documents, which go into more depth on many of the topics discussed -here: - -* [Graphs and Sessions](../guide/graphs.md) -* [Tensors](../guide/tensors.md) -* [Variables](../guide/variables.md) - - diff --git a/tensorflow/docs_src/guide/premade_estimators.md b/tensorflow/docs_src/guide/premade_estimators.md deleted file mode 100644 index 9b64d51b98..0000000000 --- a/tensorflow/docs_src/guide/premade_estimators.md +++ /dev/null @@ -1,432 +0,0 @@ -# Premade Estimators - -This document introduces the TensorFlow programming environment and shows you -how to solve the Iris classification problem in TensorFlow. - -## Prerequisites - -Prior to using the sample code in this document, you'll need to do the -following: - -* [Install TensorFlow](../install/index.md). -* If you installed TensorFlow with virtualenv or Anaconda, activate your - TensorFlow environment. -* Install or upgrade pandas by issuing the following command: - - pip install pandas - -## Getting the sample code - -Take the following steps to get the sample code we'll be going through: - -1. Clone the TensorFlow Models repository from GitHub by entering the following - command: - - git clone https://github.com/tensorflow/models - -1. Change directory within that branch to the location containing the examples - used in this document: - - cd models/samples/core/get_started/ - -The program described in this document is -[`premade_estimator.py`](https://github.com/tensorflow/models/blob/master/samples/core/get_started/premade_estimator.py). -This program uses -[`iris_data.py`](https://github.com/tensorflow/models/blob/master/samples/core/get_started/iris_data.py) -to fetch its training data. - -### Running the program - -You run TensorFlow programs as you would run any Python program. For example: - -``` bsh -python premade_estimator.py -``` - -The program should output training logs followed by some predictions against -the test set. For example, the first line in the following output shows that -the model thinks there is a 99.6% chance that the first example in the test -set is a Setosa. Since the test set expected Setosa, this appears to be -a good prediction. - -``` None -... -Prediction is "Setosa" (99.6%), expected "Setosa" - -Prediction is "Versicolor" (99.8%), expected "Versicolor" - -Prediction is "Virginica" (97.9%), expected "Virginica" -``` - -If the program generates errors instead of answers, ask yourself the following -questions: - -* Did you install TensorFlow properly? -* Are you using the correct version of TensorFlow? -* Did you activate the environment you installed TensorFlow in? (This is - only relevant in certain installation mechanisms.) - -## The programming stack - -Before getting into the details of the program itself, let's investigate the -programming environment. As the following illustration shows, TensorFlow -provides a programming stack consisting of multiple API layers: - -
- -
- -We strongly recommend writing TensorFlow programs with the following APIs: - -* [Estimators](../guide/estimators.md), which represent a complete model. - The Estimator API provides methods to train the model, to judge the model's - accuracy, and to generate predictions. -* [Datasets for Estimators](../guide/datasets_for_estimators.md), which build a data input - pipeline. The Dataset API has methods to load and manipulate data, and feed - it into your model. The Dataset API meshes well with the Estimators API. - -## Classifying irises: an overview - -The sample program in this document builds and tests a model that -classifies Iris flowers into three different species based on the size of their -[sepals](https://en.wikipedia.org/wiki/Sepal) and -[petals](https://en.wikipedia.org/wiki/Petal). - -
-Petal geometry compared for three iris species: Iris setosa, Iris virginica, and Iris versicolor -
- -**From left to right, -[*Iris setosa*](https://commons.wikimedia.org/w/index.php?curid=170298) (by -[Radomil](https://commons.wikimedia.org/wiki/User:Radomil), CC BY-SA 3.0), -[*Iris versicolor*](https://commons.wikimedia.org/w/index.php?curid=248095) (by -[Dlanglois](https://commons.wikimedia.org/wiki/User:Dlanglois), CC BY-SA 3.0), -and [*Iris virginica*](https://www.flickr.com/photos/33397993@N05/3352169862) -(by [Frank Mayfield](https://www.flickr.com/photos/33397993@N05), CC BY-SA -2.0).** - -### The data set - -The Iris data set contains four features and one -[label](https://developers.google.com/machine-learning/glossary/#label). -The four features identify the following botanical characteristics of -individual Iris flowers: - -* sepal length -* sepal width -* petal length -* petal width - -Our model will represent these features as `float32` numerical data. - -The label identifies the Iris species, which must be one of the following: - -* Iris setosa (0) -* Iris versicolor (1) -* Iris virginica (2) - -Our model will represent the label as `int32` categorical data. - -The following table shows three examples in the data set: - -|sepal length | sepal width | petal length | petal width| species (label) | -|------------:|------------:|-------------:|-----------:|:---------------:| -| 5.1 | 3.3 | 1.7 | 0.5 | 0 (Setosa) | -| 5.0 | 2.3 | 3.3 | 1.0 | 1 (versicolor)| -| 6.4 | 2.8 | 5.6 | 2.2 | 2 (virginica) | - -### The algorithm - -The program trains a Deep Neural Network classifier model having the following -topology: - -* 2 hidden layers. -* Each hidden layer contains 10 nodes. - -The following figure illustrates the features, hidden layers, and predictions -(not all of the nodes in the hidden layers are shown): - -
-A diagram of the network architecture: Inputs, 2 hidden layers, and outputs -
- -### Inference - -Running the trained model on an unlabeled example yields three predictions, -namely, the likelihood that this flower is the given Iris species. The sum of -those output predictions will be 1.0. For example, the prediction on an -unlabeled example might be something like the following: - -* 0.03 for Iris Setosa -* 0.95 for Iris Versicolor -* 0.02 for Iris Virginica - -The preceding prediction indicates a 95% probability that the given unlabeled -example is an Iris Versicolor. - -## Overview of programming with Estimators - -An Estimator is TensorFlow's high-level representation of a complete model. It -handles the details of initialization, logging, saving and restoring, and many -other features so you can concentrate on your model. For more details see -[Estimators](../guide/estimators.md). - -An Estimator is any class derived from `tf.estimator.Estimator`. TensorFlow -provides a collection of -`tf.estimator` -(for example, `LinearRegressor`) to implement common ML algorithms. Beyond -those, you may write your own -[custom Estimators](../guide/custom_estimators.md). -We recommend using pre-made Estimators when just getting started. - -To write a TensorFlow program based on pre-made Estimators, you must perform the -following tasks: - -* Create one or more input functions. -* Define the model's feature columns. -* Instantiate an Estimator, specifying the feature columns and various - hyperparameters. -* Call one or more methods on the Estimator object, passing the appropriate - input function as the source of the data. - -Let's see how those tasks are implemented for Iris classification. - -## Create input functions - -You must create input functions to supply data for training, -evaluating, and prediction. - -An **input function** is a function that returns a `tf.data.Dataset` object -which outputs the following two-element tuple: - -* [`features`](https://developers.google.com/machine-learning/glossary/#feature) - A Python dictionary in which: - * Each key is the name of a feature. - * Each value is an array containing all of that feature's values. -* `label` - An array containing the values of the - [label](https://developers.google.com/machine-learning/glossary/#label) for - every example. - -Just to demonstrate the format of the input function, here's a simple -implementation: - -```python -def input_evaluation_set(): - features = {'SepalLength': np.array([6.4, 5.0]), - 'SepalWidth': np.array([2.8, 2.3]), - 'PetalLength': np.array([5.6, 3.3]), - 'PetalWidth': np.array([2.2, 1.0])} - labels = np.array([2, 1]) - return features, labels -``` - -Your input function may generate the `features` dictionary and `label` list any -way you like. However, we recommend using TensorFlow's Dataset API, which can -parse all sorts of data. At a high level, the Dataset API consists of the -following classes: - -
-A diagram showing subclasses of the Dataset class -
- -Where the individual members are: - -* `Dataset` - Base class containing methods to create and transform - datasets. Also allows you to initialize a dataset from data in memory, or from - a Python generator. -* `TextLineDataset` - Reads lines from text files. -* `TFRecordDataset` - Reads records from TFRecord files. -* `FixedLengthRecordDataset` - Reads fixed size records from binary files. -* `Iterator` - Provides a way to access one data set element at a time. - -The Dataset API can handle a lot of common cases for you. For example, -using the Dataset API, you can easily read in records from a large collection -of files in parallel and join them into a single stream. - -To keep things simple in this example we are going to load the data with -[pandas](https://pandas.pydata.org/), and build our input pipeline from this -in-memory data. - -Here is the input function used for training in this program, which is available -in [`iris_data.py`](https://github.com/tensorflow/models/blob/master/samples/core/get_started/iris_data.py): - -``` python -def train_input_fn(features, labels, batch_size): - """An input function for training""" - # Convert the inputs to a Dataset. - dataset = tf.data.Dataset.from_tensor_slices((dict(features), labels)) - - # Shuffle, repeat, and batch the examples. - return dataset.shuffle(1000).repeat().batch(batch_size) -``` - -## Define the feature columns - -A [**feature column**](https://developers.google.com/machine-learning/glossary/#feature_columns) -is an object describing how the model should use raw input data from the -features dictionary. When you build an Estimator model, you pass it a list of -feature columns that describes each of the features you want the model to use. -The `tf.feature_column` module provides many options for representing data -to the model. - -For Iris, the 4 raw features are numeric values, so we'll build a list of -feature columns to tell the Estimator model to represent each of the four -features as 32-bit floating-point values. Therefore, the code to create the -feature column is: - -```python -# Feature columns describe how to use the input. -my_feature_columns = [] -for key in train_x.keys(): - my_feature_columns.append(tf.feature_column.numeric_column(key=key)) -``` - -Feature columns can be far more sophisticated than those we're showing here. We -detail feature columns [later on](../guide/feature_columns.md) in our Getting -Started guide. - -Now that we have the description of how we want the model to represent the raw -features, we can build the estimator. - - -## Instantiate an estimator - -The Iris problem is a classic classification problem. Fortunately, TensorFlow -provides several pre-made classifier Estimators, including: - -* `tf.estimator.DNNClassifier` for deep models that perform multi-class - classification. -* `tf.estimator.DNNLinearCombinedClassifier` for wide & deep models. -* `tf.estimator.LinearClassifier` for classifiers based on linear models. - -For the Iris problem, `tf.estimator.DNNClassifier` seems like the best choice. -Here's how we instantiated this Estimator: - -```python -# Build a DNN with 2 hidden layers and 10 nodes in each hidden layer. -classifier = tf.estimator.DNNClassifier( - feature_columns=my_feature_columns, - # Two hidden layers of 10 nodes each. - hidden_units=[10, 10], - # The model must choose between 3 classes. - n_classes=3) -``` - -## Train, Evaluate, and Predict - -Now that we have an Estimator object, we can call methods to do the following: - -* Train the model. -* Evaluate the trained model. -* Use the trained model to make predictions. - -### Train the model - -Train the model by calling the Estimator's `train` method as follows: - -```python -# Train the Model. -classifier.train( - input_fn=lambda:iris_data.train_input_fn(train_x, train_y, args.batch_size), - steps=args.train_steps) -``` - -Here we wrap up our `input_fn` call in a -[`lambda`](https://docs.python.org/3/tutorial/controlflow.html) -to capture the arguments while providing an input function that takes no -arguments, as expected by the Estimator. The `steps` argument tells the method -to stop training after a number of training steps. - -### Evaluate the trained model - -Now that the model has been trained, we can get some statistics on its -performance. The following code block evaluates the accuracy of the trained -model on the test data: - -```python -# Evaluate the model. -eval_result = classifier.evaluate( - input_fn=lambda:iris_data.eval_input_fn(test_x, test_y, args.batch_size)) - -print('\nTest set accuracy: {accuracy:0.3f}\n'.format(**eval_result)) -``` - -Unlike our call to the `train` method, we did not pass the `steps` -argument to evaluate. Our `eval_input_fn` only yields a single -[epoch](https://developers.google.com/machine-learning/glossary/#epoch) of data. - -Running this code yields the following output (or something similar): - -```none -Test set accuracy: 0.967 -``` - -The `eval_result` dictionary also contains the `average_loss` (mean loss per sample), the `loss` (mean loss per mini-batch) and the value of the estimator's `global_step` (the number of training iterations it underwent). - -### Making predictions (inferring) from the trained model - -We now have a trained model that produces good evaluation results. -We can now use the trained model to predict the species of an Iris flower -based on some unlabeled measurements. As with training and evaluation, we make -predictions using a single function call: - -```python -# Generate predictions from the model -expected = ['Setosa', 'Versicolor', 'Virginica'] -predict_x = { - 'SepalLength': [5.1, 5.9, 6.9], - 'SepalWidth': [3.3, 3.0, 3.1], - 'PetalLength': [1.7, 4.2, 5.4], - 'PetalWidth': [0.5, 1.5, 2.1], -} - -predictions = classifier.predict( - input_fn=lambda:iris_data.eval_input_fn(predict_x, - batch_size=args.batch_size)) -``` - -The `predict` method returns a Python iterable, yielding a dictionary of -prediction results for each example. The following code prints a few -predictions and their probabilities: - - -``` python -template = ('\nPrediction is "{}" ({:.1f}%), expected "{}"') - -for pred_dict, expec in zip(predictions, expected): - class_id = pred_dict['class_ids'][0] - probability = pred_dict['probabilities'][class_id] - - print(template.format(iris_data.SPECIES[class_id], - 100 * probability, expec)) -``` - -Running the preceding code yields the following output: - -``` None -... -Prediction is "Setosa" (99.6%), expected "Setosa" - -Prediction is "Versicolor" (99.8%), expected "Versicolor" - -Prediction is "Virginica" (97.9%), expected "Virginica" -``` - - -## Summary - -Pre-made Estimators are an effective way to quickly create standard models. - -Now that you've gotten started writing TensorFlow programs, consider the -following material: - -* [Checkpoints](../guide/checkpoints.md) to learn how to save and restore models. -* [Datasets for Estimators](../guide/datasets_for_estimators.md) to learn more about importing - data into your model. -* [Creating Custom Estimators](../guide/custom_estimators.md) to learn how to - write your own Estimator, customized for a particular problem. diff --git a/tensorflow/docs_src/guide/saved_model.md b/tensorflow/docs_src/guide/saved_model.md deleted file mode 100644 index 33ab891861..0000000000 --- a/tensorflow/docs_src/guide/saved_model.md +++ /dev/null @@ -1,999 +0,0 @@ -# Save and Restore - -The `tf.train.Saver` class provides methods to save and restore models. The -`tf.saved_model.simple_save` function is an easy way to build a -`tf.saved_model` suitable for serving. [Estimators](../guide/estimators.md) -automatically save and restore variables in the `model_dir`. - -## Save and restore variables - -TensorFlow [Variables](../guide/variables.md) are the best way to represent shared, persistent state -manipulated by your program. The `tf.train.Saver` constructor adds `save` and -`restore` ops to the graph for all, or a specified list, of the variables in the -graph. The `Saver` object provides methods to run these ops, specifying paths -for the checkpoint files to write to or read from. - -`Saver` restores all variables already defined in your model. If you're -loading a model without knowing how to build its graph (for example, if you're -writing a generic program to load models), then read the -[Overview of saving and restoring models](#models) section -later in this document. - -TensorFlow saves variables in binary *checkpoint files* that map variable -names to tensor values. - -Caution: TensorFlow model files are code. Be careful with untrusted code. -See [Using TensorFlow Securely](https://github.com/tensorflow/tensorflow/blob/master/SECURITY.md) -for details. - -### Save variables - -Create a `Saver` with `tf.train.Saver()` to manage all variables in the -model. For example, the following snippet demonstrates how to call the -`tf.train.Saver.save` method to save variables to checkpoint files: - -```python -# Create some variables. -v1 = tf.get_variable("v1", shape=[3], initializer = tf.zeros_initializer) -v2 = tf.get_variable("v2", shape=[5], initializer = tf.zeros_initializer) - -inc_v1 = v1.assign(v1+1) -dec_v2 = v2.assign(v2-1) - -# Add an op to initialize the variables. -init_op = tf.global_variables_initializer() - -# Add ops to save and restore all the variables. -saver = tf.train.Saver() - -# Later, launch the model, initialize the variables, do some work, and save the -# variables to disk. -with tf.Session() as sess: - sess.run(init_op) - # Do some work with the model. - inc_v1.op.run() - dec_v2.op.run() - # Save the variables to disk. - save_path = saver.save(sess, "/tmp/model.ckpt") - print("Model saved in path: %s" % save_path) -``` - -### Restore variables - -The `tf.train.Saver` object not only saves variables to checkpoint files, it -also restores variables. Note that when you restore variables you do not have -to initialize them beforehand. For example, the following snippet demonstrates -how to call the `tf.train.Saver.restore` method to restore variables from the -checkpoint files: - -```python -tf.reset_default_graph() - -# Create some variables. -v1 = tf.get_variable("v1", shape=[3]) -v2 = tf.get_variable("v2", shape=[5]) - -# Add ops to save and restore all the variables. -saver = tf.train.Saver() - -# Later, launch the model, use the saver to restore variables from disk, and -# do some work with the model. -with tf.Session() as sess: - # Restore variables from disk. - saver.restore(sess, "/tmp/model.ckpt") - print("Model restored.") - # Check the values of the variables - print("v1 : %s" % v1.eval()) - print("v2 : %s" % v2.eval()) -``` - -Note: There is not a physical file called `/tmp/model.ckpt`. It is the *prefix* of -filenames created for the checkpoint. Users only interact with the prefix -instead of physical checkpoint files. - -### Choose variables to save and restore - -If you do not pass any arguments to `tf.train.Saver()`, the saver handles all -variables in the graph. Each variable is saved under the name that was passed -when the variable was created. - -It is sometimes useful to explicitly specify names for variables in the -checkpoint files. For example, you may have trained a model with a variable -named `"weights"` whose value you want to restore into a variable named -`"params"`. - -It is also sometimes useful to only save or restore a subset of the variables -used by a model. For example, you may have trained a neural net with five -layers, and you now want to train a new model with six layers that reuses the -existing weights of the five trained layers. You can use the saver to restore -the weights of just the first five layers. - -You can easily specify the names and variables to save or load by passing to the -`tf.train.Saver()` constructor either of the following: - -* A list of variables (which will be stored under their own names). -* A Python dictionary in which keys are the names to use and the values are the -variables to manage. - -Continuing from the save/restore examples shown earlier: - -```python -tf.reset_default_graph() -# Create some variables. -v1 = tf.get_variable("v1", [3], initializer = tf.zeros_initializer) -v2 = tf.get_variable("v2", [5], initializer = tf.zeros_initializer) - -# Add ops to save and restore only `v2` using the name "v2" -saver = tf.train.Saver({"v2": v2}) - -# Use the saver object normally after that. -with tf.Session() as sess: - # Initialize v1 since the saver will not. - v1.initializer.run() - saver.restore(sess, "/tmp/model.ckpt") - - print("v1 : %s" % v1.eval()) - print("v2 : %s" % v2.eval()) -``` - -Notes: - -* You can create as many `Saver` objects as you want if you need to save and - restore different subsets of the model variables. The same variable can be - listed in multiple saver objects; its value is only changed when the - `Saver.restore()` method is run. - -* If you only restore a subset of the model variables at the start of a - session, you have to run an initialize op for the other variables. See - `tf.variables_initializer` for more information. - -* To inspect the variables in a checkpoint, you can use the - [`inspect_checkpoint`](https://www.tensorflow.org/code/tensorflow/python/tools/inspect_checkpoint.py) - library, particularly the `print_tensors_in_checkpoint_file` function. - -* By default, `Saver` uses the value of the `tf.Variable.name` property - for each variable. However, when you create a `Saver` object, you may - optionally choose names for the variables in the checkpoint files. - - -### Inspect variables in a checkpoint - -We can quickly inspect variables in a checkpoint with the -[`inspect_checkpoint`](https://www.tensorflow.org/code/tensorflow/python/tools/inspect_checkpoint.py) library. - -Continuing from the save/restore examples shown earlier: - -```python -# import the inspect_checkpoint library -from tensorflow.python.tools import inspect_checkpoint as chkp - -# print all tensors in checkpoint file -chkp.print_tensors_in_checkpoint_file("/tmp/model.ckpt", tensor_name='', all_tensors=True) - -# tensor_name: v1 -# [ 1. 1. 1.] -# tensor_name: v2 -# [-1. -1. -1. -1. -1.] - -# print only tensor v1 in checkpoint file -chkp.print_tensors_in_checkpoint_file("/tmp/model.ckpt", tensor_name='v1', all_tensors=False) - -# tensor_name: v1 -# [ 1. 1. 1.] - -# print only tensor v2 in checkpoint file -chkp.print_tensors_in_checkpoint_file("/tmp/model.ckpt", tensor_name='v2', all_tensors=False) - -# tensor_name: v2 -# [-1. -1. -1. -1. -1.] -``` - - - -## Save and restore models - -Use `SavedModel` to save and load your model—variables, the graph, and the -graph's metadata. This is a language-neutral, recoverable, hermetic -serialization format that enables higher-level systems and tools to produce, -consume, and transform TensorFlow models. TensorFlow provides several ways to -interact with `SavedModel`, including the `tf.saved_model` APIs, -`tf.estimator.Estimator`, and a command-line interface. - - -## Build and load a SavedModel - -### Simple save - -The easiest way to create a `SavedModel` is to use the `tf.saved_model.simple_save` -function: - -```python -simple_save(session, - export_dir, - inputs={"x": x, "y": y}, - outputs={"z": z}) -``` - -This configures the `SavedModel` so it can be loaded by -[TensorFlow serving](/serving/serving_basic) and supports the -[Predict API](https://github.com/tensorflow/serving/blob/master/tensorflow_serving/apis/predict.proto). -To access the classify, regress, or multi-inference APIs, use the manual -`SavedModel` builder APIs or an `tf.estimator.Estimator`. - -### Manually build a SavedModel - -If your use case isn't covered by `tf.saved_model.simple_save`, use the manual -`tf.saved_model.builder` to create a `SavedModel`. - -The `tf.saved_model.builder.SavedModelBuilder` class provides functionality to -save multiple `MetaGraphDef`s. A **MetaGraph** is a dataflow graph, plus -its associated variables, assets, and signatures. A **`MetaGraphDef`** -is the protocol buffer representation of a MetaGraph. A **signature** is -the set of inputs to and outputs from a graph. - -If assets need to be saved and written or copied to disk, they can be provided -when the first `MetaGraphDef` is added. If multiple `MetaGraphDef`s are -associated with an asset of the same name, only the first version is retained. - -Each `MetaGraphDef` added to the SavedModel must be annotated with -user-specified tags. The tags provide a means to identify the specific -`MetaGraphDef` to load and restore, along with the shared set of variables -and assets. These tags -typically annotate a `MetaGraphDef` with its functionality (for example, -serving or training), and optionally with hardware-specific aspects (for -example, GPU). - -For example, the following code suggests a typical way to use -`SavedModelBuilder` to build a SavedModel: - -```python -export_dir = ... -... -builder = tf.saved_model.builder.SavedModelBuilder(export_dir) -with tf.Session(graph=tf.Graph()) as sess: - ... - builder.add_meta_graph_and_variables(sess, - [tag_constants.TRAINING], - signature_def_map=foo_signatures, - assets_collection=foo_assets, - strip_default_attrs=True) -... -# Add a second MetaGraphDef for inference. -with tf.Session(graph=tf.Graph()) as sess: - ... - builder.add_meta_graph([tag_constants.SERVING], strip_default_attrs=True) -... -builder.save() -``` - - -#### Forward compatibility via `strip_default_attrs=True` - -Following the guidance below gives you forward compatibility only if the set of -Ops has not changed. - -The `tf.saved_model.builder.SavedModelBuilder` class allows -users to control whether default-valued attributes must be stripped from the -[`NodeDefs`](../extend/tool_developers/index.md#nodes) -while adding a meta graph to the SavedModel bundle. Both -`tf.saved_model.builder.SavedModelBuilder.add_meta_graph_and_variables` -and `tf.saved_model.builder.SavedModelBuilder.add_meta_graph` -methods accept a Boolean flag `strip_default_attrs` that controls this behavior. - -If `strip_default_attrs` is `False`, the exported `tf.MetaGraphDef` will have -the default valued attributes in all its `tf.NodeDef` instances. -This can break forward compatibility with a sequence of events such as the -following: - -* An existing Op (`Foo`) is updated to include a new attribute (`T`) with a - default (`bool`) at version 101. -* A model producer such as a "trainer binary" picks up this change (version 101) - to the `OpDef` and re-exports an existing model that uses Op `Foo`. -* A model consumer (such as [Tensorflow Serving](/serving)) running an older - binary (version 100) doesn't have attribute `T` for Op `Foo`, but tries to - import this model. The model consumer doesn't recognize attribute `T` in a - `NodeDef` that uses Op `Foo` and therefore fails to load the model. -* By setting `strip_default_attrs` to True, the model producers can strip away - any default valued attributes in the `NodeDefs`. This helps ensure that newly - added attributes with defaults don't cause older model consumers to fail - loading models regenerated with newer training binaries. - -See [compatibility guidance](./version_compat.md) -for more information. - -### Loading a SavedModel in Python - -The Python version of the SavedModel -`tf.saved_model.loader` -provides load and restore capability for a SavedModel. The `load` operation -requires the following information: - -* The session in which to restore the graph definition and variables. -* The tags used to identify the MetaGraphDef to load. -* The location (directory) of the SavedModel. - -Upon a load, the subset of variables, assets, and signatures supplied as part of -the specific MetaGraphDef will be restored into the supplied session. - - -```python -export_dir = ... -... -with tf.Session(graph=tf.Graph()) as sess: - tf.saved_model.loader.load(sess, [tag_constants.TRAINING], export_dir) - ... -``` - - -### Load a SavedModel in C++ - -The C++ version of the SavedModel -[loader](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/cc/saved_model/loader.h) -provides an API to load a SavedModel from a path, while allowing -`SessionOptions` and `RunOptions`. -You have to specify the tags associated with the graph to be loaded. -The loaded version of SavedModel is referred to as `SavedModelBundle` -and contains the MetaGraphDef and the session within which it is loaded. - -```c++ -const string export_dir = ... -SavedModelBundle bundle; -... -LoadSavedModel(session_options, run_options, export_dir, {kSavedModelTagTrain}, - &bundle); -``` - -### Load and serve a SavedModel in TensorFlow serving - -You can easily load and serve a SavedModel with the TensorFlow Serving Model -Server binary. See [instructions](https://www.tensorflow.org/serving/setup#installing_using_apt-get) -on how to install the server, or build it if you wish. - -Once you have the Model Server, run it with: -``` -tensorflow_model_server --port=port-numbers --model_name=your-model-name --model_base_path=your_model_base_path -``` -Set the port and model_name flags to values of your choosing. The -model_base_path flag expects to be to a base directory, with each version of -your model residing in a numerically named subdirectory. If you only have a -single version of your model, simply place it in a subdirectory like so: -* Place the model in /tmp/model/0001 -* Set model_base_path to /tmp/model - -Store different versions of your model in numerically named subdirectories of a -common base directory. For example, suppose the base directory is `/tmp/model`. -If you have only one version of your model, store it in `/tmp/model/0001`. If -you have two versions of your model, store the second version in -`/tmp/model/0002`, and so on. Set the `--model-base_path` flag to the base -directory (`/tmp/model`, in this example). TensorFlow Model Server will serve -the model in the highest numbered subdirectory of that base directory. - -### Standard constants - -SavedModel offers the flexibility to build and load TensorFlow graphs for a -variety of use-cases. For the most common use-cases, SavedModel's APIs -provide a set of constants in Python and C++ that are easy to -reuse and share across tools consistently. - -#### Standard MetaGraphDef tags - -You may use sets of tags to uniquely identify a `MetaGraphDef` saved in a -SavedModel. A subset of commonly used tags is specified in: - -* [Python](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/saved_model/tag_constants.py) -* [C++](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/cc/saved_model/tag_constants.h) - - -#### Standard SignatureDef constants - -A [**SignatureDef**](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/protobuf/meta_graph.proto) -is a protocol buffer that defines the signature of a computation -supported by a graph. -Commonly used input keys, output keys, and method names are -defined in: - -* [Python](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/saved_model/signature_constants.py) -* [C++](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/cc/saved_model/signature_constants.h) - -## Using SavedModel with Estimators - -After training an `Estimator` model, you may want to create a service -from that model that takes requests and returns a result. You can run such a -service locally on your machine or deploy it in the cloud. - -To prepare a trained Estimator for serving, you must export it in the standard -SavedModel format. This section explains how to: - -* Specify the output nodes and the corresponding - [APIs](https://github.com/tensorflow/serving/blob/master/tensorflow_serving/apis/prediction_service.proto) - that can be served (Classify, Regress, or Predict). -* Export your model to the SavedModel format. -* Serve the model from a local server and request predictions. - - -### Prepare serving inputs - -During training, an [`input_fn()`](../guide/premade_estimators.md#input_fn) ingests data -and prepares it for use by the model. At serving time, similarly, a -`serving_input_receiver_fn()` accepts inference requests and prepares them for -the model. This function has the following purposes: - -* To add placeholders to the graph that the serving system will feed - with inference requests. -* To add any additional ops needed to convert data from the input format - into the feature `Tensor`s expected by the model. - -The function returns a `tf.estimator.export.ServingInputReceiver` object, -which packages the placeholders and the resulting feature `Tensor`s together. - -A typical pattern is that inference requests arrive in the form of serialized -`tf.Example`s, so the `serving_input_receiver_fn()` creates a single string -placeholder to receive them. The `serving_input_receiver_fn()` is then also -responsible for parsing the `tf.Example`s by adding a `tf.parse_example` op to -the graph. - -When writing such a `serving_input_receiver_fn()`, you must pass a parsing -specification to `tf.parse_example` to tell the parser what feature names to -expect and how to map them to `Tensor`s. A parsing specification takes the -form of a dict from feature names to `tf.FixedLenFeature`, `tf.VarLenFeature`, -and `tf.SparseFeature`. Note this parsing specification should not include -any label or weight columns, since those will not be available at serving -time—in contrast to a parsing specification used in the `input_fn()` at -training time. - -In combination, then: - -```py -feature_spec = {'foo': tf.FixedLenFeature(...), - 'bar': tf.VarLenFeature(...)} - -def serving_input_receiver_fn(): - """An input receiver that expects a serialized tf.Example.""" - serialized_tf_example = tf.placeholder(dtype=tf.string, - shape=[default_batch_size], - name='input_example_tensor') - receiver_tensors = {'examples': serialized_tf_example} - features = tf.parse_example(serialized_tf_example, feature_spec) - return tf.estimator.export.ServingInputReceiver(features, receiver_tensors) -``` - -The `tf.estimator.export.build_parsing_serving_input_receiver_fn` utility -function provides that input receiver for the common case. - -> Note: when training a model to be served using the Predict API with a local -> server, the parsing step is not needed because the model will receive raw -> feature data. - -Even if you require no parsing or other input processing—that is, if the -serving system will feed feature `Tensor`s directly—you must still provide -a `serving_input_receiver_fn()` that creates placeholders for the feature -`Tensor`s and passes them through. The -`tf.estimator.export.build_raw_serving_input_receiver_fn` utility provides for -this. - -If these utilities do not meet your needs, you are free to write your own -`serving_input_receiver_fn()`. One case where this may be needed is if your -training `input_fn()` incorporates some preprocessing logic that must be -recapitulated at serving time. To reduce the risk of training-serving skew, we -recommend encapsulating such processing in a function which is then called -from both `input_fn()` and `serving_input_receiver_fn()`. - -Note that the `serving_input_receiver_fn()` also determines the *input* -portion of the signature. That is, when writing a -`serving_input_receiver_fn()`, you must tell the parser what signatures -to expect and how to map them to your model's expected inputs. -By contrast, the *output* portion of the signature is determined by the model. - - -### Specify the outputs of a custom model - -When writing a custom `model_fn`, you must populate the `export_outputs` element -of the `tf.estimator.EstimatorSpec` return value. This is a dict of -`{name: output}` describing the output signatures to be exported and used during -serving. - -In the usual case of making a single prediction, this dict contains -one element, and the `name` is immaterial. In a multi-headed model, each head -is represented by an entry in this dict. In this case the `name` is a string -of your choice that can be used to request a specific head at serving time. - -Each `output` value must be an `ExportOutput` object such as -`tf.estimator.export.ClassificationOutput`, -`tf.estimator.export.RegressionOutput`, or -`tf.estimator.export.PredictOutput`. - -These output types map straightforwardly to the -[TensorFlow Serving APIs](https://github.com/tensorflow/serving/blob/master/tensorflow_serving/apis/prediction_service.proto), -and so determine which request types will be honored. - -Note: In the multi-headed case, a `SignatureDef` will be generated for each -element of the `export_outputs` dict returned from the model_fn, named using -the same keys. These `SignatureDef`s differ only in their outputs, as -provided by the corresponding `ExportOutput` entry. The inputs are always -those provided by the `serving_input_receiver_fn`. -An inference request may specify the head by name. One head must be named -using [`signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY`](https://www.tensorflow.org/code/tensorflow/python/saved_model/signature_constants.py) -indicating which `SignatureDef` will be served when an inference request -does not specify one. - - -### Perform the export - -To export your trained Estimator, call -`tf.estimator.Estimator.export_savedmodel` with the export base path and -the `serving_input_receiver_fn`. - -```py -estimator.export_savedmodel(export_dir_base, serving_input_receiver_fn, - strip_default_attrs=True) -``` - -This method builds a new graph by first calling the -`serving_input_receiver_fn()` to obtain feature `Tensor`s, and then calling -this `Estimator`'s `model_fn()` to generate the model graph based on those -features. It starts a fresh `Session`, and, by default, restores the most recent -checkpoint into it. (A different checkpoint may be passed, if needed.) -Finally it creates a time-stamped export directory below the given -`export_dir_base` (i.e., `export_dir_base/`), and writes a -SavedModel into it containing a single `MetaGraphDef` saved from this -Session. - -> Note: It is your responsibility to garbage-collect old exports. -> Otherwise, successive exports will accumulate under `export_dir_base`. - -### Serve the exported model locally - -For local deployment, you can serve your model using -[TensorFlow Serving](https://github.com/tensorflow/serving), an open-source project that loads a -SavedModel and exposes it as a [gRPC](https://www.grpc.io/) service. - -First, [install TensorFlow Serving](https://github.com/tensorflow/serving). - -Then build and run the local model server, substituting `$export_dir_base` with -the path to the SavedModel you exported above: - -```sh -bazel build //tensorflow_serving/model_servers:tensorflow_model_server -bazel-bin/tensorflow_serving/model_servers/tensorflow_model_server --port=9000 --model_base_path=$export_dir_base -``` - -Now you have a server listening for inference requests via gRPC on port 9000! - - -### Request predictions from a local server - -The server responds to gRPC requests according to the -[PredictionService](https://github.com/tensorflow/serving/blob/master/tensorflow_serving/apis/prediction_service.proto#L15) -gRPC API service definition. (The nested protocol buffers are defined in -various [neighboring files](https://github.com/tensorflow/serving/blob/master/tensorflow_serving/apis)). - -From the API service definition, the gRPC framework generates client libraries -in various languages providing remote access to the API. In a project using the -Bazel build tool, these libraries are built automatically and provided via -dependencies like these (using Python for example): - -```build - deps = [ - "//tensorflow_serving/apis:classification_proto_py_pb2", - "//tensorflow_serving/apis:regression_proto_py_pb2", - "//tensorflow_serving/apis:predict_proto_py_pb2", - "//tensorflow_serving/apis:prediction_service_proto_py_pb2" - ] -``` - -Python client code can then import the libraries thus: - -```py -from tensorflow_serving.apis import classification_pb2 -from tensorflow_serving.apis import regression_pb2 -from tensorflow_serving.apis import predict_pb2 -from tensorflow_serving.apis import prediction_service_pb2 -``` - -> Note: `prediction_service_pb2` defines the service as a whole and so -> is always required. However a typical client will need only one of -> `classification_pb2`, `regression_pb2`, and `predict_pb2`, depending on the -> type of requests being made. - -Sending a gRPC request is then accomplished by assembling a protocol buffer -containing the request data and passing it to the service stub. Note how the -request protocol buffer is created empty and then populated via the -[generated protocol buffer API](https://developers.google.com/protocol-buffers/docs/reference/python-generated). - -```py -from grpc.beta import implementations - -channel = implementations.insecure_channel(host, int(port)) -stub = prediction_service_pb2.beta_create_PredictionService_stub(channel) - -request = classification_pb2.ClassificationRequest() -example = request.input.example_list.examples.add() -example.features.feature['x'].float_list.value.extend(image[0].astype(float)) - -result = stub.Classify(request, 10.0) # 10 secs timeout -``` - -The returned result in this example is a `ClassificationResponse` protocol -buffer. - -This is a skeletal example; please see the [Tensorflow Serving](../deploy/index.md) -documentation and [examples](https://github.com/tensorflow/serving/tree/master/tensorflow_serving/example) -for more details. - -> Note: `ClassificationRequest` and `RegressionRequest` contain a -> `tensorflow.serving.Input` protocol buffer, which in turn contains a list of -> `tensorflow.Example` protocol buffers. `PredictRequest`, by contrast, -> contains a mapping from feature names to values encoded via `TensorProto`. -> Correspondingly: When using the `Classify` and `Regress` APIs, TensorFlow -> Serving feeds serialized `tf.Example`s to the graph, so your -> `serving_input_receiver_fn()` should include a `tf.parse_example()` Op. -> When using the generic `Predict` API, however, TensorFlow Serving feeds raw -> feature data to the graph, so a pass through `serving_input_receiver_fn()` -> should be used. - - - - - - - - - -## CLI to inspect and execute SavedModel - -You can use the SavedModel Command Line Interface (CLI) to inspect and -execute a SavedModel. -For example, you can use the CLI to inspect the model's `SignatureDef`s. -The CLI enables you to quickly confirm that the input -[Tensor dtype and shape](../guide/tensors.md) match the model. Moreover, if you -want to test your model, you can use the CLI to do a sanity check by -passing in sample inputs in various formats (for example, Python -expressions) and then fetching the output. - - -### Install the SavedModel CLI - -Broadly speaking, you can install TensorFlow in either of the following -two ways: - -* By installing a pre-built TensorFlow binary. -* By building TensorFlow from source code. - -If you installed TensorFlow through a pre-built TensorFlow binary, -then the SavedModel CLI is already installed on your system -at pathname `bin\saved_model_cli`. - -If you built TensorFlow from source code, you must run the following -additional command to build `saved_model_cli`: - -``` -$ bazel build tensorflow/python/tools:saved_model_cli -``` - -### Overview of commands - -The SavedModel CLI supports the following two commands on a -`MetaGraphDef` in a SavedModel: - -* `show`, which shows a computation on a `MetaGraphDef` in a SavedModel. -* `run`, which runs a computation on a `MetaGraphDef`. - - -### `show` command - -A SavedModel contains one or more `MetaGraphDef`s, identified by their tag-sets. -To serve a model, you -might wonder what kind of `SignatureDef`s are in each model, and what are their -inputs and outputs. The `show` command let you examine the contents of the -SavedModel in hierarchical order. Here's the syntax: - -``` -usage: saved_model_cli show [-h] --dir DIR [--all] -[--tag_set TAG_SET] [--signature_def SIGNATURE_DEF_KEY] -``` - -For example, the following command shows all available -MetaGraphDef tag-sets in the SavedModel: - -``` -$ saved_model_cli show --dir /tmp/saved_model_dir -The given SavedModel contains the following tag-sets: -serve -serve, gpu -``` - -The following command shows all available `SignatureDef` keys in -a `MetaGraphDef`: - -``` -$ saved_model_cli show --dir /tmp/saved_model_dir --tag_set serve -The given SavedModel `MetaGraphDef` contains `SignatureDefs` with the -following keys: -SignatureDef key: "classify_x2_to_y3" -SignatureDef key: "classify_x_to_y" -SignatureDef key: "regress_x2_to_y3" -SignatureDef key: "regress_x_to_y" -SignatureDef key: "regress_x_to_y2" -SignatureDef key: "serving_default" -``` - -If a `MetaGraphDef` has *multiple* tags in the tag-set, you must specify -all tags, each tag separated by a comma. For example: - -```none -$ saved_model_cli show --dir /tmp/saved_model_dir --tag_set serve,gpu -``` - -To show all inputs and outputs TensorInfo for a specific `SignatureDef`, pass in -the `SignatureDef` key to `signature_def` option. This is very useful when you -want to know the tensor key value, dtype and shape of the input tensors for -executing the computation graph later. For example: - -``` -$ saved_model_cli show --dir \ -/tmp/saved_model_dir --tag_set serve --signature_def serving_default -The given SavedModel SignatureDef contains the following input(s): - inputs['x'] tensor_info: - dtype: DT_FLOAT - shape: (-1, 1) - name: x:0 -The given SavedModel SignatureDef contains the following output(s): - outputs['y'] tensor_info: - dtype: DT_FLOAT - shape: (-1, 1) - name: y:0 -Method name is: tensorflow/serving/predict -``` - -To show all available information in the SavedModel, use the `--all` option. -For example: - -```none -$ saved_model_cli show --dir /tmp/saved_model_dir --all -MetaGraphDef with tag-set: 'serve' contains the following SignatureDefs: - -signature_def['classify_x2_to_y3']: - The given SavedModel SignatureDef contains the following input(s): - inputs['inputs'] tensor_info: - dtype: DT_FLOAT - shape: (-1, 1) - name: x2:0 - The given SavedModel SignatureDef contains the following output(s): - outputs['scores'] tensor_info: - dtype: DT_FLOAT - shape: (-1, 1) - name: y3:0 - Method name is: tensorflow/serving/classify - -... - -signature_def['serving_default']: - The given SavedModel SignatureDef contains the following input(s): - inputs['x'] tensor_info: - dtype: DT_FLOAT - shape: (-1, 1) - name: x:0 - The given SavedModel SignatureDef contains the following output(s): - outputs['y'] tensor_info: - dtype: DT_FLOAT - shape: (-1, 1) - name: y:0 - Method name is: tensorflow/serving/predict -``` - - -### `run` command - -Invoke the `run` command to run a graph computation, passing -inputs and then displaying (and optionally saving) the outputs. -Here's the syntax: - -``` -usage: saved_model_cli run [-h] --dir DIR --tag_set TAG_SET --signature_def - SIGNATURE_DEF_KEY [--inputs INPUTS] - [--input_exprs INPUT_EXPRS] - [--input_examples INPUT_EXAMPLES] [--outdir OUTDIR] - [--overwrite] [--tf_debug] -``` - -The `run` command provides the following three ways to pass inputs to the model: - -* `--inputs` option enables you to pass numpy ndarray in files. -* `--input_exprs` option enables you to pass Python expressions. -* `--input_examples` option enables you to pass `tf.train.Example`. - - -#### `--inputs` - -To pass input data in files, specify the `--inputs` option, which takes the -following general format: - -```bsh ---inputs -``` - -where *INPUTS* is either of the following formats: - -* `=` -* `=[]` - -You may pass multiple *INPUTS*. If you do pass multiple inputs, use a semicolon -to separate each of the *INPUTS*. - -`saved_model_cli` uses `numpy.load` to load the *filename*. -The *filename* may be in any of the following formats: - -* `.npy` -* `.npz` -* pickle format - -A `.npy` file always contains a numpy ndarray. Therefore, when loading from -a `.npy` file, the content will be directly assigned to the specified input -tensor. If you specify a *variable_name* with that `.npy` file, the -*variable_name* will be ignored and a warning will be issued. - -When loading from a `.npz` (zip) file, you may optionally specify a -*variable_name* to identify the variable within the zip file to load for -the input tensor key. If you don't specify a *variable_name*, the SavedModel -CLI will check that only one file is included in the zip file and load it -for the specified input tensor key. - -When loading from a pickle file, if no `variable_name` is specified in the -square brackets, whatever that is inside the pickle file will be passed to the -specified input tensor key. Otherwise, the SavedModel CLI will assume a -dictionary is stored in the pickle file and the value corresponding to -the *variable_name* will be used. - - -#### `--input_exprs` - -To pass inputs through Python expressions, specify the `--input_exprs` option. -This can be useful for when you don't have data -files lying around, but still want to sanity check the model with some simple -inputs that match the dtype and shape of the model's `SignatureDef`s. -For example: - -```bsh -`=[[1],[2],[3]]` -``` - -In addition to Python expressions, you may also pass numpy functions. For -example: - -```bsh -`=np.ones((32,32,3))` -``` - -(Note that the `numpy` module is already available to you as `np`.) - - -#### `--input_examples` - -To pass `tf.train.Example` as inputs, specify the `--input_examples` option. -For each input key, it takes a list of dictionary, where each dictionary is an -instance of `tf.train.Example`. The dictionary keys are the features and the -values are the value lists for each feature. -For example: - -```bsh -`=[{"age":[22,24],"education":["BS","MS"]}]` -``` - -#### Save output - -By default, the SavedModel CLI writes output to stdout. If a directory is -passed to `--outdir` option, the outputs will be saved as npy files named after -output tensor keys under the given directory. - -Use `--overwrite` to overwrite existing output files. - - -#### TensorFlow debugger (tfdbg) integration - -If `--tf_debug` option is set, the SavedModel CLI will use the -TensorFlow Debugger (tfdbg) to watch the intermediate Tensors and runtime -graphs or subgraphs while running the SavedModel. - - -#### Full examples of `run` - -Given: - -* Your model simply adds `x1` and `x2` to get output `y`. -* All tensors in the model have shape `(-1, 1)`. -* You have two `npy` files: - * `/tmp/my_data1.npy`, which contains a numpy ndarray `[[1], [2], [3]]`. - * `/tmp/my_data2.npy`, which contains another numpy - ndarray `[[0.5], [0.5], [0.5]]`. - -To run these two `npy` files through the model to get output `y`, issue -the following command: - -``` -$ saved_model_cli run --dir /tmp/saved_model_dir --tag_set serve \ ---signature_def x1_x2_to_y --inputs x1=/tmp/my_data1.npy;x2=/tmp/my_data2.npy \ ---outdir /tmp/out -Result for output key y: -[[ 1.5] - [ 2.5] - [ 3.5]] -``` - -Let's change the preceding example slightly. This time, instead of two -`.npy` files, you now have an `.npz` file and a pickle file. Furthermore, -you want to overwrite any existing output file. Here's the command: - -``` -$ saved_model_cli run --dir /tmp/saved_model_dir --tag_set serve \ ---signature_def x1_x2_to_y \ ---inputs x1=/tmp/my_data1.npz[x];x2=/tmp/my_data2.pkl --outdir /tmp/out \ ---overwrite -Result for output key y: -[[ 1.5] - [ 2.5] - [ 3.5]] -``` - -You may specify python expression instead of an input file. For example, -the following command replaces input `x2` with a Python expression: - -``` -$ saved_model_cli run --dir /tmp/saved_model_dir --tag_set serve \ ---signature_def x1_x2_to_y --inputs x1=/tmp/my_data1.npz[x] \ ---input_exprs 'x2=np.ones((3,1))' -Result for output key y: -[[ 2] - [ 3] - [ 4]] -``` - -To run the model with the TensorFlow Debugger on, issue the -following command: - -``` -$ saved_model_cli run --dir /tmp/saved_model_dir --tag_set serve \ ---signature_def serving_default --inputs x=/tmp/data.npz[x] --tf_debug -``` - - - -## Structure of a SavedModel directory - -When you save a model in SavedModel format, TensorFlow creates -a SavedModel directory consisting of the following subdirectories -and files: - -```bsh -assets/ -assets.extra/ -variables/ - variables.data-?????-of-????? - variables.index -saved_model.pb|saved_model.pbtxt -``` - -where: - -* `assets` is a subfolder containing auxiliary (external) files, - such as vocabularies. Assets are copied to the SavedModel location - and can be read when loading a specific `MetaGraphDef`. -* `assets.extra` is a subfolder where higher-level libraries and users can - add their own assets that co-exist with the model, but are not loaded by - the graph. This subfolder is not managed by the SavedModel libraries. -* `variables` is a subfolder that includes output from - `tf.train.Saver`. -* `saved_model.pb` or `saved_model.pbtxt` is the SavedModel protocol buffer. - It includes the graph definitions as `MetaGraphDef` protocol buffers. - -A single SavedModel can represent multiple graphs. In this case, all the -graphs in the SavedModel share a *single* set of checkpoints (variables) -and assets. For example, the following diagram shows one SavedModel -containing three `MetaGraphDef`s, all three of which share the same set -of checkpoints and assets: - -![SavedModel represents checkpoints, assets, and one or more MetaGraphDefs](../images/SavedModel.svg) - -Each graph is associated with a specific set of tags, which enables -identification during a load or restore operation. diff --git a/tensorflow/docs_src/guide/summaries_and_tensorboard.md b/tensorflow/docs_src/guide/summaries_and_tensorboard.md deleted file mode 100644 index 788c556b9d..0000000000 --- a/tensorflow/docs_src/guide/summaries_and_tensorboard.md +++ /dev/null @@ -1,225 +0,0 @@ -# TensorBoard: Visualizing Learning - -The computations you'll use TensorFlow for - like training a massive -deep neural network - can be complex and confusing. To make it easier to -understand, debug, and optimize TensorFlow programs, we've included a suite of -visualization tools called TensorBoard. You can use TensorBoard to visualize -your TensorFlow graph, plot quantitative metrics about the execution of your -graph, and show additional data like images that pass through it. When -TensorBoard is fully configured, it looks like this: - -![MNIST TensorBoard](https://www.tensorflow.org/images/mnist_tensorboard.png "MNIST TensorBoard") - -
- -
- -This 30-minute tutorial is intended to get you started with simple TensorBoard -usage. It assumes a basic understanding of TensorFlow. - -There are other resources available as well! The [TensorBoard GitHub](https://github.com/tensorflow/tensorboard) -has a lot more information on using individual dashboards within TensorBoard -including tips & tricks and debugging information. - -## Setup - -[Install TensorFlow](https://www.tensorflow.org/install/). Installing TensorFlow -via pip should also automatically install TensorBoard. - -## Serializing the data - -TensorBoard operates by reading TensorFlow events files, which contain summary -data that you can generate when running TensorFlow. Here's the general -lifecycle for summary data within TensorBoard. - -First, create the TensorFlow graph that you'd like to collect summary -data from, and decide which nodes you would like to annotate with -[summary operations](../api_guides/python/summary.md). - -For example, suppose you are training a convolutional neural network for -recognizing MNIST digits. You'd like to record how the learning rate -varies over time, and how the objective function is changing. Collect these by -attaching `tf.summary.scalar` ops -to the nodes that output the learning rate and loss respectively. Then, give -each `scalar_summary` a meaningful `tag`, like `'learning rate'` or `'loss -function'`. - -Perhaps you'd also like to visualize the distributions of activations coming -off a particular layer, or the distribution of gradients or weights. Collect -this data by attaching -`tf.summary.histogram` ops to -the gradient outputs and to the variable that holds your weights, respectively. - -For details on all of the summary operations available, check out the docs on -[summary operations](../api_guides/python/summary.md). - -Operations in TensorFlow don't do anything until you run them, or an op that -depends on their output. And the summary nodes that we've just created are -peripheral to your graph: none of the ops you are currently running depend on -them. So, to generate summaries, we need to run all of these summary nodes. -Managing them by hand would be tedious, so use -`tf.summary.merge_all` -to combine them into a single op that generates all the summary data. - -Then, you can just run the merged summary op, which will generate a serialized -`Summary` protobuf object with all of your summary data at a given step. -Finally, to write this summary data to disk, pass the summary protobuf to a -`tf.summary.FileWriter`. - -The `FileWriter` takes a logdir in its constructor - this logdir is quite -important, it's the directory where all of the events will be written out. -Also, the `FileWriter` can optionally take a `Graph` in its constructor. -If it receives a `Graph` object, then TensorBoard will visualize your graph -along with tensor shape information. This will give you a much better sense of -what flows through the graph: see -[Tensor shape information](../guide/graph_viz.md#tensor-shape-information). - -Now that you've modified your graph and have a `FileWriter`, you're ready to -start running your network! If you want, you could run the merged summary op -every single step, and record a ton of training data. That's likely to be more -data than you need, though. Instead, consider running the merged summary op -every `n` steps. - -The code example below is a modification of the -[simple MNIST tutorial](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/examples/tutorials/mnist/mnist.py), -in which we have added some summary ops, and run them every ten steps. If you -run this and then launch `tensorboard --logdir=/tmp/tensorflow/mnist`, you'll be able -to visualize statistics, such as how the weights or accuracy varied during -training. The code below is an excerpt; full source is -[here](https://www.tensorflow.org/code/tensorflow/examples/tutorials/mnist/mnist_with_summaries.py). - -```python -def variable_summaries(var): - """Attach a lot of summaries to a Tensor (for TensorBoard visualization).""" - with tf.name_scope('summaries'): - mean = tf.reduce_mean(var) - tf.summary.scalar('mean', mean) - with tf.name_scope('stddev'): - stddev = tf.sqrt(tf.reduce_mean(tf.square(var - mean))) - tf.summary.scalar('stddev', stddev) - tf.summary.scalar('max', tf.reduce_max(var)) - tf.summary.scalar('min', tf.reduce_min(var)) - tf.summary.histogram('histogram', var) - -def nn_layer(input_tensor, input_dim, output_dim, layer_name, act=tf.nn.relu): - """Reusable code for making a simple neural net layer. - - It does a matrix multiply, bias add, and then uses relu to nonlinearize. - It also sets up name scoping so that the resultant graph is easy to read, - and adds a number of summary ops. - """ - # Adding a name scope ensures logical grouping of the layers in the graph. - with tf.name_scope(layer_name): - # This Variable will hold the state of the weights for the layer - with tf.name_scope('weights'): - weights = weight_variable([input_dim, output_dim]) - variable_summaries(weights) - with tf.name_scope('biases'): - biases = bias_variable([output_dim]) - variable_summaries(biases) - with tf.name_scope('Wx_plus_b'): - preactivate = tf.matmul(input_tensor, weights) + biases - tf.summary.histogram('pre_activations', preactivate) - activations = act(preactivate, name='activation') - tf.summary.histogram('activations', activations) - return activations - -hidden1 = nn_layer(x, 784, 500, 'layer1') - -with tf.name_scope('dropout'): - keep_prob = tf.placeholder(tf.float32) - tf.summary.scalar('dropout_keep_probability', keep_prob) - dropped = tf.nn.dropout(hidden1, keep_prob) - -# Do not apply softmax activation yet, see below. -y = nn_layer(dropped, 500, 10, 'layer2', act=tf.identity) - -with tf.name_scope('cross_entropy'): - # The raw formulation of cross-entropy, - # - # tf.reduce_mean(-tf.reduce_sum(y_ * tf.log(tf.softmax(y)), - # reduction_indices=[1])) - # - # can be numerically unstable. - # - # So here we use tf.losses.sparse_softmax_cross_entropy on the - # raw logit outputs of the nn_layer above. - with tf.name_scope('total'): - cross_entropy = tf.losses.sparse_softmax_cross_entropy(labels=y_, logits=y) -tf.summary.scalar('cross_entropy', cross_entropy) - -with tf.name_scope('train'): - train_step = tf.train.AdamOptimizer(FLAGS.learning_rate).minimize( - cross_entropy) - -with tf.name_scope('accuracy'): - with tf.name_scope('correct_prediction'): - correct_prediction = tf.equal(tf.argmax(y, 1), tf.argmax(y_, 1)) - with tf.name_scope('accuracy'): - accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32)) -tf.summary.scalar('accuracy', accuracy) - -# Merge all the summaries and write them out to /tmp/mnist_logs (by default) -merged = tf.summary.merge_all() -train_writer = tf.summary.FileWriter(FLAGS.summaries_dir + '/train', - sess.graph) -test_writer = tf.summary.FileWriter(FLAGS.summaries_dir + '/test') -tf.global_variables_initializer().run() -``` - -After we've initialized the `FileWriters`, we have to add summaries to the -`FileWriters` as we train and test the model. - -```python -# Train the model, and also write summaries. -# Every 10th step, measure test-set accuracy, and write test summaries -# All other steps, run train_step on training data, & add training summaries - -def feed_dict(train): - """Make a TensorFlow feed_dict: maps data onto Tensor placeholders.""" - if train or FLAGS.fake_data: - xs, ys = mnist.train.next_batch(100, fake_data=FLAGS.fake_data) - k = FLAGS.dropout - else: - xs, ys = mnist.test.images, mnist.test.labels - k = 1.0 - return {x: xs, y_: ys, keep_prob: k} - -for i in range(FLAGS.max_steps): - if i % 10 == 0: # Record summaries and test-set accuracy - summary, acc = sess.run([merged, accuracy], feed_dict=feed_dict(False)) - test_writer.add_summary(summary, i) - print('Accuracy at step %s: %s' % (i, acc)) - else: # Record train set summaries, and train - summary, _ = sess.run([merged, train_step], feed_dict=feed_dict(True)) - train_writer.add_summary(summary, i) -``` - -You're now all set to visualize this data using TensorBoard. - - -## Launching TensorBoard - -To run TensorBoard, use the following command (alternatively `python -m -tensorboard.main`) - -```bash -tensorboard --logdir=path/to/log-directory -``` - -where `logdir` points to the directory where the `FileWriter` serialized its -data. If this `logdir` directory contains subdirectories which contain -serialized data from separate runs, then TensorBoard will visualize the data -from all of those runs. Once TensorBoard is running, navigate your web browser -to `localhost:6006` to view the TensorBoard. - -When looking at TensorBoard, you will see the navigation tabs in the top right -corner. Each tab represents a set of serialized data that can be visualized. - -For in depth information on how to use the *graph* tab to visualize your graph, -see [TensorBoard: Graph Visualization](../guide/graph_viz.md). - -For more usage information on TensorBoard in general, see the -[TensorBoard GitHub](https://github.com/tensorflow/tensorboard). diff --git a/tensorflow/docs_src/guide/tensorboard_histograms.md b/tensorflow/docs_src/guide/tensorboard_histograms.md deleted file mode 100644 index af8f2cadd1..0000000000 --- a/tensorflow/docs_src/guide/tensorboard_histograms.md +++ /dev/null @@ -1,245 +0,0 @@ -# TensorBoard Histogram Dashboard - -The TensorBoard Histogram Dashboard displays how the distribution of some -`Tensor` in your TensorFlow graph has changed over time. It does this by showing -many histograms visualizations of your tensor at different points in time. - -## A Basic Example - -Let's start with a simple case: a normally-distributed variable, where the mean -shifts over time. -TensorFlow has an op -[`tf.random_normal`](https://www.tensorflow.org/api_docs/python/tf/random_normal) -which is perfect for this purpose. As is usually the case with TensorBoard, we -will ingest data using a summary op; in this case, -['tf.summary.histogram'](https://www.tensorflow.org/api_docs/python/tf/summary/histogram). -For a primer on how summaries work, please see the -[TensorBoard guide](./summaries_and_tensorboard.md). - -Here is a code snippet that will generate some histogram summaries containing -normally distributed data, where the mean of the distribution increases over -time. - -```python -import tensorflow as tf - -k = tf.placeholder(tf.float32) - -# Make a normal distribution, with a shifting mean -mean_moving_normal = tf.random_normal(shape=[1000], mean=(5*k), stddev=1) -# Record that distribution into a histogram summary -tf.summary.histogram("normal/moving_mean", mean_moving_normal) - -# Setup a session and summary writer -sess = tf.Session() -writer = tf.summary.FileWriter("/tmp/histogram_example") - -summaries = tf.summary.merge_all() - -# Setup a loop and write the summaries to disk -N = 400 -for step in range(N): - k_val = step/float(N) - summ = sess.run(summaries, feed_dict={k: k_val}) - writer.add_summary(summ, global_step=step) -``` - -Once that code runs, we can load the data into TensorBoard via the command line: - - -```sh -tensorboard --logdir=/tmp/histogram_example -``` - -Once TensorBoard is running, load it in Chrome or Firefox and navigate to the -Histogram Dashboard. Then we can see a histogram visualization for our normally -distributed data. - -![](https://www.tensorflow.org/images/tensorboard/histogram_dashboard/1_moving_mean.png) - -`tf.summary.histogram` takes an arbitrarily sized and shaped Tensor, and -compresses it into a histogram data structure consisting of many bins with -widths and counts. For example, let's say we want to organize the numbers -`[0.5, 1.1, 1.3, 2.2, 2.9, 2.99]` into bins. We could make three bins: -* a bin -containing everything from 0 to 1 (it would contain one element, 0.5), -* a bin -containing everything from 1-2 (it would contain two elements, 1.1 and 1.3), -* a bin containing everything from 2-3 (it would contain three elements: 2.2, -2.9 and 2.99). - -TensorFlow uses a similar approach to create bins, but unlike in our example, it -doesn't create integer bins. For large, sparse datasets, that might result in -many thousands of bins. -Instead, [the bins are exponentially distributed, with many bins close to 0 and -comparatively few bins for very large numbers.](https://github.com/tensorflow/tensorflow/blob/c8b59c046895fa5b6d79f73e0b5817330fcfbfc1/tensorflow/core/lib/histogram/histogram.cc#L28) -However, visualizing exponentially-distributed bins is tricky; if height is used -to encode count, then wider bins take more space, even if they have the same -number of elements. Conversely, encoding count in the area makes height -comparisons impossible. Instead, the histograms [resample the data](https://github.com/tensorflow/tensorflow/blob/17c47804b86e340203d451125a721310033710f1/tensorflow/tensorboard/components/tf_backend/backend.ts#L400) -into uniform bins. This can lead to unfortunate artifacts in some cases. - -Each slice in the histogram visualizer displays a single histogram. -The slices are organized by step; -older slices (e.g. step 0) are further "back" and darker, while newer slices -(e.g. step 400) are close to the foreground, and lighter in color. -The y-axis on the right shows the step number. - -You can mouse over the histogram to see tooltips with some more detailed -information. For example, in the following image we can see that the histogram -at timestep 176 has a bin centered at 2.25 with 177 elements in that bin. - -![](https://www.tensorflow.org/images/tensorboard/histogram_dashboard/2_moving_mean_tooltip.png) - -Also, you may note that the histogram slices are not always evenly spaced in -step count or time. This is because TensorBoard uses -[reservoir sampling](https://en.wikipedia.org/wiki/Reservoir_sampling) to keep a -subset of all the histograms, to save on memory. Reservoir sampling guarantees -that every sample has an equal likelihood of being included, but because it is -a randomized algorithm, the samples chosen don't occur at even steps. - -## Overlay Mode - -There is a control on the left of the dashboard that allows you to toggle the -histogram mode from "offset" to "overlay": - -![](https://www.tensorflow.org/images/tensorboard/histogram_dashboard/3_overlay_offset.png) - -In "offset" mode, the visualization rotates 45 degrees, so that the individual -histogram slices are no longer spread out in time, but instead are all plotted -on the same y-axis. - -![](https://www.tensorflow.org/images/tensorboard/histogram_dashboard/4_overlay.png) -Now, each slice is a separate line on the chart, and the y-axis shows the item -count within each bucket. Darker lines are older, earlier steps, and lighter -lines are more recent, later steps. Once again, you can mouse over the chart to -see some additional information. - -![](https://www.tensorflow.org/images/tensorboard/histogram_dashboard/5_overlay_tooltips.png) - -In general, the overlay visualization is useful if you want to directly compare -the counts of different histograms. - -## Multimodal Distributions - -The Histogram Dashboard is great for visualizing multimodal -distributions. Let's construct a simple bimodal distribution by concatenating -the outputs from two different normal distributions. The code will look like -this: - -```python -import tensorflow as tf - -k = tf.placeholder(tf.float32) - -# Make a normal distribution, with a shifting mean -mean_moving_normal = tf.random_normal(shape=[1000], mean=(5*k), stddev=1) -# Record that distribution into a histogram summary -tf.summary.histogram("normal/moving_mean", mean_moving_normal) - -# Make a normal distribution with shrinking variance -variance_shrinking_normal = tf.random_normal(shape=[1000], mean=0, stddev=1-(k)) -# Record that distribution too -tf.summary.histogram("normal/shrinking_variance", variance_shrinking_normal) - -# Let's combine both of those distributions into one dataset -normal_combined = tf.concat([mean_moving_normal, variance_shrinking_normal], 0) -# We add another histogram summary to record the combined distribution -tf.summary.histogram("normal/bimodal", normal_combined) - -summaries = tf.summary.merge_all() - -# Setup a session and summary writer -sess = tf.Session() -writer = tf.summary.FileWriter("/tmp/histogram_example") - -# Setup a loop and write the summaries to disk -N = 400 -for step in range(N): - k_val = step/float(N) - summ = sess.run(summaries, feed_dict={k: k_val}) - writer.add_summary(summ, global_step=step) -``` - -You already remember our "moving mean" normal distribution from the example -above. Now we also have a "shrinking variance" distribution. Side-by-side, they -look like this: -![](https://www.tensorflow.org/images/tensorboard/histogram_dashboard/6_two_distributions.png) - -When we concatenate them, we get a chart that clearly reveals the divergent, -bimodal structure: -![](https://www.tensorflow.org/images/tensorboard/histogram_dashboard/7_bimodal.png) - -## Some more distributions - -Just for fun, let's generate and visualize a few more distributions, and then -combine them all into one chart. Here's the code we'll use: - -```python -import tensorflow as tf - -k = tf.placeholder(tf.float32) - -# Make a normal distribution, with a shifting mean -mean_moving_normal = tf.random_normal(shape=[1000], mean=(5*k), stddev=1) -# Record that distribution into a histogram summary -tf.summary.histogram("normal/moving_mean", mean_moving_normal) - -# Make a normal distribution with shrinking variance -variance_shrinking_normal = tf.random_normal(shape=[1000], mean=0, stddev=1-(k)) -# Record that distribution too -tf.summary.histogram("normal/shrinking_variance", variance_shrinking_normal) - -# Let's combine both of those distributions into one dataset -normal_combined = tf.concat([mean_moving_normal, variance_shrinking_normal], 0) -# We add another histogram summary to record the combined distribution -tf.summary.histogram("normal/bimodal", normal_combined) - -# Add a gamma distribution -gamma = tf.random_gamma(shape=[1000], alpha=k) -tf.summary.histogram("gamma", gamma) - -# And a poisson distribution -poisson = tf.random_poisson(shape=[1000], lam=k) -tf.summary.histogram("poisson", poisson) - -# And a uniform distribution -uniform = tf.random_uniform(shape=[1000], maxval=k*10) -tf.summary.histogram("uniform", uniform) - -# Finally, combine everything together! -all_distributions = [mean_moving_normal, variance_shrinking_normal, - gamma, poisson, uniform] -all_combined = tf.concat(all_distributions, 0) -tf.summary.histogram("all_combined", all_combined) - -summaries = tf.summary.merge_all() - -# Setup a session and summary writer -sess = tf.Session() -writer = tf.summary.FileWriter("/tmp/histogram_example") - -# Setup a loop and write the summaries to disk -N = 400 -for step in range(N): - k_val = step/float(N) - summ = sess.run(summaries, feed_dict={k: k_val}) - writer.add_summary(summ, global_step=step) -``` -### Gamma Distribution -![](https://www.tensorflow.org/images/tensorboard/histogram_dashboard/8_gamma.png) - -### Uniform Distribution -![](https://www.tensorflow.org/images/tensorboard/histogram_dashboard/9_uniform.png) - -### Poisson Distribution -![](https://www.tensorflow.org/images/tensorboard/histogram_dashboard/10_poisson.png) -The poisson distribution is defined over the integers. So, all of the values -being generated are perfect integers. The histogram compression moves the data -into floating-point bins, causing the visualization to show little -bumps over the integer values rather than perfect spikes. - -### All Together Now -Finally, we can concatenate all of the data into one funny-looking curve. -![](https://www.tensorflow.org/images/tensorboard/histogram_dashboard/11_all_combined.png) - diff --git a/tensorflow/docs_src/guide/tensors.md b/tensorflow/docs_src/guide/tensors.md deleted file mode 100644 index 4f0ddb21b5..0000000000 --- a/tensorflow/docs_src/guide/tensors.md +++ /dev/null @@ -1,330 +0,0 @@ -# Tensors - -TensorFlow, as the name indicates, is a framework to define and run computations -involving tensors. A **tensor** is a generalization of vectors and matrices to -potentially higher dimensions. Internally, TensorFlow represents tensors as -n-dimensional arrays of base datatypes. - -When writing a TensorFlow program, the main object you manipulate and pass -around is the `tf.Tensor`. A `tf.Tensor` object represents a partially defined -computation that will eventually produce a value. TensorFlow programs work by -first building a graph of `tf.Tensor` objects, detailing how each tensor is -computed based on the other available tensors and then by running parts of this -graph to achieve the desired results. - -A `tf.Tensor` has the following properties: - - * a data type (`float32`, `int32`, or `string`, for example) - * a shape - - -Each element in the Tensor has the same data type, and the data type is always -known. The shape (that is, the number of dimensions it has and the size of each -dimension) might be only partially known. Most operations produce tensors of -fully-known shapes if the shapes of their inputs are also fully known, but in -some cases it's only possible to find the shape of a tensor at graph execution -time. - -Some types of tensors are special, and these will be covered in other -units of the TensorFlow guide. The main ones are: - - * `tf.Variable` - * `tf.constant` - * `tf.placeholder` - * `tf.SparseTensor` - -With the exception of `tf.Variable`, the value of a tensor is immutable, which -means that in the context of a single execution tensors only have a single -value. However, evaluating the same tensor twice can return different values; -for example that tensor can be the result of reading data from disk, or -generating a random number. - -## Rank - -The **rank** of a `tf.Tensor` object is its number of dimensions. Synonyms for -rank include **order** or **degree** or **n-dimension**. -Note that rank in TensorFlow is not the same as matrix rank in mathematics. -As the following table shows, each rank in TensorFlow corresponds to a -different mathematical entity: - -Rank | Math entity ---- | --- -0 | Scalar (magnitude only) -1 | Vector (magnitude and direction) -2 | Matrix (table of numbers) -3 | 3-Tensor (cube of numbers) -n | n-Tensor (you get the idea) - - -### Rank 0 - -The following snippet demonstrates creating a few rank 0 variables: - -```python -mammal = tf.Variable("Elephant", tf.string) -ignition = tf.Variable(451, tf.int16) -floating = tf.Variable(3.14159265359, tf.float64) -its_complicated = tf.Variable(12.3 - 4.85j, tf.complex64) -``` - -Note: A string is treated as a single item in TensorFlow, not as a sequence of -characters. It is possible to have scalar strings, vectors of strings, etc. - -### Rank 1 - -To create a rank 1 `tf.Tensor` object, you can pass a list of items as the -initial value. For example: - -```python -mystr = tf.Variable(["Hello"], tf.string) -cool_numbers = tf.Variable([3.14159, 2.71828], tf.float32) -first_primes = tf.Variable([2, 3, 5, 7, 11], tf.int32) -its_very_complicated = tf.Variable([12.3 - 4.85j, 7.5 - 6.23j], tf.complex64) -``` - - -### Higher ranks - -A rank 2 `tf.Tensor` object consists of at least one row and at least -one column: - -```python -mymat = tf.Variable([[7],[11]], tf.int16) -myxor = tf.Variable([[False, True],[True, False]], tf.bool) -linear_squares = tf.Variable([[4], [9], [16], [25]], tf.int32) -squarish_squares = tf.Variable([ [4, 9], [16, 25] ], tf.int32) -rank_of_squares = tf.rank(squarish_squares) -mymatC = tf.Variable([[7],[11]], tf.int32) -``` - -Higher-rank Tensors, similarly, consist of an n-dimensional array. For example, -during image processing, many tensors of rank 4 are used, with dimensions -corresponding to example-in-batch, image width, image height, and color channel. - -``` python -my_image = tf.zeros([10, 299, 299, 3]) # batch x height x width x color -``` - -### Getting a `tf.Tensor` object's rank - -To determine the rank of a `tf.Tensor` object, call the `tf.rank` method. -For example, the following method programmatically determines the rank -of the `tf.Tensor` defined in the previous section: - -```python -r = tf.rank(my_image) -# After the graph runs, r will hold the value 4. -``` - -### Referring to `tf.Tensor` slices - -Since a `tf.Tensor` is an n-dimensional array of cells, to access a single cell -in a `tf.Tensor` you need to specify n indices. - -For a rank 0 tensor (a scalar), no indices are necessary, since it is already a -single number. - -For a rank 1 tensor (a vector), passing a single index allows you to access a -number: - -```python -my_scalar = my_vector[2] -``` - -Note that the index passed inside the `[]` can itself be a scalar `tf.Tensor`, if -you want to dynamically choose an element from the vector. - -For tensors of rank 2 or higher, the situation is more interesting. For a -`tf.Tensor` of rank 2, passing two numbers returns a scalar, as expected: - - -```python -my_scalar = my_matrix[1, 2] -``` - - -Passing a single number, however, returns a subvector of a matrix, as follows: - - -```python -my_row_vector = my_matrix[2] -my_column_vector = my_matrix[:, 3] -``` - -The `:` notation is python slicing syntax for "leave this dimension alone". This -is useful in higher-rank Tensors, as it allows you to access its subvectors, -submatrices, and even other subtensors. - - -## Shape - -The **shape** of a tensor is the number of elements in each dimension. -TensorFlow automatically infers shapes during graph construction. These inferred -shapes might have known or unknown rank. If the rank is known, the sizes of each -dimension might be known or unknown. - -The TensorFlow documentation uses three notational conventions to describe -tensor dimensionality: rank, shape, and dimension number. The following table -shows how these relate to one another: - -Rank | Shape | Dimension number | Example ---- | --- | --- | --- -0 | [] | 0-D | A 0-D tensor. A scalar. -1 | [D0] | 1-D | A 1-D tensor with shape [5]. -2 | [D0, D1] | 2-D | A 2-D tensor with shape [3, 4]. -3 | [D0, D1, D2] | 3-D | A 3-D tensor with shape [1, 4, 3]. -n | [D0, D1, ... Dn-1] | n-D | A tensor with shape [D0, D1, ... Dn-1]. - -Shapes can be represented via Python lists / tuples of ints, or with the -`tf.TensorShape`. - -### Getting a `tf.Tensor` object's shape - -There are two ways of accessing the shape of a `tf.Tensor`. While building the -graph, it is often useful to ask what is already known about a tensor's -shape. This can be done by reading the `shape` property of a `tf.Tensor` object. -This method returns a `TensorShape` object, which is a convenient way of -representing partially-specified shapes (since, when building the graph, not all -shapes will be fully known). - -It is also possible to get a `tf.Tensor` that will represent the fully-defined -shape of another `tf.Tensor` at runtime. This is done by calling the `tf.shape` -operation. This way, you can build a graph that manipulates the shapes of -tensors by building other tensors that depend on the dynamic shape of the input -`tf.Tensor`. - -For example, here is how to make a vector of zeros with the same size as the -number of columns in a given matrix: - -``` python -zeros = tf.zeros(my_matrix.shape[1]) -``` - -### Changing the shape of a `tf.Tensor` - -The **number of elements** of a tensor is the product of the sizes of all its -shapes. The number of elements of a scalar is always `1`. Since there are often -many different shapes that have the same number of elements, it's often -convenient to be able to change the shape of a `tf.Tensor`, keeping its elements -fixed. This can be done with `tf.reshape`. - -The following examples demonstrate how to reshape tensors: - -```python -rank_three_tensor = tf.ones([3, 4, 5]) -matrix = tf.reshape(rank_three_tensor, [6, 10]) # Reshape existing content into - # a 6x10 matrix -matrixB = tf.reshape(matrix, [3, -1]) # Reshape existing content into a 3x20 - # matrix. -1 tells reshape to calculate - # the size of this dimension. -matrixAlt = tf.reshape(matrixB, [4, 3, -1]) # Reshape existing content into a - #4x3x5 tensor - -# Note that the number of elements of the reshaped Tensors has to match the -# original number of elements. Therefore, the following example generates an -# error because no possible value for the last dimension will match the number -# of elements. -yet_another = tf.reshape(matrixAlt, [13, 2, -1]) # ERROR! -``` - -## Data types - -In addition to dimensionality, Tensors have a data type. Refer to the -`tf.DType` page for a complete list of the data types. - -It is not possible to have a `tf.Tensor` with more than one data type. It is -possible, however, to serialize arbitrary data structures as `string`s and store -those in `tf.Tensor`s. - -It is possible to cast `tf.Tensor`s from one datatype to another using -`tf.cast`: - -``` python -# Cast a constant integer tensor into floating point. -float_tensor = tf.cast(tf.constant([1, 2, 3]), dtype=tf.float32) -``` - -To inspect a `tf.Tensor`'s data type use the `Tensor.dtype` property. - -When creating a `tf.Tensor` from a python object you may optionally specify the -datatype. If you don't, TensorFlow chooses a datatype that can represent your -data. TensorFlow converts Python integers to `tf.int32` and python floating -point numbers to `tf.float32`. Otherwise TensorFlow uses the same rules numpy -uses when converting to arrays. - -## Evaluating Tensors - -Once the computation graph has been built, you can run the computation that -produces a particular `tf.Tensor` and fetch the value assigned to it. This is -often useful for debugging as well as being required for much of TensorFlow to -work. - -The simplest way to evaluate a Tensor is using the `Tensor.eval` method. For -example: - -```python -constant = tf.constant([1, 2, 3]) -tensor = constant * constant -print(tensor.eval()) -``` - -The `eval` method only works when a default `tf.Session` is active (see -Graphs and Sessions for more information). - -`Tensor.eval` returns a numpy array with the same contents as the tensor. - -Sometimes it is not possible to evaluate a `tf.Tensor` with no context because -its value might depend on dynamic information that is not available. For -example, tensors that depend on `placeholder`s can't be evaluated without -providing a value for the `placeholder`. - -``` python -p = tf.placeholder(tf.float32) -t = p + 1.0 -t.eval() # This will fail, since the placeholder did not get a value. -t.eval(feed_dict={p:2.0}) # This will succeed because we're feeding a value - # to the placeholder. -``` - -Note that it is possible to feed any `tf.Tensor`, not just placeholders. - -Other model constructs might make evaluating a `tf.Tensor` -complicated. TensorFlow can't directly evaluate `tf.Tensor`s defined inside -functions or inside control flow constructs. If a `tf.Tensor` depends on a value -from a queue, evaluating the `tf.Tensor` will only work once something has been -enqueued; otherwise, evaluating it will hang. When working with queues, remember -to call `tf.train.start_queue_runners` before evaluating any `tf.Tensor`s. - -## Printing Tensors - -For debugging purposes you might want to print the value of a `tf.Tensor`. While - [tfdbg](../guide/debugger.md) provides advanced debugging support, TensorFlow also has an - operation to directly print the value of a `tf.Tensor`. - -Note that you rarely want to use the following pattern when printing a -`tf.Tensor`: - -``` python -t = <> -print(t) # This will print the symbolic tensor when the graph is being built. - # This tensor does not have a value in this context. -``` - -This code prints the `tf.Tensor` object (which represents deferred computation) -and not its value. Instead, TensorFlow provides the `tf.Print` operation, which -returns its first tensor argument unchanged while printing the set of -`tf.Tensor`s it is passed as the second argument. - -To correctly use `tf.Print` its return value must be used. See the example below - -``` python -t = <> -tf.Print(t, [t]) # This does nothing -t = tf.Print(t, [t]) # Here we are using the value returned by tf.Print -result = t + 1 # Now when result is evaluated the value of `t` will be printed. -``` - -When you evaluate `result` you will evaluate everything `result` depends -upon. Since `result` depends upon `t`, and evaluating `t` has the side effect of -printing its input (the old value of `t`), `t` gets printed. - diff --git a/tensorflow/docs_src/guide/using_gpu.md b/tensorflow/docs_src/guide/using_gpu.md deleted file mode 100644 index 8cb9b354c7..0000000000 --- a/tensorflow/docs_src/guide/using_gpu.md +++ /dev/null @@ -1,215 +0,0 @@ -# Using GPUs - -## Supported devices - -On a typical system, there are multiple computing devices. In TensorFlow, the -supported device types are `CPU` and `GPU`. They are represented as `strings`. -For example: - -* `"/cpu:0"`: The CPU of your machine. -* `"/device:GPU:0"`: The GPU of your machine, if you have one. -* `"/device:GPU:1"`: The second GPU of your machine, etc. - -If a TensorFlow operation has both CPU and GPU implementations, the GPU devices -will be given priority when the operation is assigned to a device. For example, -`matmul` has both CPU and GPU kernels. On a system with devices `cpu:0` and -`gpu:0`, `gpu:0` will be selected to run `matmul`. - -## Logging Device placement - -To find out which devices your operations and tensors are assigned to, create -the session with `log_device_placement` configuration option set to `True`. - -```python -# Creates a graph. -a = tf.constant([1.0, 2.0, 3.0, 4.0, 5.0, 6.0], shape=[2, 3], name='a') -b = tf.constant([1.0, 2.0, 3.0, 4.0, 5.0, 6.0], shape=[3, 2], name='b') -c = tf.matmul(a, b) -# Creates a session with log_device_placement set to True. -sess = tf.Session(config=tf.ConfigProto(log_device_placement=True)) -# Runs the op. -print(sess.run(c)) -``` - -You should see the following output: - -``` -Device mapping: -/job:localhost/replica:0/task:0/device:GPU:0 -> device: 0, name: Tesla K40c, pci bus -id: 0000:05:00.0 -b: /job:localhost/replica:0/task:0/device:GPU:0 -a: /job:localhost/replica:0/task:0/device:GPU:0 -MatMul: /job:localhost/replica:0/task:0/device:GPU:0 -[[ 22. 28.] - [ 49. 64.]] - -``` - -## Manual device placement - -If you would like a particular operation to run on a device of your choice -instead of what's automatically selected for you, you can use `with tf.device` -to create a device context such that all the operations within that context will -have the same device assignment. - -```python -# Creates a graph. -with tf.device('/cpu:0'): - a = tf.constant([1.0, 2.0, 3.0, 4.0, 5.0, 6.0], shape=[2, 3], name='a') - b = tf.constant([1.0, 2.0, 3.0, 4.0, 5.0, 6.0], shape=[3, 2], name='b') -c = tf.matmul(a, b) -# Creates a session with log_device_placement set to True. -sess = tf.Session(config=tf.ConfigProto(log_device_placement=True)) -# Runs the op. -print(sess.run(c)) -``` - -You will see that now `a` and `b` are assigned to `cpu:0`. Since a device was -not explicitly specified for the `MatMul` operation, the TensorFlow runtime will -choose one based on the operation and available devices (`gpu:0` in this -example) and automatically copy tensors between devices if required. - -``` -Device mapping: -/job:localhost/replica:0/task:0/device:GPU:0 -> device: 0, name: Tesla K40c, pci bus -id: 0000:05:00.0 -b: /job:localhost/replica:0/task:0/cpu:0 -a: /job:localhost/replica:0/task:0/cpu:0 -MatMul: /job:localhost/replica:0/task:0/device:GPU:0 -[[ 22. 28.] - [ 49. 64.]] -``` - -## Allowing GPU memory growth - -By default, TensorFlow maps nearly all of the GPU memory of all GPUs (subject to -[`CUDA_VISIBLE_DEVICES`](https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#env-vars)) -visible to the process. This is done to more efficiently use the relatively -precious GPU memory resources on the devices by reducing [memory -fragmentation](https://en.wikipedia.org/wiki/Fragmentation_\(computing\)). - -In some cases it is desirable for the process to only allocate a subset of the -available memory, or to only grow the memory usage as is needed by the process. -TensorFlow provides two Config options on the Session to control this. - -The first is the `allow_growth` option, which attempts to allocate only as much -GPU memory based on runtime allocations: it starts out allocating very little -memory, and as Sessions get run and more GPU memory is needed, we extend the GPU -memory region needed by the TensorFlow process. Note that we do not release -memory, since that can lead to even worse memory fragmentation. To turn this -option on, set the option in the ConfigProto by: - -```python -config = tf.ConfigProto() -config.gpu_options.allow_growth = True -session = tf.Session(config=config, ...) -``` - -The second method is the `per_process_gpu_memory_fraction` option, which -determines the fraction of the overall amount of memory that each visible GPU -should be allocated. For example, you can tell TensorFlow to only allocate 40% -of the total memory of each GPU by: - -```python -config = tf.ConfigProto() -config.gpu_options.per_process_gpu_memory_fraction = 0.4 -session = tf.Session(config=config, ...) -``` - -This is useful if you want to truly bound the amount of GPU memory available to -the TensorFlow process. - -## Using a single GPU on a multi-GPU system - -If you have more than one GPU in your system, the GPU with the lowest ID will be -selected by default. If you would like to run on a different GPU, you will need -to specify the preference explicitly: - -```python -# Creates a graph. -with tf.device('/device:GPU:2'): - a = tf.constant([1.0, 2.0, 3.0, 4.0, 5.0, 6.0], shape=[2, 3], name='a') - b = tf.constant([1.0, 2.0, 3.0, 4.0, 5.0, 6.0], shape=[3, 2], name='b') - c = tf.matmul(a, b) -# Creates a session with log_device_placement set to True. -sess = tf.Session(config=tf.ConfigProto(log_device_placement=True)) -# Runs the op. -print(sess.run(c)) -``` - -If the device you have specified does not exist, you will get -`InvalidArgumentError`: - -``` -InvalidArgumentError: Invalid argument: Cannot assign a device to node 'b': -Could not satisfy explicit device specification '/device:GPU:2' - [[{{node b}} = Const[dtype=DT_FLOAT, value=Tensor, _device="/device:GPU:2"]()]] -``` - -If you would like TensorFlow to automatically choose an existing and supported -device to run the operations in case the specified one doesn't exist, you can -set `allow_soft_placement` to `True` in the configuration option when creating -the session. - -```python -# Creates a graph. -with tf.device('/device:GPU:2'): - a = tf.constant([1.0, 2.0, 3.0, 4.0, 5.0, 6.0], shape=[2, 3], name='a') - b = tf.constant([1.0, 2.0, 3.0, 4.0, 5.0, 6.0], shape=[3, 2], name='b') - c = tf.matmul(a, b) -# Creates a session with allow_soft_placement and log_device_placement set -# to True. -sess = tf.Session(config=tf.ConfigProto( - allow_soft_placement=True, log_device_placement=True)) -# Runs the op. -print(sess.run(c)) -``` - -## Using multiple GPUs - -If you would like to run TensorFlow on multiple GPUs, you can construct your -model in a multi-tower fashion where each tower is assigned to a different GPU. -For example: - -``` python -# Creates a graph. -c = [] -for d in ['/device:GPU:2', '/device:GPU:3']: - with tf.device(d): - a = tf.constant([1.0, 2.0, 3.0, 4.0, 5.0, 6.0], shape=[2, 3]) - b = tf.constant([1.0, 2.0, 3.0, 4.0, 5.0, 6.0], shape=[3, 2]) - c.append(tf.matmul(a, b)) -with tf.device('/cpu:0'): - sum = tf.add_n(c) -# Creates a session with log_device_placement set to True. -sess = tf.Session(config=tf.ConfigProto(log_device_placement=True)) -# Runs the op. -print(sess.run(sum)) -``` - -You will see the following output. - -``` -Device mapping: -/job:localhost/replica:0/task:0/device:GPU:0 -> device: 0, name: Tesla K20m, pci bus -id: 0000:02:00.0 -/job:localhost/replica:0/task:0/device:GPU:1 -> device: 1, name: Tesla K20m, pci bus -id: 0000:03:00.0 -/job:localhost/replica:0/task:0/device:GPU:2 -> device: 2, name: Tesla K20m, pci bus -id: 0000:83:00.0 -/job:localhost/replica:0/task:0/device:GPU:3 -> device: 3, name: Tesla K20m, pci bus -id: 0000:84:00.0 -Const_3: /job:localhost/replica:0/task:0/device:GPU:3 -Const_2: /job:localhost/replica:0/task:0/device:GPU:3 -MatMul_1: /job:localhost/replica:0/task:0/device:GPU:3 -Const_1: /job:localhost/replica:0/task:0/device:GPU:2 -Const: /job:localhost/replica:0/task:0/device:GPU:2 -MatMul: /job:localhost/replica:0/task:0/device:GPU:2 -AddN: /job:localhost/replica:0/task:0/cpu:0 -[[ 44. 56.] - [ 98. 128.]] -``` - -The [cifar10 tutorial](../tutorials/images/deep_cnn.md) is a good example -demonstrating how to do training with multiple GPUs. diff --git a/tensorflow/docs_src/guide/using_tpu.md b/tensorflow/docs_src/guide/using_tpu.md deleted file mode 100644 index 59b34e19e0..0000000000 --- a/tensorflow/docs_src/guide/using_tpu.md +++ /dev/null @@ -1,395 +0,0 @@ -# Using TPUs - -This document walks through the principal TensorFlow APIs necessary to make -effective use of a [Cloud TPU](https://cloud.google.com/tpu/), and highlights -the differences between regular TensorFlow usage, and usage on a TPU. - -This doc is aimed at users who: - -* Are familiar with TensorFlow's `Estimator` and `Dataset` APIs -* Have maybe [tried out a Cloud TPU](https://cloud.google.com/tpu/docs/quickstart) - using an existing model. -* Have, perhaps, skimmed the code of an example TPU model - [[1]](https://github.com/tensorflow/models/blob/master/official/mnist/mnist_tpu.py) - [[2]](https://github.com/tensorflow/tpu/tree/master/models). -* Are interested in porting an existing `Estimator` model to - run on Cloud TPUs - -## TPUEstimator - -`tf.estimator.Estimator` are TensorFlow's model-level abstraction. -Standard `Estimators` can drive models on CPU and GPUs. You must use -`tf.contrib.tpu.TPUEstimator` to drive a model on TPUs. - -Refer to TensorFlow's Getting Started section for an introduction to the basics -of using a [pre-made `Estimator`](../guide/premade_estimators.md), and -[custom `Estimator`s](../guide/custom_estimators.md). - -The `TPUEstimator` class differs somewhat from the `Estimator` class. - -The simplest way to maintain a model that can be run both on CPU/GPU or on a -Cloud TPU is to define the model's inference phase (from inputs to predictions) -outside of the `model_fn`. Then maintain separate implementations of the -`Estimator` setup and `model_fn`, both wrapping this inference step. For an -example of this pattern compare the `mnist.py` and `mnist_tpu.py` implementation in -[tensorflow/models](https://github.com/tensorflow/models/tree/master/official/mnist). - -### Running a `TPUEstimator` locally - -To create a standard `Estimator` you call the constructor, and pass it a -`model_fn`, for example: - -``` -my_estimator = tf.estimator.Estimator( - model_fn=my_model_fn) -``` - -The changes required to use a `tf.contrib.tpu.TPUEstimator` on your local -machine are relatively minor. The constructor requires two additional arguments. -You should set the `use_tpu` argument to `False`, and pass a -`tf.contrib.tpu.RunConfig` as the `config` argument, as shown below: - -``` python -my_tpu_estimator = tf.contrib.tpu.TPUEstimator( - model_fn=my_model_fn, - config=tf.contrib.tpu.RunConfig() - use_tpu=False) -``` - -Just this simple change will allow you to run a `TPUEstimator` locally. -The majority of example TPU models can be run in this local mode, -by setting the command line flags as follows: - - -``` -$> python mnist_tpu.py --use_tpu=false --master='' -``` - -Note: This `use_tpu=False` argument is useful for trying out the `TPUEstimator` -API. It is not meant to be a complete TPU compatibility test. Successfully -running a model locally in a `TPUEstimator` does not guarantee that it will -work on a TPU. - - -### Building a `tpu.RunConfig` - -While the default `RunConfig` is sufficient for local training, these settings -cannot be ignored in real usage. - -A more typical setup for a `RunConfig`, that can be switched to use a Cloud -TPU, might be as follows: - -``` python -import tempfile -import subprocess - -class FLAGS(object): - use_tpu=False - tpu_name=None - # Use a local temporary path for the `model_dir` - model_dir = tempfile.mkdtemp() - # Number of training steps to run on the Cloud TPU before returning control. - iterations = 50 - # A single Cloud TPU has 8 shards. - num_shards = 8 - -if FLAGS.use_tpu: - my_project_name = subprocess.check_output([ - 'gcloud','config','get-value','project']) - my_zone = subprocess.check_output([ - 'gcloud','config','get-value','compute/zone']) - cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver( - tpu_names=[FLAGS.tpu_name], - zone=my_zone, - project=my_project) - master = tpu_cluster_resolver.get_master() -else: - master = '' - -my_tpu_run_config = tf.contrib.tpu.RunConfig( - master=master, - evaluation_master=master, - model_dir=FLAGS.model_dir, - session_config=tf.ConfigProto( - allow_soft_placement=True, log_device_placement=True), - tpu_config=tf.contrib.tpu.TPUConfig(FLAGS.iterations, - FLAGS.num_shards), -) -``` - -Then you must pass the `tf.contrib.tpu.RunConfig` to the constructor: - -``` python -my_tpu_estimator = tf.contrib.tpu.TPUEstimator( - model_fn=my_model_fn, - config = my_tpu_run_config, - use_tpu=FLAGS.use_tpu) -``` - -Typically the `FLAGS` would be set by command line arguments. To switch from -training locally to training on a cloud TPU you would need to: - -* Set `FLAGS.use_tpu` to `True` -* Set `FLAGS.tpu_name` so the `tf.contrib.cluster_resolver.TPUClusterResolver` can find it -* Set `FLAGS.model_dir` to a Google Cloud Storage bucket url (`gs://`). - - -## Optimizer - -When training on a cloud TPU you **must** wrap the optimizer in a -`tf.contrib.tpu.CrossShardOptimizer`, which uses an `allreduce` to aggregate -gradients and broadcast the result to each shard (each TPU core). - -The `CrossShardOptimizer` is not compatible with local training. So, to have -the same code run both locally and on a Cloud TPU, add lines like the following: - -``` python -optimizer = tf.train.GradientDescentOptimizer(learning_rate=learning_rate) -if FLAGS.use_tpu: - optimizer = tf.contrib.tpu.CrossShardOptimizer(optimizer) -``` - -If you prefer to avoid a global `FLAGS` variable in your model code, one -approach is to set the optimizer as one of the `Estimator`'s params, -as follows: - -``` python -my_tpu_estimator = tf.contrib.tpu.TPUEstimator( - model_fn=my_model_fn, - config = my_tpu_run_config, - use_tpu=FLAGS.use_tpu, - params={'optimizer':optimizer}) -``` - -## Model Function - -This section details the changes you must make to the model function -(`model_fn()`) to make it `TPUEstimator` compatible. - -### Static shapes - -During regular usage TensorFlow attempts to determine the shapes of each -`tf.Tensor` during graph construction. During execution any unknown shape -dimensions are determined dynamically, -see [Tensor Shapes](../guide/tensors.md#shape) for more details. - -To run on Cloud TPUs TensorFlow models are compiled using [XLA](../performance/xla/index.md). -XLA uses a similar system for determining shapes at compile time. XLA requires -that all tensor dimensions be statically defined at compile time. All shapes -must evaluate to a constant, and not depend on external data, or stateful -operations like variables or a random number generator. - - -### Summaries - -Remove any use of `tf.summary` from your model. - -[TensorBoard summaries](../guide/summaries_and_tensorboard.md) are a great way see inside -your model. A minimal set of basic summaries are automatically recorded by the -`TPUEstimator`, to `event` files in the `model_dir`. Custom summaries, however, -are currently unsupported when training on a Cloud TPU. So while the -`TPUEstimator` will still run locally with summaries, it will fail if used on a -TPU. - -### Metrics - -Build your evaluation metrics dictionary in a stand-alone `metric_fn`. - - - -Evaluation metrics are an essential part of training a model. These are fully -supported on Cloud TPUs, but with a slightly different syntax. - -A standard `tf.metrics` returns two tensors. The first returns the running -average of the metric value, while the second updates the running average and -returns the value for this batch: - -``` -running_average, current_batch = tf.metrics.accuracy(labels, predictions) -``` - -In a standard `Estimator` you create a dictionary of these pairs, and return it -as part of the `EstimatorSpec`. - -```python -my_metrics = {'accuracy': tf.metrics.accuracy(labels, predictions)} - -return tf.estimator.EstimatorSpec( - ... - eval_metric_ops=my_metrics -) -``` - -In a `TPUEstimator` you instead pass a function (which returns a metrics -dictionary) and a list of argument tensors, as shown below: - -```python -def my_metric_fn(labels, predictions): - return {'accuracy': tf.metrics.accuracy(labels, predictions)} - -return tf.contrib.tpu.TPUEstimatorSpec( - ... - eval_metrics=(my_metric_fn, [labels, predictions]) -) -``` - -### Use `TPUEstimatorSpec` - -`TPUEstimatorSpec` do not support hooks, and require function wrappers for -some fields. - -An `Estimator`'s `model_fn` must return an `EstimatorSpec`. An `EstimatorSpec` -is a simple structure of named fields containing all the `tf.Tensors` of the -model that the `Estimator` may need to interact with. - -`TPUEstimators` use a `tf.contrib.tpu.TPUEstimatorSpec`. There are a few -differences between it and a standard `tf.estimator.EstimatorSpec`: - - -* The `eval_metric_ops` must be wrapped into a `metrics_fn`, this field is - renamed `eval_metrics` ([see above](#metrics)). -* The `tf.train.SessionRunHook` are unsupported, so these fields are - omitted. -* The `tf.train.Scaffold`, if used, must also be wrapped in a - function. This field is renamed to `scaffold_fn`. - -`Scaffold` and `Hooks` are for advanced usage, and can typically be omitted. - -## Input functions - -Input functions work mainly unchanged as they run on the host computer, not the -Cloud TPU itself. This section explains the two necessary adjustments. - -### Params argument - - - -The `input_fn` for a standard `Estimator` _can_ include a -`params` argument; the `input_fn` for a `TPUEstimator` *must* include a -`params` argument. This is necessary to allow the estimator to set the batch -size for each replica of the input stream. So the minimum signature for an -`input_fn` for a `TPUEstimator` is: - -``` -def my_input_fn(params): - pass -``` - -Where `params['batch-size']` will contain the batch size. - -### Static shapes and batch size - -The input pipeline generated by your `input_fn` is run on CPU. So it is mostly -free from the strict static shape requirements imposed by the XLA/TPU environment. -The one requirement is that the batches of data fed from your input pipeline to -the TPU have a static shape, as determined by the standard TensorFlow shape -inference algorithm. Intermediate tensors are free to have a dynamic shapes. -If shape inference has failed, but the shape is known it is possible to -impose the correct shape using `tf.set_shape()`. - -In the example below the shape -inference algorithm fails, but it is correctly using `set_shape`: - -``` ->>> x = tf.zeros(tf.constant([1,2,3])+1) ->>> x.shape - -TensorShape([Dimension(None), Dimension(None), Dimension(None)]) - ->>> x.set_shape([2,3,4]) -``` - -In many cases the batch size is the only unknown dimension. - -A typical input pipeline, using `tf.data`, will usually produce batches of a -fixed size. The last batch of a finite `Dataset`, however, is typically smaller, -containing just the remaining elements. Since a `Dataset` does not know its own -length or finiteness, the standard `tf.data.Dataset.batch` method -cannot determine if all batches will have a fixed size batch on its own: - -``` ->>> params = {'batch_size':32} ->>> ds = tf.data.Dataset.from_tensors([0, 1, 2]) ->>> ds = ds.repeat().batch(params['batch-size']) ->>> ds - - -``` - -The most straightforward fix is to -`tf.data.Dataset.apply` `tf.contrib.data.batch_and_drop_remainder` -as follows: - -``` ->>> params = {'batch_size':32} ->>> ds = tf.data.Dataset.from_tensors([0, 1, 2]) ->>> ds = ds.repeat().apply( -... tf.contrib.data.batch_and_drop_remainder(params['batch-size'])) ->>> ds - - <_RestructuredDataset shapes: (32, 3), types: tf.int32> -``` - -The one downside to this approach is that, as the name implies, this batching -method throws out any fractional batch at the end of the dataset. This is fine -for an infinitely repeating dataset being used for training, but could be a -problem if you want to train for an exact number of epochs. - -To do an exact 1-epoch of _evaluation_ you can work around this by manually -padding the length of the batches, and setting the padding entries to have zero -weight when creating your `tf.metrics`. - -## Datasets - -Efficient use of the `tf.data.Dataset` API is critical when using a Cloud -TPU, as it is impossible to use the Cloud TPU's unless you can feed it data -quickly enough. See [Input Pipeline Performance Guide](../performance/datasets_performance.md) for details on dataset performance. - -For all but the simplest experimentation (using -`tf.data.Dataset.from_tensor_slices` or other in-graph data) you will need to -store all data files read by the `TPUEstimator`'s `Dataset` in Google Cloud -Storage Buckets. - - - -For most use-cases, we recommend converting your data into `TFRecord` -format and using a `tf.data.TFRecordDataset` to read it. This, however, is not -a hard requirement and you can use other dataset readers -(`FixedLengthRecordDataset` or `TextLineDataset`) if you prefer. - -Small datasets can be loaded entirely into memory using -`tf.data.Dataset.cache`. - -Regardless of the data format used, it is strongly recommended that you -[use large files](../performance/performance_guide.md#use_large_files), on the order of -100MB. This is especially important in this networked setting as the overhead -of opening a file is significantly higher. - -It is also important, regardless of the type of reader used, to enable buffering -using the `buffer_size` argument to the constructor. This argument is specified -in bytes. A minimum of a few MB (`buffer_size=8*1024*1024`) is recommended so -that data is available when needed. - -The TPU-demos repo includes -[a script](https://github.com/tensorflow/tpu/blob/master/tools/datasets/imagenet_to_gcs.py) -for downloading the imagenet dataset and converting it to an appropriate format. -This together with the imagenet -[models](https://github.com/tensorflow/tpu/tree/master/models) -included in the repo demonstrate all of these best-practices. - - -## What Next - -For details on how to actually set up and run a Cloud TPU see: - - * [Google Cloud TPU Documentation](https://cloud.google.com/tpu/docs/) - -This document is by no means exhaustive. The best source of more detail on how -to make a Cloud TPU compatible model are the example models published in: - - * The [TPU Demos Repository.](https://github.com/tensorflow/tpu) - -For more information about tuning TensorFlow code for performance see: - - * The [Performance Section.](../performance/index.md) - diff --git a/tensorflow/docs_src/guide/variables.md b/tensorflow/docs_src/guide/variables.md deleted file mode 100644 index 5d5d73394c..0000000000 --- a/tensorflow/docs_src/guide/variables.md +++ /dev/null @@ -1,319 +0,0 @@ -# Variables - -A TensorFlow **variable** is the best way to represent shared, persistent state -manipulated by your program. - -Variables are manipulated via the `tf.Variable` class. A `tf.Variable` -represents a tensor whose value can be changed by running ops on it. Unlike -`tf.Tensor` objects, a `tf.Variable` exists outside the context of a single -`session.run` call. - -Internally, a `tf.Variable` stores a persistent tensor. Specific ops allow you -to read and modify the values of this tensor. These modifications are visible -across multiple `tf.Session`s, so multiple workers can see the same values for a -`tf.Variable`. - -## Creating a Variable - -The best way to create a variable is to call the `tf.get_variable` -function. This function requires you to specify the Variable's name. This name -will be used by other replicas to access the same variable, as well as to name -this variable's value when checkpointing and exporting models. `tf.get_variable` -also allows you to reuse a previously created variable of the same name, making it -easy to define models which reuse layers. - -To create a variable with `tf.get_variable`, simply provide the name and shape - -``` python -my_variable = tf.get_variable("my_variable", [1, 2, 3]) -``` - -This creates a variable named "my_variable" which is a three-dimensional tensor -with shape `[1, 2, 3]`. This variable will, by default, have the `dtype` -`tf.float32` and its initial value will be randomized via -`tf.glorot_uniform_initializer`. - -You may optionally specify the `dtype` and initializer to `tf.get_variable`. For -example: - -``` python -my_int_variable = tf.get_variable("my_int_variable", [1, 2, 3], dtype=tf.int32, - initializer=tf.zeros_initializer) -``` - -TensorFlow provides many convenient initializers. Alternatively, you may -initialize a `tf.Variable` to have the value of a `tf.Tensor`. For example: - -``` python -other_variable = tf.get_variable("other_variable", dtype=tf.int32, - initializer=tf.constant([23, 42])) -``` - -Note that when the initializer is a `tf.Tensor` you should not specify the -variable's shape, as the shape of the initializer tensor will be used. - - - -### Variable collections - -Because disconnected parts of a TensorFlow program might want to create -variables, it is sometimes useful to have a single way to access all of -them. For this reason TensorFlow provides **collections**, which are named lists -of tensors or other objects, such as `tf.Variable` instances. - -By default every `tf.Variable` gets placed in the following two collections: - - * `tf.GraphKeys.GLOBAL_VARIABLES` --- variables that can be shared across - multiple devices, - * `tf.GraphKeys.TRAINABLE_VARIABLES` --- variables for which TensorFlow will - calculate gradients. - -If you don't want a variable to be trainable, add it to the -`tf.GraphKeys.LOCAL_VARIABLES` collection instead. For example, the following -snippet demonstrates how to add a variable named `my_local` to this collection: - -``` python -my_local = tf.get_variable("my_local", shape=(), -collections=[tf.GraphKeys.LOCAL_VARIABLES]) -``` - -Alternatively, you can specify `trainable=False` as an argument to -`tf.get_variable`: - -``` python -my_non_trainable = tf.get_variable("my_non_trainable", - shape=(), - trainable=False) -``` - - -You can also use your own collections. Any string is a valid collection name, -and there is no need to explicitly create a collection. To add a variable (or -any other object) to a collection after creating the variable, call -`tf.add_to_collection`. For example, the following code adds an existing -variable named `my_local` to a collection named `my_collection_name`: - -``` python -tf.add_to_collection("my_collection_name", my_local) -``` - -And to retrieve a list of all the variables (or other objects) you've placed in -a collection you can use: - -``` python -tf.get_collection("my_collection_name") -``` - -### Device placement - -Just like any other TensorFlow operation, you can place variables on particular -devices. For example, the following snippet creates a variable named `v` and -places it on the second GPU device: - -``` python -with tf.device("/device:GPU:1"): - v = tf.get_variable("v", [1]) -``` - -It is particularly important for variables to be in the correct device in -distributed settings. Accidentally putting variables on workers instead of -parameter servers, for example, can severely slow down training or, in the worst -case, let each worker blithely forge ahead with its own independent copy of each -variable. For this reason we provide `tf.train.replica_device_setter`, which -can automatically place variables in parameter servers. For example: - -``` python -cluster_spec = { - "ps": ["ps0:2222", "ps1:2222"], - "worker": ["worker0:2222", "worker1:2222", "worker2:2222"]} -with tf.device(tf.train.replica_device_setter(cluster=cluster_spec)): - v = tf.get_variable("v", shape=[20, 20]) # this variable is placed - # in the parameter server - # by the replica_device_setter -``` - -## Initializing variables - -Before you can use a variable, it must be initialized. If you are programming in -the low-level TensorFlow API (that is, you are explicitly creating your own -graphs and sessions), you must explicitly initialize the variables. Most -high-level frameworks such as `tf.contrib.slim`, `tf.estimator.Estimator` and -`Keras` automatically initialize variables for you before training a model. - -Explicit initialization is otherwise useful because it allows you not to rerun -potentially expensive initializers when reloading a model from a checkpoint as -well as allowing determinism when randomly-initialized variables are shared in a -distributed setting. - -To initialize all trainable variables in one go, before training starts, call -`tf.global_variables_initializer()`. This function returns a single operation -responsible for initializing all variables in the -`tf.GraphKeys.GLOBAL_VARIABLES` collection. Running this operation initializes -all variables. For example: - -``` python -session.run(tf.global_variables_initializer()) -# Now all variables are initialized. -``` - -If you do need to initialize variables yourself, you can run the variable's -initializer operation. For example: - -``` python -session.run(my_variable.initializer) -``` - - -You can also ask which variables have still not been initialized. For example, -the following code prints the names of all variables which have not yet been -initialized: - -``` python -print(session.run(tf.report_uninitialized_variables())) -``` - - -Note that by default `tf.global_variables_initializer` does not specify the -order in which variables are initialized. Therefore, if the initial value of a -variable depends on another variable's value, it's likely that you'll get an -error. Any time you use the value of a variable in a context in which not all -variables are initialized (say, if you use a variable's value while initializing -another variable), it is best to use `variable.initialized_value()` instead of -`variable`: - -``` python -v = tf.get_variable("v", shape=(), initializer=tf.zeros_initializer()) -w = tf.get_variable("w", initializer=v.initialized_value() + 1) -``` - -## Using variables - -To use the value of a `tf.Variable` in a TensorFlow graph, simply treat it like -a normal `tf.Tensor`: - -``` python -v = tf.get_variable("v", shape=(), initializer=tf.zeros_initializer()) -w = v + 1 # w is a tf.Tensor which is computed based on the value of v. - # Any time a variable is used in an expression it gets automatically - # converted to a tf.Tensor representing its value. -``` - -To assign a value to a variable, use the methods `assign`, `assign_add`, and -friends in the `tf.Variable` class. For example, here is how you can call these -methods: - -``` python -v = tf.get_variable("v", shape=(), initializer=tf.zeros_initializer()) -assignment = v.assign_add(1) -tf.global_variables_initializer().run() -sess.run(assignment) # or assignment.op.run(), or assignment.eval() -``` - -Most TensorFlow optimizers have specialized ops that efficiently update the -values of variables according to some gradient descent-like algorithm. See -`tf.train.Optimizer` for an explanation of how to use optimizers. - -Because variables are mutable it's sometimes useful to know what version of a -variable's value is being used at any point in time. To force a re-read of the -value of a variable after something has happened, you can use -`tf.Variable.read_value`. For example: - -``` python -v = tf.get_variable("v", shape=(), initializer=tf.zeros_initializer()) -assignment = v.assign_add(1) -with tf.control_dependencies([assignment]): - w = v.read_value() # w is guaranteed to reflect v's value after the - # assign_add operation. -``` - - -## Sharing variables - -TensorFlow supports two ways of sharing variables: - - * Explicitly passing `tf.Variable` objects around. - * Implicitly wrapping `tf.Variable` objects within `tf.variable_scope` objects. - -While code which explicitly passes variables around is very clear, it is -sometimes convenient to write TensorFlow functions that implicitly use -variables in their implementations. Most of the functional layers from -`tf.layers` use this approach, as well as all `tf.metrics`, and a few other -library utilities. - -Variable scopes allow you to control variable reuse when calling functions which -implicitly create and use variables. They also allow you to name your variables -in a hierarchical and understandable way. - -For example, let's say we write a function to create a convolutional / relu -layer: - -```python -def conv_relu(input, kernel_shape, bias_shape): - # Create variable named "weights". - weights = tf.get_variable("weights", kernel_shape, - initializer=tf.random_normal_initializer()) - # Create variable named "biases". - biases = tf.get_variable("biases", bias_shape, - initializer=tf.constant_initializer(0.0)) - conv = tf.nn.conv2d(input, weights, - strides=[1, 1, 1, 1], padding='SAME') - return tf.nn.relu(conv + biases) -``` - -This function uses short names `weights` and `biases`, which is good for -clarity. In a real model, however, we want many such convolutional layers, and -calling this function repeatedly would not work: - -``` python -input1 = tf.random_normal([1,10,10,32]) -input2 = tf.random_normal([1,20,20,32]) -x = conv_relu(input1, kernel_shape=[5, 5, 32, 32], bias_shape=[32]) -x = conv_relu(x, kernel_shape=[5, 5, 32, 32], bias_shape = [32]) # This fails. -``` - -Since the desired behavior is unclear (create new variables or reuse the -existing ones?) TensorFlow will fail. Calling `conv_relu` in different scopes, -however, clarifies that we want to create new variables: - -```python -def my_image_filter(input_images): - with tf.variable_scope("conv1"): - # Variables created here will be named "conv1/weights", "conv1/biases". - relu1 = conv_relu(input_images, [5, 5, 32, 32], [32]) - with tf.variable_scope("conv2"): - # Variables created here will be named "conv2/weights", "conv2/biases". - return conv_relu(relu1, [5, 5, 32, 32], [32]) -``` - -If you do want the variables to be shared, you have two options. First, you can -create a scope with the same name using `reuse=True`: - -``` python -with tf.variable_scope("model"): - output1 = my_image_filter(input1) -with tf.variable_scope("model", reuse=True): - output2 = my_image_filter(input2) - -``` - -You can also call `scope.reuse_variables()` to trigger a reuse: - -``` python -with tf.variable_scope("model") as scope: - output1 = my_image_filter(input1) - scope.reuse_variables() - output2 = my_image_filter(input2) - -``` - -Since depending on exact string names of scopes can feel dangerous, it's also -possible to initialize a variable scope based on another one: - -``` python -with tf.variable_scope("model") as scope: - output1 = my_image_filter(input1) -with tf.variable_scope(scope, reuse=True): - output2 = my_image_filter(input2) - -``` - diff --git a/tensorflow/docs_src/guide/version_compat.md b/tensorflow/docs_src/guide/version_compat.md deleted file mode 100644 index de93d225e3..0000000000 --- a/tensorflow/docs_src/guide/version_compat.md +++ /dev/null @@ -1,327 +0,0 @@ -# TensorFlow Version Compatibility - -This document is for users who need backwards compatibility across different -versions of TensorFlow (either for code or data), and for developers who want -to modify TensorFlow while preserving compatibility. - -## Semantic Versioning 2.0 - -TensorFlow follows Semantic Versioning 2.0 ([semver](http://semver.org)) for its -public API. Each release version of TensorFlow has the form `MAJOR.MINOR.PATCH`. -For example, TensorFlow version 1.2.3 has `MAJOR` version 1, `MINOR` version 2, -and `PATCH` version 3. Changes to each number have the following meaning: - -* **MAJOR**: Potentially backwards incompatible changes. Code and data that - worked with a previous major release will not necessarily work with the new - release. However, in some cases existing TensorFlow graphs and checkpoints - may be migratable to the newer release; see - [Compatibility of graphs and checkpoints](#compatibility_of_graphs_and_checkpoints) - for details on data compatibility. - -* **MINOR**: Backwards compatible features, speed improvements, etc. Code and - data that worked with a previous minor release *and* which depends only on the - public API will continue to work unchanged. For details on what is and is - not the public API, see [What is covered](#what_is_covered). - -* **PATCH**: Backwards compatible bug fixes. - -For example, release 1.0.0 introduced backwards *incompatible* changes from -release 0.12.1. However, release 1.1.1 was backwards *compatible* with release -1.0.0. - -## What is covered - -Only the public APIs of TensorFlow are backwards compatible across minor and -patch versions. The public APIs consist of - -* All the documented [Python](../api_docs/python) functions and classes in the - `tensorflow` module and its submodules, except for - * functions and classes in `tf.contrib` - * functions and classes whose names start with `_` (as these are private) - * functions, arguments, properties and classes whose name starts with - `experimental`, or whose fully qualified name includes a module called - `experimental` - Note that the code in the `examples/` and `tools/` directories is not - reachable through the `tensorflow` Python module and is thus not covered by - the compatibility guarantee. - - If a symbol is available through the `tensorflow` Python module or its - submodules, but is not documented, then it is **not** considered part of the - public API. - -* The [C API](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/c/c_api.h). - -* The following protocol buffer files: - * [`attr_value`](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/framework/attr_value.proto) - * [`config`](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/protobuf/config.proto) - * [`event`](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/util/event.proto) - * [`graph`](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/framework/graph.proto) - * [`op_def`](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/framework/op_def.proto) - * [`reader_base`](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/framework/reader_base.proto) - * [`summary`](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/framework/summary.proto) - * [`tensor`](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/framework/tensor.proto) - * [`tensor_shape`](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/framework/tensor_shape.proto) - * [`types`](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/framework/types.proto) - - -## What is *not* covered - -Some API functions are explicitly marked as "experimental" and can change in -backward incompatible ways between minor releases. These include: - -* **Experimental APIs**: The `tf.contrib` module and its submodules in Python - and any functions in the C API or fields in protocol buffers that are - explicitly commented as being experimental. In particular, any field in a - protocol buffer which is called "experimental" and all its fields and - submessages can change at any time. - -* **Other languages**: TensorFlow APIs in languages other than Python and C, - such as: - - - [C++](../api_guides/cc/guide.md) (exposed through header files in - [`tensorflow/cc`](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/cc)). - - [Java](../api_docs/java/reference/org/tensorflow/package-summary), - - [Go](https://godoc.org/github.com/tensorflow/tensorflow/tensorflow/go) - - [JavaScript](https://js.tensorflow.org) - -* **Details of composite ops:** Many public functions in Python expand to - several primitive ops in the graph, and these details will be part of any - graphs saved to disk as `GraphDef`s. These details may change for - minor releases. In particular, regressions tests that check for exact - matching between graphs are likely to break across minor releases, even - though the behavior of the graph should be unchanged and existing - checkpoints will still work. - -* **Floating point numerical details:** The specific floating point values - computed by ops may change at any time. Users should rely only on - approximate accuracy and numerical stability, not on the specific bits - computed. Changes to numerical formulas in minor and patch releases should - result in comparable or improved accuracy, with the caveat that in machine - learning improved accuracy of specific formulas may result in decreased - accuracy for the overall system. - -* **Random numbers:** The specific random numbers computed by the - [random ops](../api_guides/python/constant_op.md#Random_Tensors) may change at any time. - Users should rely only on approximately correct distributions and - statistical strength, not the specific bits computed. However, we will make - changes to random bits rarely (or perhaps never) for patch releases. We - will, of course, document all such changes. - -* **Version skew in distributed Tensorflow:** Running two different versions - of TensorFlow in a single cluster is unsupported. There are no guarantees - about backwards compatibility of the wire protocol. - -* **Bugs:** We reserve the right to make backwards incompatible behavior - (though not API) changes if the current implementation is clearly broken, - that is, if it contradicts the documentation or if a well-known and - well-defined intended behavior is not properly implemented due to a bug. - For example, if an optimizer claims to implement a well-known optimization - algorithm but does not match that algorithm due to a bug, then we will fix - the optimizer. Our fix may break code relying on the wrong behavior for - convergence. We will note such changes in the release notes. - -* **Error messages:** We reserve the right to change the text of error - messages. In addition, the type of an error may change unless the type is - specified in the documentation. For example, a function documented to - raise an `InvalidArgument` exception will continue to - raise `InvalidArgument`, but the human-readable message contents can change. - -## Compatibility of graphs and checkpoints - -You'll sometimes need to preserve graphs and checkpoints. -Graphs describe the data flow of ops to be run during training and -inference, and checkpoints contain the saved tensor values of variables in a -graph. - -Many TensorFlow users save graphs and trained models to disk for -later evaluation or additional training, but end up running their saved graphs -or models on a later release. In compliance with semver, any graph or checkpoint -written out with one version of TensorFlow can be loaded and evaluated with a -later version of TensorFlow with the same major release. However, we will -endeavor to preserve backwards compatibility even across major releases when -possible, so that the serialized files are usable over long periods of time. - - -Graphs are serialized via the `GraphDef` protocol buffer. To facilitate (rare) -backwards incompatible changes to graphs, each `GraphDef` has a version number -separate from the TensorFlow version. For example, `GraphDef` version 17 -deprecated the `inv` op in favor of `reciprocal`. The semantics are: - -* Each version of TensorFlow supports an interval of `GraphDef` versions. This - interval will be constant across patch releases, and will only grow across - minor releases. Dropping support for a `GraphDef` version will only occur - for a major release of TensorFlow. - -* Newly created graphs are assigned the latest `GraphDef` version number. - -* If a given version of TensorFlow supports the `GraphDef` version of a graph, - it will load and evaluate with the same behavior as the TensorFlow version - used to generate it (except for floating point numerical details and random - numbers), regardless of the major version of TensorFlow. In particular, all - checkpoint files will be compatible. - -* If the `GraphDef` *upper* bound is increased to X in a (minor) release, there - will be at least six months before the *lower* bound is increased to X. For - example (we're using hypothetical version numbers here): - * TensorFlow 1.2 might support `GraphDef` versions 4 to 7. - * TensorFlow 1.3 could add `GraphDef` version 8 and support versions 4 to 8. - * At least six months later, TensorFlow 2.0.0 could drop support for - versions 4 to 7, leaving version 8 only. - -Finally, when support for a `GraphDef` version is dropped, we will attempt to -provide tools for automatically converting graphs to a newer supported -`GraphDef` version. - -## Graph and checkpoint compatibility when extending TensorFlow - -This section is relevant only when making incompatible changes to the `GraphDef` -format, such as when adding ops, removing ops, or changing the functionality -of existing ops. The previous section should suffice for most users. - - - -### Backward and partial forward compatibility - -Our versioning scheme has three requirements: - -* **Backward compatibility** to support loading graphs and checkpoints - created with older versions of TensorFlow. -* **Forward compatibility** to support scenarios where the producer of a - graph or checkpoint is upgraded to a newer version of TensorFlow before - the consumer. -* Enable evolving TensorFlow in incompatible ways. For example, removing ops, - adding attributes, and removing attributes. - -Note that while the `GraphDef` version mechanism is separate from the TensorFlow -version, backwards incompatible changes to the `GraphDef` format are still -restricted by Semantic Versioning. This means functionality can only be removed -or changed between `MAJOR` versions of TensorFlow (such as `1.7` to `2.0`). -Additionally, forward compatibility is enforced within Patch releases (`1.x.1` -to `1.x.2` for example). - -To achieve backward and forward compatibility and to know when to enforce changes -in formats, graphs and checkpoints have metadata that describes when they -were produced. The sections below detail the TensorFlow implementation and -guidelines for evolving `GraphDef` versions. - -### Independent data version schemes - -There are different data versions for graphs and checkpoints. The two data -formats evolve at different rates from each other and also at different rates -from TensorFlow. Both versioning systems are defined in -[`core/public/version.h`](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/public/version.h). -Whenever a new version is added, a note is added to the header detailing what -changed and the date. - -### Data, producers, and consumers - -We distinguish between the following kinds of data version information: -* **producers**: binaries that produce data. Producers have a version - (`producer`) and a minimum consumer version that they are compatible with - (`min_consumer`). -* **consumers**: binaries that consume data. Consumers have a version - (`consumer`) and a minimum producer version that they are compatible with - (`min_producer`). - -Each piece of versioned data has a [`VersionDef -versions`](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/framework/versions.proto) -field which records the `producer` that made the data, the `min_consumer` -that it is compatible with, and a list of `bad_consumers` versions that are -disallowed. - -By default, when a producer makes some data, the data inherits the producer's -`producer` and `min_consumer` versions. `bad_consumers` can be set if specific -consumer versions are known to contain bugs and must be avoided. A consumer can -accept a piece of data if the following are all true: - -* `consumer` >= data's `min_consumer` -* data's `producer` >= consumer's `min_producer` -* `consumer` not in data's `bad_consumers` - -Since both producers and consumers come from the same TensorFlow code base, -[`core/public/version.h`](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/public/version.h) -contains a main data version which is treated as either `producer` or -`consumer` depending on context and both `min_consumer` and `min_producer` -(needed by producers and consumers, respectively). Specifically, - -* For `GraphDef` versions, we have `TF_GRAPH_DEF_VERSION`, - `TF_GRAPH_DEF_VERSION_MIN_CONSUMER`, and - `TF_GRAPH_DEF_VERSION_MIN_PRODUCER`. -* For checkpoint versions, we have `TF_CHECKPOINT_VERSION`, - `TF_CHECKPOINT_VERSION_MIN_CONSUMER`, and - `TF_CHECKPOINT_VERSION_MIN_PRODUCER`. - -### Add a new attribute with default to an existing op - -Following the guidance below gives you forward compatibility only if the set of -ops has not changed: - -1. If forward compatibility is desired, set `strip_default_attrs` to `True` - while exporting the model using either the - `tf.saved_model.builder.SavedModelBuilder.add_meta_graph_and_variables` - and `tf.saved_model.builder.SavedModelBuilder.add_meta_graph` - methods of the `SavedModelBuilder` class, or - `tf.estimator.Estimator.export_savedmodel` -2. This strips off the default valued attributes at the time of - producing/exporting the models. This makes sure that the exported - `tf.MetaGraphDef` does not contain the new op-attribute when the default - value is used. -3. Having this control could allow out-of-date consumers (for example, serving - binaries that lag behind training binaries) to continue loading the models - and prevent interruptions in model serving. - -### Evolving GraphDef versions - -This section explains how to use this versioning mechanism to make different -types of changes to the `GraphDef` format. - -#### Add an op - -Add the new op to both consumers and producers at the same time, and do not -change any `GraphDef` versions. This type of change is automatically -backward compatible, and does not impact forward compatibility plan since -existing producer scripts will not suddenly use the new functionality. - -#### Add an op and switch existing Python wrappers to use it - -1. Implement new consumer functionality and increment the `GraphDef` version. -2. If it is possible to make the wrappers use the new functionality only in - cases that did not work before, the wrappers can be updated now. -3. Change Python wrappers to use the new functionality. Do not increment - `min_consumer`, since models that do not use this op should not break. - -#### Remove or restrict an op's functionality - -1. Fix all producer scripts (not TensorFlow itself) to not use the banned op or - functionality. -2. Increment the `GraphDef` version and implement new consumer functionality - that bans the removed op or functionality for GraphDefs at the new version - and above. If possible, make TensorFlow stop producing `GraphDefs` with the - banned functionality. To do so, add the - [`REGISTER_OP(...).Deprecated(deprecated_at_version, - message)`](https://github.com/tensorflow/tensorflow/blob/b289bc7a50fc0254970c60aaeba01c33de61a728/tensorflow/core/ops/array_ops.cc#L1009). -3. Wait for a major release for backward compatibility purposes. -4. Increase `min_producer` to the GraphDef version from (2) and remove the - functionality entirely. - -#### Change an op's functionality - -1. Add a new similar op named `SomethingV2` or similar and go through the - process of adding it and switching existing Python wrappers to use it. - To ensure forward compatibility use the checks suggested in - [compat.py](https://www.tensorflow.org/code/tensorflow/python/compat/compat.py) - when changing the Python wrappers. -2. Remove the old op (Can only take place with a major version change due to - backward compatibility). -3. Increase `min_consumer` to rule out consumers with the old op, add back the - old op as an alias for `SomethingV2`, and go through the process to switch - existing Python wrappers to use it. -4. Go through the process to remove `SomethingV2`. - -#### Ban a single unsafe consumer version - -1. Bump the `GraphDef` version and add the bad version to `bad_consumers` for - all new GraphDefs. If possible, add to `bad_consumers` only for GraphDefs - which contain a certain op or similar. -2. If existing consumers have the bad version, push them out as soon as - possible. diff --git a/tensorflow/docs_src/install/index.md b/tensorflow/docs_src/install/index.md deleted file mode 100644 index 76e590e1e1..0000000000 --- a/tensorflow/docs_src/install/index.md +++ /dev/null @@ -1,39 +0,0 @@ -# Install TensorFlow - -Note: Run the [TensorFlow tutorials](../tutorials) in a pre-configured -[Colab notebook environment](https://colab.research.google.com/notebooks/welcome.ipynb){: .external}, -without installation. - -TensorFlow is built and tested on the following 64-bit operating systems: - - * macOS 10.12.6 (Sierra) or later. - * Ubuntu 16.04 or later - * Windows 7 or later. - * Raspbian 9.0 or later. - -While TensorFlow may work on other systems, we only support—and fix issues in—the -systems listed above. - -The following guides explain how to install a version of TensorFlow -that enables you to write applications in Python: - - * [Install TensorFlow on Ubuntu](../install/install_linux.md) - * [Install TensorFlow on macOS](../install/install_mac.md) - * [Install TensorFlow on Windows](../install/install_windows.md) - * [Install TensorFlow on a Raspberry Pi](../install/install_raspbian.md) - * [Install TensorFlow from source code](../install/install_sources.md) - -Many aspects of the Python TensorFlow API changed from version 0.n to 1.0. -The following guide explains how to migrate older TensorFlow applications -to Version 1.0: - - * [Transition to TensorFlow 1.0](../install/migration.md) - -The following guides explain how to install TensorFlow libraries for use in -other programming languages. These APIs are aimed at deploying TensorFlow -models in applications and are not as extensive as the Python APIs. - - * [Install TensorFlow for Java](../install/install_java.md) - * [Install TensorFlow for C](../install/install_c.md) - * [Install TensorFlow for Go](../install/install_go.md) - diff --git a/tensorflow/docs_src/install/install_c.md b/tensorflow/docs_src/install/install_c.md deleted file mode 100644 index 084634bc9c..0000000000 --- a/tensorflow/docs_src/install/install_c.md +++ /dev/null @@ -1,118 +0,0 @@ -# Install TensorFlow for C - -TensorFlow provides a C API defined in -[`c_api.h`](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/c/c_api.h), -which is suitable for -[building bindings for other languages](https://www.tensorflow.org/extend/language_bindings). -The API leans towards simplicity and uniformity rather than convenience. - - -## Supported Platforms - -This guide explains how to install TensorFlow for C. Although these -instructions might also work on other variants, we have only tested -(and we only support) these instructions on machines meeting the -following requirements: - - * Linux, 64-bit, x86 - * macOS X, Version 10.12.6 (Sierra) or higher - - -## Installation - -Take the following steps to install the TensorFlow for C library and -enable TensorFlow for C: - - 1. Decide whether you will run TensorFlow for C on CPU(s) only or - with the help of GPU(s). To help you decide, read the section - entitled "Determine which TensorFlow to install" in one of the - following guides: - - * [Installing TensorFlow on Linux](../install/install_linux.md#determine_which_tensorflow_to_install) - * [Installing TensorFlow on macOS](../install/install_mac.md#determine_which_tensorflow_to_install) - - 2. Download and extract the TensorFlow C library into `/usr/local/lib` by - invoking the following shell commands: - - TF_TYPE="cpu" # Change to "gpu" for GPU support - OS="linux" # Change to "darwin" for macOS - TARGET_DIRECTORY="/usr/local" - curl -L \ - "https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-${TF_TYPE}-${OS}-x86_64-1.10.0.tar.gz" | - sudo tar -C $TARGET_DIRECTORY -xz - - The `tar` command extracts the TensorFlow C library into the `lib` - subdirectory of `TARGET_DIRECTORY`. For example, specifying `/usr/local` - as `TARGET_DIRECTORY` causes `tar` to extract the TensorFlow C library - into `/usr/local/lib`. - - If you'd prefer to extract the library into a different directory, - adjust `TARGET_DIRECTORY` accordingly. - - 3. In Step 2, if you specified a system directory (for example, `/usr/local`) - as the `TARGET_DIRECTORY`, then run `ldconfig` to configure the linker. - For example: - -
sudo ldconfig
- - If you assigned a `TARGET_DIRECTORY` other than a system - directory (for example, `~/mydir`), then you must append the extraction - directory (for example, `~/mydir/lib`) to two environment variables. - For example: - -
 export LIBRARY_PATH=$LIBRARY_PATH:~/mydir/lib # For both Linux and macOS X
-     export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:~/mydir/lib # For Linux only
-     export DYLD_LIBRARY_PATH=$DYLD_LIBRARY_PATH:~/mydir/lib # For macOS X only
- - - -## Validate your installation - -After installing TensorFlow for C, enter the following code into a file named -`hello_tf.c`: - -```c -#include -#include - -int main() { - printf("Hello from TensorFlow C library version %s\n", TF_Version()); - return 0; -} -``` - -### Build and Run - -Build `hello_tf.c` by invoking the following command: - - -
gcc hello_tf.c
- - -Running the resulting executable should output the following message: - - -
a.out
-Hello from TensorFlow C library version number
- - -### Troubleshooting - -If building the program fails, the most likely culprit is that `gcc` cannot -find the TensorFlow C library. One way to fix this problem is to specify -the `-I` and `-L` options to `gcc`. For example, if the `TARGET_LIBRARY` -was `/usr/local`, you would invoke `gcc` as follows: - -
gcc -I/usr/local/include -L/usr/local/lib hello_tf.c -ltensorflow
- -If executing `a.out` fails, ask yourself the following questions: - - * Did the program build without error? - * Have you assigned the correct directory to the environment variables - noted in Step 3 of [Installation](#installation)? - * Did you export those environment variables? - -If you are still seeing build or execution error messages, search (or post to) -[StackOverflow](https://stackoverflow.com/questions/tagged/tensorflow) for -possible solutions. - diff --git a/tensorflow/docs_src/install/install_go.md b/tensorflow/docs_src/install/install_go.md deleted file mode 100644 index 0c604d7713..0000000000 --- a/tensorflow/docs_src/install/install_go.md +++ /dev/null @@ -1,142 +0,0 @@ -# Install TensorFlow for Go - -TensorFlow provides APIs for use in Go programs. These APIs are particularly -well-suited to loading models created in Python and executing them within -a Go application. This guide explains how to install and set up the -[TensorFlow Go package](https://godoc.org/github.com/tensorflow/tensorflow/tensorflow/go). - -Warning: The TensorFlow Go API is *not* covered by the TensorFlow -[API stability guarantees](../guide/version_compat.md). - - -## Supported Platforms - -This guide explains how to install TensorFlow for Go. Although these -instructions might also work on other variants, we have only tested -(and we only support) these instructions on machines meeting the -following requirements: - - * Linux, 64-bit, x86 - * macOS X, 10.12.6 (Sierra) or higher - - -## Installation - -TensorFlow for Go depends on the TensorFlow C library. Take the following -steps to install this library and enable TensorFlow for Go: - - 1. Decide whether you will run TensorFlow for Go on CPU(s) only or with - the help of GPU(s). To help you decide, read the section entitled - "Determine which TensorFlow to install" in one of the following guides: - - * [Installing TensorFlow on Linux](../install/install_linux.md#determine_which_tensorflow_to_install) - * [Installing TensorFlow on macOS](../install/install_mac.md#determine_which_tensorflow_to_install) - - 2. Download and extract the TensorFlow C library into `/usr/local/lib` by - invoking the following shell commands: - - TF_TYPE="cpu" # Change to "gpu" for GPU support - TARGET_DIRECTORY='/usr/local' - curl -L \ - "https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-${TF_TYPE}-$(go env GOOS)-x86_64-1.10.0.tar.gz" | - sudo tar -C $TARGET_DIRECTORY -xz - - The `tar` command extracts the TensorFlow C library into the `lib` - subdirectory of `TARGET_DIRECTORY`. For example, specifying `/usr/local` - as `TARGET_DIRECTORY` causes `tar` to extract the TensorFlow C library - into `/usr/local/lib`. - - If you'd prefer to extract the library into a different directory, - adjust `TARGET_DIRECTORY` accordingly. - - 3. In Step 2, if you specified a system directory (for example, `/usr/local`) - as the `TARGET_DIRECTORY`, then run `ldconfig` to configure the linker. - For example: - -
sudo ldconfig
- - If you assigned a `TARGET_DIRECTORY` other than a system - directory (for example, `~/mydir`), then you must append the extraction - directory (for example, `~/mydir/lib`) to two environment variables - as follows: - -
 export LIBRARY_PATH=$LIBRARY_PATH:~/mydir/lib # For both Linux and macOS X
-     export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:~/mydir/lib # For Linux only
-     export DYLD_LIBRARY_PATH=$DYLD_LIBRARY_PATH:~/mydir/lib # For macOS X only
- - 4. Now that the TensorFlow C library is installed, invoke `go get` as follows - to download the appropriate packages and their dependencies: - -
go get github.com/tensorflow/tensorflow/tensorflow/go
- - 5. Invoke `go test` as follows to validate the TensorFlow for Go - installation: - -
go test github.com/tensorflow/tensorflow/tensorflow/go
- -If `go get` or `go test` generate error messages, search (or post to) -[StackOverflow](http://www.stackoverflow.com/questions/tagged/tensorflow) -for possible solutions. - - -## Hello World - -After installing TensorFlow for Go, enter the following code into a -file named `hello_tf.go`: - -```go -package main - -import ( - tf "github.com/tensorflow/tensorflow/tensorflow/go" - "github.com/tensorflow/tensorflow/tensorflow/go/op" - "fmt" -) - -func main() { - // Construct a graph with an operation that produces a string constant. - s := op.NewScope() - c := op.Const(s, "Hello from TensorFlow version " + tf.Version()) - graph, err := s.Finalize() - if err != nil { - panic(err) - } - - // Execute the graph in a session. - sess, err := tf.NewSession(graph, nil) - if err != nil { - panic(err) - } - output, err := sess.Run(nil, []tf.Output{c}, nil) - if err != nil { - panic(err) - } - fmt.Println(output[0].Value()) -} -``` - -For a more advanced example of TensorFlow in Go, look at the -[example in the API documentation](https://godoc.org/github.com/tensorflow/tensorflow/tensorflow/go#ex-package), -which uses a pre-trained TensorFlow model to label contents of an image. - - -### Running - -Run `hello_tf.go` by invoking the following command: - -
go run hello_tf.go
-Hello from TensorFlow version number
- -The program might also generate multiple warning messages of the -following form, which you can ignore: - -
W tensorflow/core/platform/cpu_feature_guard.cc:45] The TensorFlow library
-wasn't compiled to use *Type* instructions, but these are available on your
-machine and could speed up CPU computations.
- - -## Building from source code - -TensorFlow is open-source. You may build TensorFlow for Go from the -TensorFlow source code by following the instructions in a -[separate document](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/go/README.md). diff --git a/tensorflow/docs_src/install/install_java.md b/tensorflow/docs_src/install/install_java.md deleted file mode 100644 index c411cb78fe..0000000000 --- a/tensorflow/docs_src/install/install_java.md +++ /dev/null @@ -1,268 +0,0 @@ -# Install TensorFlow for Java - -TensorFlow provides APIs for use in Java programs. These APIs are particularly -well-suited to loading models created in Python and executing them within a -Java application. This guide explains how to install -[TensorFlow for Java](https://www.tensorflow.org/api_docs/java/reference/org/tensorflow/package-summary) -and use it in a Java application. - -Warning: The TensorFlow Java API is *not* covered by the TensorFlow -[API stability guarantees](../guide/version_semantics.md). - - -## Supported Platforms - -This guide explains how to install TensorFlow for Java. Although these -instructions might also work on other variants, we have only tested -(and we only support) these instructions on machines meeting the -following requirements: - - * Ubuntu 16.04 or higher; 64-bit, x86 - * macOS 10.12.6 (Sierra) or higher - * Windows 7 or higher; 64-bit, x86 - -The installation instructions for Android are in a separate -[Android TensorFlow Support page](https://www.tensorflow.org/code/tensorflow/contrib/android). -After installation, please see this -[complete example](https://www.tensorflow.org/code/tensorflow/examples/android) -of TensorFlow on Android. - -## Using TensorFlow with a Maven project - -If your project uses [Apache Maven](https://maven.apache.org), then add the -following to the project's `pom.xml` to use the TensorFlow Java APIs: - -```xml - - org.tensorflow - tensorflow - 1.10.0 - -``` - -That's all. - -### Example - -As an example, these steps will create a Maven project that uses TensorFlow: - - 1. Create the project's `pom.xml`: - - - - 4.0.0 - org.myorg - hellotf - 1.0-SNAPSHOT - - HelloTF - - - 1.7 - 1.7 - - - - org.tensorflow - tensorflow - 1.10.0 - - - - - - 2. Create the source file (`src/main/java/HelloTF.java`): - - - import org.tensorflow.Graph; - import org.tensorflow.Session; - import org.tensorflow.Tensor; - import org.tensorflow.TensorFlow; - - public class HelloTF { - public static void main(String[] args) throws Exception { - try (Graph g = new Graph()) { - final String value = "Hello from " + TensorFlow.version(); - - // Construct the computation graph with a single operation, a constant - // named "MyConst" with a value "value". - try (Tensor t = Tensor.create(value.getBytes("UTF-8"))) { - // The Java API doesn't yet include convenience functions for adding operations. - g.opBuilder("Const", "MyConst").setAttr("dtype", t.dataType()).setAttr("value", t).build(); - } - - // Execute the "MyConst" operation in a Session. - try (Session s = new Session(g); - // Generally, there may be multiple output tensors, all of them must be closed to prevent resource leaks. - Tensor output = s.runner().fetch("MyConst").run().get(0)) { - System.out.println(new String(output.bytesValue(), "UTF-8")); - } - } - } - } - - - 3. Compile and execute: - -
 # Use -q to hide logging from the mvn tool
-     mvn -q compile exec:java
- - -The preceding command should output Hello from version. If it -does, you've successfully set up TensorFlow for Java and are ready to use it in -Maven projects. If not, check -[Stack Overflow](http://stackoverflow.com/questions/tagged/tensorflow) -for possible solutions. You can skip reading the rest of this document. - -### GPU support - -If your Linux system has an NVIDIA® GPU and your TensorFlow Java program -requires GPU acceleration, then add the following to the project's `pom.xml` -instead: - -```xml - - org.tensorflow - libtensorflow - 1.10.0 - - - org.tensorflow - libtensorflow_jni_gpu - 1.10.0 - -``` - -GPU acceleration is available via Maven only for Linux and only if your system -meets the -[requirements for GPU](../install/install_linux.md#determine_which_tensorflow_to_install). - -## Using TensorFlow with JDK - -This section describes how to use TensorFlow using the `java` and `javac` -commands from a JDK installation. If your project uses Apache Maven, then -refer to the simpler instructions above instead. - -### Install on Linux or macOS - -Take the following steps to install TensorFlow for Java on Linux or macOS: - - 1. Download - [libtensorflow.jar](https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-1.10.0.jar), - which is the TensorFlow Java Archive (JAR). - - 2. Decide whether you will run TensorFlow for Java on CPU(s) only or with - the help of GPU(s). To help you decide, read the section entitled - "Determine which TensorFlow to install" in one of the following guides: - - * [Installing TensorFlow on Linux](../install/install_linux.md#determine_which_tensorflow_to_install) - * [Installing TensorFlow on macOS](../install/install_mac.md#determine_which_tensorflow_to_install) - - 3. Download and extract the appropriate Java Native Interface (JNI) - file for your operating system and processor support by running the - following shell commands: - - - TF_TYPE="cpu" # Default processor is CPU. If you want GPU, set to "gpu" - OS=$(uname -s | tr '[:upper:]' '[:lower:]') - mkdir -p ./jni - curl -L \ - "https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow_jni-${TF_TYPE}-${OS}-x86_64-1.10.0.tar.gz" | - tar -xz -C ./jni - -### Install on Windows - -Take the following steps to install TensorFlow for Java on Windows: - - 1. Download - [libtensorflow.jar](https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-1.10.0.jar), - which is the TensorFlow Java Archive (JAR). - 2. Download the following Java Native Interface (JNI) file appropriate for - [TensorFlow for Java on Windows](https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow_jni-cpu-windows-x86_64-1.10.0.zip). - 3. Extract this .zip file. - -__Note__: The native library (`tensorflow_jni.dll`) requires `msvcp140.dll` at runtime, which is included in the [Visual C++ 2015 Redistributable](https://www.microsoft.com/en-us/download/details.aspx?id=48145) package. - -### Validate the installation - -After installing TensorFlow for Java, validate your installation by entering -the following code into a file named `HelloTF.java`: - -```java -import org.tensorflow.Graph; -import org.tensorflow.Session; -import org.tensorflow.Tensor; -import org.tensorflow.TensorFlow; - -public class HelloTF { - public static void main(String[] args) throws Exception { - try (Graph g = new Graph()) { - final String value = "Hello from " + TensorFlow.version(); - - // Construct the computation graph with a single operation, a constant - // named "MyConst" with a value "value". - try (Tensor t = Tensor.create(value.getBytes("UTF-8"))) { - // The Java API doesn't yet include convenience functions for adding operations. - g.opBuilder("Const", "MyConst").setAttr("dtype", t.dataType()).setAttr("value", t).build(); - } - - // Execute the "MyConst" operation in a Session. - try (Session s = new Session(g); - // Generally, there may be multiple output tensors, all of them must be closed to prevent resource leaks. - Tensor output = s.runner().fetch("MyConst").run().get(0)) { - System.out.println(new String(output.bytesValue(), "UTF-8")); - } - } - } -} -``` - -And use the instructions below to compile and run `HelloTF.java`. - - -### Compiling - -When compiling a Java program that uses TensorFlow, the downloaded `.jar` -must be part of your `classpath`. For example, you can include the -downloaded `.jar` in your `classpath` by using the `-cp` compilation flag -as follows: - -
javac -cp libtensorflow-1.10.0.jar HelloTF.java
- - -### Running - -To execute a Java program that depends on TensorFlow, ensure that the following -two files are available to the JVM: - - * the downloaded `.jar` file - * the extracted JNI library - -For example, the following command line executes the `HelloTF` program on Linux -and macOS X: - -
java -cp libtensorflow-1.10.0.jar:. -Djava.library.path=./jni HelloTF
- -And the following command line executes the `HelloTF` program on Windows: - -
java -cp libtensorflow-1.10.0.jar;. -Djava.library.path=jni HelloTF
- -If the program prints Hello from version, you've successfully -installed TensorFlow for Java and are ready to use the API. If the program -outputs something else, check -[Stack Overflow](http://stackoverflow.com/questions/tagged/tensorflow) for -possible solutions. - - -### Advanced Example - -For a more sophisticated example, see -[LabelImage.java](https://www.tensorflow.org/code/tensorflow/java/src/main/java/org/tensorflow/examples/LabelImage.java), -which recognizes objects in an image. - - -## Building from source code - -TensorFlow is open-source. You may build TensorFlow for Java from the -TensorFlow source code by following the instructions in a -[separate document](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/java/README.md). diff --git a/tensorflow/docs_src/install/install_linux.md b/tensorflow/docs_src/install/install_linux.md deleted file mode 100644 index 5fcfa4b988..0000000000 --- a/tensorflow/docs_src/install/install_linux.md +++ /dev/null @@ -1,714 +0,0 @@ -# Install TensorFlow on Ubuntu - -This guide explains how to install TensorFlow on Ubuntu Linux. While these -instructions may work on other Linux variants, they are tested and supported -with the following system requirements: - -* 64-bit desktops or laptops -* Ubuntu 16.04 or higher - -## Choose which TensorFlow to install - -The following TensorFlow variants are available for installation: - -* __TensorFlow with CPU support only__. If your system does not have a - NVIDIA® GPU, you must install this version. This version of TensorFlow - is usually easier to install, so even if you have an NVIDIA GPU, we - recommend installing this version first. -* __TensorFlow with GPU support__. TensorFlow programs usually run much faster - on a GPU instead of a CPU. If you run performance-critical applications and - your system has an NVIDIA® GPU that meets the prerequisites, you should - install this version. See [TensorFlow GPU support](#NVIDIARequirements) for - details. - -## How to install TensorFlow - -There are a few options to install TensorFlow on your machine: - -* [Use pip in a virtual environment](#InstallingVirtualenv) *(recommended)* -* [Use pip in your system environment](#InstallingNativePip) -* [Configure a Docker container](#InstallingDocker) -* [Use pip in Anaconda](#InstallingAnaconda) -* [Install TensorFlow from source](/install/install_sources) - -
- -### Use `pip` in a virtual environment - -Key Point: Using a virtual environment is the recommended install method. - -The [Virtualenv](https://virtualenv.pypa.io/en/stable/) tool creates virtual -Python environments that are isolated from other Python development on the same -machine. In this scenario, you install TensorFlow and its dependencies within a -virtual environment that is available when *activated*. Virtualenv provides a -reliable way to install and run TensorFlow while avoiding conflicts with the -rest of the system. - -##### 1. Install Python, `pip`, and `virtualenv`. - -On Ubuntu, Python is automatically installed and `pip` is *usually* installed. -Confirm the `python` and `pip` versions: - -
-  python -V  # or: python3 -V
-  pip -V     # or: pip3 -V
-
- -To install these packages on Ubuntu: - -
-  sudo apt-get install python-pip python-dev python-virtualenv   # for Python 2.7
-  sudo apt-get install python3-pip python3-dev python-virtualenv # for Python 3.n
-
- -We *recommend* using `pip` version 8.1 or higher. If using a release before -version 8.1, upgrade `pip`: - -
-  pip install --upgrade pip
-
- -If not using Ubuntu and [setuptools](https://pypi.org/project/setuptools/) is -installed, use `easy_install` to install `pip`: - -
-  easy_install -U pip
-
- -##### 2. Create a directory for the virtual environment and choose a Python interpreter. - -
-  mkdir ~/tensorflow  # somewhere to work out of
-  cd ~/tensorflow
-  # Choose one of the following Python environments for the ./venv directory:
-  virtualenv --system-site-packages venv            # Use python default (Python 2.7)
-  virtualenv --system-site-packages -p python3 venv # Use Python 3.n
-
- -##### 3. Activate the Virtualenv environment. - -Use one of these shell-specific commands to activate the virtual environment: - -
-  source ~/tensorflow/venv/bin/activate      # bash, sh, ksh, or zsh
-  source ~/tensorflow/venv/bin/activate.csh  # csh or tcsh
-  . ~/tensorflow/venv/bin/activate.fish      # fish
-
- -When the Virtualenv is activated, the shell prompt displays as `(venv) $`. - -##### 4. Upgrade `pip` in the virtual environment. - -Within the active virtual environment, upgrade `pip`: - -
-(venv)$ pip install --upgrade pip
-
- -You can install other Python packages within the virtual environment without -affecting packages outside the `virtualenv`. - -##### 5. Install TensorFlow in the virtual environment. - -Choose one of the available TensorFlow packages for installation: - -* `tensorflow` —Current release for CPU -* `tensorflow-gpu` —Current release with GPU support -* `tf-nightly` —Nightly build for CPU -* `tf-nightly-gpu` —Nightly build with GPU support - -Within an active Virtualenv environment, use `pip` to install the package: - -
-  pip install --upgrade tensorflow
-
- -Use `pip list` to show the packages installed in the virtual environment. -[Validate the install](#ValidateYourInstallation) and test the version: - -
-(venv)$ python -c "import tensorflow as tf; print(tf.__version__)"
-
- -Success: TensorFlow is now installed. - -Use the `deactivate` command to stop the Python virtual environment. - -#### Problems - -If the above steps failed, try installing the TensorFlow binary using the remote -URL of the `pip` package: - -
-(venv)$ pip install --upgrade remote-pkg-URL   # Python 2.7
-(venv)$ pip3 install --upgrade remote-pkg-URL  # Python 3.n
-
- -The remote-pkg-URL depends on the operating system, Python version, -and GPU support. See [here](#the_url_of_the_tensorflow_python_package) for the -URL naming scheme and location. - -See [Common Installation Problems](#common_installation_problems) if you -encounter problems. - -#### Uninstall TensorFlow - -To uninstall TensorFlow, remove the Virtualenv directory you created in step 2: - -
-  deactivate  # stop the virtualenv
-  rm -r ~/tensorflow/venv
-
- - - -### Use `pip` in your system environment - -Use `pip` to install the TensorFlow package directly on your system without -using a container or virtual environment for isolation. This method is -recommended for system administrators that want a TensorFlow installation that -is available to everyone on a multi-user system. - -Since a system install is not isolated, it could interfere with other -Python-based installations. But if you understand `pip` and your Python -environment, a system `pip` install is straightforward. - -See the -[REQUIRED_PACKAGES section of setup.py](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/tools/pip_package/setup.py) -for a list of packages that TensorFlow installs. - -##### 1. Install Python, `pip`, and `virtualenv`. - -On Ubuntu, Python is automatically installed and `pip` is *usually* installed. -Confirm the `python` and `pip` versions: - -
-  python -V  # or: python3 -V
-  pip -V     # or: pip3 -V
-
- -To install these packages on Ubuntu: - -
-  sudo apt-get install python-pip python-dev   # for Python 2.7
-  sudo apt-get install python3-pip python3-dev # for Python 3.n
-
- -We *recommend* using `pip` version 8.1 or higher. If using a release before -version 8.1, upgrade `pip`: - -
-  pip install --upgrade pip
-
- -If not using Ubuntu and [setuptools](https://pypi.org/project/setuptools/) is -installed, use `easy_install` to install `pip`: - -
-  easy_install -U pip
-
- -##### 2. Install TensorFlow on system. - -Choose one of the available TensorFlow packages for installation: - -* `tensorflow` —Current release for CPU -* `tensorflow-gpu` —Current release with GPU support -* `tf-nightly` —Nightly build for CPU -* `tf-nightly-gpu` —Nightly build with GPU support - -And use `pip` to install the package for Python 2 or 3: - -
-  pip install --upgrade --user tensorflow   # Python 2.7
-  pip3 install --upgrade --user tensorflow  # Python 3.n
-
- -Use `pip list` to show the packages installed on the system. -[Validate the install](#ValidateYourInstallation) and test the version: - -
-  python -c "import tensorflow as tf; print(tf.__version__)"
-
- -Success: TensorFlow is now installed. - -#### Problems - -If the above steps failed, try installing the TensorFlow binary using the remote -URL of the `pip` package: - -
-  pip install --user --upgrade remote-pkg-URL   # Python 2.7
-  pip3 install --user --upgrade remote-pkg-URL  # Python 3.n
-
- -The remote-pkg-URL depends on the operating system, Python version, -and GPU support. See [here](#the_url_of_the_tensorflow_python_package) for the -URL naming scheme and location. - -See [Common Installation Problems](#common_installation_problems) if you -encounter problems. - -#### Uninstall TensorFlow - -To uninstall TensorFlow on your system, use one of following commands: - -
-  pip uninstall tensorflow   # for Python 2.7
-  pip3 uninstall tensorflow  # for Python 3.n
-
- - - -### Configure a Docker container - -Docker completely isolates the TensorFlow installation from pre-existing -packages on your machine. The Docker container contains TensorFlow and all its -dependencies. Note that the Docker image can be quite large (hundreds of MBs). -You might choose the Docker installation if you are incorporating TensorFlow -into a larger application architecture that already uses Docker. - -Take the following steps to install TensorFlow through Docker: - -1. Install Docker on your machine as described in the - [Docker documentation](http://docs.docker.com/engine/installation/). -2. Optionally, create a Linux group called docker to allow - launching containers without sudo as described in the - [Docker documentation](https://docs.docker.com/engine/installation/linux/linux-postinstall/). - (If you don't do this step, you'll have to use sudo each time you invoke - Docker.) -3. To install a version of TensorFlow that supports GPUs, you must first - install [nvidia-docker](https://github.com/NVIDIA/nvidia-docker), which is - stored in github. -4. Launch a Docker container that contains one of the - [TensorFlow binary images](https://hub.docker.com/r/tensorflow/tensorflow/tags/). - -The remainder of this section explains how to launch a Docker container. - -#### CPU-only - -To launch a Docker container with CPU-only support (that is, without GPU -support), enter a command of the following format: - -
-$ docker run -it -p hostPort:containerPort TensorFlowCPUImage
-
- -where: - -* -p hostPort:containerPort is optional. If you plan to run - TensorFlow programs from the shell, omit this option. If you plan to run - TensorFlow programs as Jupyter notebooks, set both hostPort - and containerPort to 8888. If you'd like to run - TensorBoard inside the container, add a second `-p` flag, setting both - hostPort and containerPort to 6006. -* TensorFlowCPUImage is required. It identifies the Docker - container. Specify one of the following values: - - * tensorflow/tensorflow, which is the TensorFlow CPU binary - image. - * tensorflow/tensorflow:latest-devel, which is the latest - TensorFlow CPU Binary image plus source code. - * tensorflow/tensorflow:version, which is the specified - version (for example, 1.1.0rc1) of TensorFlow CPU binary image. - * tensorflow/tensorflow:version-devel, which is the - specified version (for example, 1.1.0rc1) of the TensorFlow GPU binary - image plus source code. - - TensorFlow images are available at - [dockerhub](https://hub.docker.com/r/tensorflow/tensorflow/). - -For example, the following command launches the latest TensorFlow CPU binary -image in a Docker container from which you can run TensorFlow programs in a -shell: - -
-$ docker run -it tensorflow/tensorflow bash
-
- -The following command also launches the latest TensorFlow CPU binary image in a -Docker container. However, in this Docker container, you can run TensorFlow -programs in a Jupyter notebook: - -
-$ docker run -it -p 8888:8888 tensorflow/tensorflow
-
- -Docker will download the TensorFlow binary image the first time you launch it. - -#### GPU support - -To launch a Docker container with NVidia GPU support, enter a command of the -following format (this -[does not require any local CUDA installation](https://github.com/nvidia/nvidia-docker/wiki/CUDA#requirements)): - -
-$ nvidia-docker run -it -p hostPort:containerPort TensorFlowGPUImage
-
- -where: - -* -p hostPort:containerPort is optional. If you plan to run - TensorFlow programs from the shell, omit this option. If you plan to run - TensorFlow programs as Jupyter notebooks, set both hostPort - and containerPort to `8888`. -* TensorFlowGPUImage specifies the Docker container. You must specify - one of the following values: - * tensorflow/tensorflow:latest-gpu, which is the latest - TensorFlow GPU binary image. - * tensorflow/tensorflow:latest-devel-gpu, which is the latest - TensorFlow GPU Binary image plus source code. - * tensorflow/tensorflow:version-gpu, which is the - specified version (for example, 0.12.1) of the TensorFlow GPU binary - image. - * tensorflow/tensorflow:version-devel-gpu, which is the - specified version (for example, 0.12.1) of the TensorFlow GPU binary - image plus source code. - -We recommend installing one of the `latest` versions. For example, the following -command launches the latest TensorFlow GPU binary image in a Docker container -from which you can run TensorFlow programs in a shell: - -
-$ nvidia-docker run -it tensorflow/tensorflow:latest-gpu bash
-
- -The following command also launches the latest TensorFlow GPU binary image in a -Docker container. In this Docker container, you can run TensorFlow programs in a -Jupyter notebook: - -
-$ nvidia-docker run -it -p 8888:8888 tensorflow/tensorflow:latest-gpu
-
- -The following command installs an older TensorFlow version (0.12.1): - -
-$ nvidia-docker run -it -p 8888:8888 tensorflow/tensorflow:0.12.1-gpu
-
- -Docker will download the TensorFlow binary image the first time you launch it. -For more details see the -[TensorFlow docker readme](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/tools/docker). - -#### Next Steps - -You should now [validate your installation](#ValidateYourInstallation). - - - -### Use `pip` in Anaconda - -Anaconda provides the `conda` utility to create a virtual environment. However, -within Anaconda, we recommend installing TensorFlow using the `pip install` -command and *not* with the `conda install` command. - -Caution: `conda` is a community supported package this is not officially -maintained by the TensorFlow team. Use this package at your own risk since it is -not tested on new TensorFlow releases. - -Take the following steps to install TensorFlow in an Anaconda environment: - -1. Follow the instructions on the - [Anaconda download site](https://www.continuum.io/downloads) to download and - install Anaconda. - -2. Create a conda environment named tensorflow to run a version of - Python by invoking the following command: - -
$ conda create -n tensorflow pip python=2.7 # or python=3.3, etc.
- -3. Activate the conda environment by issuing the following command: - -
$ source activate tensorflow
-     (tensorflow)$  # Your prompt should change 
- -4. Issue a command of the following format to install TensorFlow inside your - conda environment: - -
(tensorflow)$ pip install --ignore-installed --upgrade tfBinaryURL
- - where tfBinaryURL is the - [URL of the TensorFlow Python package](#the_url_of_the_tensorflow_python_package). - For example, the following command installs the CPU-only version of - TensorFlow for Python 3.4: - -
-     (tensorflow)$ pip install --ignore-installed --upgrade \
-     https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.10.0-cp34-cp34m-linux_x86_64.whl
- - - -## Validate your installation - -To validate your TensorFlow installation, do the following: - -1. Ensure that your environment is prepared to run TensorFlow programs. -2. Run a short TensorFlow program. - -### Prepare your environment - -If you installed on native pip, Virtualenv, or Anaconda, then do the following: - -1. Start a terminal. -2. If you installed with Virtualenv or Anaconda, activate your container. -3. If you installed TensorFlow source code, navigate to any directory *except* - one containing TensorFlow source code. - -If you installed through Docker, start a Docker container from which you can run -bash. For example: - -
-$ docker run -it tensorflow/tensorflow bash
-
- -### Run a short TensorFlow program - -Invoke python from your shell as follows: - -
$ python
- -Enter the following short program inside the python interactive shell: - -```python -# Python -import tensorflow as tf -hello = tf.constant('Hello, TensorFlow!') -sess = tf.Session() -print(sess.run(hello)) -``` - -If the system outputs the following, then you are ready to begin writing -TensorFlow programs: - -
Hello, TensorFlow!
- -If the system outputs an error message instead of a greeting, see -[Common installation problems](#common_installation_problems). - -To learn more, see the [TensorFlow tutorials](../tutorials/). - - - -## TensorFlow GPU support - -Note: Due to the number of libraries required, using [Docker](#InstallingDocker) -is recommended over installing directly on the host system. - -The following NVIDIA® hardware must be installed on your system: - -* GPU card with CUDA Compute Capability 3.5 or higher. See - [NVIDIA documentation](https://developer.nvidia.com/cuda-gpus) for a list of - supported GPU cards. - -The following NVIDIA® software must be installed on your system: - -* [GPU drivers](http://nvidia.com/driver). CUDA 9.0 requires 384.x or higher. -* [CUDA Toolkit 9.0](http://nvidia.com/cuda). -* [cuDNN SDK](http://developer.nvidia.com/cudnn) (>= 7.0). Version 7.1 is - recommended. -* [CUPTI](http://docs.nvidia.com/cuda/cupti/) ships with the CUDA Toolkit, but - you also need to append its path to the `LD_LIBRARY_PATH` environment - variable: `export - LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/cuda/extras/CUPTI/lib64` -* *OPTIONAL*: [NCCL 2.2](https://developer.nvidia.com/nccl) to use TensorFlow - with multiple GPUs. -* *OPTIONAL*: - [TensorRT](http://docs.nvidia.com/deeplearning/sdk/tensorrt-install-guide/index.html) - which can improve latency and throughput for inference for some models. - -To use a GPU with CUDA Compute Capability 3.0, or different versions of the -preceding NVIDIA libraries see -[installing TensorFlow from Sources](../install/install_sources.md). If using Ubuntu 16.04 -and possibly other Debian based linux distros, `apt-get` can be used with the -NVIDIA repository to simplify installation. - -```bash -# Adds NVIDIA package repository. -sudo apt-key adv --fetch-keys http://developer.download.nvidia.com/compute/cuda/repos/ubuntu1604/x86_64/7fa2af80.pub -wget http://developer.download.nvidia.com/compute/cuda/repos/ubuntu1604/x86_64/cuda-repo-ubuntu1604_9.1.85-1_amd64.deb -wget http://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1604/x86_64/nvidia-machine-learning-repo-ubuntu1604_1.0.0-1_amd64.deb -sudo dpkg -i cuda-repo-ubuntu1604_9.1.85-1_amd64.deb -sudo dpkg -i nvidia-machine-learning-repo-ubuntu1604_1.0.0-1_amd64.deb -sudo apt-get update -# Includes optional NCCL 2.x. -sudo apt-get install cuda9.0 cuda-cublas-9-0 cuda-cufft-9-0 cuda-curand-9-0 \ - cuda-cusolver-9-0 cuda-cusparse-9-0 libcudnn7=7.1.4.18-1+cuda9.0 \ - libnccl2=2.2.13-1+cuda9.0 cuda-command-line-tools-9-0 -# Optionally install TensorRT runtime, must be done after above cuda install. -sudo apt-get update -sudo apt-get install libnvinfer4=4.1.2-1+cuda9.0 -``` - -## Common installation problems - -We are relying on Stack Overflow to document TensorFlow installation problems -and their remedies. The following table contains links to Stack Overflow answers -for some common installation problems. If you encounter an error message or -other installation problem not listed in the following table, search for it on -Stack Overflow. If Stack Overflow doesn't show the error message, ask a new -question about it on Stack Overflow and specify the `tensorflow` tag. - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Link to GitHub or Stack Overflow Error Message
36159194
ImportError: libcudart.so.Version: cannot open shared object file:
-  No such file or directory
41991101
ImportError: libcudnn.Version: cannot open shared object file:
-  No such file or directory
36371137 and - here
libprotobuf ERROR google/protobuf/src/google/protobuf/io/coded_stream.cc:207] A
-  protocol message was rejected because it was too big (more than 67108864 bytes).
-  To increase the limit (or to disable these warnings), see
-  CodedInputStream::SetTotalBytesLimit() in google/protobuf/io/coded_stream.h.
35252888
Error importing tensorflow. Unless you are using bazel, you should
-  not try to import tensorflow from its source directory; please exit the
-  tensorflow source tree, and relaunch your python interpreter from
-  there.
33623453
IOError: [Errno 2] No such file or directory:
-  '/tmp/pip-o6Tpui-build/setup.py'
-
42006320
ImportError: Traceback (most recent call last):
-  File ".../tensorflow/core/framework/graph_pb2.py", line 6, in 
-  from google.protobuf import descriptor as _descriptor
-  ImportError: cannot import name 'descriptor'
-
35190574
SSLError: [SSL: CERTIFICATE_VERIFY_FAILED] certificate verify
-  failed
42009190
-  Installing collected packages: setuptools, protobuf, wheel, numpy, tensorflow
-  Found existing installation: setuptools 1.1.6
-  Uninstalling setuptools-1.1.6:
-  Exception:
-  ...
-  [Errno 1] Operation not permitted:
-  '/tmp/pip-a1DXRT-uninstall/.../lib/python/_markerlib' 
36933958
-  ...
-  Installing collected packages: setuptools, protobuf, wheel, numpy, tensorflow
-  Found existing installation: setuptools 1.1.6
-  Uninstalling setuptools-1.1.6:
-  Exception:
-  ...
-  [Errno 1] Operation not permitted:
-  '/tmp/pip-a1DXRT-uninstall/System/Library/Frameworks/Python.framework/
-   Versions/2.7/Extras/lib/python/_markerlib'
-
- - - -## The URL of the TensorFlow Python package - -A few installation mechanisms require the URL of the TensorFlow Python package. -The value you specify depends on three factors: - -* operating system -* Python version -* CPU only vs. GPU support - -This section documents the relevant values for Linux installations. - -### Python 2.7 - -CPU only: - -
-https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.10.0-cp27-none-linux_x86_64.whl
-
- -GPU support: - -
-https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.10.0-cp27-none-linux_x86_64.whl
-
- -Note that GPU support requires the NVIDIA hardware and software described in -[NVIDIA requirements to run TensorFlow with GPU support](#NVIDIARequirements). - -### Python 3.4 - -CPU only: - -
-https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.10.0-cp34-cp34m-linux_x86_64.whl
-
- -GPU support: - -
-https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.10.0-cp34-cp34m-linux_x86_64.whl
-
- -Note that GPU support requires the NVIDIA hardware and software described in -[NVIDIA requirements to run TensorFlow with GPU support](#NVIDIARequirements). - -### Python 3.5 - -CPU only: - -
-https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.10.0-cp35-cp35m-linux_x86_64.whl
-
- -GPU support: - -
-https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.10.0-cp35-cp35m-linux_x86_64.whl
-
- -Note that GPU support requires the NVIDIA hardware and software described in -[NVIDIA requirements to run TensorFlow with GPU support](#NVIDIARequirements). - -### Python 3.6 - -CPU only: - -
-https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.10.0-cp36-cp36m-linux_x86_64.whl
-
- -GPU support: - -
-https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.10.0-cp36-cp36m-linux_x86_64.whl
-
- -Note that GPU support requires the NVIDIA hardware and software described in -[NVIDIA requirements to run TensorFlow with GPU support](#NVIDIARequirements). diff --git a/tensorflow/docs_src/install/install_mac.md b/tensorflow/docs_src/install/install_mac.md deleted file mode 100644 index c4d63cc107..0000000000 --- a/tensorflow/docs_src/install/install_mac.md +++ /dev/null @@ -1,529 +0,0 @@ -# Install TensorFlow on macOS - -This guide explains how to install TensorFlow on macOS. Although these -instructions might also work on other macOS variants, we have only -tested (and we only support) these instructions on machines meeting the -following requirements: - - * macOS 10.12.6 (Sierra) or higher - -Note: There are known, accuracy-affecting numerical issues before macOS 10.12.6 -(Sierra) that are described in -[GitHub#15933](https://github.com/tensorflow/tensorflow/issues/15933#issuecomment-366331383). - -Note: As of version 1.2, TensorFlow no longer provides GPU support on macOS. - -## Determine how to install TensorFlow - -You must pick the mechanism by which you install TensorFlow. The supported choices are as follows: - - * Virtualenv - * "native" pip - * Docker - * installing from sources, which is documented in - [a separate guide](https://www.tensorflow.org/install/install_sources). - -**We recommend the Virtualenv installation.** -[Virtualenv](https://virtualenv.pypa.io/en/stable) -is a virtual Python environment isolated from other Python development, -incapable of interfering with or being affected by other Python programs -on the same machine. During the Virtualenv installation process, -you will install not only TensorFlow but also all the packages that -TensorFlow requires. (This is actually pretty easy.) -To start working with TensorFlow, you simply need to "activate" the -virtual environment. All in all, Virtualenv provides a safe and -reliable mechanism for installing and running TensorFlow. - -Native pip installs TensorFlow directly on your system without going through -any container or virtual environment system. Since a native pip installation -is not walled-off, the pip installation might interfere with or be influenced -by other Python-based installations on your system. Furthermore, you might need -to disable System Integrity Protection (SIP) in order to install through native -pip. However, if you understand SIP, pip, and your Python environment, a -native pip installation is relatively easy to perform. - -[Docker](http://docker.com) completely isolates the TensorFlow installation -from pre-existing packages on your machine. The Docker container contains -TensorFlow and all its dependencies. Note that the Docker image can be quite -large (hundreds of MBs). You might choose the Docker installation if you are -incorporating TensorFlow into a larger application architecture that -already uses Docker. - -In Anaconda, you may use conda to create a virtual environment. -However, within Anaconda, we recommend installing TensorFlow with the -`pip install` command, not with the `conda install` command. - -**NOTE:** The conda package is community supported, not officially supported. -That is, the TensorFlow team neither tests nor maintains the conda package. -Use that package at your own risk. - -## Installing with Virtualenv - -Take the following steps to install TensorFlow with Virtualenv: - - 1. Start a terminal (a shell). You'll perform all subsequent steps - in this shell. - - 2. Install pip and Virtualenv by issuing the following commands: - -
 $ sudo easy_install pip
-     $ pip install --upgrade virtualenv 
- - 3. Create a Virtualenv environment by issuing a command of one - of the following formats: - -
 $ virtualenv --system-site-packages targetDirectory # for Python 2.7
-     $ virtualenv --system-site-packages -p python3 targetDirectory # for Python 3.n
-     
- - where targetDirectory identifies the top of the Virtualenv tree. - Our instructions assume that targetDirectory - is `~/tensorflow`, but you may choose any directory. - - 4. Activate the Virtualenv environment by issuing one of the - following commands: - -
$ cd targetDirectory
-    $ source ./bin/activate      # If using bash, sh, ksh, or zsh
-    $ source ./bin/activate.csh  # If using csh or tcsh 
- - The preceding `source` command should change your prompt to the following: - -
 (targetDirectory)$ 
- - 5. Ensure pip ≥8.1 is installed: - -
 (targetDirectory)$ easy_install -U pip
- - 6. Issue one of the following commands to install TensorFlow and all the - packages that TensorFlow requires into the active Virtualenv environment: - -
 (targetDirectory)$ pip install --upgrade tensorflow      # for Python 2.7
-     (targetDirectory)$ pip3 install --upgrade tensorflow     # for Python 3.n
-
-  7. Optional. If Step 6 failed (typically because you invoked a pip version
-     lower than 8.1), install TensorFlow in the active
-     Virtualenv environment by issuing a command of the following format:
-
-     
 $ pip install --upgrade tfBinaryURL   # Python 2.7
-     $ pip3 install --upgrade tfBinaryURL  # Python 3.n 
- - where tfBinaryURL identifies the URL - of the TensorFlow Python package. The appropriate value of - tfBinaryURL depends on the operating system and - Python version. Find the appropriate value for - tfBinaryURL for your system - [here](#the_url_of_the_tensorflow_python_package). - For example, if you are installing TensorFlow for macOS, - Python 2.7, the command to install - TensorFlow in the active Virtualenv is as follows: - -
 $ pip3 install --upgrade \
-     https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.10.0-py3-none-any.whl
- -If you encounter installation problems, see -[Common Installation Problems](#common-installation-problems). - - -### Next Steps - -After installing TensorFlow, -[validate your installation](#ValidateYourInstallation) -to confirm that the installation worked properly. - -Note that you must activate the Virtualenv environment each time you -use TensorFlow in a new shell. If the Virtualenv environment is not -currently active (that is, the prompt is not `(targetDirectory)`, invoke -one of the following commands: - -
$ cd targetDirectory
-$ source ./bin/activate      # If using bash, sh, ksh, or zsh
-$ source ./bin/activate.csh  # If using csh or tcsh 
- - -Your prompt will transform to the following to indicate that your -tensorflow environment is active: - -
 (targetDirectory)$ 
- -When the Virtualenv environment is active, you may run -TensorFlow programs from this shell. - -When you are done using TensorFlow, you may deactivate the -environment by issuing the following command: - -
 (targetDirectory)$ deactivate 
- -The prompt will revert back to your default prompt (as defined by `PS1`). - - -### Uninstalling TensorFlow - -If you want to uninstall TensorFlow, simply remove the tree you created. For example: - -
 $ rm -r ~/tensorflow 
- - -## Installing with native pip - -We have uploaded the TensorFlow binaries to PyPI. -Therefore, you can install TensorFlow through pip. - -The -[REQUIRED_PACKAGES section of setup.py](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/tools/pip_package/setup.py) -lists the packages that pip will install or upgrade. - - -### Prerequisite: Python - -In order to install TensorFlow, your system must contain one of the following Python versions: - - * Python 2.7 - * Python 3.3+ - -If your system does not already have one of the preceding Python versions, -[install](https://wiki.python.org/moin/BeginnersGuide/Download) it now. - -When installing Python, you might need to disable -System Integrity Protection (SIP) to permit any entity other than -Mac App Store to install software. - - -### Prerequisite: pip - -[Pip](https://en.wikipedia.org/wiki/Pip_(package_manager)) installs -and manages software packages written in Python. If you intend to install -with native pip, then one of the following flavors of pip must be -installed on your system: - - * `pip`, for Python 2.7 - * `pip3`, for Python 3.n. - -`pip` or `pip3` was probably installed on your system when you -installed Python. To determine whether pip or pip3 is actually -installed on your system, issue one of the following commands: - -
$ pip -V  # for Python 2.7
-$ pip3 -V # for Python 3.n 
- -We strongly recommend pip or pip3 version 8.1 or higher in order -to install TensorFlow. If pip or pip3 8.1 or later is not -installed, issue the following commands to install or upgrade: - -
$ sudo easy_install --upgrade pip
-$ sudo easy_install --upgrade six 
- - -### Install TensorFlow - -Assuming the prerequisite software is installed on your Mac, -take the following steps: - - 1. Install TensorFlow by invoking **one** of the following commands: - -
 $ pip install tensorflow      # Python 2.7; CPU support
-     $ pip3 install tensorflow     # Python 3.n; CPU support
-
-     If the preceding command runs to completion, you should now
-     [validate your installation](#ValidateYourInstallation).
-
-  2. (Optional.) If Step 1 failed, install the latest version of TensorFlow
-     by issuing a command of the following format:
-
-     
 $ sudo pip  install --upgrade tfBinaryURL   # Python 2.7
-     $ sudo pip3 install --upgrade tfBinaryURL   # Python 3.n 
- - where tfBinaryURL identifies the URL of the TensorFlow Python - package. The appropriate value of tfBinaryURL depends on the - operating system and Python version. Find the appropriate - value for tfBinaryURL - [here](#the_url_of_the_tensorflow_python_package). For example, if - you are installing TensorFlow for macOS and Python 2.7 - issue the following command: - -
 $ sudo pip3 install --upgrade \
-     https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.10.0-py3-none-any.whl 
- - If the preceding command fails, see - [installation problems](#common-installation-problems). - - - -### Next Steps - -After installing TensorFlow, -[validate your installation](#ValidateYourInstallation) -to confirm that the installation worked properly. - - -### Uninstalling TensorFlow - -To uninstall TensorFlow, issue one of following commands: - -
$ pip uninstall tensorflow
-$ pip3 uninstall tensorflow 
- - -## Installing with Docker - -Follow these steps to install TensorFlow through Docker. - - 1. Install Docker on your machine as described in the - [Docker documentation](https://docs.docker.com/engine/installation/#/on-macos-and-windows). - - 2. Launch a Docker container that contains one of the TensorFlow - binary images. - -The remainder of this section explains how to launch a Docker container. - -To launch a Docker container that holds the TensorFlow binary image, -enter a command of the following format: - -
 $ docker run -it -p hostPort:containerPort TensorFlowImage 
- -where: - - * -p hostPort:containerPort is optional. If you'd like to run - TensorFlow programs from the shell, omit this option. If you'd like - to run TensorFlow programs from Jupyter notebook, set both - hostPort and containerPort to 8888. - If you'd like to run TensorBoard inside the container, add - a second `-p` flag, setting both hostPort and containerPort - to 6006. - * TensorFlowImage is required. It identifies the Docker container. - You must specify one of the following values: - * tensorflow/tensorflow: TensorFlow binary image. - * tensorflow/tensorflow:latest-devel: TensorFlow - Binary image plus source code. - -The TensorFlow images are available at -[dockerhub](https://hub.docker.com/r/tensorflow/tensorflow/). - -For example, the following command launches a TensorFlow CPU binary image -in a Docker container from which you can run TensorFlow programs in a shell: - -
$ docker run -it tensorflow/tensorflow bash
- -The following command also launches a TensorFlow CPU binary image in a -Docker container. However, in this Docker container, you can run -TensorFlow programs in a Jupyter notebook: - -
$ docker run -it -p 8888:8888 tensorflow/tensorflow
- -Docker will download the TensorFlow binary image the first time you launch it. - - -### Next Steps - -You should now -[validate your installation](#ValidateYourInstallation). - - -## Installing with Anaconda - -**The Anaconda installation is community supported, not officially supported.** - -Take the following steps to install TensorFlow in an Anaconda environment: - - 1. Follow the instructions on the - [Anaconda download site](https://www.continuum.io/downloads) - to download and install Anaconda. - - 2. Create a conda environment named `tensorflow` - by invoking the following command: - -
$ conda create -n tensorflow pip python=2.7 # or python=3.3, etc.
- - 3. Activate the conda environment by issuing the following command: - -
$ source activate tensorflow
-     (targetDirectory)$  # Your prompt should change
- - 4. Issue a command of the following format to install - TensorFlow inside your conda environment: - -
(targetDirectory)$ pip install --ignore-installed --upgrade TF_PYTHON_URL
- - where TF_PYTHON_URL is the - [URL of the TensorFlow Python package](#the_url_of_the_tensorflow_python_package). - For example, the following command installs the CPU-only version of - TensorFlow for Python 2.7: - -
 (targetDirectory)$ pip install --ignore-installed --upgrade \
-     https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.10.0-py2-none-any.whl
- - - -## Validate your installation - -To validate your TensorFlow installation, do the following: - - 1. Ensure that your environment is prepared to run TensorFlow programs. - 2. Run a short TensorFlow program. - - -### Prepare your environment - -If you installed on native pip, Virtualenv, or Anaconda, then -do the following: - - 1. Start a terminal. - 2. If you installed with Virtualenv or Anaconda, activate your container. - 3. If you installed TensorFlow source code, navigate to any - directory *except* one containing TensorFlow source code. - -If you installed through Docker, start a Docker container that runs bash. -For example: - -
$ docker run -it tensorflow/tensorflow bash
- - - -### Run a short TensorFlow program - -Invoke python from your shell as follows: - -
$ python
- -Enter the following short program inside the python interactive shell: - -```python -# Python -import tensorflow as tf -hello = tf.constant('Hello, TensorFlow!') -sess = tf.Session() -print(sess.run(hello)) -``` - -If the system outputs the following, then you are ready to begin -writing TensorFlow programs: - -
Hello, TensorFlow!
- -If the system outputs an error message instead of a greeting, see -[Common installation problems](#common_installation_problems). - -To learn more, see the [TensorFlow tutorials](../tutorials/). - -## Common installation problems - -We are relying on Stack Overflow to document TensorFlow installation problems -and their remedies. The following table contains links to Stack Overflow -answers for some common installation problems. -If you encounter an error message or other -installation problem not listed in the following table, search for it -on Stack Overflow. If Stack Overflow doesn't show the error message, -ask a new question about it on Stack Overflow and specify -the `tensorflow` tag. - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Stack Overflow Link Error Message
42006320
ImportError: Traceback (most recent call last):
-File ".../tensorflow/core/framework/graph_pb2.py", line 6, in 
-from google.protobuf import descriptor as _descriptor
-ImportError: cannot import name 'descriptor'
-
33623453
IOError: [Errno 2] No such file or directory:
-  '/tmp/pip-o6Tpui-build/setup.py'
-
35190574
SSLError: [SSL: CERTIFICATE_VERIFY_FAILED] certificate verify
-  failed
42009190
-  Installing collected packages: setuptools, protobuf, wheel, numpy, tensorflow
-  Found existing installation: setuptools 1.1.6
-  Uninstalling setuptools-1.1.6:
-  Exception:
-  ...
-  [Errno 1] Operation not permitted:
-  '/tmp/pip-a1DXRT-uninstall/.../lib/python/_markerlib' 
33622019
ImportError: No module named copyreg
37810228During a pip install operation, the system returns: -
OSError: [Errno 1] Operation not permitted
-
33622842An import tensorflow statement triggers an error such as the - following:
Traceback (most recent call last):
-  File "", line 1, in 
-  File "/usr/local/lib/python2.7/site-packages/tensorflow/__init__.py",
-    line 4, in 
-    from tensorflow.python import *
-    ...
-  File "/usr/local/lib/python2.7/site-packages/tensorflow/core/framework/tensor_shape_pb2.py",
-    line 22, in 
-    serialized_pb=_b('\n,tensorflow/core/framework/tensor_shape.proto\x12\ntensorflow\"d\n\x10TensorShapeProto\x12-\n\x03\x64im\x18\x02
-      \x03(\x0b\x32
-      .tensorflow.TensorShapeProto.Dim\x1a!\n\x03\x44im\x12\x0c\n\x04size\x18\x01
-      \x01(\x03\x12\x0c\n\x04name\x18\x02 \x01(\tb\x06proto3')
-  TypeError: __init__() got an unexpected keyword argument 'syntax'
-
42075397A pip install command triggers the following error: -
...
-You have not agreed to the Xcode license agreements, please run
-'xcodebuild -license' (for user-level acceptance) or
-'sudo xcodebuild -license' (for system-wide acceptance) from within a
-Terminal window to review and agree to the Xcode license agreements.
-...
-  File "numpy/core/setup.py", line 653, in get_mathlib_info
-
-    raise RuntimeError("Broken toolchain: cannot link a simple C program")
-
-RuntimeError: Broken toolchain: cannot link a simple C program
-
- - - - - -## The URL of the TensorFlow Python package - -A few installation mechanisms require the URL of the TensorFlow Python package. -The value you specify depends on your Python version. - -### Python 2.7 - - -
-https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.10.0-py2-none-any.whl
-
- - -### Python 3.4, 3.5, or 3.6 - - -
-https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.10.0-py3-none-any.whl
-
diff --git a/tensorflow/docs_src/install/install_raspbian.md b/tensorflow/docs_src/install/install_raspbian.md deleted file mode 100644 index cf6b6b4f79..0000000000 --- a/tensorflow/docs_src/install/install_raspbian.md +++ /dev/null @@ -1,313 +0,0 @@ -# Install TensorFlow on Raspbian - -This guide explains how to install TensorFlow on a Raspberry Pi running -Raspbian. Although these instructions might also work on other Pi variants, we -have only tested (and we only support) these instructions on machines meeting -the following requirements: - -* Raspberry Pi devices running Raspbian 9.0 or higher - -## Determine how to install TensorFlow - -You must pick the mechanism by which you install TensorFlow. The supported -choices are as follows: - -* "Native" pip. -* Cross-compiling from sources. - -**We recommend pip installation.** - -## Installing with native pip - -We have uploaded the TensorFlow binaries to piwheels.org. Therefore, you can -install TensorFlow through pip. - -The [REQUIRED_PACKAGES section of -setup.py](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/tools/pip_package/setup.py) -lists the packages that pip will install or upgrade. - -### Prerequisite: Python - -In order to install TensorFlow, your system must contain one of the following -Python versions: - -* Python 2.7 -* Python 3.4+ - -If your system does not already have one of the preceding Python versions, -[install](https://wiki.python.org/moin/BeginnersGuide/Download) it now. It -should already be included when Raspbian was installed though, so no extra steps -should be needed. - -### Prerequisite: pip - -[Pip](https://en.wikipedia.org/wiki/Pip_\(package_manager\)) installs and -manages software packages written in Python. If you intend to install with -native pip, then one of the following flavors of pip must be installed on your -system: - -* `pip3`, for Python 3.n (preferred). -* `pip`, for Python 2.7. - -`pip` or `pip3` was probably installed on your system when you installed Python. -To determine whether pip or pip3 is actually installed on your system, issue one -of the following commands: - -
$ pip3 -V # for Python 3.n
-$ pip -V  # for Python 2.7
- -If it gives the error "Command not found", then the package has not been -installed yet. To install if for the first time, run: - -
$ sudo apt-get install python3-pip # for Python 3.n
-$ sudo apt-get install python-pip # for Python 2.7
- -You can find more help on installing and upgrading pip in -[the Raspberry Pi documentation](https://www.raspberrypi.org/documentation/linux/software/python.md). - -### Prerequisite: Atlas - -[Atlas](http://math-atlas.sourceforge.net/) is a linear algebra library that -numpy depends on, and so needs to be installed before TensorFlow. To add it to -your system, run the following command: - -
$ sudo apt install libatlas-base-dev
- -### Install TensorFlow - -Assuming the prerequisite software is installed on your Pi, install TensorFlow -by invoking **one** of the following commands: - -
$ pip3 install tensorflow     # Python 3.n
-$ pip install tensorflow      # Python 2.7
- -This can take some time on certain platforms like the Pi Zero, where some Python -packages like scipy that TensorFlow depends on need to be compiled before the -installation can complete. The Python 3 version will typically be faster to -install because piwheels.org has pre-built versions of the dependencies -available, so this is our recommended option. - -### Next Steps - -After installing TensorFlow, [validate your -installation](#ValidateYourInstallation) to confirm that the installation worked -properly. - -### Uninstalling TensorFlow - -To uninstall TensorFlow, issue one of following commands: - -
$ pip uninstall tensorflow
-$ pip3 uninstall tensorflow 
- -## Cross-compiling from sources - -Cross-compilation means building on a different machine than than you'll be -deploying on. Since Raspberry Pi's only have limited RAM and comparatively slow -processors, and TensorFlow has a large amount of source code to compile, it's -easier to use a MacOS or Linux desktop or laptop to handle the build process. -Because it can take over 24 hours to build on a Pi, and requires external swap -space to cope with the memory shortage, we recommend using cross-compilation if -you do need to compile TensorFlow from source. To make the dependency management -process easier, we also recommend using Docker to help simplify building. - -Note that we provide well-tested, pre-built TensorFlow binaries for Raspbian -systems. So, don't build a TensorFlow binary yourself unless you are very -comfortable building complex packages from source and dealing with the -inevitable aftermath should things not go exactly as documented - -### Prerequisite: Docker - -Install Docker on your machine as described in the [Docker -documentation](https://docs.docker.com/engine/installation/#/on-macos-and-windows). - -### Clone the TensorFlow repository - -Start the process of building TensorFlow by cloning a TensorFlow repository. - -To clone **the latest** TensorFlow repository, issue the following command: - -
$ git clone https://github.com/tensorflow/tensorflow 
- -The preceding git clone command creates a subdirectory named -`tensorflow`. After cloning, you may optionally build a **specific branch** -(such as a release branch) by invoking the following commands: - -
-$ cd tensorflow
-$ git checkout Branch # where Branch is the desired branch
-
- -For example, to work with the `r1.0` release instead of the master release, -issue the following command: - -
$ git checkout r1.0
- -### Build from source - -To compile TensorFlow and produce a binary pip can install, do the following: - -1. Start a terminal. -2. Navigate to the directory containing the tensorflow source code. -3. Run a command to cross-compile the library, for example: - -
$ CI_DOCKER_EXTRA_PARAMS="-e CI_BUILD_PYTHON=python3 -e CROSSTOOL_PYTHON_INCLUDE_PATH=/usr/include/python3.4" \
-tensorflow/tools/ci_build/ci_build.sh PI-PYTHON3 tensorflow/tools/ci_build/pi/build_raspberry_pi.sh
- 
- -This will build a pip .whl file for Python 3.4, with Arm v7 instructions that -will only work on the Pi models 2 or 3. These NEON instructions are required for -the fastest operation on those devices, but you can build a library that will -run across all Pi devices by passing `PI_ONE` at the end of the command line. -You can also target Python 2.7 by omitting the initial docker parameters. Here's -an example of building for Python 2.7 and Raspberry Pi model Zero or One -devices: - -
$ tensorflow/tools/ci_build/ci_build.sh PI tensorflow/tools/ci_build/pi/build_raspberry_pi.sh PI_ONE
- -This will take some time to complete, typically twenty or thirty minutes, and -should produce a .whl file in an output-artifacts sub-folder inside your source -tree at the end. This wheel file can be installed through pip or pip3 (depending -on your Python version) by copying it to a Raspberry Pi and running a terminal -command like this (with the name of your actual file substituted): - -
$ pip3 install tensorflow-1.9.0-cp34-none-linux_armv7l.whl
- -### Troubleshooting the build - -The build script uses Docker internally to create a Linux virtual machine to -handle the compilation. If you do have problems running the script, first check -that you're able to run Docker tests like `docker run hello-world` on your -system. - -If you're building from the latest development branch, try syncing to an older -version that's known to work, for example release 1.9, with a command like this: - -
$ git checkout r1.0
- - - -## Validate your installation - -To validate your TensorFlow installation, do the following: - -1. Ensure that your environment is prepared to run TensorFlow programs. -2. Run a short TensorFlow program. - -### Prepare your environment - -If you installed on native pip, Virtualenv, or Anaconda, then do the following: - -1. Start a terminal. -2. If you installed TensorFlow source code, navigate to any directory *except* - one containing TensorFlow source code. - -### Run a short TensorFlow program - -Invoke python from your shell as follows: - -
$ python
- -Enter the following short program inside the python interactive shell: - -```python -# Python -import tensorflow as tf -hello = tf.constant('Hello, TensorFlow!') -sess = tf.Session() -print(sess.run(hello)) -``` - -If the system outputs the following, then you are ready to begin writing -TensorFlow programs: - -
Hello, TensorFlow!
- -If you're running with Python 3.5, you may see a warning when you first import -TensorFlow. This is not an error, and TensorFlow should continue to run with no -problems, despite the log message. - -If the system outputs an error message instead of a greeting, see [Common -installation problems](#common_installation_problems). - -To learn more, see the [TensorFlow tutorials](../tutorials/). - -## Common installation problems - -We are relying on Stack Overflow to document TensorFlow installation problems -and their remedies. The following table contains links to Stack Overflow answers -for some common installation problems. If you encounter an error message or -other installation problem not listed in the following table, search for it on -Stack Overflow. If Stack Overflow doesn't show the error message, ask a new -question about it on Stack Overflow and specify the `tensorflow` tag. - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Stack Overflow Link Error Message
42006320
ImportError: Traceback (most recent call last):
-File ".../tensorflow/core/framework/graph_pb2.py", line 6, in 
-from google.protobuf import descriptor as _descriptor
-ImportError: cannot import name 'descriptor'
-
33623453
IOError: [Errno 2] No such file or directory:
-  '/tmp/pip-o6Tpui-build/setup.py'
-
35190574
SSLError: [SSL: CERTIFICATE_VERIFY_FAILED] certificate verify
-  failed
42009190
-  Installing collected packages: setuptools, protobuf, wheel, numpy, tensorflow
-  Found existing installation: setuptools 1.1.6
-  Uninstalling setuptools-1.1.6:
-  Exception:
-  ...
-  [Errno 1] Operation not permitted:
-  '/tmp/pip-a1DXRT-uninstall/.../lib/python/_markerlib' 
33622019
ImportError: No module named copyreg
37810228During a pip install operation, the system returns: -
OSError: [Errno 1] Operation not permitted
-
33622842An import tensorflow statement triggers an error such as the - following:
Traceback (most recent call last):
-  File "", line 1, in 
-  File "/usr/local/lib/python2.7/site-packages/tensorflow/__init__.py",
-    line 4, in 
-    from tensorflow.python import *
-    ...
-  File "/usr/local/lib/python2.7/site-packages/tensorflow/core/framework/tensor_shape_pb2.py",
-    line 22, in 
-    serialized_pb=_b('\n,tensorflow/core/framework/tensor_shape.proto\x12\ntensorflow\"d\n\x10TensorShapeProto\x12-\n\x03\x64im\x18\x02
-      \x03(\x0b\x32
-      .tensorflow.TensorShapeProto.Dim\x1a!\n\x03\x44im\x12\x0c\n\x04size\x18\x01
-      \x01(\x03\x12\x0c\n\x04name\x18\x02 \x01(\tb\x06proto3')
-  TypeError: __init__() got an unexpected keyword argument 'syntax'
-
diff --git a/tensorflow/docs_src/install/install_sources.md b/tensorflow/docs_src/install/install_sources.md deleted file mode 100644 index 44ea18fa7b..0000000000 --- a/tensorflow/docs_src/install/install_sources.md +++ /dev/null @@ -1,579 +0,0 @@ -# Install TensorFlow from Sources - -This guide explains how to build TensorFlow sources into a TensorFlow binary and -how to install that TensorFlow binary. Note that we provide well-tested, -pre-built TensorFlow binaries for Ubuntu, macOS, and Windows systems. In -addition, there are pre-built TensorFlow -[docker images](https://hub.docker.com/r/tensorflow/tensorflow/). So, don't -build a TensorFlow binary yourself unless you are very comfortable building -complex packages from source and dealing with the inevitable aftermath should -things not go exactly as documented. - -If the last paragraph didn't scare you off, welcome. This guide explains how to -build TensorFlow on 64-bit desktops and laptops running either of the following -operating systems: - -* Ubuntu -* macOS X - -Note: Some users have successfully built and installed TensorFlow from sources -on non-supported systems. Please remember that we do not fix issues stemming -from these attempts. - -We **do not support** building TensorFlow on Windows. That said, if you'd like -to try to build TensorFlow on Windows anyway, use either of the following: - -* [Bazel on Windows](https://bazel.build/versions/master/docs/windows.html) -* [TensorFlow CMake build](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/cmake) - -Note: Starting from 1.6 release, our prebuilt binaries will use AVX -instructions. Older CPUs may not be able to execute these binaries. - -## Determine which TensorFlow to install - -You must choose one of the following types of TensorFlow to build and install: - -* **TensorFlow with CPU support only**. If your system does not have a NVIDIA® - GPU, build and install this version. Note that this version of TensorFlow is - typically easier to build and install, so even if you have an NVIDIA GPU, we - recommend building and installing this version first. -* **TensorFlow with GPU support**. TensorFlow programs typically run - significantly faster on a GPU than on a CPU. Therefore, if your system has a - NVIDIA GPU and you need to run performance-critical applications, you should - ultimately build and install this version. Beyond the NVIDIA GPU itself, - your system must also fulfill the NVIDIA software requirements described in - one of the following documents: - - * @ {$install_linux#NVIDIARequirements$Installing TensorFlow on Ubuntu} - * @ {$install_mac#NVIDIARequirements$Installing TensorFlow on macOS} - -## Clone the TensorFlow repository - -Start the process of building TensorFlow by cloning a TensorFlow repository. - -To clone **the latest** TensorFlow repository, issue the following command: - -
$ git clone https://github.com/tensorflow/tensorflow 
- -The preceding git clone command creates a subdirectory named -`tensorflow`. After cloning, you may optionally build a **specific branch** -(such as a release branch) by invoking the following commands: - -
-$ cd tensorflow
-$ git checkout Branch # where Branch is the desired branch
-
- -For example, to work with the `r1.0` release instead of the master release, -issue the following command: - -
$ git checkout r1.0
- -Next, you must prepare your environment for [Linux](#PrepareLinux) or -[macOS](#PrepareMac) - - - -## Prepare environment for Linux - -Before building TensorFlow on Linux, install the following build tools on your -system: - -* bazel -* TensorFlow Python dependencies -* optionally, NVIDIA packages to support TensorFlow for GPU. - -### Install Bazel - -If bazel is not installed on your system, install it now by following -[these directions](https://bazel.build/versions/master/docs/install.html). - -### Install TensorFlow Python dependencies - -To install TensorFlow, you must install the following packages: - -* `numpy`, which is a numerical processing package that TensorFlow requires. -* `dev`, which enables adding extensions to Python. -* `pip`, which enables you to install and manage certain Python packages. -* `wheel`, which enables you to manage Python compressed packages in the wheel - (.whl) format. - -To install these packages for Python 2.7, issue the following command: - -
-$ sudo apt-get install python-numpy python-dev python-pip python-wheel
-
- -To install these packages for Python 3.n, issue the following command: - -
-$ sudo apt-get install python3-numpy python3-dev python3-pip python3-wheel
-
- -### Optional: install TensorFlow for GPU prerequisites - -If you are building TensorFlow without GPU support, skip this section. - -The following NVIDIA® hardware must be installed on your system: - -* GPU card with CUDA Compute Capability 3.5 or higher. See - [NVIDIA documentation](https://developer.nvidia.com/cuda-gpus) for a list of - supported GPU cards. - -The following NVIDIA® software must be installed on your system: - -* [GPU drivers](http://nvidia.com/driver). CUDA 9.0 requires 384.x or higher. -* [CUDA Toolkit](http://nvidia.com/cuda) (>= 8.0). We recommend version 9.0. -* [cuDNN SDK](http://developer.nvidia.com/cudnn) (>= 6.0). We recommend - version 7.1.x. -* [CUPTI](http://docs.nvidia.com/cuda/cupti/) ships with the CUDA Toolkit, but - you also need to append its path to the `LD_LIBRARY_PATH` environment - variable: `export - LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/cuda/extras/CUPTI/lib64` -* *OPTIONAL*: [NCCL 2.2](https://developer.nvidia.com/nccl) to use TensorFlow - with multiple GPUs. -* *OPTIONAL*: - [TensorRT](http://docs.nvidia.com/deeplearning/sdk/tensorrt-install-guide/index.html) - which can improve latency and throughput for inference for some models. - -While it is possible to install the NVIDIA libraries via `apt-get` from the -NVIDIA repository, the libraries and headers are installed in locations that -make it difficult to configure and debug build issues. Downloading and -installing the libraries manually or using docker -([latest-devel-gpu](https://hub.docker.com/r/tensorflow/tensorflow/tags/)) is -recommended. - -### Next - -After preparing the environment, you must now -[configure the installation](#ConfigureInstallation). - - - -## Prepare environment for macOS - -Before building TensorFlow, you must install the following on your system: - -* bazel -* TensorFlow Python dependencies. -* optionally, NVIDIA packages to support TensorFlow for GPU. - -### Install bazel - -If bazel is not installed on your system, install it now by following -[these directions](https://bazel.build/versions/master/docs/install.html#mac-os-x). - -### Install python dependencies - -To build TensorFlow, you must install the following packages: - -* six -* mock -* numpy, which is a numerical processing package that TensorFlow requires. -* wheel, which enables you to manage Python compressed packages in the wheel - (.whl) format. - -You may install the python dependencies using pip. If you don't have pip on your -machine, we recommend using homebrew to install Python and pip as -[documented here](http://docs.python-guide.org/en/latest/starting/install/osx/). -If you follow these instructions, you will not need to disable SIP. - -After installing pip, invoke the following commands: - -
 $ pip install six numpy wheel mock h5py
- $ pip install keras_applications==1.0.5 --no-deps
- $ pip install keras_preprocessing==1.0.3 --no-deps
-
- -Note: These are just the minimum requirements to _build_ tensorflow. Installing -the pip package will download additional packages required to _run_ it. If you -plan on executing tasks directly with `bazel` , without the pip installation, -you may need to install additional python packages. For example, you should `pip -install enum34` before running TensorFlow's tests with bazel. - - - -## Configure the installation - -The root of the source tree contains a bash script named configure. -This script asks you to identify the pathname of all relevant TensorFlow -dependencies and specify other build configuration options such as compiler -flags. You must run this script *prior* to creating the pip package and -installing TensorFlow. - -If you wish to build TensorFlow with GPU, `configure` will ask you to specify -the version numbers of CUDA and cuDNN. If several versions of CUDA or cuDNN are -installed on your system, explicitly select the desired version instead of -relying on the default. - -One of the questions that `configure` will ask is as follows: - -
-Please specify optimization flags to use during compilation when bazel option "--config=opt" is specified [Default is -march=native]
-
- -This question refers to a later phase in which you'll use bazel to -[build the pip package](#build-the-pip-package) or the -[C/Java libraries](#BuildCorJava). We recommend accepting the default -(`-march=native`), which will optimize the generated code for your local -machine's CPU type. However, if you are building TensorFlow on one CPU type but -will run TensorFlow on a different CPU type, then consider specifying a more -specific optimization flag as described in -[the gcc documentation](https://gcc.gnu.org/onlinedocs/gcc-4.5.3/gcc/i386-and-x86_002d64-Options.html). - -Here is an example execution of the `configure` script. Note that your own input -will likely differ from our sample input: - -
-$ cd tensorflow  # cd to the top-level directory created
-$ ./configure
-You have bazel 0.15.0 installed.
-Please specify the location of python. [Default is /usr/bin/python]: /usr/bin/python2.7
-
-
-Found possible Python library paths:
-  /usr/local/lib/python2.7/dist-packages
-  /usr/lib/python2.7/dist-packages
-Please input the desired Python library path to use.  Default is [/usr/lib/python2.7/dist-packages]
-
-Do you wish to build TensorFlow with jemalloc as malloc support? [Y/n]:
-jemalloc as malloc support will be enabled for TensorFlow.
-
-Do you wish to build TensorFlow with Google Cloud Platform support? [Y/n]:
-Google Cloud Platform support will be enabled for TensorFlow.
-
-Do you wish to build TensorFlow with Hadoop File System support? [Y/n]:
-Hadoop File System support will be enabled for TensorFlow.
-
-Do you wish to build TensorFlow with Amazon AWS Platform support? [Y/n]:
-Amazon AWS Platform support will be enabled for TensorFlow.
-
-Do you wish to build TensorFlow with Apache Kafka Platform support? [Y/n]:
-Apache Kafka Platform support will be enabled for TensorFlow.
-
-Do you wish to build TensorFlow with XLA JIT support? [y/N]:
-No XLA JIT support will be enabled for TensorFlow.
-
-Do you wish to build TensorFlow with GDR support? [y/N]:
-No GDR support will be enabled for TensorFlow.
-
-Do you wish to build TensorFlow with VERBS support? [y/N]:
-No VERBS support will be enabled for TensorFlow.
-
-Do you wish to build TensorFlow with OpenCL SYCL support? [y/N]:
-No OpenCL SYCL support will be enabled for TensorFlow.
-
-Do you wish to build TensorFlow with CUDA support? [y/N]: Y
-CUDA support will be enabled for TensorFlow.
-
-Please specify the CUDA SDK version you want to use. [Leave empty to default to CUDA 9.0]: 9.0
-
-
-Please specify the location where CUDA 9.0 toolkit is installed. Refer to README.md for more details. [Default is /usr/local/cuda]:
-
-
-Please specify the cuDNN version you want to use. [Leave empty to default to cuDNN 7.0]: 7.0
-
-
-Please specify the location where cuDNN 7 library is installed. Refer to README.md for more details. [Default is /usr/local/cuda]:
-
-
-Do you wish to build TensorFlow with TensorRT support? [y/N]:
-No TensorRT support will be enabled for TensorFlow.
-
-Please specify the NCCL version you want to use. If NCLL 2.2 is not installed, then you can use version 1.3 that can be fetched automatically but it may have worse performance with multiple GPUs. [Default is 2.2]: 1.3
-
-
-Please specify a list of comma-separated Cuda compute capabilities you want to build with.
-You can find the compute capability of your device at: https://developer.nvidia.com/cuda-gpus.
-Please note that each additional compute capability significantly increases your
-build time and binary size. [Default is: 3.5,7.0] 6.1
-
-
-Do you want to use clang as CUDA compiler? [y/N]:
-nvcc will be used as CUDA compiler.
-
-Please specify which gcc should be used by nvcc as the host compiler. [Default is /usr/bin/gcc]:
-
-
-Do you wish to build TensorFlow with MPI support? [y/N]:
-No MPI support will be enabled for TensorFlow.
-
-Please specify optimization flags to use during compilation when bazel option "--config=opt" is specified [Default is -march=native]:
-
-
-Would you like to interactively configure ./WORKSPACE for Android builds? [y/N]:
-Not configuring the WORKSPACE for Android builds.
-
-Preconfigured Bazel build configs. You can use any of the below by adding "--config=<>" to your build command. See tools/bazel.rc for more details.
-    --config=mkl            # Build with MKL support.
-    --config=monolithic     # Config for mostly static monolithic build.
-Configuration finished
-
- -If you told `configure` to build for GPU support, then `configure` will create a -canonical set of symbolic links to the CUDA libraries on your system. Therefore, -every time you change the CUDA library paths, you must rerun the `configure` -script before re-invoking the bazel build command. - -Note the following: - -* Although it is possible to build both CUDA and non-CUDA configs under the - same source tree, we recommend running `bazel clean` when switching between - these two configurations in the same source tree. -* If you don't run the `configure` script *before* running the `bazel build` - command, the `bazel build` command will fail. - -## Build the pip package - -Note: If you're only interested in building the libraries for the TensorFlow C -or Java APIs, see [Build the C or Java libraries](#BuildCorJava), you do not -need to build the pip package in that case. - -### CPU-only support - -To build a pip package for TensorFlow with CPU-only support: - -
-$ bazel build --config=opt //tensorflow/tools/pip_package:build_pip_package
-
- -To build a pip package for TensorFlow with CPU-only support for the Intel® -MKL-DNN: - -
-$ bazel build --config=mkl --config=opt //tensorflow/tools/pip_package:build_pip_package
-
- -### GPU support - -To build a pip package for TensorFlow with GPU support: - -
-$ bazel build --config=opt --config=cuda //tensorflow/tools/pip_package:build_pip_package
-
- -**NOTE on gcc 5 or later:** the binary pip packages available on the TensorFlow -website are built with gcc 4, which uses the older ABI. To make your build -compatible with the older ABI, you need to add -`--cxxopt="-D_GLIBCXX_USE_CXX11_ABI=0"` to your `bazel build` command. ABI -compatibility allows custom ops built against the TensorFlow pip package to -continue to work against your built package. - -Tip: By default, building TensorFlow from sources consumes a lot of RAM. -If RAM is an issue on your system, you may limit RAM usage by specifying ---local_resources 2048,.5,1.0 while invoking `bazel`. - -### Run the build_pip_package script - -The bazel build command builds a script named `build_pip_package`. -Running this script as follows will build a `.whl` file within the -`/tmp/tensorflow_pkg` directory: - -
-$ bazel-bin/tensorflow/tools/pip_package/build_pip_package /tmp/tensorflow_pkg
-
- -## Install the pip package - -Invoke `pip install` to install that pip package. The filename of the `.whl` -file depends on your platform. For example, the following command will install -the pip package - -for TensorFlow 1.10.0 on Linux: - -
-$ sudo pip install /tmp/tensorflow_pkg/tensorflow-1.10.0-py2-none-any.whl
-
- -## Validate your installation - -Validate your TensorFlow installation by doing the following: - -Start a terminal. - -Change directory (`cd`) to any directory on your system other than the -`tensorflow` subdirectory from which you invoked the `configure` command. - -Invoke python: - -
$ python
- -Enter the following short program inside the python interactive shell: - -```python -# Python -import tensorflow as tf -hello = tf.constant('Hello, TensorFlow!') -sess = tf.Session() -print(sess.run(hello)) -``` - -If the system outputs the following, then you are ready to begin writing -TensorFlow programs: - -
Hello, TensorFlow!
- -To learn more, see the [TensorFlow tutorials](../tutorials/). - -If the system outputs an error message instead of a greeting, see -[Common installation problems](#common_installation_problems). - -## Common build and installation problems - -The build and installation problems you encounter typically depend on the -operating system. See the "Common installation problems" section of one of the -following guides: - -* @ - {$install_linux#common_installation_problems$Installing TensorFlow on Linux} -* @ - {$install_mac#common_installation_problems$Installing TensorFlow on Mac OS} -* @ - {$install_windows#common_installation_problems$Installing TensorFlow on Windows} - -Beyond the errors documented in those two guides, the following table notes -additional errors specific to building TensorFlow. Note that we are relying on -Stack Overflow as the repository for build and installation problems. If you -encounter an error message not listed in the preceding two guides or in the -following table, search for it on Stack Overflow. If Stack Overflow doesn't show -the error message, ask a new question on Stack Overflow and specify the -`tensorflow` tag. - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Stack Overflow Link Error Message
41293077
W tensorflow/core/platform/cpu_feature_guard.cc:45] The TensorFlow
-  library wasn't compiled to use SSE4.1 instructions, but these are available on
-  your machine and could speed up CPU computations.
42013316
ImportError: libcudart.so.8.0: cannot open shared object file:
-  No such file or directory
42013316
ImportError: libcudnn.5: cannot open shared object file:
-  No such file or directory
35953210Invoking `python` or `ipython` generates the following error: -
ImportError: cannot import name pywrap_tensorflow
45276830
external/local_config_cc/BUILD:50:5: in apple_cc_toolchain rule
-  @local_config_cc//:cc-compiler-darwin_x86_64: Xcode version must be specified
-  to use an Apple CROSSTOOL.
-
47080760
undefined reference to `cublasGemmEx@libcublas.so.9.0'
- -## Tested source configurations - -**Linux** - - - - - - - - - - - - - - - - - - - - - - - - -
Version:CPU/GPU:Python Version:Compiler:Build Tools:cuDNN:CUDA:
tensorflow-1.10.0CPU2.7, 3.3-3.6GCC 4.8Bazel 0.15.0N/AN/A
tensorflow_gpu-1.10.0GPU2.7, 3.3-3.6GCC 4.8Bazel 0.15.079
tensorflow-1.9.0CPU2.7, 3.3-3.6GCC 4.8Bazel 0.11.0N/AN/A
tensorflow_gpu-1.9.0GPU2.7, 3.3-3.6GCC 4.8Bazel 0.11.079
tensorflow-1.8.0CPU2.7, 3.3-3.6GCC 4.8Bazel 0.10.0N/AN/A
tensorflow_gpu-1.8.0GPU2.7, 3.3-3.6GCC 4.8Bazel 0.9.079
tensorflow-1.7.0CPU2.7, 3.3-3.6GCC 4.8Bazel 0.10.0N/AN/A
tensorflow_gpu-1.7.0GPU2.7, 3.3-3.6GCC 4.8Bazel 0.9.079
tensorflow-1.6.0CPU2.7, 3.3-3.6GCC 4.8Bazel 0.9.0N/AN/A
tensorflow_gpu-1.6.0GPU2.7, 3.3-3.6GCC 4.8Bazel 0.9.079
tensorflow-1.5.0CPU2.7, 3.3-3.6GCC 4.8Bazel 0.8.0N/AN/A
tensorflow_gpu-1.5.0GPU2.7, 3.3-3.6GCC 4.8Bazel 0.8.079
tensorflow-1.4.0CPU2.7, 3.3-3.6GCC 4.8Bazel 0.5.4N/AN/A
tensorflow_gpu-1.4.0GPU2.7, 3.3-3.6GCC 4.8Bazel 0.5.468
tensorflow-1.3.0CPU2.7, 3.3-3.6GCC 4.8Bazel 0.4.5N/AN/A
tensorflow_gpu-1.3.0GPU2.7, 3.3-3.6GCC 4.8Bazel 0.4.568
tensorflow-1.2.0CPU2.7, 3.3-3.6GCC 4.8Bazel 0.4.5N/AN/A
tensorflow_gpu-1.2.0GPU2.7, 3.3-3.6GCC 4.8Bazel 0.4.55.18
tensorflow-1.1.0CPU2.7, 3.3-3.6GCC 4.8Bazel 0.4.2N/AN/A
tensorflow_gpu-1.1.0GPU2.7, 3.3-3.6GCC 4.8Bazel 0.4.25.18
tensorflow-1.0.0CPU2.7, 3.3-3.6GCC 4.8Bazel 0.4.2N/AN/A
tensorflow_gpu-1.0.0GPU2.7, 3.3-3.6GCC 4.8Bazel 0.4.25.18
- -**Mac** - - - - - - - - - - - - - - - -
Version:CPU/GPU:Python Version:Compiler:Build Tools:cuDNN:CUDA:
tensorflow-1.10.0CPU2.7, 3.3-3.6Clang from xcodeBazel 0.15.0N/AN/A
tensorflow-1.9.0CPU2.7, 3.3-3.6Clang from xcodeBazel 0.11.0N/AN/A
tensorflow-1.8.0CPU2.7, 3.3-3.6Clang from xcodeBazel 0.10.1N/AN/A
tensorflow-1.7.0CPU2.7, 3.3-3.6Clang from xcodeBazel 0.10.1N/AN/A
tensorflow-1.6.0CPU2.7, 3.3-3.6Clang from xcodeBazel 0.8.1N/AN/A
tensorflow-1.5.0CPU2.7, 3.3-3.6Clang from xcodeBazel 0.8.1N/AN/A
tensorflow-1.4.0CPU2.7, 3.3-3.6Clang from xcodeBazel 0.5.4N/AN/A
tensorflow-1.3.0CPU2.7, 3.3-3.6Clang from xcodeBazel 0.4.5N/AN/A
tensorflow-1.2.0CPU2.7, 3.3-3.6Clang from xcodeBazel 0.4.5N/AN/A
tensorflow-1.1.0CPU2.7, 3.3-3.6Clang from xcodeBazel 0.4.2N/AN/A
tensorflow_gpu-1.1.0GPU2.7, 3.3-3.6Clang from xcodeBazel 0.4.25.18
tensorflow-1.0.0CPU2.7, 3.3-3.6Clang from xcodeBazel 0.4.2N/AN/A
tensorflow_gpu-1.0.0GPU2.7, 3.3-3.6Clang from xcodeBazel 0.4.25.18
- -**Windows** - - - - - - - - - - - - - - - - - - - - - - - - -
Version:CPU/GPU:Python Version:Compiler:Build Tools:cuDNN:CUDA:
tensorflow-1.10.0CPU3.5-3.6MSVC 2015 update 3Cmake v3.6.3N/AN/A
tensorflow_gpu-1.10.0GPU3.5-3.6MSVC 2015 update 3Cmake v3.6.379
tensorflow-1.9.0CPU3.5-3.6MSVC 2015 update 3Cmake v3.6.3N/AN/A
tensorflow_gpu-1.9.0GPU3.5-3.6MSVC 2015 update 3Cmake v3.6.379
tensorflow-1.8.0CPU3.5-3.6MSVC 2015 update 3Cmake v3.6.3N/AN/A
tensorflow_gpu-1.8.0GPU3.5-3.6MSVC 2015 update 3Cmake v3.6.379
tensorflow-1.7.0CPU3.5-3.6MSVC 2015 update 3Cmake v3.6.3N/AN/A
tensorflow_gpu-1.7.0GPU3.5-3.6MSVC 2015 update 3Cmake v3.6.379
tensorflow-1.6.0CPU3.5-3.6MSVC 2015 update 3Cmake v3.6.3N/AN/A
tensorflow_gpu-1.6.0GPU3.5-3.6MSVC 2015 update 3Cmake v3.6.379
tensorflow-1.5.0CPU3.5-3.6MSVC 2015 update 3Cmake v3.6.3N/AN/A
tensorflow_gpu-1.5.0GPU3.5-3.6MSVC 2015 update 3Cmake v3.6.379
tensorflow-1.4.0CPU3.5-3.6MSVC 2015 update 3Cmake v3.6.3N/AN/A
tensorflow_gpu-1.4.0GPU3.5-3.6MSVC 2015 update 3Cmake v3.6.368
tensorflow-1.3.0CPU3.5-3.6MSVC 2015 update 3Cmake v3.6.3N/AN/A
tensorflow_gpu-1.3.0GPU3.5-3.6MSVC 2015 update 3Cmake v3.6.368
tensorflow-1.2.0CPU3.5-3.6MSVC 2015 update 3Cmake v3.6.3N/AN/A
tensorflow_gpu-1.2.0GPU3.5-3.6MSVC 2015 update 3Cmake v3.6.35.18
tensorflow-1.1.0CPU3.5MSVC 2015 update 3Cmake v3.6.3N/AN/A
tensorflow_gpu-1.1.0GPU3.5MSVC 2015 update 3Cmake v3.6.35.18
tensorflow-1.0.0CPU3.5MSVC 2015 update 3Cmake v3.6.3N/AN/A
tensorflow_gpu-1.0.0GPU3.5MSVC 2015 update 3Cmake v3.6.35.18
- - - -## Build the C or Java libraries - -The instructions above are tailored to building the TensorFlow Python packages. - -If you're interested in building the libraries for the TensorFlow C API, do the -following: - -1. Follow the steps up to [Configure the installation](#ConfigureInstallation) -2. Build the C libraries following instructions in the - [README](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/tools/lib_package/README.md). - -If you're interested inv building the libraries for the TensorFlow Java API, do -the following: - -1. Follow the steps up to [Configure the installation](#ConfigureInstallation) -2. Build the Java library following instructions in the - [README](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/tools/lib_package/README.md). diff --git a/tensorflow/docs_src/install/install_sources_windows.md b/tensorflow/docs_src/install/install_sources_windows.md deleted file mode 100644 index 40dce106d6..0000000000 --- a/tensorflow/docs_src/install/install_sources_windows.md +++ /dev/null @@ -1,320 +0,0 @@ -# Install TensorFlow from Sources on Windows - -This guide explains how to build TensorFlow sources into a TensorFlow binary and -how to install that TensorFlow binary on Windows. - -## Determine which TensorFlow to install - -You must choose one of the following types of TensorFlow to build and install: - -* **TensorFlow with CPU support only**. If your system does not have a NVIDIA® - GPU, build and install this version. Note that this version of TensorFlow is - typically easier to build and install, so even if you have an NVIDIA GPU, we - recommend building and installing this version first. -* **TensorFlow with GPU support**. TensorFlow programs typically run - significantly faster on a GPU than on a CPU. Therefore, if your system has a - NVIDIA GPU and you need to run performance-critical applications, you should - ultimately build and install this version. Beyond the NVIDIA GPU itself, - your system must also fulfill the NVIDIA software requirements described in - the following document: - - * [Installing TensorFlow on Windows](install_windows.md#NVIDIARequirements) - -## Prepare environment for Windows - -Before building TensorFlow on Windows, install the following build tools on your -system: - -* [MSYS2](#InstallMSYS2) -* [Visual C++ build tools](#InstallVCBuildTools) -* [Bazel for Windows](#InstallBazel) -* [TensorFlow Python dependencies](#InstallPython) -* [optionally, NVIDIA packages to support TensorFlow for GPU](#InstallCUDA) - - - -### Install MSYS2 - -Bash bin tools are used in TensorFlow Bazel build, you can install them through [MSYS2](https://www.msys2.org/). - -Assume you installed MSYS2 at `C:\msys64`, add `C:\msys64\usr\bin` to your `%PATH%` environment variable. - -To install necessary bash bin tools, issue the following command under `cmd.exe`: - -
-C:\> pacman -S git patch unzip
-
- - - -### Install Visual C++ Build Tools 2015 - -To build TensorFlow, you need to install Visual C++ build tools 2015. It is a part of Visual Studio 2015. -But you can install it separately by the following way: - - * Open the [official downloand page](https://visualstudio.microsoft.com/vs/older-downloads/). - * Go to Redistributables and Build Tools section. - * Find Microsoft Build Tools 2015 Update 3 and click download. - * Run the installer. - -It's possible to build TensorFlow with newer version of Visual C++ build tools, -but we only test against Visual Studio 2015 Update 3. - - - -### Install Bazel - -If bazel is not installed on your system, install it now by following -[these instructions](https://docs.bazel.build/versions/master/install-windows.html). -It is recommended to use a Bazel version >= `0.15.0`. - -Add the directory where you installed Bazel to your `%PATH%` environment variable. - - - -### Install TensorFlow Python dependencies - -If you don't have Python 3.5 or Python 3.6 installed, install it now: - - * [Python 3.5.x 64-bit from python.org](https://www.python.org/downloads/release/python-352/) - * [Python 3.6.x 64-bit from python.org](https://www.python.org/downloads/release/python-362/) - -To build and install TensorFlow, you must install the following python packages: - -* `six`, which provides simple utilities for wrapping over differences between - Python 2 and Python 3. -* `numpy`, which is a numerical processing package that TensorFlow requires. -* `wheel`, which enables you to manage Python compressed packages in the wheel - (.whl) format. -* `keras_applications`, the applications module of the Keras deep learning library. -* `keras_preprocessing`, the data preprocessing and data augmentation module - of the Keras deep learning library. - -Assume you already have `pip3` in `%PATH%`, issue the following command: - -
-C:\> pip3 install six numpy wheel
-C:\> pip3 install keras_applications==1.0.5 --no-deps
-C:\> pip3 install keras_preprocessing==1.0.3 --no-deps
-
- - - -### Optional: install TensorFlow for GPU prerequisites - -If you are building TensorFlow without GPU support, skip this section. - -The following NVIDIA® _hardware_ must be installed on your system: - -* GPU card with CUDA Compute Capability 3.5 or higher. See - [NVIDIA documentation](https://developer.nvidia.com/cuda-gpus) for a list of - supported GPU cards. - -The following NVIDIA® _software_ must be installed on your system: - -* [GPU drivers](http://nvidia.com/driver). CUDA 9.0 requires 384.x or higher. -* [CUDA Toolkit](http://nvidia.com/cuda) (>= 8.0). We recommend version 9.0. -* [cuDNN SDK](http://developer.nvidia.com/cudnn) (>= 6.0). We recommend - version 7.1.x. -* [CUPTI](http://docs.nvidia.com/cuda/cupti/) ships with the CUDA Toolkit, but - you also need to append its path to `%PATH%` environment - variable. - -Assume you have CUDA Toolkit installed at `C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v9.0` -and cuDNN at `C:\tools\cuda`, issue the following commands. - -
-C:\> SET PATH=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v9.0\bin;%PATH%
-C:\> SET PATH=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v9.0\extras\CUPTI\libx64;%PATH%
-C:\> SET PATH=C:\tools\cuda\bin;%PATH%
-
- -## Clone the TensorFlow repository - -Now you need to clone **the latest** TensorFlow repository, -thanks to MSYS2 we already have `git` avaiable, issue the following command: - -
C:\> git clone https://github.com/tensorflow/tensorflow.git 
- -The preceding git clone command creates a subdirectory named -`tensorflow`. After cloning, you may optionally build a **specific branch** -(such as a release branch) by invoking the following commands: - -
-C:\> cd tensorflow
-C:\> git checkout Branch # where Branch is the desired branch
-
- -For example, to work with the `r1.11` release instead of the master release, -issue the following command: - -
C:\> git checkout r1.11
- -Next, you must now configure the installation. - -## Configure the installation - -The root of the source tree contains a python script named configure.py. -This script asks you to identify the pathname of all relevant TensorFlow -dependencies and specify other build configuration options such as compiler -flags. You must run this script *prior* to creating the pip package and -installing TensorFlow. - -If you wish to build TensorFlow with GPU, `configure.py` will ask you to specify -the version numbers of CUDA and cuDNN. If several versions of CUDA or cuDNN are -installed on your system, explicitly select the desired version instead of -relying on the default. - -One of the questions that `configure.py` will ask is as follows: - -
-Please specify optimization flags to use during compilation when bazel option "--config=opt" is specified [Default is /arch:AVX]:
-
- -Here is an example execution of the `configure.py` script. Note that your own input -will likely differ from our sample input: - -
-C:\> cd tensorflow  # cd to the top-level directory created
-C:\tensorflow> python ./configure.py
-Starting local Bazel server and connecting to it...
-................
-You have bazel 0.15.0 installed.
-Please specify the location of python. [Default is C:\python36\python.exe]: 
-
-Found possible Python library paths:
-  C:\python36\lib\site-packages
-Please input the desired Python library path to use.  Default is [C:\python36\lib\site-packages]
-
-Do you wish to build TensorFlow with CUDA support? [y/N]: Y
-CUDA support will be enabled for TensorFlow.
-
-Please specify the CUDA SDK version you want to use. [Leave empty to default to CUDA 9.0]:
-
-Please specify the location where CUDA 9.0 toolkit is installed. Refer to README.md for more details. [Default is C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v9.0]:
-
-Please specify the cuDNN version you want to use. [Leave empty to default to cuDNN 7.0]: 7.0
-
-Please specify the location where cuDNN 7 library is installed. Refer to README.md for more details. [Default is C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v9.0]: C:\tools\cuda
-
-Please specify a list of comma-separated Cuda compute capabilities you want to build with.
-You can find the compute capability of your device at: https://developer.nvidia.com/cuda-gpus.
-Please note that each additional compute capability significantly increases your build time and binary size. [Default is: 3.5,7.0]: 3.7
-
-Please specify optimization flags to use during compilation when bazel option "--config=opt" is specified [Default is /arch:AVX]: 
-
-Would you like to override eigen strong inline for some C++ compilation to reduce the compilation time? [Y/n]:
-Eigen strong inline overridden.
-
-Configuration finished
-
- -## Build the pip package - -### CPU-only support - -To build a pip package for TensorFlow with CPU-only support: - -
-C:\tensorflow> bazel build --config=opt //tensorflow/tools/pip_package:build_pip_package
-
- -### GPU support - -To build a pip package for TensorFlow with GPU support: - -
-C:\tensorflow> bazel build --config=opt --config=cuda //tensorflow/tools/pip_package:build_pip_package
-
- -**NOTE :** When building with GPU support, you might want to add `--copt=-nvcc_options=disable-warnings` -to suppress nvcc warning messages. - -The `bazel build` command builds a binary named `build_pip_package` -(an executable binary to launch bash and run a bash script to create the pip package). -Running this binary as follows will build a `.whl` file within the `C:/tmp/tensorflow_pkg` directory: - -
-C:\tensorflow> bazel-bin\tensorflow\tools\pip_package\build_pip_package C:/tmp/tensorflow_pkg
-
- -## Install the pip package - -Invoke `pip3 install` to install that pip package. The filename of the `.whl` -file depends on the TensorFlow version and your platform. For example, the -following command will install the pip package for TensorFlow 1.11.0rc0: - -
-C:\tensorflow> pip3 install C:/tmp/tensorflow_pkg/tensorflow-1.11.0rc0-cp36-cp36m-win_amd64.whl
-
- -## Validate your installation - -Validate your TensorFlow installation by doing the following: - -Start a terminal. - -Change directory (`cd`) to any directory on your system other than the -`tensorflow` subdirectory from which you invoked the `configure` command. - -Invoke python: - -
$ python
- -Enter the following short program inside the python interactive shell: - -```python -# Python -import tensorflow as tf -hello = tf.constant('Hello, TensorFlow!') -sess = tf.Session() -print(sess.run(hello)) -``` - -If the system outputs the following, then you are ready to begin writing -TensorFlow programs: - -
Hello, TensorFlow!
- -To learn more, see the [TensorFlow tutorials](../tutorials/). - -## Build under MSYS shell -The above instruction assumes you are building under the Windows native command line (`cmd.exe`), but you can also -build TensorFlow from MSYS shell. There are a few things to notice: - -* Disable the path conversion heuristic in MSYS. MSYS automatically converts arguments that look - like a Unix path to Windows path when running a program, this will confuse Bazel. - (eg. A Bazel label `//foo/bar:bin` is considered a Unix absolute path, only because it starts with a slash) - - ```sh -$ export MSYS_NO_PATHCONV=1 -$ export MSYS2_ARG_CONV_EXCL="*" -``` - -* Add the directory where you install Bazel in `$PATH`. Assume you have Bazel - installed at `C:\tools\bazel.exe`, issue the following command: - - ```sh -# `:` is used as path separator, so we have to convert the path to Unix style. -$ export PATH="/c/tools:$PATH" -``` - -* Add the directory where you install Python in `$PATH`. Assume you have - Python installed at `C:\Python36\python.exe`, issue the following command: - - ```sh -$ export PATH="/c/Python36:$PATH" -``` - -* If you have Python in `$PATH`, you can run configure script just by - `./configure`, a shell script will help you invoke python. - -* (For GPU build only) Add Cuda and cuDNN bin directories in `$PATH` in the following way: - - ```sh -$ export PATH="/c/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v9.0/bin:$PATH" -$ export PATH="/c/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v9.0/extras/CUPTI/libx64:$PATH" -$ export PATH="/c/tools/cuda/bin:$PATH" -``` - -The rest steps should be the same as building under `cmd.exe`. diff --git a/tensorflow/docs_src/install/install_windows.md b/tensorflow/docs_src/install/install_windows.md deleted file mode 100644 index 0bb0e5aeb9..0000000000 --- a/tensorflow/docs_src/install/install_windows.md +++ /dev/null @@ -1,227 +0,0 @@ -# Install TensorFlow on Windows - -This guide explains how to install TensorFlow on Windows. Although these -instructions might also work on other Windows variants, we have only -tested (and we only support) these instructions on machines meeting the -following requirements: - - * 64-bit, x86 desktops or laptops - * Windows 7 or later - - -## Determine which TensorFlow to install - -You must choose one of the following types of TensorFlow to install: - - * **TensorFlow with CPU support only**. If your system does not have a - NVIDIA® GPU, you must install this version. Note that this version of - TensorFlow is typically much easier to install (typically, - in 5 or 10 minutes), so even if you have an NVIDIA GPU, we recommend - installing this version first. Prebuilt binaries will use AVX instructions. - * **TensorFlow with GPU support**. TensorFlow programs typically run - significantly faster on a GPU than on a CPU. Therefore, if your - system has a NVIDIA® GPU meeting the prerequisites shown below - and you need to run performance-critical applications, you should - ultimately install this version. - - - -### Requirements to run TensorFlow with GPU support - -If you are installing TensorFlow with GPU support using one of the mechanisms -described in this guide, then the following NVIDIA software must be -installed on your system: - - * CUDA® Toolkit 9.0. For details, see - [NVIDIA's - documentation](http://docs.nvidia.com/cuda/cuda-installation-guide-microsoft-windows/) - Ensure that you append the relevant Cuda pathnames to the `%PATH%` - environment variable as described in the NVIDIA documentation. - * The NVIDIA drivers associated with CUDA Toolkit 9.0. - * cuDNN v7.0. For details, see - [NVIDIA's documentation](https://developer.nvidia.com/cudnn). - Note that cuDNN is typically installed in a different location from the - other CUDA DLLs. Ensure that you add the directory where you installed - the cuDNN DLL to your `%PATH%` environment variable. - * GPU card with CUDA Compute Capability 3.0 or higher for building - from source and 3.5 or higher for our binaries. See - [NVIDIA documentation](https://developer.nvidia.com/cuda-gpus) for a - list of supported GPU cards. - -If you have a different version of one of the preceding packages, please -change to the specified versions. In particular, the cuDNN version -must match exactly: TensorFlow will not load if it cannot find `cuDNN64_7.dll`. -To use a different version of cuDNN, you must build from source. - -## Determine how to install TensorFlow - -You must pick the mechanism by which you install TensorFlow. The -supported choices are as follows: - - * "native" pip - * Anaconda - -Native pip installs TensorFlow directly on your system without going -through a virtual environment. Since a native pip installation is not -walled-off in a separate container, the pip installation might interfere -with other Python-based installations on your system. However, if you -understand pip and your Python environment, a native pip installation -often entails only a single command! Furthermore, if you install with -native pip, users can run TensorFlow programs from any directory on -the system. - -In Anaconda, you may use conda to create a virtual environment. -However, within Anaconda, we recommend installing TensorFlow with the -`pip install` command, not with the `conda install` command. - -**NOTE:** The conda package is community supported, not officially supported. -That is, the TensorFlow team neither tests nor maintains this conda package. -Use that package at your own risk. - - -## Installing with native pip - -If one of the following versions of Python is not installed on your machine, -install it now: - - * [Python 3.5.x 64-bit from python.org](https://www.python.org/downloads/release/python-352/) - * [Python 3.6.x 64-bit from python.org](https://www.python.org/downloads/release/python-362/) - -TensorFlow supports Python 3.5.x and 3.6.x on Windows. -Note that Python 3 comes with the pip3 package manager, which is the -program you'll use to install TensorFlow. - -To install TensorFlow, start a terminal. Then issue the appropriate -pip3 install command in that terminal. To install the CPU-only -version of TensorFlow, enter the following command: - -
C:\> pip3 install --upgrade tensorflow
- -To install the GPU version of TensorFlow, enter the following command: - -
C:\> pip3 install --upgrade tensorflow-gpu
- -## Installing with Anaconda - -**The Anaconda installation is community supported, not officially supported.** - -Take the following steps to install TensorFlow in an Anaconda environment: - - 1. Follow the instructions on the - [Anaconda download site](https://www.continuum.io/downloads) - to download and install Anaconda. - - 2. Create a conda environment named tensorflow - by invoking the following command: - -
C:\> conda create -n tensorflow pip python=3.5 
- - 3. Activate the conda environment by issuing the following command: - -
C:\> activate tensorflow
-     (tensorflow)C:\>  # Your prompt should change 
- - 4. Issue the appropriate command to install TensorFlow inside your conda - environment. To install the CPU-only version of TensorFlow, enter the - following command: - -
(tensorflow)C:\> pip install --ignore-installed --upgrade tensorflow 
- - To install the GPU version of TensorFlow, enter the following command - (on a single line): - -
(tensorflow)C:\> pip install --ignore-installed --upgrade tensorflow-gpu 
- -## Validate your installation - -Start a terminal. - -If you installed through Anaconda, activate your Anaconda environment. - -Invoke python from your shell as follows: - -
$ python
- -Enter the following short program inside the python interactive shell: - -```python ->>> import tensorflow as tf ->>> hello = tf.constant('Hello, TensorFlow!') ->>> sess = tf.Session() ->>> print(sess.run(hello)) -``` - -If the system outputs the following, then you are ready to begin writing -TensorFlow programs: - -
Hello, TensorFlow!
- -If the system outputs an error message instead of a greeting, see [Common -installation problems](#common_installation_problems). - -To learn more, see the [TensorFlow tutorials](../tutorials/). - -## Common installation problems - -We are relying on Stack Overflow to document TensorFlow installation problems -and their remedies. The following table contains links to Stack Overflow -answers for some common installation problems. -If you encounter an error message or other -installation problem not listed in the following table, search for it -on Stack Overflow. If Stack Overflow doesn't show the error message, -ask a new question about it on Stack Overflow and specify -the `tensorflow` tag. - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Stack Overflow Link Error Message
41007279 -
[...\stream_executor\dso_loader.cc] Couldn't open CUDA library nvcuda.dll
-
41007279 -
[...\stream_executor\cuda\cuda_dnn.cc] Unable to load cuDNN DSO
-
42006320
ImportError: Traceback (most recent call last):
-File "...\tensorflow\core\framework\graph_pb2.py", line 6, in 
-from google.protobuf import descriptor as _descriptor
-ImportError: cannot import name 'descriptor'
-
42011070
No module named "pywrap_tensorflow"
42217532 -
OpKernel ('op: "BestSplits" device_type: "CPU"') for unknown op: BestSplits
-
43134753 -
The TensorFlow library wasn't compiled to use SSE instructions
-
38896424 -
Could not find a version that satisfies the requirement tensorflow
-
diff --git a/tensorflow/docs_src/install/leftnav_files b/tensorflow/docs_src/install/leftnav_files deleted file mode 100644 index 59292f7121..0000000000 --- a/tensorflow/docs_src/install/leftnav_files +++ /dev/null @@ -1,18 +0,0 @@ -index.md - -### Python -install_linux.md: Ubuntu -install_mac.md: MacOS -install_windows.md: Windows -install_raspbian.md: Raspbian -install_sources.md: From source -install_sources_windows.md: From source on Windows ->>> -migration.md - -### Other Languages -install_java.md: Java -install_go.md: Go -install_c.md: C - - diff --git a/tensorflow/docs_src/install/migration.md b/tensorflow/docs_src/install/migration.md deleted file mode 100644 index 19315ace2d..0000000000 --- a/tensorflow/docs_src/install/migration.md +++ /dev/null @@ -1,336 +0,0 @@ -# Transition to TensorFlow 1.0 - - -The APIs in TensorFlow 1.0 have changed in ways that are not all backwards -compatible. That is, TensorFlow programs that worked on TensorFlow 0.n won't -necessarily work on TensorFlow 1.0. We have made this API changes to ensure an -internally-consistent API, and do not plan to make backwards-breaking changes -throughout the 1.N lifecycle. - -This guide walks you through the major changes in the API and how to -automatically upgrade your programs for TensorFlow 1.0. This guide not -only steps you through the changes but also explains why we've made them. - -## How to upgrade - -If you would like to automatically port your code to 1.0, you can try our -`tf_upgrade.py` script. While this script handles many cases, manual changes -are sometimes necessary. - Get this script from our -[GitHub tree](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/tools/compatibility). - -To convert a single 0.n TensorFlow source file to 1.0, enter a -command of the following format: - -
-$ python tf_upgrade.py --infile InputFile --outfile OutputFile
-
- -For example, the following command converts a 0.n TensorFlow -program named `test.py` to a 1.0 TensorFlow program named `test_1.0.py`: - -
-$ python tf_upgrade.py --infile test.py --outfile test_1.0.py
-
- -The `tf_upgrade.py` script also generates a file named `report.txt`, which -details all the changes it performed and makes additional suggestions about -changes you might need to make manually. - -To upgrade a whole directory of 0.n TensorFlow programs to 1.0, -enter a command having the following format: - -
-$ python tf_upgrade.py --intree InputDir --outtree OutputDir
-
- -For example, the following command converts all the 0.n TensorFlow programs -in the `/home/user/cool` directory, creating their 1.0 equivalents in -the `/home/user/cool_1.0` directory: - -
-$ python tf_upgrade.py --intree /home/user/cool --outtree /home/user/cool_1.0
-
- -### Limitations - -There are a few things to watch out for. Specifically: - - * You must manually fix any instances of `tf.reverse()`. - The `tf_upgrade.py` script will warn you about `tf.reverse()` in - stdout and in the `report.txt` file. - * On reordered arguments, `tf_upgrade.py` tries to minimally reformat - your code, so it cannot automatically change the actual argument order. - Instead, `tf_upgrade.py` makes your function invocations order-independent - by introducing keyword arguments. - * Constructions like `tf.get_variable_scope().reuse_variables()` - will likely not work. We recommend deleting those lines and replacing - them with lines such as the following: - -
-   with tf.variable_scope(tf.get_variable_scope(), reuse=True):
-     ...
-   
- - * Analogously to `tf.pack` and `tf.unpack`, we're renamed - `TensorArray.pack` and `TensorArray.unpack` to - `TensorArray.stack` and `TensorArray.unstack`. However, `TensorArray.pack` - and `TensorArray.unpack` cannot be detected lexically since they are - indirectly related to the `tf` namespace e.g. - `foo = tf.TensorArray(); foo.unpack()` - -## Upgrading your code manually - -Instead of running `tf_upgrade.py`, you may manually upgrade your code. -The remainder of this document provides a comprehensive list of -all backward incompatible changes made in TensorFlow 1.0. - - -### Variables - -Variable functions have been made more consistent and less confusing. - -* `tf.VARIABLES` - * should be renamed to `tf.GLOBAL_VARIABLES` -* `tf.all_variables` - * should be renamed to `tf.global_variables` -* `tf.initialize_all_variables` - * should be renamed to `tf.global_variables_initializer` -* `tf.initialize_local_variables` - * should be renamed to `tf.local_variables_initializer` -* `tf.initialize_variables` - * should be renamed to `tf.variables_initializer` - -### Summary functions - -Summary functions have been consolidated under the `tf.summary` namespace. - -* `tf.audio_summary` - * should be renamed to `tf.summary.audio` -* `tf.contrib.deprecated.histogram_summary` - * should be renamed to `tf.summary.histogram` -* `tf.contrib.deprecated.scalar_summary` - * should be renamed to `tf.summary.scalar` -* `tf.histogram_summary` - * should be renamed to `tf.summary.histogram` -* `tf.image_summary` - * should be renamed to `tf.summary.image` -* `tf.merge_all_summaries` - * should be renamed to `tf.summary.merge_all` -* `tf.merge_summary` - * should be renamed to `tf.summary.merge` -* `tf.scalar_summary` - * should be renamed to `tf.summary.scalar` -* `tf.train.SummaryWriter` - * should be renamed to `tf.summary.FileWriter` - -### Numeric differences - - -Integer division and `tf.floordiv` now uses flooring semantics. This is to -make the results of `np.divide` and `np.mod` consistent with `tf.divide` and -`tf.mod`, respectively. In addition we have changed the rounding algorithm -used by `tf.round` to match NumPy. - - -* `tf.div` - - * The semantics of `tf.divide` division have been changed to match Python -semantics completely. That is, `/` in Python 3 and future division mode in -Python 2 will produce floating point numbers always, `//` will produce floored -division. However, even `tf.div` will produce floored integer division. -To force C-style truncation semantics, you must use `tf.truncatediv`. - - * Consider changing your code to use `tf.divide`, which follows Python semantics for promotion. - -* `tf.mod` - - * The semantics of `tf.mod` have been changed to match Python semantics. In -particular, flooring semantics are used for integers. If you wish to have -C-style truncation mod (remainders), you can use `tf.truncatemod` - - -The old and new behavior of division can be summarized with this table: - -| Expr | TF 0.11 (py2) | TF 0.11 (py3) | TF 1.0 (py2) | TF 1.0 (py3) | -|---------------------|---------------|---------------|--------------|--------------| -| tf.div(3,4) | 0 | 0 | 0 | 0 | -| tf.div(-3,4) | 0 | 0 | -1 | -1 | -| tf.mod(-3,4) | -3 | -3 | 1 | 1 | -| -3/4 | 0 | -0.75 | -1 | -0.75 | -| -3/4tf.divide(-3,4) | N/A | N/A | -0.75 | -1 | - -The old and new behavior of rounding can be summarized with this table: - -| Input | Python | NumPy | C++ round() | TensorFlow 0.11(floor(x+.5)) | TensorFlow 1.0 | -|-------|--------|-------|-------------|------------------------------|----------------| -| -3.5 | -4 | -4 | -4 | -3 | -4 | -| -2.5 | -2 | -2 | -3 | -2 | -2 | -| -1.5 | -2 | -2 | -2 | -1 | -2 | -| -0.5 | 0 | 0 | -1 | 0 | 0 | -| 0.5 | 0 | 0 | 1 | 1 | 0 | -| 1.5 | 2 | 2 | 2 | 2 | 2 | -| 2.5 | 2 | 2 | 3 | 3 | 2 | -| 3.5 | 4 | 4 | 4 | 4 | 4 | - - - -### NumPy matching names - - -Many functions have been renamed to match NumPy. This was done to make the -transition between NumPy and TensorFlow as easy as possible. There are still -numerous cases where functions do not match, so this is far from a hard and -fast rule, but we have removed several commonly noticed inconsistencies. - -* `tf.inv` - * should be renamed to `tf.reciprocal` - * This was done to avoid confusion with NumPy's matrix inverse `np.inv` -* `tf.list_diff` - * should be renamed to `tf.setdiff1d` -* `tf.listdiff` - * should be renamed to `tf.setdiff1d` -* `tf.mul` - * should be renamed to `tf.multiply` -* `tf.neg` - * should be renamed to `tf.negative` -* `tf.select` - * should be renamed to `tf.where` - * `tf.where` now takes 3 arguments or 1 argument, just like `np.where` -* `tf.sub` - * should be renamed to `tf.subtract` - -### NumPy matching arguments - -Arguments for certain TensorFlow 1.0 methods now match arguments in certain -NumPy methods. To achieve this, TensorFlow 1.0 has changed keyword arguments -and reordered some arguments. Notably, TensorFlow 1.0 now uses `axis` rather -than `dimension`. TensorFlow 1.0 aims to keep the tensor argument first on -operations that modify Tensors. (see the `tf.concat` change). - - -* `tf.argmax` - * keyword argument `dimension` should be renamed to `axis` -* `tf.argmin` - * keyword argument `dimension` should be renamed to `axis` -* `tf.concat` - * keyword argument `concat_dim` should be renamed to `axis` - * arguments have been reordered to `tf.concat(values, axis, name='concat')`. -* `tf.count_nonzero` - * keyword argument `reduction_indices` should be renamed to `axis` -* `tf.expand_dims` - * keyword argument `dim` should be renamed to `axis` -* `tf.reduce_all` - * keyword argument `reduction_indices` should be renamed to `axis` -* `tf.reduce_any` - * keyword argument `reduction_indices` should be renamed to `axis` -* `tf.reduce_join` - * keyword argument `reduction_indices` should be renamed to `axis` -* `tf.reduce_logsumexp` - * keyword argument `reduction_indices` should be renamed to `axis` -* `tf.reduce_max` - * keyword argument `reduction_indices` should be renamed to `axis` -* `tf.reduce_mean` - * keyword argument `reduction_indices` should be renamed to `axis` -* `tf.reduce_min` - * keyword argument `reduction_indices` should be renamed to `axis` -* `tf.reduce_prod` - * keyword argument `reduction_indices` should be renamed to `axis` -* `tf.reduce_sum` - * keyword argument `reduction_indices` should be renamed to `axis` -* `tf.reverse` - * `tf.reverse` used to take a 1D `bool` tensor to control which dimensions were reversed. Now we use a Tensor of axis indices. - * For example `tf.reverse(a, [True, False, True])` now must be `tf.reverse(a, [0, 2])` -* `tf.reverse_sequence` - * keyword argument `batch_dim` should be renamed to `batch_axis` - * keyword argument `seq_dim` should be renamed to `seq_axis` -* `tf.sparse_concat` - * keyword argument `concat_dim` should be renamed to `axis` -* `tf.sparse_reduce_sum` - * keyword argument `reduction_axes` should be renamed to `axis` -* `tf.sparse_reduce_sum_sparse` - * keyword argument `reduction_axes` should be renamed to `axis` -* `tf.sparse_split` - * keyword argument `split_dim` should be renamed to `axis` - * arguments have been reordered to `tf.sparse_split(keyword_required=KeywordRequired(), sp_input=None, num_split=None, axis=None, name=None, split_dim=None)`. -* `tf.split` - * keyword argument `split_dim` should be renamed to `axis` - * keyword argument `num_split` should be renamed to `num_or_size_splits` - * arguments have been reordered to `tf.split(value, num_or_size_splits, axis=0, num=None, name='split')`. -* `tf.squeeze` - * keyword argument `squeeze_dims` should be renamed to `axis` -* `tf.svd` - * arguments have been reordered to `tf.svd(tensor, full_matrices=False, compute_uv=True, name=None)`. - -### Simplified math variants - -Batched versions of math operations have been removed. Now the functionality is -contained in the non-batched versions. Similarly,`tf.complex_abs` has had its -functionality moved to `tf.abs` - -* `tf.batch_band_part` - * should be renamed to `tf.band_part` -* `tf.batch_cholesky` - * should be renamed to `tf.cholesky` -* `tf.batch_cholesky_solve` - * should be renamed to `tf.cholesky_solve` -* `tf.batch_fft` - * should be renamed to `tf.fft` -* `tf.batch_fft3d` - * should be renamed to `tf.fft3d` -* `tf.batch_ifft` - * should be renamed to `tf.ifft` -* `tf.batch_ifft2d` - * should be renamed to `tf.ifft2d` -* `tf.batch_ifft3d` - * should be renamed to `tf.ifft3d` -* `tf.batch_matmul` - * should be renamed to `tf.matmul` -* `tf.batch_matrix_determinant` - * should be renamed to `tf.matrix_determinant` -* `tf.batch_matrix_diag` - * should be renamed to `tf.matrix_diag` -* `tf.batch_matrix_inverse` - * should be renamed to `tf.matrix_inverse` -* `tf.batch_matrix_solve` - * should be renamed to `tf.matrix_solve` -* `tf.batch_matrix_solve_ls` - * should be renamed to `tf.matrix_solve_ls` -* `tf.batch_matrix_transpose` - * should be renamed to `tf.matrix_transpose` -* `tf.batch_matrix_triangular_solve` - * should be renamed to `tf.matrix_triangular_solve` -* `tf.batch_self_adjoint_eig` - * should be renamed to `tf.self_adjoint_eig` -* `tf.batch_self_adjoint_eigvals` - * should be renamed to `tf.self_adjoint_eigvals` -* `tf.batch_set_diag` - * should be renamed to `tf.set_diag` -* `tf.batch_svd` - * should be renamed to `tf.svd` -* `tf.complex_abs` - * should be renamed to `tf.abs` - -### Misc Changes - -Several other changes have been made, including the following: - -* `tf.image.per_image_whitening` - * should be renamed to `tf.image.per_image_standardization` -* `tf.nn.sigmoid_cross_entropy_with_logits` - * arguments have been reordered to `tf.nn.sigmoid_cross_entropy_with_logits(_sentinel=None, labels=None, logits=None, name=None)`. -* `tf.nn.softmax_cross_entropy_with_logits` - * arguments have been reordered to `tf.nn.softmax_cross_entropy_with_logits(_sentinel=None, labels=None, logits=None, dim=-1, name=None)`. -* `tf.nn.sparse_softmax_cross_entropy_with_logits` - * arguments have been reordered to `tf.nn.sparse_softmax_cross_entropy_with_logits(_sentinel=None, labels=None, logits=None, name=None)`. -* `tf.ones_initializer` - * should be changed to a function call i.e. `tf.ones_initializer()` -* `tf.pack` - * should be renamed to `tf.stack` -* `tf.round` - * The semantics of `tf.round` now match Banker's rounding. -* `tf.unpack` - * should be renamed to `tf.unstack` -* `tf.zeros_initializer` - * should be changed to a function call i.e. `tf.zeros_initializer()` - diff --git a/tensorflow/docs_src/mobile/README.md b/tensorflow/docs_src/mobile/README.md deleted file mode 100644 index ecf4267265..0000000000 --- a/tensorflow/docs_src/mobile/README.md +++ /dev/null @@ -1,3 +0,0 @@ -# TF Lite subsite - -This subsite directory lives in [tensorflow/contrib/lite/g3doc](../../contrib/lite/g3doc/). diff --git a/tensorflow/docs_src/performance/benchmarks.md b/tensorflow/docs_src/performance/benchmarks.md deleted file mode 100644 index a5fa551dd4..0000000000 --- a/tensorflow/docs_src/performance/benchmarks.md +++ /dev/null @@ -1,412 +0,0 @@ -# Benchmarks - -## Overview - -A selection of image classification models were tested across multiple platforms -to create a point of reference for the TensorFlow community. The -[Methodology](#methodology) section details how the tests were executed and has -links to the scripts used. - -## Results for image classification models - -InceptionV3 ([arXiv:1512.00567](https://arxiv.org/abs/1512.00567)), ResNet-50 -([arXiv:1512.03385](https://arxiv.org/abs/1512.03385)), ResNet-152 -([arXiv:1512.03385](https://arxiv.org/abs/1512.03385)), VGG16 -([arXiv:1409.1556](https://arxiv.org/abs/1409.1556)), and -[AlexNet](http://papers.nips.cc/paper/4824-imagenet-classification-with-deep-convolutional-neural-networks.pdf) -were tested using the [ImageNet](http://www.image-net.org/) data set. Tests were -run on Google Compute Engine, Amazon Elastic Compute Cloud (Amazon EC2), and an -NVIDIA® DGX-1™. Most of the tests were run with both synthetic and real data. -Testing with synthetic data was done by using a `tf.Variable` set to the same -shape as the data expected by each model for ImageNet. We believe it is -important to include real data measurements when benchmarking a platform. This -load tests both the underlying hardware and the framework at preparing data for -actual training. We start with synthetic data to remove disk I/O as a variable -and to set a baseline. Real data is then used to verify that the TensorFlow -input pipeline and the underlying disk I/O are saturating the compute units. - -### Training with NVIDIA® DGX-1™ (NVIDIA® Tesla® P100) - -
- -
- -Details and additional results are in the [Details for NVIDIA® DGX-1™ (NVIDIA® -Tesla® P100)](#details_for_nvidia_dgx-1tm_nvidia_tesla_p100) section. - -### Training with NVIDIA® Tesla® K80 - -
- -
- -Details and additional results are in the [Details for Google Compute Engine -(NVIDIA® Tesla® K80)](#details_for_google_compute_engine_nvidia_tesla_k80) and -[Details for Amazon EC2 (NVIDIA® Tesla® -K80)](#details_for_amazon_ec2_nvidia_tesla_k80) sections. - -### Distributed training with NVIDIA® Tesla® K80 - -
- -
- -Details and additional results are in the [Details for Amazon EC2 Distributed -(NVIDIA® Tesla® K80)](#details_for_amazon_ec2_distributed_nvidia_tesla_k80) -section. - -### Compare synthetic with real data training - -**NVIDIA® Tesla® P100** - -
- - -
- -**NVIDIA® Tesla® K80** - -
- - -
- -## Details for NVIDIA® DGX-1™ (NVIDIA® Tesla® P100) - -### Environment - -* **Instance type**: NVIDIA® DGX-1™ -* **GPU:** 8x NVIDIA® Tesla® P100 -* **OS:** Ubuntu 16.04 LTS with tests run via Docker -* **CUDA / cuDNN:** 8.0 / 5.1 -* **TensorFlow GitHub hash:** b1e174e -* **Benchmark GitHub hash:** 9165a70 -* **Build Command:** `bazel build -c opt --copt=-march="haswell" --config=cuda - //tensorflow/tools/pip_package:build_pip_package` -* **Disk:** Local SSD -* **DataSet:** ImageNet -* **Test Date:** May 2017 - -Batch size and optimizer used for each model are listed in the table below. In -addition to the batch sizes listed in the table, InceptionV3, ResNet-50, -ResNet-152, and VGG16 were tested with a batch size of 32. Those results are in -the *other results* section. - -Options | InceptionV3 | ResNet-50 | ResNet-152 | AlexNet | VGG16 ------------------- | ----------- | --------- | ---------- | ------- | ----- -Batch size per GPU | 64 | 64 | 64 | 512 | 64 -Optimizer | sgd | sgd | sgd | sgd | sgd - -Configuration used for each model. - -Model | variable_update | local_parameter_device ------------ | ---------------------- | ---------------------- -InceptionV3 | parameter_server | cpu -ResNet50 | parameter_server | cpu -ResNet152 | parameter_server | cpu -AlexNet | replicated (with NCCL) | n/a -VGG16 | replicated (with NCCL) | n/a - -### Results - -
- -
- -
- - -
- -**Training synthetic data** - -GPUs | InceptionV3 | ResNet-50 | ResNet-152 | AlexNet | VGG16 ----- | ----------- | --------- | ---------- | ------- | ----- -1 | 142 | 219 | 91.8 | 2987 | 154 -2 | 284 | 422 | 181 | 5658 | 295 -4 | 569 | 852 | 356 | 10509 | 584 -8 | 1131 | 1734 | 716 | 17822 | 1081 - -**Training real data** - -GPUs | InceptionV3 | ResNet-50 | ResNet-152 | AlexNet | VGG16 ----- | ----------- | --------- | ---------- | ------- | ----- -1 | 142 | 218 | 91.4 | 2890 | 154 -2 | 278 | 425 | 179 | 4448 | 284 -4 | 551 | 853 | 359 | 7105 | 534 -8 | 1079 | 1630 | 708 | N/A | 898 - -Training AlexNet with real data on 8 GPUs was excluded from the graph and table -above due to it maxing out the input pipeline. - -### Other Results - -The results below are all with a batch size of 32. - -**Training synthetic data** - -GPUs | InceptionV3 | ResNet-50 | ResNet-152 | VGG16 ----- | ----------- | --------- | ---------- | ----- -1 | 128 | 195 | 82.7 | 144 -2 | 259 | 368 | 160 | 281 -4 | 520 | 768 | 317 | 549 -8 | 995 | 1485 | 632 | 820 - -**Training real data** - -GPUs | InceptionV3 | ResNet-50 | ResNet-152 | VGG16 ----- | ----------- | --------- | ---------- | ----- -1 | 130 | 193 | 82.4 | 144 -2 | 257 | 369 | 159 | 253 -4 | 507 | 760 | 317 | 457 -8 | 966 | 1410 | 609 | 690 - -## Details for Google Compute Engine (NVIDIA® Tesla® K80) - -### Environment - -* **Instance type**: n1-standard-32-k80x8 -* **GPU:** 8x NVIDIA® Tesla® K80 -* **OS:** Ubuntu 16.04 LTS -* **CUDA / cuDNN:** 8.0 / 5.1 -* **TensorFlow GitHub hash:** b1e174e -* **Benchmark GitHub hash:** 9165a70 -* **Build Command:** `bazel build -c opt --copt=-march="haswell" --config=cuda - //tensorflow/tools/pip_package:build_pip_package` -* **Disk:** 1.7 TB Shared SSD persistent disk (800 MB/s) -* **DataSet:** ImageNet -* **Test Date:** May 2017 - -Batch size and optimizer used for each model are listed in the table below. In -addition to the batch sizes listed in the table, InceptionV3 and ResNet-50 were -tested with a batch size of 32. Those results are in the *other results* -section. - -Options | InceptionV3 | ResNet-50 | ResNet-152 | AlexNet | VGG16 ------------------- | ----------- | --------- | ---------- | ------- | ----- -Batch size per GPU | 64 | 64 | 32 | 512 | 32 -Optimizer | sgd | sgd | sgd | sgd | sgd - -The configuration used for each model was `variable_update` equal to -`parameter_server` and `local_parameter_device` equal to `cpu`. - -### Results - -
- - -
- -**Training synthetic data** - -GPUs | InceptionV3 | ResNet-50 | ResNet-152 | AlexNet | VGG16 ----- | ----------- | --------- | ---------- | ------- | ----- -1 | 30.5 | 51.9 | 20.0 | 656 | 35.4 -2 | 57.8 | 99.0 | 38.2 | 1209 | 64.8 -4 | 116 | 195 | 75.8 | 2328 | 120 -8 | 227 | 387 | 148 | 4640 | 234 - -**Training real data** - -GPUs | InceptionV3 | ResNet-50 | ResNet-152 | AlexNet | VGG16 ----- | ----------- | --------- | ---------- | ------- | ----- -1 | 30.6 | 51.2 | 20.0 | 639 | 34.2 -2 | 58.4 | 98.8 | 38.3 | 1136 | 62.9 -4 | 115 | 194 | 75.4 | 2067 | 118 -8 | 225 | 381 | 148 | 4056 | 230 - -### Other Results - -**Training synthetic data** - -GPUs | InceptionV3 (batch size 32) | ResNet-50 (batch size 32) ----- | --------------------------- | ------------------------- -1 | 29.3 | 49.5 -2 | 55.0 | 95.4 -4 | 109 | 183 -8 | 216 | 362 - -**Training real data** - -GPUs | InceptionV3 (batch size 32) | ResNet-50 (batch size 32) ----- | --------------------------- | ------------------------- -1 | 29.5 | 49.3 -2 | 55.4 | 95.3 -4 | 110 | 186 -8 | 216 | 359 - -## Details for Amazon EC2 (NVIDIA® Tesla® K80) - -### Environment - -* **Instance type**: p2.8xlarge -* **GPU:** 8x NVIDIA® Tesla® K80 -* **OS:** Ubuntu 16.04 LTS -* **CUDA / cuDNN:** 8.0 / 5.1 -* **TensorFlow GitHub hash:** b1e174e -* **Benchmark GitHub hash:** 9165a70 -* **Build Command:** `bazel build -c opt --copt=-march="haswell" --config=cuda - //tensorflow/tools/pip_package:build_pip_package` -* **Disk:** 1TB Amazon EFS (burst 100 MiB/sec for 12 hours, continuous 50 - MiB/sec) -* **DataSet:** ImageNet -* **Test Date:** May 2017 - -Batch size and optimizer used for each model are listed in the table below. In -addition to the batch sizes listed in the table, InceptionV3 and ResNet-50 were -tested with a batch size of 32. Those results are in the *other results* -section. - -Options | InceptionV3 | ResNet-50 | ResNet-152 | AlexNet | VGG16 ------------------- | ----------- | --------- | ---------- | ------- | ----- -Batch size per GPU | 64 | 64 | 32 | 512 | 32 -Optimizer | sgd | sgd | sgd | sgd | sgd - -Configuration used for each model. - -Model | variable_update | local_parameter_device ------------ | ------------------------- | ---------------------- -InceptionV3 | parameter_server | cpu -ResNet-50 | replicated (without NCCL) | gpu -ResNet-152 | replicated (without NCCL) | gpu -AlexNet | parameter_server | gpu -VGG16 | parameter_server | gpu - -### Results - -
- - -
- -**Training synthetic data** - -GPUs | InceptionV3 | ResNet-50 | ResNet-152 | AlexNet | VGG16 ----- | ----------- | --------- | ---------- | ------- | ----- -1 | 30.8 | 51.5 | 19.7 | 684 | 36.3 -2 | 58.7 | 98.0 | 37.6 | 1244 | 69.4 -4 | 117 | 195 | 74.9 | 2479 | 141 -8 | 230 | 384 | 149 | 4853 | 260 - -**Training real data** - -GPUs | InceptionV3 | ResNet-50 | ResNet-152 | AlexNet | VGG16 ----- | ----------- | --------- | ---------- | ------- | ----- -1 | 30.5 | 51.3 | 19.7 | 674 | 36.3 -2 | 59.0 | 94.9 | 38.2 | 1227 | 67.5 -4 | 118 | 188 | 75.2 | 2201 | 136 -8 | 228 | 373 | 149 | N/A | 242 - -Training AlexNet with real data on 8 GPUs was excluded from the graph and table -above due to our EFS setup not providing enough throughput. - -### Other Results - -**Training synthetic data** - -GPUs | InceptionV3 (batch size 32) | ResNet-50 (batch size 32) ----- | --------------------------- | ------------------------- -1 | 29.9 | 49.0 -2 | 57.5 | 94.1 -4 | 114 | 184 -8 | 216 | 355 - -**Training real data** - -GPUs | InceptionV3 (batch size 32) | ResNet-50 (batch size 32) ----- | --------------------------- | ------------------------- -1 | 30.0 | 49.1 -2 | 57.5 | 95.1 -4 | 113 | 185 -8 | 212 | 353 - -## Details for Amazon EC2 Distributed (NVIDIA® Tesla® K80) - -### Environment - -* **Instance type**: p2.8xlarge -* **GPU:** 8x NVIDIA® Tesla® K80 -* **OS:** Ubuntu 16.04 LTS -* **CUDA / cuDNN:** 8.0 / 5.1 -* **TensorFlow GitHub hash:** b1e174e -* **Benchmark GitHub hash:** 9165a70 -* **Build Command:** `bazel build -c opt --copt=-march="haswell" --config=cuda - //tensorflow/tools/pip_package:build_pip_package` -* **Disk:** 1.0 TB EFS (burst 100 MB/sec for 12 hours, continuous 50 MB/sec) -* **DataSet:** ImageNet -* **Test Date:** May 2017 - -The batch size and optimizer used for the tests are listed in the table. In -addition to the batch sizes listed in the table, InceptionV3 and ResNet-50 were -tested with a batch size of 32. Those results are in the *other results* -section. - -Options | InceptionV3 | ResNet-50 | ResNet-152 ------------------- | ----------- | --------- | ---------- -Batch size per GPU | 64 | 64 | 32 -Optimizer | sgd | sgd | sgd - -Configuration used for each model. - -Model | variable_update | local_parameter_device | cross_replica_sync ------------ | ---------------------- | ---------------------- | ------------------ -InceptionV3 | distributed_replicated | n/a | True -ResNet-50 | distributed_replicated | n/a | True -ResNet-152 | distributed_replicated | n/a | True - -To simplify server setup, EC2 instances (p2.8xlarge) running worker servers also -ran parameter servers. Equal numbers of parameter servers and worker servers were -used with the following exceptions: - -* InceptionV3: 8 instances / 6 parameter servers -* ResNet-50: (batch size 32) 8 instances / 4 parameter servers -* ResNet-152: 8 instances / 4 parameter servers - -### Results - -
- -
- -
- -
- -**Training synthetic data** - -GPUs | InceptionV3 | ResNet-50 | ResNet-152 ----- | ----------- | --------- | ---------- -1 | 29.7 | 52.4 | 19.4 -8 | 229 | 378 | 146 -16 | 459 | 751 | 291 -32 | 902 | 1388 | 565 -64 | 1783 | 2744 | 981 - -### Other Results - -
- -
- -**Training synthetic data** - -GPUs | InceptionV3 (batch size 32) | ResNet-50 (batch size 32) ----- | --------------------------- | ------------------------- -1 | 29.2 | 48.4 -8 | 219 | 333 -16 | 427 | 667 -32 | 820 | 1180 -64 | 1608 | 2315 - -## Methodology - -This -[script](https://github.com/tensorflow/benchmarks/tree/master/scripts/tf_cnn_benchmarks) -was run on the various platforms to generate the above results. - -In order to create results that are as repeatable as possible, each test was run -5 times and then the times were averaged together. GPUs are run in their default -state on the given platform. For NVIDIA® Tesla® K80 this means leaving on [GPU -Boost](https://devblogs.nvidia.com/parallelforall/increase-performance-gpu-boost-k80-autoboost/). -For each test, 10 warmup steps are done and then the next 100 steps are -averaged. diff --git a/tensorflow/docs_src/performance/datasets_performance.md b/tensorflow/docs_src/performance/datasets_performance.md deleted file mode 100644 index 5d9e4ba392..0000000000 --- a/tensorflow/docs_src/performance/datasets_performance.md +++ /dev/null @@ -1,331 +0,0 @@ -# Input Pipeline Performance Guide - -GPUs and TPUs can radically reduce the time required to execute a single -training step. Achieving peak performance requires an efficient input pipeline -that delivers data for the next step before the current step has finished. The -`tf.data` API helps to build flexible and efficient input pipelines. This -document explains the `tf.data` API's features and best practices for building -high performance TensorFlow input pipelines across a variety of models and -accelerators. - -This guide does the following: - -* Illustrates that TensorFlow input pipelines are essentially an - [ETL](https://en.wikipedia.org/wiki/Extract,_transform,_load) process. -* Describes common performance optimizations in the context of the `tf.data` - API. -* Discusses the performance implications of the order in which you apply - transformations. -* Summarizes the best practices for designing performant TensorFlow input - pipelines. - - -## Input Pipeline Structure - -A typical TensorFlow training input pipeline can be framed as an ETL process: - -1. **Extract**: Read data from persistent storage -- either local (e.g. HDD or - SSD) or remote (e.g. [GCS](https://cloud.google.com/storage/) or - [HDFS](https://en.wikipedia.org/wiki/Apache_Hadoop#Hadoop_distributed_file_system)). -2. **Transform**: Use CPU cores to parse and perform preprocessing operations - on the data such as image decompression, data augmentation transformations - (such as random crop, flips, and color distortions), shuffling, and batching. -3. **Load**: Load the transformed data onto the accelerator device(s) (for - example, GPU(s) or TPU(s)) that execute the machine learning model. - -This pattern effectively utilizes the CPU, while reserving the accelerator for -the heavy lifting of training your model. In addition, viewing input pipelines -as an ETL process provides structure that facilitates the application of -performance optimizations. - -When using the `tf.estimator.Estimator` API, the first two phases (Extract and -Transform) are captured in the `input_fn` passed to -`tf.estimator.Estimator.train`. In code, this might look like the following -(naive, sequential) implementation: - -``` -def parse_fn(example): - "Parse TFExample records and perform simple data augmentation." - example_fmt = { - "image": tf.FixedLengthFeature((), tf.string, ""), - "label": tf.FixedLengthFeature((), tf.int64, -1) - } - parsed = tf.parse_single_example(example, example_fmt) - image = tf.image.decode_image(parsed["image"]) - image = _augment_helper(image) # augments image using slice, reshape, resize_bilinear - return image, parsed["label"] - -def input_fn(): - files = tf.data.Dataset.list_files("/path/to/dataset/train-*.tfrecord") - dataset = files.interleave(tf.data.TFRecordDataset) - dataset = dataset.shuffle(buffer_size=FLAGS.shuffle_buffer_size) - dataset = dataset.map(map_func=parse_fn) - dataset = dataset.batch(batch_size=FLAGS.batch_size) - return dataset -``` - -The next section builds on this input pipeline, adding performance -optimizations. - -## Optimizing Performance - -As new computing devices (such as GPUs and TPUs) make it possible to train -neural networks at an increasingly fast rate, the CPU processing is prone to -becoming the bottleneck. The `tf.data` API provides users with building blocks -to design input pipelines that effectively utilize the CPU, optimizing each step -of the ETL process. - -### Pipelining - -To perform a training step, you must first extract and transform the training -data and then feed it to a model running on an accelerator. However, in a naive -synchronous implementation, while the CPU is preparing the data, the accelerator -is sitting idle. Conversely, while the accelerator is training the model, the -CPU is sitting idle. The training step time is thus the sum of both CPU -pre-processing time and the accelerator training time. - -**Pipelining** overlaps the preprocessing and model execution of a training -step. While the accelerator is performing training step `N`, the CPU is -preparing the data for step `N+1`. Doing so reduces the step time to the maximum -(as opposed to the sum) of the training and the time it takes to extract and -transform the data. - -Without pipelining, the CPU and the GPU/TPU sit idle much of the time: - -![without pipelining](/images/datasets_without_pipelining.png) - -With pipelining, idle time diminishes significantly: - -![with pipelining](/images/datasets_with_pipelining.png) - -The `tf.data` API provides a software pipelining mechanism through the -`tf.data.Dataset.prefetch` transformation, which can be used to decouple the -time data is produced from the time it is consumed. In particular, the -transformation uses a background thread and an internal buffer to prefetch -elements from the input dataset ahead of the time they are requested. Thus, to -achieve the pipelining effect illustrated above, you can add `prefetch(1)` as -the final transformation to your dataset pipeline (or `prefetch(n)` if a single -training step consumes n elements). - -To apply this change to our running example, change: - -``` -dataset = dataset.batch(batch_size=FLAGS.batch_size) -return dataset -``` - -to: - - -``` -dataset = dataset.batch(batch_size=FLAGS.batch_size) -dataset = dataset.prefetch(buffer_size=FLAGS.prefetch_buffer_size) -return dataset -``` - -Note that the prefetch transformation will yield benefits any time there is an -opportunity to overlap the work of a "producer" with the work of a "consumer." -The preceding recommendation is simply the most common application. - -### Parallelize Data Transformation - -When preparing a batch, input elements may need to be pre-processed. To this -end, the `tf.data` API offers the `tf.data.Dataset.map` transformation, which -applies a user-defined function (for example, `parse_fn` from the running -example) to each element of the input dataset. Because input elements are -independent of one another, the pre-processing can be parallelized across -multiple CPU cores. To make this possible, the `map` transformation provides the -`num_parallel_calls` argument to specify the level of parallelism. For example, -the following diagram illustrates the effect of setting `num_parallel_calls=2` -to the `map` transformation: - -![parallel map](/images/datasets_parallel_map.png) - -Choosing the best value for the `num_parallel_calls` argument depends on your -hardware, characteristics of your training data (such as its size and shape), -the cost of your map function, and what other processing is happening on the -CPU at the same time; a simple heuristic is to use the number of available CPU -cores. For instance, if the machine executing the example above had 4 cores, it -would have been more efficient to set `num_parallel_calls=4`. On the other hand, -setting `num_parallel_calls` to a value much greater than the number of -available CPUs can lead to inefficient scheduling, resulting in a slowdown. - -To apply this change to our running example, change: - -``` -dataset = dataset.map(map_func=parse_fn) -``` - -to: - -``` -dataset = dataset.map(map_func=parse_fn, num_parallel_calls=FLAGS.num_parallel_calls) -``` - -Furthermore, if your batch size is in the hundreds or thousands, your pipeline -will likely additionally benefit from parallelizing the batch creation. To this -end, the `tf.data` API provides the `tf.contrib.data.map_and_batch` -transformation, which effectively "fuses" the map and batch transformations. - -To apply this change to our running example, change: - -``` -dataset = dataset.map(map_func=parse_fn, num_parallel_calls=FLAGS.num_parallel_calls) -dataset = dataset.batch(batch_size=FLAGS.batch_size) -``` - -to: - -``` -dataset = dataset.apply(tf.contrib.data.map_and_batch( - map_func=parse_fn, batch_size=FLAGS.batch_size)) -``` - -### Parallelize Data Extraction - -In a real-world setting, the input data may be stored remotely (for example, -GCS or HDFS), either because the input data would not fit locally or because the -training is distributed and it would not make sense to replicate the input data -on every machine. A dataset pipeline that works well when reading data locally -might become bottlenecked on I/O when reading data remotely because of the -following differences between local and remote storage: - - -* **Time-to-first-byte:** Reading the first byte of a file from remote storage - can take orders of magnitude longer than from local storage. -* **Read throughput:** While remote storage typically offers large aggregate - bandwidth, reading a single file might only be able to utilize a small - fraction of this bandwidth. - -In addition, once the raw bytes are read into memory, it may also be necessary -to deserialize or decrypt the data -(e.g. [protobuf](https://developers.google.com/protocol-buffers/)), which adds -additional overhead. This overhead is present irrespective of whether the data -is stored locally or remotely, but can be worse in the remote case if data is -not prefetched effectively. - -To mitigate the impact of the various data extraction overheads, the `tf.data` -API offers the `tf.contrib.data.parallel_interleave` transformation. Use this -transformation to parallelize the execution of and interleave the contents of -other datasets (such as data file readers). The -number of datasets to overlap can be specified by the `cycle_length` argument. - -The following diagram illustrates the effect of supplying `cycle_length=2` to -the `parallel_interleave` transformation: - -![parallel io](/images/datasets_parallel_io.png) - -To apply this change to our running example, change: - -``` -dataset = files.interleave(tf.data.TFRecordDataset) -``` - -to: - -``` -dataset = files.apply(tf.contrib.data.parallel_interleave( - tf.data.TFRecordDataset, cycle_length=FLAGS.num_parallel_readers)) -``` - - -The throughput of remote storage systems can vary over time due to load or -network events. To account for this variance, the `parallel_interleave` -transformation can optionally use prefetching. (See -`tf.contrib.data.parallel_interleave` for details). - -By default, the `parallel_interleave` transformation provides a deterministic -ordering of elements to aid reproducibility. As an alternative to prefetching -(which may be ineffective in some cases), the `parallel_interleave` -transformation also provides an option that can boost performance at the expense -of ordering guarantees. In particular, if the `sloppy` argument is set to true, -the transformation may depart from its otherwise deterministic ordering, by -temporarily skipping over files whose elements are not available when the next -element is requested. - -## Performance Considerations - -The `tf.data` API is designed around composable transformations to provide its -users with flexibility. Although many of these transformations are commutative, -the ordering of certain transformations has performance implications. - -### Map and Batch - -Invoking the user-defined function passed into the `map` transformation has -overhead related to scheduling and executing the user-defined function. -Normally, this overhead is small compared to the amount of computation performed -by the function. However, if `map` does little work, this overhead can dominate -the total cost. In such cases, we recommend vectorizing the user-defined -function (that is, have it operate over a batch of inputs at once) and apply the -`batch` transformation _before_ the `map` transformation. - -### Map and Cache - -The `tf.data.Dataset.cache` transformation can cache a dataset, either in -memory or on local storage. If the user-defined function passed into the `map` -transformation is expensive, apply the cache transformation after the map -transformation as long as the resulting dataset can still fit into memory or -local storage. If the user-defined function increases the space required to -store the dataset beyond the cache capacity, consider pre-processing your data -before your training job to reduce resource usage. - -### Map and Interleave / Prefetch / Shuffle - -A number of transformations, including `interleave`, `prefetch`, and `shuffle`, -maintain an internal buffer of elements. If the user-defined function passed -into the `map` transformation changes the size of the elements, then the -ordering of the map transformation and the transformations that buffer elements -affects the memory usage. In general, we recommend choosing the order that -results in lower memory footprint, unless different ordering is desirable for -performance (for example, to enable fusing of the map and batch transformations). - -### Repeat and Shuffle - -The `tf.data.Dataset.repeat` transformation repeats the input data a finite (or -infinite) number of times; each repetition of the data is typically referred to -as an _epoch_. The `tf.data.Dataset.shuffle` transformation randomizes the -order of the dataset's examples. - -If the `repeat` transformation is applied before the `shuffle` transformation, -then the epoch boundaries are blurred. That is, certain elements can be repeated -before other elements appear even once. On the other hand, if the `shuffle` -transformation is applied before the repeat transformation, then performance -might slow down at the beginning of each epoch related to initialization of the -internal state of the `shuffle` transformation. In other words, the former -(`repeat` before `shuffle`) provides better performance, while the latter -(`shuffle` before `repeat`) provides stronger ordering guarantees. - -When possible, we recommend using the fused -`tf.contrib.data.shuffle_and_repeat` transformation, which combines the best of -both worlds (good performance and strong ordering guarantees). Otherwise, we -recommend shuffling before repeating. - -## Summary of Best Practices - -Here is a summary of the best practices for designing input pipelines: - -* Use the `prefetch` transformation to overlap the work of a producer and - consumer. In particular, we recommend adding prefetch(n) (where n is the - number of elements / batches consumed by a training step) to the end of your - input pipeline to overlap the transformations performed on the CPU with the - training done on the accelerator. -* Parallelize the `map` transformation by setting the `num_parallel_calls` - argument. We recommend using the number of available CPU cores for its value. -* If you are combining pre-processed elements into a batch using the `batch` - transformation, we recommend using the fused `map_and_batch` transformation; - especially if you are using large batch sizes. -* If you are working with data stored remotely and / or requiring - deserialization, we recommend using the `parallel_interleave` - transformation to overlap the reading (and deserialization) of data from - different files. -* Vectorize cheap user-defined functions passed in to the `map` transformation - to amortize the overhead associated with scheduling and executing the - function. -* If your data can fit into memory, use the `cache` transformation to cache it - in memory during the first epoch, so that subsequent epochs can avoid the - overhead associated with reading, parsing, and transforming it. -* If your pre-processing increases the size of your data, we recommend - applying the `interleave`, `prefetch`, and `shuffle` first (if possible) to - reduce memory usage. -* We recommend applying the `shuffle` transformation _before_ the `repeat` - transformation, ideally using the fused `shuffle_and_repeat` transformation. diff --git a/tensorflow/docs_src/performance/index.md b/tensorflow/docs_src/performance/index.md deleted file mode 100644 index a0f26a8c3a..0000000000 --- a/tensorflow/docs_src/performance/index.md +++ /dev/null @@ -1,52 +0,0 @@ -# Performance - -Performance is an important consideration when training machine learning -models. Performance speeds up and scales research while -also providing end users with near instant predictions. This section provides -details on the high level APIs to use along with best practices to build -and train high performance models, and quantize models for the least latency -and highest throughput for inference. - - * [Performance Guide](../performance/performance_guide.md) contains a collection of best - practices for optimizing your TensorFlow code. - - * [Data input pipeline guide](../performance/datasets_performance.md) describes the tf.data - API for building efficient data input pipelines for TensorFlow. - - * [Benchmarks](../performance/benchmarks.md) contains a collection of - benchmark results for a variety of hardware configurations. - - * For improving inference efficiency on mobile and - embedded hardware, see - [How to Quantize Neural Networks with TensorFlow](../performance/quantization.md), which - explains how to use quantization to reduce model size, both in storage - and at runtime. - - * For optimizing inference on GPUs, refer to [NVIDIA TensorRT™ - integration with TensorFlow.]( - https://medium.com/tensorflow/speed-up-tensorflow-inference-on-gpus-with-tensorrt-13b49f3db3fa) - - -XLA (Accelerated Linear Algebra) is an experimental compiler for linear -algebra that optimizes TensorFlow computations. The following guides explore -XLA: - - * [XLA Overview](../performance/xla/index.md), which introduces XLA. - * [Broadcasting Semantics](../performance/xla/broadcasting.md), which describes XLA's - broadcasting semantics. - * [Developing a new back end for XLA](../performance/xla/developing_new_backend.md), which - explains how to re-target TensorFlow in order to optimize the performance - of the computational graph for particular hardware. - * [Using JIT Compilation](../performance/xla/jit.md), which describes the XLA JIT compiler that - compiles and runs parts of TensorFlow graphs via XLA in order to optimize - performance. - * [Operation Semantics](../performance/xla/operation_semantics.md), which is a reference manual - describing the semantics of operations in the `ComputationBuilder` - interface. - * [Shapes and Layout](../performance/xla/shapes.md), which details the `Shape` protocol buffer. - * [Using AOT compilation](../performance/xla/tfcompile.md), which explains `tfcompile`, a - standalone tool that compiles TensorFlow graphs into executable code in - order to optimize performance. - - - diff --git a/tensorflow/docs_src/performance/leftnav_files b/tensorflow/docs_src/performance/leftnav_files deleted file mode 100644 index 12e0dbd48a..0000000000 --- a/tensorflow/docs_src/performance/leftnav_files +++ /dev/null @@ -1,14 +0,0 @@ -index.md -performance_guide.md -datasets_performance.md -benchmarks.md -quantization.md - -### XLA -xla/index.md -xla/broadcasting.md -xla/developing_new_backend.md -xla/jit.md -xla/operation_semantics.md -xla/shapes.md -xla/tfcompile.md diff --git a/tensorflow/docs_src/performance/performance_guide.md b/tensorflow/docs_src/performance/performance_guide.md deleted file mode 100644 index 9ea1d6a705..0000000000 --- a/tensorflow/docs_src/performance/performance_guide.md +++ /dev/null @@ -1,733 +0,0 @@ -# Performance Guide - -This guide contains a collection of best practices for optimizing TensorFlow -code. The guide is divided into a few sections: - -* [General best practices](#general_best_practices) covers topics that are - common across a variety of model types and hardware. -* [Optimizing for GPU](#optimizing_for_gpu) details tips specifically relevant - to GPUs. -* [Optimizing for CPU](#optimizing_for_cpu) details CPU specific information. - -## General best practices - -The sections below cover best practices that are relevant to a variety of -hardware and models. The best practices section is broken down into the -following sections: - -* [Input pipeline optimizations](#input-pipeline-optimization) -* [Data formats](#data-formats) -* [Common fused Ops](#common-fused-ops) -* [RNN Performance](#rnn-performance) -* [Building and installing from source](#building-and-installing-from-source) - -### Input pipeline optimization - -Typical models retrieve data from disk and preprocess it before sending the data -through the network. For example, models that process JPEG images will follow -this flow: load image from disk, decode JPEG into a tensor, crop and pad, -possibly flip and distort, and then batch. This flow is referred to as the input -pipeline. As GPUs and other hardware accelerators get faster, preprocessing of -data can be a bottleneck. - -Determining if the input pipeline is the bottleneck can be complicated. One of -the most straightforward methods is to reduce the model to a single operation -(trivial model) after the input pipeline and measure the examples per second. If -the difference in examples per second for the full model and the trivial model -is minimal then the input pipeline is likely a bottleneck. Below are some other -approaches to identifying issues: - -* Check if a GPU is underutilized by running `nvidia-smi -l 2`. If GPU - utilization is not approaching 80-100%, then the input pipeline may be the - bottleneck. -* Generate a timeline and look for large blocks of white space (waiting). An - example of generating a timeline exists as part of the [XLA JIT](../performance/xla/jit.md) - tutorial. -* Check CPU usage. It is possible to have an optimized input pipeline and lack - the CPU cycles to process the pipeline. -* Estimate the throughput needed and verify the disk used is capable of that - level of throughput. Some cloud solutions have network attached disks that - start as low as 50 MB/sec, which is slower than spinning disks (150 MB/sec), - SATA SSDs (500 MB/sec), and PCIe SSDs (2,000+ MB/sec). - -#### Preprocessing on the CPU - -Placing input pipeline operations on the CPU can significantly improve -performance. Utilizing the CPU for the input pipeline frees the GPU to focus on -training. To ensure preprocessing is on the CPU, wrap the preprocessing -operations as shown below: - -```python -with tf.device('/cpu:0'): - # function to get and process images or data. - distorted_inputs = load_and_distort_images() -``` - -If using `tf.estimator.Estimator` the input function is automatically placed on -the CPU. - -#### Using the tf.data API - -The [tf.data API](../guide/datasets.md) is replacing `queue_runner` as the recommended API -for building input pipelines. This -[ResNet example](https://github.com/tensorflow/models/tree/master/tutorials/image/cifar10_estimator/cifar10_main.py) -([arXiv:1512.03385](https://arxiv.org/abs/1512.03385)) -training CIFAR-10 illustrates the use of the `tf.data` API along with -`tf.estimator.Estimator`. - -The `tf.data` API utilizes C++ multi-threading and has a much lower overhead -than the Python-based `queue_runner` that is limited by Python's multi-threading -performance. A detailed performance guide for the `tf.data` API can be found -[here](../performance/datasets_performance.md). - -While feeding data using a `feed_dict` offers a high level of flexibility, in -general `feed_dict` does not provide a scalable solution. If only a single GPU -is used, the difference between the `tf.data` API and `feed_dict` performance -may be negligible. Our recommendation is to avoid using `feed_dict` for all but -trivial examples. In particular, avoid using `feed_dict` with large inputs: - -```python -# feed_dict often results in suboptimal performance when using large inputs. -sess.run(train_step, feed_dict={x: batch_xs, y_: batch_ys}) -``` - -#### Fused decode and crop - -If inputs are JPEG images that also require cropping, use fused -`tf.image.decode_and_crop_jpeg` to speed up preprocessing. -`tf.image.decode_and_crop_jpeg` only decodes the part of -the image within the crop window. This significantly speeds up the process if -the crop window is much smaller than the full image. For imagenet data, this -approach could speed up the input pipeline by up to 30%. - -Example Usage: - -```python -def _image_preprocess_fn(image_buffer): - # image_buffer 1-D string Tensor representing the raw JPEG image buffer. - - # Extract image shape from raw JPEG image buffer. - image_shape = tf.image.extract_jpeg_shape(image_buffer) - - # Get a crop window with distorted bounding box. - sample_distorted_bounding_box = tf.image.sample_distorted_bounding_box( - image_shape, ...) - bbox_begin, bbox_size, distort_bbox = sample_distorted_bounding_box - - # Decode and crop image. - offset_y, offset_x, _ = tf.unstack(bbox_begin) - target_height, target_width, _ = tf.unstack(bbox_size) - crop_window = tf.stack([offset_y, offset_x, target_height, target_width]) - cropped_image = tf.image.decode_and_crop_jpeg(image, crop_window) -``` - -`tf.image.decode_and_crop_jpeg` is available on all platforms. There is no speed -up on Windows due to the use of `libjpeg` vs. `libjpeg-turbo` on other -platforms. - -#### Use large files - -Reading large numbers of small files significantly impacts I/O performance. -One approach to get maximum I/O throughput is to preprocess input data into -larger (~100MB) `TFRecord` files. For smaller data sets (200MB-1GB), the best -approach is often to load the entire data set into memory. The document -[Downloading and converting to TFRecord format](https://github.com/tensorflow/models/tree/master/research/slim#downloading-and-converting-to-tfrecord-format) -includes information and scripts for creating `TFRecords` and this -[script](https://github.com/tensorflow/models/tree/master/tutorials/image/cifar10_estimator/generate_cifar10_tfrecords.py) -converts the CIFAR-10 data set into `TFRecords`. - -### Data formats - -Data formats refers to the structure of the Tensor passed to a given Op. The -discussion below is specifically about 4D Tensors representing images. In -TensorFlow the parts of the 4D tensor are often referred to by the following -letters: - -* N refers to the number of images in a batch. -* H refers to the number of pixels in the vertical (height) dimension. -* W refers to the number of pixels in the horizontal (width) dimension. -* C refers to the channels. For example, 1 for black and white or grayscale - and 3 for RGB. - -Within TensorFlow there are two naming conventions representing the two most -common data formats: - -* `NCHW` or `channels_first` -* `NHWC` or `channels_last` - -`NHWC` is the TensorFlow default and `NCHW` is the optimal format to use when -training on NVIDIA GPUs using [cuDNN](https://developer.nvidia.com/cudnn). - -The best practice is to build models that work with both data formats. This -simplifies training on GPUs and then running inference on CPUs. If TensorFlow is -compiled with the [Intel MKL](#tensorflow_with_intel_mkl-dnn) optimizations, -many operations, especially those related to CNN based models, will be optimized -and support `NCHW`. If not using the MKL, some operations are not supported on -CPU when using `NCHW`. - -The brief history of these two formats is that TensorFlow started by using -`NHWC` because it was a little faster on CPUs. In the long term, we are working -on tools to auto rewrite graphs to make switching between the formats -transparent and take advantages of micro optimizations where a GPU Op may be -faster using `NHWC` than the normally most efficient `NCHW`. - -### Common fused Ops - -Fused Ops combine multiple operations into a single kernel for improved -performance. There are many fused Ops within TensorFlow and [XLA](../performance/xla/index.md) will -create fused Ops when possible to automatically improve performance. Collected -below are select fused Ops that can greatly improve performance and may be -overlooked. - -#### Fused batch norm - -Fused batch norm combines the multiple operations needed to do batch -normalization into a single kernel. Batch norm is an expensive process that for -some models makes up a large percentage of the operation time. Using fused batch -norm can result in a 12%-30% speedup. - -There are two commonly used batch norms and both support fusing. The core -`tf.layers.batch_normalization` added fused starting in TensorFlow 1.3. - -```python -bn = tf.layers.batch_normalization( - input_layer, fused=True, data_format='NCHW') -``` - -The contrib `tf.contrib.layers.batch_norm` method has had fused as an option -since before TensorFlow 1.0. - -```python -bn = tf.contrib.layers.batch_norm(input_layer, fused=True, data_format='NCHW') -``` - -### RNN Performance - -There are many ways to specify an RNN computation in TensorFlow and they have -trade-offs with respect to model flexibility and performance. The -`tf.nn.rnn_cell.BasicLSTMCell` should be considered a reference implementation -and used only as a last resort when no other options will work. - -When using one of the cells, rather than the fully fused RNN layers, you have a -choice of whether to use `tf.nn.static_rnn` or `tf.nn.dynamic_rnn`. There -shouldn't generally be a performance difference at runtime, but large unroll -amounts can increase the graph size of the `tf.nn.static_rnn` and cause long -compile times. An additional advantage of `tf.nn.dynamic_rnn` is that it can -optionally swap memory from the GPU to the CPU to enable training of very long -sequences. Depending on the model and hardware configuration, this can come at -a performance cost. It is also possible to run multiple iterations of -`tf.nn.dynamic_rnn` and the underlying `tf.while_loop` construct in parallel, -although this is rarely useful with RNN models as they are inherently -sequential. - -On NVIDIA GPUs, the use of `tf.contrib.cudnn_rnn` should always be preferred -unless you want layer normalization, which it doesn't support. It is often at -least an order of magnitude faster than `tf.contrib.rnn.BasicLSTMCell` and -`tf.contrib.rnn.LSTMBlockCell` and uses 3-4x less memory than -`tf.contrib.rnn.BasicLSTMCell`. - -If you need to run one step of the RNN at a time, as might be the case in -reinforcement learning with a recurrent policy, then you should use the -`tf.contrib.rnn.LSTMBlockCell` with your own environment interaction loop -inside a `tf.while_loop` construct. Running one step of the RNN at a time and -returning to Python is possible, but it will be slower. - -On CPUs, mobile devices, and if `tf.contrib.cudnn_rnn` is not available on -your GPU, the fastest and most memory efficient option is -`tf.contrib.rnn.LSTMBlockFusedCell`. - -For all of the less common cell types like `tf.contrib.rnn.NASCell`, -`tf.contrib.rnn.PhasedLSTMCell`, `tf.contrib.rnn.UGRNNCell`, -`tf.contrib.rnn.GLSTMCell`, `tf.contrib.rnn.Conv1DLSTMCell`, -`tf.contrib.rnn.Conv2DLSTMCell`, `tf.contrib.rnn.LayerNormBasicLSTMCell`, -etc., one should be aware that they are implemented in the graph like -`tf.contrib.rnn.BasicLSTMCell` and as such will suffer from the same poor -performance and high memory usage. One should consider whether or not those -trade-offs are worth it before using these cells. For example, while layer -normalization can speed up convergence, because cuDNN is 20x faster the fastest -wall clock time to convergence is usually obtained without it. - - -### Building and installing from source - -The default TensorFlow binaries target the broadest range of hardware to make -TensorFlow accessible to everyone. If using CPUs for training or inference, it -is recommended to compile TensorFlow with all of the optimizations available for -the CPU in use. Speedups for training and inference on CPU are documented below -in [Comparing compiler optimizations](#comparing-compiler-optimizations). - -To install the most optimized version of TensorFlow, -[build and install](../install/install_sources.md) from source. If there is a need to build -TensorFlow on a platform that has different hardware than the target, then -cross-compile with the highest optimizations for the target platform. The -following command is an example of using `bazel` to compile for a specific -platform: - -```python -# This command optimizes for Intel’s Broadwell processor -bazel build -c opt --copt=-march="broadwell" --config=cuda //tensorflow/tools/pip_package:build_pip_package - -``` - -#### Environment, build, and install tips - -* `./configure` asks which compute capability to include in the build. This - does not impact overall performance but does impact initial startup. After - running TensorFlow once, the compiled kernels are cached by CUDA. If using - a docker container, the data is not cached and the penalty is paid each time - TensorFlow starts. The best practice is to include the - [compute capabilities](http://developer.nvidia.com/cuda-gpus) - of the GPUs that will be used, e.g. P100: 6.0, Titan X (Pascal): 6.1, Titan - X (Maxwell): 5.2, and K80: 3.7. -* Use a version of gcc that supports all of the optimizations of the target - CPU. The recommended minimum gcc version is 4.8.3. On OS X, upgrade to the - latest Xcode version and use the version of clang that comes with Xcode. -* Install the latest stable CUDA platform and cuDNN libraries supported by - TensorFlow. - -## Optimizing for GPU - -This section contains GPU-specific tips that are not covered in the -[General best practices](#general-best-practices). Obtaining optimal performance -on multi-GPUs is a challenge. A common approach is to use data parallelism. -Scaling through the use of data parallelism involves making multiple copies of -the model, which are referred to as "towers", and then placing one tower on each -of the GPUs. Each tower operates on a different mini-batch of data and then -updates variables, also known as parameters, that need to be shared between -each of the towers. How each tower gets the updated variables and how the -gradients are applied has an impact on the performance, scaling, and convergence -of the model. The rest of this section provides an overview of variable -placement and the towering of a model on multiple GPUs. -[High-Performance Models](../performance/performance_models.md) gets into more details regarding -more complex methods that can be used to share and update variables between -towers. - -The best approach to handling variable updates depends on the model, hardware, -and even how the hardware has been configured. An example of this, is that two -systems can be built with NVIDIA Tesla P100s but one may be using PCIe and the -other [NVLink](http://www.nvidia.com/object/nvlink.html). In that scenario, the -optimal solution for each system may be different. For real world examples, read -the [benchmark](../performance/benchmarks.md) page which details the settings that -were optimal for a variety of platforms. Below is a summary of what was learned -from benchmarking various platforms and configurations: - -* **Tesla K80**: If the GPUs are on the same PCI Express root complex and are - able to use [NVIDIA GPUDirect](https://developer.nvidia.com/gpudirect) Peer - to Peer, then placing the variables equally across the GPUs used for - training is the best approach. If the GPUs cannot use GPUDirect, then - placing the variables on the CPU is the best option. - -* **Titan X (Maxwell and Pascal), M40, P100, and similar**: For models like - ResNet and InceptionV3, placing variables on the CPU is the optimal setting, - but for models with a lot of variables like AlexNet and VGG, using GPUs with - `NCCL` is better. - -A common approach to managing where variables are placed, is to create a method -to determine where each Op is to be placed and use that method in place of a -specific device name when calling `with tf.device():`. Consider a scenario where -a model is being trained on 2 GPUs and the variables are to be placed on the -CPU. There would be a loop for creating and placing the "towers" on each of the -2 GPUs. A custom device placement method would be created that watches for Ops -of type `Variable`, `VariableV2`, and `VarHandleOp` and indicates that they are -to be placed on the CPU. All other Ops would be placed on the target GPU. -The building of the graph would proceed as follows: - -* On the first loop a "tower" of the model would be created for `gpu:0`. - During the placement of the Ops, the custom device placement method would - indicate that variables are to be placed on `cpu:0` and all other Ops on - `gpu:0`. - -* On the second loop, `reuse` is set to `True` to indicate that variables are - to be reused and then the "tower" is created on `gpu:1`. During the - placement of the Ops associated with the "tower", the variables that were - placed on `cpu:0` are reused and all other Ops are created and placed on - `gpu:1`. - -The final result is all of the variables are placed on the CPU with each GPU -having a copy of all of the computational Ops associated with the model. - -The code snippet below illustrates two different approaches for variable -placement: one is placing variables on the CPU; the other is placing variables -equally across the GPUs. - -```python - -class GpuParamServerDeviceSetter(object): - """Used with tf.device() to place variables on the least loaded GPU. - - A common use for this class is to pass a list of GPU devices, e.g. ['gpu:0', - 'gpu:1','gpu:2'], as ps_devices. When each variable is placed, it will be - placed on the least loaded gpu. All other Ops, which will be the computation - Ops, will be placed on the worker_device. - """ - - def __init__(self, worker_device, ps_devices): - """Initializer for GpuParamServerDeviceSetter. - Args: - worker_device: the device to use for computation Ops. - ps_devices: a list of devices to use for Variable Ops. Each variable is - assigned to the least loaded device. - """ - self.ps_devices = ps_devices - self.worker_device = worker_device - self.ps_sizes = [0] * len(self.ps_devices) - - def __call__(self, op): - if op.device: - return op.device - if op.type not in ['Variable', 'VariableV2', 'VarHandleOp']: - return self.worker_device - - # Gets the least loaded ps_device - device_index, _ = min(enumerate(self.ps_sizes), key=operator.itemgetter(1)) - device_name = self.ps_devices[device_index] - var_size = op.outputs[0].get_shape().num_elements() - self.ps_sizes[device_index] += var_size - - return device_name - -def _create_device_setter(is_cpu_ps, worker, num_gpus): - """Create device setter object.""" - if is_cpu_ps: - # tf.train.replica_device_setter supports placing variables on the CPU, all - # on one GPU, or on ps_servers defined in a cluster_spec. - return tf.train.replica_device_setter( - worker_device=worker, ps_device='/cpu:0', ps_tasks=1) - else: - gpus = ['/gpu:%d' % i for i in range(num_gpus)] - return ParamServerDeviceSetter(worker, gpus) - -# The method below is a modified snippet from the full example. -def _resnet_model_fn(): - # When set to False, variables are placed on the least loaded GPU. If set - # to True, the variables will be placed on the CPU. - is_cpu_ps = False - - # Loops over the number of GPUs and creates a copy ("tower") of the model on - # each GPU. - for i in range(num_gpus): - worker = '/gpu:%d' % i - # Creates a device setter used to determine where Ops are to be placed. - device_setter = _create_device_setter(is_cpu_ps, worker, FLAGS.num_gpus) - # Creates variables on the first loop. On subsequent loops reuse is set - # to True, which results in the "towers" sharing variables. - with tf.variable_scope('resnet', reuse=bool(i != 0)): - with tf.name_scope('tower_%d' % i) as name_scope: - # tf.device calls the device_setter for each Op that is created. - # device_setter returns the device the Op is to be placed on. - with tf.device(device_setter): - # Creates the "tower". - _tower_fn(is_training, weight_decay, tower_features[i], - tower_labels[i], tower_losses, tower_gradvars, - tower_preds, False) - -``` - -In the near future the above code will be for illustration purposes only as -there will be easy to use high level methods to support a wide range of popular -approaches. This -[example](https://github.com/tensorflow/models/tree/master/tutorials/image/cifar10_estimator) -will continue to get updated as the API expands and evolves to address multi-GPU -scenarios. - -## Optimizing for CPU - -CPUs, which includes Intel® Xeon Phi™, achieve optimal performance when -TensorFlow is [built from source](../install/install_sources.md) with all of the instructions -supported by the target CPU. - -Beyond using the latest instruction sets, Intel® has added support for the -Intel® Math Kernel Library for Deep Neural Networks (Intel® MKL-DNN) to -TensorFlow. While the name is not completely accurate, these optimizations are -often simply referred to as 'MKL' or 'TensorFlow with MKL'. [TensorFlow -with Intel® MKL-DNN](#tensorflow_with_intel_mkl_dnn) contains details on the -MKL optimizations. - -The two configurations listed below are used to optimize CPU performance by -adjusting the thread pools. - -* `intra_op_parallelism_threads`: Nodes that can use multiple threads to - parallelize their execution will schedule the individual pieces into this - pool. -* `inter_op_parallelism_threads`: All ready nodes are scheduled in this pool. - -These configurations are set via the `tf.ConfigProto` and passed to `tf.Session` -in the `config` attribute as shown in the snippet below. For both configuration -options, if they are unset or set to 0, will default to the number of logical -CPU cores. Testing has shown that the default is effective for systems ranging -from one CPU with 4 cores to multiple CPUs with 70+ combined logical cores. -A common alternative optimization is to set the number of threads in both pools -equal to the number of physical cores rather than logical cores. - -```python - - config = tf.ConfigProto() - config.intra_op_parallelism_threads = 44 - config.inter_op_parallelism_threads = 44 - tf.Session(config=config) - -``` - -The [Comparing compiler optimizations](#comparing-compiler-optimizations) -section contains the results of tests that used different compiler -optimizations. - -### TensorFlow with Intel® MKL DNN - -Intel® has added optimizations to TensorFlow for Intel® Xeon® and Intel® Xeon -Phi™ through the use of the Intel® Math Kernel Library for Deep Neural Networks -(Intel® MKL-DNN) optimized primitives. The optimizations also provide speedups -for the consumer line of processors, e.g. i5 and i7 Intel processors. The Intel -published paper -[TensorFlow* Optimizations on Modern Intel® Architecture](https://software.intel.com/en-us/articles/tensorflow-optimizations-on-modern-intel-architecture) -contains additional details on the implementation. - -> Note: MKL was added as of TensorFlow 1.2 and currently only works on Linux. It -> also does not work when also using `--config=cuda`. - -In addition to providing significant performance improvements for training CNN -based models, compiling with the MKL creates a binary that is optimized for AVX -and AVX2. The result is a single binary that is optimized and compatible with -most modern (post-2011) processors. - -TensorFlow can be compiled with the MKL optimizations using the following -commands that depending on the version of the TensorFlow source used. - -For TensorFlow source versions after 1.3.0: - -```bash -./configure -# Pick the desired options -bazel build --config=mkl --config=opt //tensorflow/tools/pip_package:build_pip_package - -``` - -For TensorFlow versions 1.2.0 through 1.3.0: - -```bash -./configure -Do you wish to build TensorFlow with MKL support? [y/N] Y -Do you wish to download MKL LIB from the web? [Y/n] Y -# Select the defaults for the rest of the options. - -bazel build --config=mkl --copt="-DEIGEN_USE_VML" -c opt //tensorflow/tools/pip_package:build_pip_package - -``` - -#### Tuning MKL for the best performance - -This section details the different configurations and environment variables that -can be used to tune the MKL to get optimal performance. Before tweaking various -environment variables make sure the model is using the `NCHW` (`channels_first`) -[data format](#data-formats). The MKL is optimized for `NCHW` and Intel is -working to get near performance parity when using `NHWC`. - -MKL uses the following environment variables to tune performance: - -* KMP_BLOCKTIME - Sets the time, in milliseconds, that a thread should wait, - after completing the execution of a parallel region, before sleeping. -* KMP_AFFINITY - Enables the run-time library to bind threads to physical - processing units. -* KMP_SETTINGS - Enables (true) or disables (false) the printing of OpenMP* - run-time library environment variables during program execution. -* OMP_NUM_THREADS - Specifies the number of threads to use. - -More details on the KMP variables are on -[Intel's](https://software.intel.com/en-us/node/522775) site and the OMP -variables on -[gnu.org](https://gcc.gnu.org/onlinedocs/libgomp/Environment-Variables.html) - -While there can be substantial gains from adjusting the environment variables, -which is discussed below, the simplified advice is to set the -`inter_op_parallelism_threads` equal to the number of physical CPUs and to set -the following environment variables: - -* KMP_BLOCKTIME=0 -* KMP_AFFINITY=granularity=fine,verbose,compact,1,0 - -Example setting MKL variables with command-line arguments: - -```bash -KMP_BLOCKTIME=0 KMP_AFFINITY=granularity=fine,verbose,compact,1,0 \ -KMP_SETTINGS=1 python your_python_script.py -``` - -Example setting MKL variables with python `os.environ`: - -```python -os.environ["KMP_BLOCKTIME"] = str(FLAGS.kmp_blocktime) -os.environ["KMP_SETTINGS"] = str(FLAGS.kmp_settings) -os.environ["KMP_AFFINITY"]= FLAGS.kmp_affinity -if FLAGS.num_intra_threads > 0: - os.environ["OMP_NUM_THREADS"]= str(FLAGS.num_intra_threads) - -``` - -There are models and hardware platforms that benefit from different settings. -Each variable that impacts performance is discussed below. - -* **KMP_BLOCKTIME**: The MKL default is 200ms, which was not optimal in our - testing. 0 (0ms) was a good default for CNN based models that were tested. - The best performance for AlexNex was achieved at 30ms and both GoogleNet and - VGG11 performed best set at 1ms. - -* **KMP_AFFINITY**: The recommended setting is - `granularity=fine,verbose,compact,1,0`. - -* **OMP_NUM_THREADS**: This defaults to the number of physical cores. - Adjusting this parameter beyond matching the number of cores can have an - impact when using Intel® Xeon Phi™ (Knights Landing) for some models. See - [TensorFlow* Optimizations on Modern Intel® Architecture](https://software.intel.com/en-us/articles/tensorflow-optimizations-on-modern-intel-architecture) - for optimal settings. - -* **intra_op_parallelism_threads**: Setting this equal to the number of - physical cores is recommended. Setting the value to 0, which is the default, - results in the value being set to the number of logical cores - this is an - alternate option to try for some architectures. This value and `OMP_NUM_THREADS` - should be equal. - -* **inter_op_parallelism_threads**: Setting this equal to the number of - sockets is recommended. Setting the value to 0, which is the default, - results in the value being set to the number of logical cores. - -### Comparing compiler optimizations - -Collected below are performance results running training and inference on -different types of CPUs on different platforms with various compiler -optimizations. The models used were ResNet-50 -([arXiv:1512.03385](https://arxiv.org/abs/1512.03385)) and -InceptionV3 ([arXiv:1512.00567](https://arxiv.org/abs/1512.00567)). - -For each test, when the MKL optimization was used the environment variable -KMP_BLOCKTIME was set to 0 (0ms) and KMP_AFFINITY to -`granularity=fine,verbose,compact,1,0`. - -#### Inference InceptionV3 - -**Environment** - -* Instance Type: AWS EC2 m4.xlarge -* CPU: Intel(R) Xeon(R) CPU E5-2686 v4 @ 2.30GHz (Broadwell) -* Dataset: ImageNet -* TensorFlow Version: 1.2.0 RC2 -* Test Script: [tf_cnn_benchmarks.py](https://github.com/tensorflow/benchmarks/blob/mkl_experiment/scripts/tf_cnn_benchmarks/tf_cnn_benchmarks.py) - -**Batch Size: 1** - -Command executed for the MKL test: - -```bash -python tf_cnn_benchmarks.py --forward_only=True --device=cpu --mkl=True \ ---kmp_blocktime=0 --nodistortions --model=inception3 --data_format=NCHW \ ---batch_size=1 --num_inter_threads=1 --num_intra_threads=4 \ ---data_dir= -``` - -| Optimization | Data Format | Images/Sec | Intra threads | Inter Threads | -: : : (step time) : : : -| ------------ | ----------- | ------------ | ------------- | ------------- | -| AVX2 | NHWC | 7.0 (142ms) | 4 | 0 | -| MKL | NCHW | 6.6 (152ms) | 4 | 1 | -| AVX | NHWC | 5.0 (202ms) | 4 | 0 | -| SSE3 | NHWC | 2.8 (361ms) | 4 | 0 | - -**Batch Size: 32** - -Command executed for the MKL test: - -```bash -python tf_cnn_benchmarks.py --forward_only=True --device=cpu --mkl=True \ ---kmp_blocktime=0 --nodistortions --model=inception3 --data_format=NCHW \ ---batch_size=32 --num_inter_threads=1 --num_intra_threads=4 \ ---data_dir= -``` - -| Optimization | Data Format | Images/Sec | Intra threads | Inter Threads | -: : : (step time) : : : -| ------------ | ----------- | ------------- | ------------- | ------------- | -| MKL | NCHW | 10.3 | 4 | 1 | -: : : (3,104ms) : : : -| AVX2 | NHWC | 7.5 (4,255ms) | 4 | 0 | -| AVX | NHWC | 5.1 (6,275ms) | 4 | 0 | -| SSE3 | NHWC | 2.8 (11,428ms)| 4 | 0 | - -#### Inference ResNet-50 - -**Environment** - -* Instance Type: AWS EC2 m4.xlarge -* CPU: Intel(R) Xeon(R) CPU E5-2686 v4 @ 2.30GHz (Broadwell) -* Dataset: ImageNet -* TensorFlow Version: 1.2.0 RC2 -* Test Script: [tf_cnn_benchmarks.py](https://github.com/tensorflow/benchmarks/blob/mkl_experiment/scripts/tf_cnn_benchmarks/tf_cnn_benchmarks.py) - -**Batch Size: 1** - -Command executed for the MKL test: - -```bash -python tf_cnn_benchmarks.py --forward_only=True --device=cpu --mkl=True \ ---kmp_blocktime=0 --nodistortions --model=resnet50 --data_format=NCHW \ ---batch_size=1 --num_inter_threads=1 --num_intra_threads=4 \ ---data_dir= -``` - -| Optimization | Data Format | Images/Sec | Intra threads | Inter Threads | -: : : (step time) : : : -| ------------ | ----------- | ------------ | ------------- | ------------- | -| AVX2 | NHWC | 8.8 (113ms) | 4 | 0 | -| MKL | NCHW | 8.5 (120ms) | 4 | 1 | -| AVX | NHWC | 6.4 (157ms) | 4 | 0 | -| SSE3 | NHWC | 3.7 (270ms) | 4 | 0 | - -**Batch Size: 32** - -Command executed for the MKL test: - -```bash -python tf_cnn_benchmarks.py --forward_only=True --device=cpu --mkl=True \ ---kmp_blocktime=0 --nodistortions --model=resnet50 --data_format=NCHW \ ---batch_size=32 --num_inter_threads=1 --num_intra_threads=4 \ ---data_dir= -``` - -| Optimization | Data Format | Images/Sec | Intra threads | Inter Threads | -: : : (step time) : : : -| ------------ | ----------- | ------------- | ------------- | ------------- | -| MKL | NCHW | 12.4 | 4 | 1 | -: : : (2,590ms) : : : -| AVX2 | NHWC | 10.4 (3,079ms)| 4 | 0 | -| AVX | NHWC | 7.3 (4,4416ms)| 4 | 0 | -| SSE3 | NHWC | 4.0 (8,054ms) | 4 | 0 | - -#### Training InceptionV3 - -**Environment** - -* Instance Type: Dedicated AWS EC2 r4.16xlarge (Broadwell) -* CPU: Intel Xeon E5-2686 v4 (Broadwell) Processors -* Dataset: ImageNet -* TensorFlow Version: 1.2.0 RC2 -* Test Script: [tf_cnn_benchmarks.py](https://github.com/tensorflow/benchmarks/blob/mkl_experiment/scripts/tf_cnn_benchmarks/tf_cnn_benchmarks.py) - -Command executed for MKL test: - -```bash -python tf_cnn_benchmarks.py --device=cpu --mkl=True --kmp_blocktime=0 \ ---nodistortions --model=resnet50 --data_format=NCHW --batch_size=32 \ ---num_inter_threads=2 --num_intra_threads=36 \ ---data_dir= -``` - -Optimization | Data Format | Images/Sec | Intra threads | Inter Threads ------------- | ----------- | ---------- | ------------- | ------------- -MKL | NCHW | 20.8 | 36 | 2 -AVX2 | NHWC | 6.2 | 36 | 0 -AVX | NHWC | 5.7 | 36 | 0 -SSE3 | NHWC | 4.3 | 36 | 0 - -ResNet and [AlexNet](http://papers.nips.cc/paper/4824-imagenet-classification-with-deep-convolutional-neural-networks.pdf) -were also run on this configuration but in an ad hoc manner. There were not -enough runs executed to publish a coherent table of results. The incomplete -results strongly indicated the final result would be similar to the table above -with MKL providing significant 3x+ gains over AVX2. diff --git a/tensorflow/docs_src/performance/performance_models.md b/tensorflow/docs_src/performance/performance_models.md deleted file mode 100644 index 151c0b2946..0000000000 --- a/tensorflow/docs_src/performance/performance_models.md +++ /dev/null @@ -1,422 +0,0 @@ -# High-Performance Models - -This document and accompanying -[scripts](https://github.com/tensorflow/benchmarks/tree/master/scripts/tf_cnn_benchmarks) -detail how to build highly scalable models that target a variety of system types -and network topologies. The techniques in this document utilize some low-level -TensorFlow Python primitives. In the future, many of these techniques will be -incorporated into high-level APIs. - -## Input Pipeline - -The [Performance Guide](../performance/performance_guide.md) explains how to identify possible -input pipeline issues and best practices. We found that using `tf.FIFOQueue` -and `tf.train.queue_runner` could not saturate multiple current generation GPUs -when using large inputs and processing with higher samples per second, such -as training ImageNet with [AlexNet](http://papers.nips.cc/paper/4824-imagenet-classification-with-deep-convolutional-neural-networks.pdf). -This is due to the use of Python threads as its underlying implementation. The -overhead of Python threads is too large. - -Another approach, which we have implemented in the -[scripts](https://github.com/tensorflow/benchmarks/tree/master/scripts/tf_cnn_benchmarks), -is to build an input pipeline using the native parallelism in TensorFlow. Our -implementation is made up of 3 stages: - -* I/O reads: Choose and read image files from disk. -* Image Processing: Decode image records into images, preprocess, and organize - into mini-batches. -* CPU-to-GPU Data Transfer: Transfer images from CPU to GPU. - -The dominant part of each stage is executed in parallel with the other stages -using `data_flow_ops.StagingArea`. `StagingArea` is a queue-like operator -similar to `tf.FIFOQueue`. The difference is that `StagingArea` does not -guarantee FIFO ordering, but offers simpler functionality and can be executed -on both CPU and GPU in parallel with other stages. Breaking the input pipeline -into 3 stages that operate independently in parallel is scalable and takes full -advantage of large multi-core environments. The rest of this section details -the stages followed by details about using `data_flow_ops.StagingArea`. - -### Parallelize I/O Reads - -`data_flow_ops.RecordInput` is used to parallelize reading from disk. Given a -list of input files representing TFRecords, `RecordInput` continuously reads -records using background threads. The records are placed into its own large -internal pool and when it has loaded at least half of its capacity, it produces -output tensors. - -This op has its own internal threads that are dominated by I/O time that consume -minimal CPU, which allows it to run smoothly in parallel with the rest of the -model. - -### Parallelize Image Processing - -After images are read from `RecordInput` they are passed as tensors to the image -processing pipeline. To make the image processing pipeline easier to explain, -assume that the input pipeline is targeting 8 GPUs with a batch size of 256 (32 -per GPU). - -256 records are read and processed individually in parallel. This starts with -256 independent `RecordInput` read ops in the graph. Each read op is followed by -an identical set of ops for image preprocessing that are considered independent -and executed in parallel. The image preprocessing ops include operations such as -image decoding, distortion, and resizing. - -Once the images are through preprocessing, they are concatenated together into 8 -tensors each with a batch-size of 32. Rather than using `tf.concat` for this -purpose, which is implemented as a single op that waits for all the inputs to be -ready before concatenating them together, `tf.parallel_stack` is used. -`tf.parallel_stack` allocates an uninitialized tensor as an output, and each -input tensor is written to its designated portion of the output tensor as soon -as the input is available. - -When all the input tensors are finished, the output tensor is passed along in -the graph. This effectively hides all the memory latency with the long tail of -producing all the input tensors. - -### Parallelize CPU-to-GPU Data Transfer - -Continuing with the assumption that the target is 8 GPUs with a batch size of -256 (32 per GPU). Once the input images are processed and concatenated together -by the CPU, we have 8 tensors each with a batch-size of 32. - -TensorFlow enables tensors from one device to be used on any other device -directly. TensorFlow inserts implicit copies to make the tensors available on -any devices where they are used. The runtime schedules the copy between devices -to run before the tensors are actually used. However, if the copy cannot finish -in time, the computation that needs those tensors will stall and result in -decreased performance. - -In this implementation, `data_flow_ops.StagingArea` is used to explicitly -schedule the copy in parallel. The end result is that when computation starts on -the GPU, all the tensors are already available. - -### Software Pipelining - -With all the stages capable of being driven by different processors, -`data_flow_ops.StagingArea` is used between them so they run in parallel. -`StagingArea` is a queue-like operator similar to `tf.FIFOQueue` that offers -simpler functionalities that can be executed on both CPU and GPU. - -Before the model starts running all the stages, the input pipeline stages are -warmed up to prime the staging buffers in between with one set of data. -During each run step, one set of data is read from the staging buffers at -the beginning of each stage, and one set is pushed at the end. - -For example: if there are three stages: A, B and C. There are two staging areas -in between: S1 and S2. During the warm up, we run: - -``` -Warm up: -Step 1: A0 -Step 2: A1 B0 - -Actual execution: -Step 3: A2 B1 C0 -Step 4: A3 B2 C1 -Step 5: A4 B3 C2 -``` - -After the warm up, S1 and S2 each have one set of data in them. For each step of -the actual execution, one set of data is consumed from each staging area, and -one set is added to each. - -Benefits of using this scheme: - -* All stages are non-blocking, since the staging areas always have one set of - data after the warm up. -* Each stage can run in parallel since they can all start immediately. -* The staging buffers have a fixed memory overhead. They will have at most one - extra set of data. -* Only a single`session.run()` call is needed to run all stages of the step, - which makes profiling and debugging much easier. - -## Best Practices in Building High-Performance Models - -Collected below are a couple of additional best practices that can improve -performance and increase the flexibility of models. - -### Build the model with both NHWC and NCHW - -Most TensorFlow operations used by a CNN support both NHWC and NCHW data format. -On GPU, NCHW is faster. But on CPU, NHWC is sometimes faster. - -Building a model to support both data formats keeps the model flexible and -capable of operating optimally regardless of platform. Most TensorFlow -operations used by a CNN support both NHWC and NCHW data formats. The benchmark -script was written to support both NCHW and NHWC. NCHW should always be used -when training with GPUs. NHWC is sometimes faster on CPU. A flexible model can -be trained on GPUs using NCHW with inference done on CPU using NHWC with the -weights obtained from training. - -### Use Fused Batch-Normalization - -The default batch-normalization in TensorFlow is implemented as composite -operations. This is very general, but often leads to suboptimal performance. An -alternative is to use fused batch-normalization which often has much better -performance on GPU. Below is an example of using `tf.contrib.layers.batch_norm` -to implement fused batch-normalization. - -```python -bn = tf.contrib.layers.batch_norm( - input_layer, fused=True, data_format='NCHW' - scope=scope) -``` - -## Variable Distribution and Gradient Aggregation - -During training, training variable values are updated using aggregated gradients -and deltas. In the benchmark script, we demonstrate that with the flexible and -general-purpose TensorFlow primitives, a diverse range of high-performance -distribution and aggregation schemes can be built. - -Three examples of variable distribution and aggregation were included in the -script: - -* `parameter_server` where each replica of the training model reads the - variables from a parameter server and updates the variable independently. - When each model needs the variables, they are copied over through the - standard implicit copies added by the TensorFlow runtime. The example - [script](https://github.com/tensorflow/benchmarks/tree/master/scripts/tf_cnn_benchmarks) - illustrates using this method for local training, distributed synchronous - training, and distributed asynchronous training. -* `replicated` places an identical copy of each training variable on each - GPU. The forward and backward computation can start immediately as the - variable data is immediately available. Gradients are accumulated across all - GPUs, and the aggregated total is applied to each GPU's copy of the - variables to keep them in sync. -* `distributed_replicated` places an identical copy of the training parameters - on each GPU along with a master copy on the parameter servers. The forward - and backward computation can start immediately as the variable data is - immediately available. Gradients are accumulated across all GPUs on each - server and then the per-server aggregated gradients are applied to the - master copy. After all workers do this, each worker updates its copy of the - variable from the master copy. - -Below are additional details about each approach. - -### Parameter Server Variables - -The most common way trainable variables are managed in TensorFlow models is -parameter server mode. - -In a distributed system, each worker process runs the same model, and parameter -server processes own the master copies of the variables. When a worker needs a -variable from a parameter server, it refers to it directly. The TensorFlow -runtime adds implicit copies to the graph to make the variable value available -on the computation device that needs it. When a gradient is computed on a -worker, it is sent to the parameter server that owns the particular variable, -and the corresponding optimizer is used to update the variable. - -There are some techniques to improve throughput: - -* The variables are spread among parameter servers based on their size, for - load balancing. -* When each worker has multiple GPUs, gradients are accumulated across the - GPUs and a single aggregated gradient is sent to the parameter server. This - reduces the network bandwidth and the amount of work done by the parameter - servers. - -For coordinating between workers, a very common mode is async updates, where -each worker updates the master copy of the variables without synchronizing with -other workers. In our model, we demonstrate that it is fairly easy to introduce -synchronization across workers so updates for all workers are finished in one -step before the next step can start. - -The parameter server method can also be used for local training, In this case, -instead of spreading the master copies of variables across parameters servers, -they are either on the CPU or spread across the available GPUs. - -Due to the simple nature of this setup, this architecture has gained a lot of -popularity within the community. - -This mode can be used in the script by passing -`--variable_update=parameter_server`. - -
- parameter_server mode in distributed training -
- -### Replicated Variables - -In this design, each GPU on the server has its own copy of each variable. The -values are kept in sync across GPUs by applying the fully aggregated gradient to -each GPU's copy of the variable. - -The variables and data are available at the start of training, so the forward -pass of training can start immediately. Gradients are aggregated across the -devices and the fully aggregated gradient is then applied to each local copy. - -Gradient aggregation across the server can be done in different ways: - -* Using standard TensorFlow operations to accumulate the total on a single - device (CPU or GPU) and then copy it back to all GPUs. -* Using NVIDIA® NCCL, described below in the NCCL section. - -This mode can be used in the script by passing `--variable_update=replicated`. - -### Replicated Variables in Distributed Training - -The replicated method for variables can be extended to distributed training. One -way to do this like the replicated mode: aggregate the gradients fully across -the cluster and apply them to each local copy of the variable. This may be shown -in a future version of this scripts; the scripts do present a different -variation, described here. - -In this mode, in addition to each GPU's copy of the variables, a master copy is -stored on the parameter servers. As with the replicated mode, training can start -immediately using the local copies of the variables. - -As the gradients of the weights become available, they are sent back to the -parameter servers and all local copies are updated: - -1. All the gradients from the GPU on the same worker are aggregated together. -2. Aggregated gradients from each worker are sent to the parameter server that - owns the variable, where the specified optimizer is used to update the - master copy of the variable. -3. Each worker updates its local copy of the variable from the master. In the - example model, this is done with a cross-replica barrier that waits for all - the workers to finish updating the variables, and fetches the new variable - only after the barrier has been released by all replicas. Once the copy - finishes for all variables, this marks the end of a training step, and a new - step can start. - -Although this sounds similar to the standard use of parameter servers, the -performance is often better in many cases. This is largely due to the fact the -computation can happen without any delay, and much of the copy latency of early -gradients can be hidden by later computation layers. - -This mode can be used in the script by passing -`--variable_update=distributed_replicated`. - - -
- distributed_replicated mode -
- -#### NCCL - -In order to broadcast variables and aggregate gradients across different GPUs -within the same host machine, we can use the default TensorFlow implicit copy -mechanism. - -However, we can instead use the optional NCCL (`tf.contrib.nccl`) support. NCCL -is an NVIDIA® library that can efficiently broadcast and aggregate data across -different GPUs. It schedules a cooperating kernel on each GPU that knows how to -best utilize the underlying hardware topology; this kernel uses a single SM of -the GPU. - -In our experiment, we demonstrate that although NCCL often leads to much faster -data aggregation by itself, it doesn't necessarily lead to faster training. Our -hypothesis is that the implicit copies are essentially free since they go to the -copy engine on GPU, as long as its latency can be hidden by the main computation -itself. Although NCCL can transfer data faster, it takes one SM away, and adds -more pressure to the underlying L2 cache. Our results show that for 8-GPUs, NCCL -often leads to better performance. However, for fewer GPUs, the implicit copies -often perform better. - -#### Staged Variables - -We further introduce a staged-variable mode where we use staging areas for both -the variable reads, and their updates. Similar to software pipelining of the -input pipeline, this can hide the data copy latency. If the computation time -takes longer than the copy and aggregation, the copy itself becomes essentially -free. - -The downside is that all the weights read are from the previous training step. -So it is a different algorithm from SGD. But it is possible to improve its -convergence by adjusting learning rate and other hyperparameters. - -## Executing the script - -This section lists the core command line arguments and a few basic examples for -executing the main script -([tf_cnn_benchmarks.py](https://github.com/tensorflow/benchmarks/tree/master/scripts/tf_cnn_benchmarks/tf_cnn_benchmarks.py)). - -> Note: `tf_cnn_benchmarks.py` uses the config `force_gpu_compatible`, -> which was introduced after TensorFlow 1.1. Until TensorFlow 1.2 is released -> building from source is advised. - -#### Base command line arguments - -* **`model`**: Model to use, e.g. `resnet50`, `inception3`, `vgg16`, and - `alexnet`. -* **`num_gpus`**: Number of GPUs to use. -* **`data_dir`**: Path to data to process. If not set, synthetic data is used. - To use ImageNet data use these - [instructions](https://github.com/tensorflow/models/tree/master/research/inception#getting-started) - as a starting point. -* **`batch_size`**: Batch size for each GPU. -* **`variable_update`**: The method for managing variables: `parameter_server` - ,`replicated`, `distributed_replicated`, `independent` -* **`local_parameter_device`**: Device to use as parameter server: `cpu` or - `gpu`. - -#### Single instance examples - -```bash -# VGG16 training ImageNet with 8 GPUs using arguments that optimize for -# Google Compute Engine. -python tf_cnn_benchmarks.py --local_parameter_device=cpu --num_gpus=8 \ ---batch_size=32 --model=vgg16 --data_dir=/home/ubuntu/imagenet/train \ ---variable_update=parameter_server --nodistortions - -# VGG16 training synthetic ImageNet data with 8 GPUs using arguments that -# optimize for the NVIDIA DGX-1. -python tf_cnn_benchmarks.py --local_parameter_device=gpu --num_gpus=8 \ ---batch_size=64 --model=vgg16 --variable_update=replicated --use_nccl=True - -# VGG16 training ImageNet data with 8 GPUs using arguments that optimize for -# Amazon EC2. -python tf_cnn_benchmarks.py --local_parameter_device=gpu --num_gpus=8 \ ---batch_size=64 --model=vgg16 --variable_update=parameter_server - -# ResNet-50 training ImageNet data with 8 GPUs using arguments that optimize for -# Amazon EC2. -python tf_cnn_benchmarks.py --local_parameter_device=gpu --num_gpus=8 \ ---batch_size=64 --model=resnet50 --variable_update=replicated --use_nccl=False - -``` - -#### Distributed command line arguments - -* **`ps_hosts`**: Comma separated list of hosts to use as parameter servers - in the format of ```:port```, e.g. ```10.0.0.2:50000```. -* **`worker_hosts`**: Comma separated list of hosts to use as workers in the - format of ```:port```, e.g. ```10.0.0.2:50001```. -* **`task_index`**: Index of the host in the list of `ps_hosts` or - `worker_hosts` being started. -* **`job_name`**: Type of job, e.g `ps` or `worker` - -#### Distributed examples - -Below is an example of training ResNet-50 on 2 hosts: host_0 (10.0.0.1) and -host_1 (10.0.0.2). The example uses synthetic data. To use real data pass the -`--data_dir` argument. - -```bash -# Run the following commands on host_0 (10.0.0.1): -python tf_cnn_benchmarks.py --local_parameter_device=gpu --num_gpus=8 \ ---batch_size=64 --model=resnet50 --variable_update=distributed_replicated \ ---job_name=worker --ps_hosts=10.0.0.1:50000,10.0.0.2:50000 \ ---worker_hosts=10.0.0.1:50001,10.0.0.2:50001 --task_index=0 - -python tf_cnn_benchmarks.py --local_parameter_device=gpu --num_gpus=8 \ ---batch_size=64 --model=resnet50 --variable_update=distributed_replicated \ ---job_name=ps --ps_hosts=10.0.0.1:50000,10.0.0.2:50000 \ ---worker_hosts=10.0.0.1:50001,10.0.0.2:50001 --task_index=0 - - -# Run the following commands on host_1 (10.0.0.2): -python tf_cnn_benchmarks.py --local_parameter_device=gpu --num_gpus=8 \ ---batch_size=64 --model=resnet50 --variable_update=distributed_replicated \ ---job_name=worker --ps_hosts=10.0.0.1:50000,10.0.0.2:50000 \ ---worker_hosts=10.0.0.1:50001,10.0.0.2:50001 --task_index=1 - -python tf_cnn_benchmarks.py --local_parameter_device=gpu --num_gpus=8 \ ---batch_size=64 --model=resnet50 --variable_update=distributed_replicated \ ---job_name=ps --ps_hosts=10.0.0.1:50000,10.0.0.2:50000 \ ---worker_hosts=10.0.0.1:50001,10.0.0.2:50001 --task_index=1 - -``` diff --git a/tensorflow/docs_src/performance/quantization.md b/tensorflow/docs_src/performance/quantization.md deleted file mode 100644 index 3326d82964..0000000000 --- a/tensorflow/docs_src/performance/quantization.md +++ /dev/null @@ -1,253 +0,0 @@ -# Fixed Point Quantization - -Quantization techniques store and calculate numbers in more compact formats. -[TensorFlow Lite](/mobile/tflite/) adds quantization that uses an 8-bit fixed -point representation. - -Since a challenge for modern neural networks is optimizing for high accuracy, the -priority has been improving accuracy and speed during training. Using floating -point arithmetic is an easy way to preserve accuracy and GPUs are designed to -accelerate these calculations. - -However, as more machine learning models are deployed to mobile devices, -inference efficiency has become a critical issue. Where the computational demand -for *training* grows with the amount of models trained on different -architectures, the computational demand for *inference* grows in proportion to -the amount of users. - -## Quantization benefits - - -Using 8-bit calculations help your models run faster and use less power. This is -especially important for mobile devices and embedded applications that can't run -floating point code efficiently, for example, Internet of Things (IoT) and -robotics devices. There are additional opportunities to extend this support to -more backends and research lower precision networks. - -### Smaller file sizes {: .hide-from-toc} - -Neural network models require a lot of space on disk. For example, the original -AlexNet requires over 200 MB for the float format—almost all of that for the -model's millions of weights. Because the weights are slightly different -floating point numbers, simple compression formats perform poorly (like zip). - -Weights fall in large layers of numerical values. For each layer, weights tend to -be normally distributed within a range. Quantization can shrink file sizes by -storing the minimum and maximum weight for each layer, then compress each -weight's float value to an 8-bit integer representing the closest real number in -a linear set of 256 within the range. - -### Faster inference {: .hide-from-toc} - -Since calculations are run entirely on 8-bit inputs and outputs, quantization -reduces the computational resources needed for inference calculations. This is -more involved, requiring changes to all floating point calculations, but results -in a large speed-up for inference time. - -### Memory efficiency {: .hide-from-toc} - -Since fetching 8-bit values only requires 25% of the memory bandwidth of floats, -more efficient caches avoid bottlenecks for RAM access. In many cases, the power -consumption for running a neural network is dominated by memory access. The -savings from using fixed-point 8-bit weights and activations are significant. - -Typically, SIMD operations are available that run more operations per clock -cycle. In some cases, a DSP chip is available that accelerates 8-bit calculations -resulting in a massive speedup. - -## Fixed point quantization techniques - -The goal is to use the same precision for weights and activations during both -training and inference. But an important difference is that training consists of -a forward pass and a backward pass, while inference only uses a forward pass. -When we train the model with quantization in the loop, we ensure that the forward -pass matches precision for both training and inference. - -To minimize the loss in accuracy for fully fixed point models (weights and -activations), train the model with quantization in the loop. This simulates -quantization in the forward pass of a model so weights tend towards values that -perform better during quantized inference. The backward pass uses quantized -weights and activations and models quantization as a straight through estimator. -(See Bengio et al., [2013](https://arxiv.org/abs/1308.3432)) - -Additionally, the minimum and maximum values for activations are determined -during training. This allows a model trained with quantization in the loop to be -converted to a fixed point inference model with little effort, eliminating the -need for a separate calibration step. - -## Quantization training with TensorFlow - -TensorFlow can train models with quantization in the loop. Because training -requires small gradient adjustments, floating point values are still used. To -keep models as floating point while adding the quantization error in the training -loop, [fake quantization](../api_guides/python/array_ops.md#Fake_quantization) nodes simulate the -effect of quantization in the forward and backward passes. - -Since it's difficult to add these fake quantization operations to all the -required locations in the model, there's a function available that rewrites the -training graph. To create a fake quantized training graph: - -``` -# Build forward pass of model. -loss = tf.losses.get_total_loss() - -# Call the training rewrite which rewrites the graph in-place with -# FakeQuantization nodes and folds batchnorm for training. It is -# often needed to fine tune a floating point model for quantization -# with this training tool. When training from scratch, quant_delay -# can be used to activate quantization after training to converge -# with the float graph, effectively fine-tuning the model. -tf.contrib.quantize.create_training_graph(quant_delay=2000000) - -# Call backward pass optimizer as usual. -optimizer = tf.train.GradientDescentOptimizer(learning_rate) -optimizer.minimize(loss) -``` - -The rewritten *eval graph* is non-trivially different from the *training graph* -since the quantization ops affect the batch normalization step. Because of this, -we've added a separate rewrite for the *eval graph*: - -``` -# Build eval model -logits = tf.nn.softmax_cross_entropy_with_logits_v2(...) - -# Call the eval rewrite which rewrites the graph in-place with -# FakeQuantization nodes and fold batchnorm for eval. -tf.contrib.quantize.create_eval_graph() - -# Save the checkpoint and eval graph proto to disk for freezing -# and providing to TFLite. -with open(eval_graph_file, ‘w’) as f: - f.write(str(g.as_graph_def())) -saver = tf.train.Saver() -saver.save(sess, checkpoint_name) -``` - -Methods to rewrite the training and eval graphs are an active area of research -and experimentation. Although rewrites and quantized training might not work or -improve performance for all models, we are working to generalize these -techniques. - -## Generating fully quantized models - -The previously demonstrated after-rewrite eval graph only *simulates* -quantization. To generate real fixed point computations from a trained -quantization model, convert it to a fixed point kernel. Tensorflow Lite supports -this conversion from the graph resulting from `create_eval_graph`. - -First, create a frozen graph that will be the input for the TensorFlow Lite -toolchain: - -``` -bazel build tensorflow/python/tools:freeze_graph && \ - bazel-bin/tensorflow/python/tools/freeze_graph \ - --input_graph=eval_graph_def.pb \ - --input_checkpoint=checkpoint \ - --output_graph=frozen_eval_graph.pb --output_node_names=outputs -``` - -Provide this to the TensorFlow Lite Optimizing Converter (TOCO) to get a fully -quantized TensorFLow Lite model: - -``` -bazel build tensorflow/contrib/lite/toco:toco && \ - ./bazel-bin/third_party/tensorflow/contrib/lite/toco/toco \ - --input_file=frozen_eval_graph.pb \ - --output_file=tflite_model.tflite \ - --input_format=TENSORFLOW_GRAPHDEF --output_format=TFLITE \ - --inference_type=QUANTIZED_UINT8 \ - --input_shape="1,224, 224,3" \ - --input_array=input \ - --output_array=outputs \ - --std_value=127.5 --mean_value=127.5 -``` - -See the documentation for `tf.contrib.quantize` and -[TensorFlow Lite](/mobile/tflite/). - -## Quantized accuracy - -Fixed point [MobileNet](https://arxiv.org/abs/1704.0486) models are released with -8-bit weights and activations. Using the rewriters, these models achieve the -Top-1 accuracies listed in Table 1. For comparison, the floating point accuracies -are listed for the same models. The code used to generate these models -[is available](https://github.com/tensorflow/models/blob/master/research/slim/nets/mobilenet_v1.md) -along with links to all of the pretrained mobilenet_v1 models. - -
- - - - - - - - - - - - - - - - - - - - - - - -
Image SizeDepthTop-1 Accuracy:
Floating point
Top-1 Accuracy:
Fixed point: 8 bit weights and activations
1280.250.4150.399
1280.50.5630.549
1280.750.6210.598
12810.6520.64
1600.250.4550.435
1600.50.5910.577
1600.750.6530.639
16010.680.673
1920.250.4770.458
1920.50.6170.604
1920.750.6720.662
19210.70.69
2240.250.4980.482
2240.50.6330.622
2240.750.6840.679
22410.7090.697
-
- Table 1: MobileNet Top-1 accuracy on Imagenet Validation dataset. -
-
- -## Representation for quantized tensors - -TensorFlow approaches the conversion of floating-point arrays of numbers into -8-bit representations as a compression problem. Since the weights and activation -tensors in trained neural network models tend to have values that are distributed -across comparatively small ranges (for example, -15 to +15 for weights or -500 to -1000 for image model activations). And since neural nets tend to be robust -handling noise, the error introduced by quantizing to a small set of values -maintains the precision of the overall results within an acceptable threshold. A -chosen representation must perform fast calculations, especially the large matrix -multiplications that comprise the bulk of the computations while running a model. - -This is represented with two floats that store the overall minimum and maximum -values corresponding to the lowest and highest quantized value. Each entry in the -quantized array represents a float value in that range, distributed linearly -between the minimum and maximum. For example, with a minimum of -10.0 and maximum -of 30.0f, and an 8-bit array, the quantized values represent the following: - -
- - - - - -
QuantizedFloat
0-10.0
12810.0
25530.0
-
- Table 2: Example quantized value range -
-
- -The advantages of this representation format are: - -* It efficiently represents an arbitrary magnitude of ranges. -* The values don't have to be symmetrical. -* The format represents both signed and unsigned values. -* The linear spread makes multiplications straightforward. - -Alternative techniques use lower bit depths by non-linearly distributing the -float values across the representation, but currently are more expensive in terms -of computation time. (See Han et al., -[2016](https://arxiv.org/abs/1510.00149).) - -The advantage of having a clear definition of the quantized format is that it's -always possible to convert back and forth from fixed-point to floating-point for -operations that aren't quantization-ready, or to inspect the tensors for -debugging. diff --git a/tensorflow/docs_src/performance/xla/broadcasting.md b/tensorflow/docs_src/performance/xla/broadcasting.md deleted file mode 100644 index 7018ded53f..0000000000 --- a/tensorflow/docs_src/performance/xla/broadcasting.md +++ /dev/null @@ -1,204 +0,0 @@ -# Broadcasting semantics - -This document describes how the broadcasting semantics in XLA work. - -## What is broadcasting? - -Broadcasting is the process of making arrays with different shapes have -compatible shapes for arithmetic operations. The terminology is borrowed from -Numpy -[(broadcasting)](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html). - -Broadcasting may be required for operations between multi-dimensional arrays of -different ranks, or between multi-dimensional arrays with different but -compatible shapes. Consider the addition `X+v` where `X` is a matrix (an array -of rank 2) and `v` is a vector (an array of rank 1). To perform element-wise -addition, XLA needs to "broadcast" the vector `v` to the same rank as the -matrix `X`, by replicating `v` a certain number of times. The vector's length -has to match at least one of the dimensions of the matrix. - -For example: - - |1 2 3| + |7 8 9| - |4 5 6| - -The matrix's dimensions are (2,3), the vector's are (3). The vector is broadcast -by replicating it over rows to get: - - |1 2 3| + |7 8 9| = |8 10 12| - |4 5 6| |7 8 9| |11 13 15| - -In Numpy, this is called [broadcasting] -(http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html). - -## Principles - -The XLA language is as strict and explicit as possible, avoiding implicit and -"magical" features. Such features may make some computations slightly easier to -define, at the cost of more assumptions baked into user code that will be -difficult to change in the long term. If necessary, implicit and magical -features can be added in client-level wrappers. - -In regards to broadcasting, explicit broadcasting specifications on operations -between arrays of different ranks is required. This is different from Numpy, -which infers the specification when possible. - -## Broadcasting a lower-rank array onto a higher-rank array - -*Scalars* can always be broadcast over arrays without an explicit specification -of broadcasting dimensions. An element-wise binary operation between a scalar -and an array means applying the operation with the scalar for each element in -the array. For example, adding a scalar to a matrix means producing a matrix -each element of which is a sum of the scalar with the corresponding input -matrix's element. - - |1 2 3| + 7 = |8 9 10| - |4 5 6| |11 12 13| - -Most broadcasting needs can be captured by using a tuple of dimensions on a -binary operation. When the inputs to the operation have different ranks, this -broadcasting tuple specifies which dimension(s) in the **higher-rank** array to -match with the **lower-rank** array. - -Consider the previous example, instead of adding a scalar to a (2,3) matrix, add -a vector of dimension (3) to a matrix of dimensions (2,3). *Without specifying -broadcasting, this operation is invalid.* To correctly request matrix-vector -addition, specify the broadcasting dimension to be (1), meaning the vector's -dimension is matched to dimension 1 of the matrix. In 2D, if dimension 0 is -considered as rows and dimension 1 as columns, this means that each element of -the vector becomes a column of a size matching the number of rows in the matrix: - - |7 8 9| ==> |7 8 9| - |7 8 9| - -As a more complex example, consider adding a 3-element vector (dimension (3)) to -a 3x3 matrix (dimensions (3,3)). There are two ways broadcasting can happen for -this example: - -(1) A broadcasting dimension of 1 can be used. Each vector element becomes a -column and the vector is duplicated for each row in the matrix. - - |7 8 9| ==> |7 8 9| - |7 8 9| - |7 8 9| - -(2) A broadcasting dimension of 0 can be used. Each vector element becomes a row -and the vector is duplicated for each column in the matrix. - - |7| ==> |7 7 7| - |8| |8 8 8| - |9| |9 9 9| - -> Note: when adding a 2x3 matrix to a 3-element vector, a broadcasting dimension -> of 0 is invalid. - -The broadcasting dimensions can be a tuple that describes how a smaller rank -shape is broadcast into a larger rank shape. For example, given a 2x3x4 cuboid -and a 3x4 matrix, a broadcasting tuple (1,2) means matching the matrix to -dimensions 1 and 2 of the cuboid. - -This type of broadcast is used in the binary ops in `XlaBuilder`, if the -`broadcast_dimensions` argument is given. For example, see -[XlaBuilder::Add](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_builder.cc). -In the XLA source code, this type of broadcasting is sometimes called "InDim" -broadcasting. - -### Formal definition - -The broadcasting attribute allows matching a lower-rank array to a higher-rank -array, by specifying which dimensions of the higher-rank array to match. For -example, for an array with dimensions MxNxPxQ, a vector with dimension T can be -matched as follows: - - MxNxPxQ - - dim 3: T - dim 2: T - dim 1: T - dim 0: T - -In each case, T has to be equal to the matching dimension of the higher-rank -array. The vector's values are then broadcast from the matched dimension to all -the other dimensions. - -To match a TxV matrix onto the MxNxPxQ array, a pair of broadcasting dimensions -are used: - - MxNxPxQ - dim 2,3: T V - dim 1,2: T V - dim 0,3: T V - etc... - -The order of dimensions in the broadcasting tuple has to be the order in which -the lower-rank array's dimensions are expected to match the higher-rank array's -dimensions. The first element in the tuple says which dimension in the -higher-rank array has to match dimension 0 in the lower-rank array. The second -element for dimension 1, and so on. The order of broadcast dimensions has to be -strictly increasing. For example, in the previous example it is illegal to match -V to N and T to P; it is also illegal to match V to both P and N. - -## Broadcasting similar-rank arrays with degenerate dimensions - -A related broadcasting problem is broadcasting two arrays that have the same -rank but different dimension sizes. Similarly to Numpy's rules, this is only -possible when the arrays are *compatible*. Two arrays are compatible when all -their dimensions are compatible. Two dimensions are compatible if: - -* They are equal, or -* One of them is 1 (a "degenerate" dimension) - -When two compatible arrays are encountered, the result shape has the maximum -among the two inputs at every dimension index. - -Examples: - -1. (2,1) and (2,3) broadcast to (2,3). -2. (1,2,5) and (7,2,5) broadcast to (7,2,5) -3. (7,2,5) and (7,1,5) broadcast to (7,2,5) -4. (7,2,5) and (7,2,6) are incompatible and cannot be broadcast. - -A special case arises, and is also supported, where each of the input arrays has -a degenerate dimension at a different index. In this case, the result is an -"outer operation": (2,1) and (1,3) broadcast to (2,3). For more examples, -consult the [Numpy documentation on -broadcasting](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html). - -## Broadcast composition - -Broadcasting of a lower-rank array to a higher-rank array **and** broadcasting -using degenerate dimensions can both be performed in the same binary operation. -For example, a vector of size 4 and an matrix of size 1x2 can be added together -using broadcast dimensions value of (0): - - |1 2 3 4| + [5 6] // [5 6] is a 1x2 matrix, not a vector. - -First the vector is broadcast up to rank 2 (matrix) using the broadcast -dimensions. The single value (0) in the broadcast dimensions indicates that -dimension zero of the vector matches to dimension zero of the matrix. This -produces an matrix of size 4xM where the value M is chosen to match the -corresponding dimension size in the 1x2 array. Therefore, a 4x2 matrix is -produced: - - |1 1| + [5 6] - |2 2| - |3 3| - |4 4| - -Then "degenerate dimension broadcasting" broadcasts dimension zero of the 1x2 -matrix to match the corresponding dimension size of the right hand side: - - |1 1| + |5 6| |6 7| - |2 2| + |5 6| = |7 8| - |3 3| + |5 6| |8 9| - |4 4| + |5 6| |9 10| - -A more complicated example is a matrix of size 1x2 added to an array of size -4x3x1 using broadcast dimensions of (1, 2). First the 1x2 matrix is broadcast up -to rank 3 using the broadcast dimensions to produces an intermediate Mx1x2 array -where the dimension size M is determined by the size of the larger operand (the -4x3x1 array) producing a 4x1x2 intermediate array. The M is at dimension 0 -(left-most dimension) because the dimensions 1 and 2 are mapped to the -dimensions of the original 1x2 matrix as the broadcast dimension are (1, 2). -This intermediate array can be added to the 4x3x1 matrix using broadcasting of -degenerate dimensions to produce a 4x3x2 array result. diff --git a/tensorflow/docs_src/performance/xla/developing_new_backend.md b/tensorflow/docs_src/performance/xla/developing_new_backend.md deleted file mode 100644 index 840f6983c2..0000000000 --- a/tensorflow/docs_src/performance/xla/developing_new_backend.md +++ /dev/null @@ -1,77 +0,0 @@ -# Developing a new backend for XLA - -This preliminary guide is for early adopters that want to easily retarget -TensorFlow to their hardware in an efficient manner. The guide is not -step-by-step and assumes knowledge of [LLVM](http://llvm.org), -[Bazel](https://bazel.build/), and TensorFlow. - -XLA provides an abstract interface that a new architecture or accelerator can -implement to create a backend to run TensorFlow graphs. Retargeting XLA should -be significantly simpler and scalable than implementing every existing -TensorFlow Op for new hardware. - -Most implementations will fall into one of the following scenarios: - -1. Existing CPU architecture not yet officially supported by XLA, with or - without an existing [LLVM](http://llvm.org) backend. -2. Non-CPU-like hardware with an existing LLVM backend. -3. Non-CPU-like hardware without an existing LLVM backend. - -> Note: An LLVM backend can mean either one of the officially released LLVM -> backends or a custom LLVM backend developed in-house. - -## Scenario 1: Existing CPU architecture not yet officially supported by XLA - -In this scenario, start by looking at the existing [XLA CPU backend] -(https://www.tensorflow.org/code/tensorflow/compiler/xla/service/cpu/). -XLA makes it easy to retarget TensorFlow to different CPUs by using LLVM, since -the main difference between XLA backends for CPUs is the code generated by LLVM. -Google tests XLA for x64 and ARM64 architectures. - -If the hardware vendor has an LLVM backend for their hardware, it is simple to -link the backend with the LLVM built with XLA. In JIT mode, the XLA CPU backend -emits code for the host CPU. For ahead-of-time compilation, -[`xla::AotCompilationOptions`](https://www.tensorflow.org/code/tensorflow/compiler/xla/service/compiler.h) -can provide an LLVM triple to configure the target architecture. - -If there is no existing LLVM backend but another kind of code generator exists, -it should be possible to reuse most of the existing CPU backend. - -## Scenario 2: Non-CPU-like hardware with an existing LLVM backend - -It is possible to model a new -[`xla::Compiler`](https://www.tensorflow.org/code/tensorflow/compiler/xla/service/compiler.h) -implementation on the existing [`xla::CPUCompiler`] -(https://www.tensorflow.org/code/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc) -and [`xla::GPUCompiler`] -(https://www.tensorflow.org/code/tensorflow/compiler/xla/service/gpu/nvptx_compiler.cc) -classes, since these already emit LLVM IR. Depending on the nature of the -hardware, it is possible that many of the LLVM IR generation aspects will have -to be changed, but a lot of code can be shared with the existing backends. - -A good example to follow is the [GPU backend] -(https://www.tensorflow.org/code/tensorflow/compiler/xla/service/gpu/) -of XLA. The GPU backend targets a non-CPU-like ISA, and therefore some aspects -of its code generation are unique to the GPU domain. Other kinds of hardware, -e.g. DSPs like Hexagon (which has an upstream LLVM backend), can reuse parts of -the LLVM IR emission logic, but other parts will be unique. - -## Scenario 3: Non-CPU-like hardware without an existing LLVM backend - -If it is not possible to utilize LLVM, then the best option is to implement a -new backend for XLA for the desired hardware. This option requires the most -effort. The classes that need to be implemented are as follows: - -* [`StreamExecutor`](https://www.tensorflow.org/code/tensorflow/stream_executor/stream_executor.h): - For many devices not all methods of `StreamExecutor` are needed. See - existing `StreamExecutor` implementations for details. -* [`xla::Compiler`](https://www.tensorflow.org/code/tensorflow/compiler/xla/service/compiler.h): - This class encapsulates the compilation of an HLO computation into an - `xla::Executable`. -* [`xla::Executable`](https://www.tensorflow.org/code/tensorflow/compiler/xla/service/executable.h): - This class is used to launch a compiled computation on the platform. -* [`xla::TransferManager`](https://www.tensorflow.org/code/tensorflow/compiler/xla/service/transfer_manager.h): - This class enables backends to provide platform-specific mechanisms for - constructing XLA literal data from given device memory handles. In other - words, it helps encapsulate the transfer of data from the host to the device - and back. diff --git a/tensorflow/docs_src/performance/xla/index.md b/tensorflow/docs_src/performance/xla/index.md deleted file mode 100644 index 770737c34c..0000000000 --- a/tensorflow/docs_src/performance/xla/index.md +++ /dev/null @@ -1,98 +0,0 @@ -# XLA Overview - -
- -
- -> Note: XLA is experimental and considered alpha. Most use cases will not -> see improvements in performance (speed or decreased memory usage). We have -> released XLA early so the Open Source Community can contribute to its -> development, as well as create a path for integration with hardware -> accelerators. - -XLA (Accelerated Linear Algebra) is a domain-specific compiler for linear -algebra that optimizes TensorFlow computations. The results are improvements in -speed, memory usage, and portability on server and mobile platforms. Initially, -most users will not see large benefits from XLA, but are welcome to experiment -by using XLA via [just-in-time (JIT) compilation](../../performance/xla/jit.md) or [ahead-of-time (AOT) compilation](../../performance/xla/tfcompile.md). Developers targeting new hardware accelerators are -especially encouraged to try out XLA. - -The XLA framework is experimental and in active development. In particular, -while it is unlikely that the semantics of existing operations will change, it -is expected that more operations will be added to cover important use cases. The -team welcomes feedback from the community about missing functionality and -community contributions via GitHub. - -## Why did we build XLA? - -We had several objectives for XLA to work with TensorFlow: - -* *Improve execution speed.* Compile subgraphs to reduce the execution time of - short-lived Ops to eliminate overhead from the TensorFlow runtime, fuse - pipelined operations to reduce memory overhead, and specialize to known - tensor shapes to allow for more aggressive constant propagation. - -* *Improve memory usage.* Analyze and schedule memory usage, in principle - eliminating many intermediate storage buffers. - -* *Reduce reliance on custom Ops.* Remove the need for many custom Ops by - improving the performance of automatically fused low-level Ops to match the - performance of custom Ops that were fused by hand. - -* *Reduce mobile footprint.* Eliminate the TensorFlow runtime by ahead-of-time - compiling the subgraph and emitting an object/header file pair that can be - linked directly into another application. The results can reduce the - footprint for mobile inference by several orders of magnitude. - -* *Improve portability.* Make it relatively easy to write a new backend for - novel hardware, at which point a large fraction of TensorFlow programs will - run unmodified on that hardware. This is in contrast with the approach of - specializing individual monolithic Ops for new hardware, which requires - TensorFlow programs to be rewritten to make use of those Ops. - -## How does XLA work? - -The input language to XLA is called "HLO IR", or just HLO (High Level -Optimizer). The semantics of HLO are described on the -[Operation Semantics](../../performance/xla/operation_semantics.md) page. It -is most convenient to think of HLO as a [compiler -IR](https://en.wikipedia.org/wiki/Intermediate_representation). - -XLA takes graphs ("computations") defined in HLO and compiles them into machine -instructions for various architectures. XLA is modular in the sense that it is -easy to slot in an alternative backend to [target some novel HW architecture](../../performance/xla/developing_new_backend.md). The CPU backend for x64 and ARM64 as -well as the NVIDIA GPU backend are in the TensorFlow source tree. - -The following diagram shows the compilation process in XLA: - -
- -
- -XLA comes with several optimizations and analysis passes that are -target-independent, such as -[CSE](https://en.wikipedia.org/wiki/Common_subexpression_elimination), -target-independent operation fusion, and buffer analysis for allocating runtime -memory for the computation. - -After the target-independent step, XLA sends the HLO computation to a backend. -The backend can perform further HLO-level optimizations, this time with target -specific information and needs in mind. For example, the XLA GPU backend may -perform operation fusion beneficial specifically for the GPU programming model -and determine how to partition the computation into streams. At this stage, -backends may also pattern-match certain operations or combinations thereof to -optimized library calls. - -The next step is target-specific code generation. The CPU and GPU backends -included with XLA use [LLVM](http://llvm.org) for low-level IR, optimization, -and code-generation. These backends emit the LLVM IR necessary to represent the -XLA HLO computation in an efficient manner, and then invoke LLVM to emit native -code from this LLVM IR. - -The GPU backend currently supports NVIDIA GPUs via the LLVM NVPTX backend; the -CPU backend supports multiple CPU ISAs. - -## Supported Platforms - -XLA currently supports [JIT compilation](../../performance/xla/jit.md) on x86-64 and NVIDIA GPUs; and -[AOT compilation](../../performance/xla/tfcompile.md) for x86-64 and ARM. diff --git a/tensorflow/docs_src/performance/xla/jit.md b/tensorflow/docs_src/performance/xla/jit.md deleted file mode 100644 index 83b3e71566..0000000000 --- a/tensorflow/docs_src/performance/xla/jit.md +++ /dev/null @@ -1,169 +0,0 @@ -# Using JIT Compilation - -> Note: TensorFlow must be compiled from source to include XLA. - -## Why use just-in-time (JIT) compilation? - -The TensorFlow/XLA JIT compiler compiles and runs parts of TensorFlow graphs via -XLA. The benefit of this over the standard TensorFlow implementation is that XLA -can fuse multiple operators (kernel fusion) into a small number of compiled -kernels. Fusing operators can reduce memory bandwidth requirements and improve -performance compared to executing operators one-at-a-time, as the TensorFlow -executor does. - -## Running TensorFlow graphs via XLA - -There are two ways to run TensorFlow computations via XLA, either by -JIT-compiling operators placed on a CPU or GPU device, or by placing operators -on the `XLA_CPU` or `XLA_GPU` TensorFlow devices. Placing operators directly on -a TensorFlow XLA device forces the operator to run on that device and is mainly -used for testing. - -> Note: The XLA CPU backend supports intra-op parallelism (i.e. it can shard a -> single operation across multiple cores) but it does not support inter-op -> parallelism (i.e. it cannot execute independent operations concurrently across -> multiple cores). The XLA GPU backend is competitive with the standard -> TensorFlow implementation, sometimes faster, sometimes slower. - -### Turning on JIT compilation - -JIT compilation can be turned on at the session level or manually for select -operations. Both of these approaches are zero-copy --- data does not need to be -copied when passing data between a compiled XLA kernel and a TensorFlow operator -placed on the same device. - -#### Session - -Turning on JIT compilation at the session level will result in all possible -operators being greedily compiled into XLA computations. Each XLA computation -will be compiled into one or more kernels for the underlying device. - -Subject to a few constraints, if there are two adjacent operators in the graph -that both have XLA implementations, then they will be compiled into a single XLA -computation. - -JIT compilation is turned on at the session level by setting the -`global_jit_level` config to `tf.OptimizerOptions.ON_1` and passing the config -during session initialization. - -```python -# Config to turn on JIT compilation -config = tf.ConfigProto() -config.graph_options.optimizer_options.global_jit_level = tf.OptimizerOptions.ON_1 - -sess = tf.Session(config=config) -``` - -> Note: Turning on JIT at the session level will not result in operations being -> compiled for the CPU. JIT compilation for CPU operations must be done via -> the manual method documented below. - -#### Manual - -JIT compilation can also be turned on manually for one or more operators. This -is done by tagging the operators to compile with the attribute -`_XlaCompile=true`. The simplest way to do this is via the -`tf.contrib.compiler.jit.experimental_jit_scope()` scope defined in -[`tensorflow/contrib/compiler/jit.py`](https://www.tensorflow.org/code/tensorflow/contrib/compiler/jit.py). -Example usage: - -```python - jit_scope = tf.contrib.compiler.jit.experimental_jit_scope - - x = tf.placeholder(np.float32) - with jit_scope(): - y = tf.add(x, x) # The "add" will be compiled with XLA. -``` - -The `_XlaCompile` attribute is currently supported on a best-effort basis. If an -operator cannot be compiled, TensorFlow will silently fall back to the normal -implementation. - -### Placing operators on XLA devices - -Another way to run computations via XLA is to place an operator on a specific -XLA device. This method is normally only used for testing. Valid targets are -`XLA_CPU` or `XLA_GPU`. - -```python -with tf.device("/job:localhost/replica:0/task:0/device:XLA_GPU:0"): - output = tf.add(input1, input2) -``` - -Unlike JIT compilation on the standard CPU and GPU devices, these devices make a -copy of data when it is transferred on and off the device. The extra copy makes -it expensive to mix XLA and TensorFlow operators in the same graph. - -## Tutorial - -This tutorial covers training a simple version of MNIST softmax with JIT turned -on. Currently JIT at the session level, which is what is used for the tutorial, -only supports GPU. - -Before starting the tutorial verify that the LD_LIBRARY environment variable or -ldconfig contains `$CUDA_ROOT/extras/CUPTI/lib64`, which contains libraries for -the CUDA Profiling Tools Interface [(CUPTI)](http://docs.nvidia.com/cuda/cupti/index.html). -TensorFlow uses CUPTI to pull tracing information from the GPU. - -### Step #1: Prepare sample script - -Download or move -[mnist_softmax_xla.py](https://www.tensorflow.org/code/tensorflow/examples/tutorials/mnist/mnist_softmax_xla.py) -into a folder outside of the TensorFlow source tree. - -### Step #2: Run without XLA - -Execute the python script to train the model without XLA. - -```shell -python mnist_softmax_xla.py --xla='' -``` - -Using the Chrome Trace Event Profiler (browse to chrome://tracing), -open the timeline file created when the script finishes: `timeline.ctf.json`. -The rendered timeline should look similar to the picture below with multiple -green boxes labeled `MatMul`, possibly across multiple CPUs. -
- -
- -### Step #3 Run with XLA - -Execute the python script to train the model with XLA and turn on a debugging -feature of XLA via an environmental variable that outputs the XLA graph. - -```shell -TF_XLA_FLAGS="--xla_hlo_graph_path=/tmp --xla_generate_hlo_graph=.*" python mnist_softmax_xla.py -``` - -Open the timeline file created (`timeline.ctf.json`). The rendered timeline -should look similar to the picture below with one long bar labeled `XlaLaunch`. -
- -
- -To understand what is happening in `XlaLaunch`, look at the console output for -statements similar to the following: - -```shell -computation cluster_0[_XlaCompiledKernel=true,_XlaNumConstantArgs=1].v82 [CPU: -pipeline start, before inline]: /tmp/hlo_graph_0.dot - -``` - -The console statements point to the location of `hlo_graph_xx.dot` files that -contain information about the graph created by XLA. The process that XLA takes -to fuse Ops is visible by starting at `hlo_graph_0.dot` and viewing each diagram -in succession. - -To Render the .dot file into a png, install -[GraphViz](https://www.graphviz.org/download/) and run: - -```shell -dot -Tpng hlo_graph_80.dot -o hlo_graph_80.png -``` - -The result will look like the following: -
- -
diff --git a/tensorflow/docs_src/performance/xla/operation_semantics.md b/tensorflow/docs_src/performance/xla/operation_semantics.md deleted file mode 100644 index 96d269bec4..0000000000 --- a/tensorflow/docs_src/performance/xla/operation_semantics.md +++ /dev/null @@ -1,2426 +0,0 @@ -# Operation Semantics - -The following describes the semantics of operations defined in the -[`XlaBuilder`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_builder.h) -interface. Typically, these operations map one-to-one to operations defined in -the RPC interface in -[`xla_data.proto`](https://www.tensorflow.org/code/tensorflow/compiler/xla/xla_data.proto). - -A note on nomenclature: the generalized data type XLA deals with is an -N-dimensional array holding elements of some uniform type (such as 32-bit -float). Throughout the documentation, *array* is used to denote an -arbitrary-dimensional array. For convenience, special cases have more specific -and familiar names; for example a *vector* is a 1-dimensional array and a -*matrix* is a 2-dimensional array. - -## AllToAll - -See also -[`XlaBuilder::AllToAll`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_builder.h). - -Alltoall is a collective operation that sends data from all cores to all cores. -It has two phases: - -1. the scatter phase. On each core, the operand is split into `split_count` - number of blocks along the `split_dimensions`, and the blocks are scattered - to all cores, e.g., the ith block is send to the ith core. -2. the gather phase. Each core concatenates the received blocks along the - `concat_dimension`. - -The participating cores can be configured by: - -- `replica_groups`: each ReplicaGroup contains a list of replica id. If empty, - all replicas belong to one group in the order of 0 - (n-1). Alltoall will be - applied within subgroups in the specified order. For example, replica - groups = {{1,2,3},{4,5,0}} means, an Alltoall will be applied within replica - 1, 2, 3, and in the gather phase, the received blocks will be concatenated - in the order of 1, 2, 3; another Alltoall will be applied within replica 4, - 5, 0, and the concatenation order is 4, 5, 0. - -Prerequisites: - -- The dimension size of the operand on the split_dimension is divisible by - split_count. -- The operand's shape is not tuple. - - `AllToAll(operand, split_dimension, concat_dimension, split_count, -replica_groups)` - - -| Arguments | Type | Semantics | -| ------------------ | --------------------- | ------------------------------- | -| `operand` | `XlaOp` | n dimensional input array | -| `split_dimension` | `int64` | A value in the interval `[0, | -: : : n)` that names the dimension : -: : : along which the operand is : -: : : split : -| `concat_dimension` | `int64` | a value in the interval `[0, | -: : : n)` that names the dimension : -: : : along which the split blocks : -: : : are concatenated : -| `split_count` | `int64` | the number of cores that | -: : : participate this operation. If : -: : : `replica_groups` is empty, this : -: : : should be the number of : -: : : replicas; otherwise, this : -: : : should be equal to the number : -: : : of replicas in each group. : -| `replica_groups` | `ReplicaGroup` vector | each group contains a list of | -: : : replica id. : - -Below shows an example of Alltoall. - -``` -XlaBuilder b("alltoall"); -auto x = Parameter(&b, 0, ShapeUtil::MakeShape(F32, {4, 16}), "x"); -AllToAll(x, /*split_dimension=*/1, /*concat_dimension=*/0, /*split_count=*/4); -``` - -
- -
- -In this example, there are 4 cores participating the Alltoall. On each core, the -operand is split into 4 parts along dimension 0, so each part has shape -f32[4,4]. The 4 parts are scattered to all cores. Then each core concatenates -the received parts along dimension 1, in the order or core 0-4. So the output on -each core has shape f32[16,4]. - -## BatchNormGrad - -See also -[`XlaBuilder::BatchNormGrad`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_builder.h) -and [the original batch normalization paper](https://arxiv.org/abs/1502.03167) -for a detailed description of the algorithm. - -Calculates gradients of batch norm. - - `BatchNormGrad(operand, scale, mean, variance, grad_output, epsilon, feature_index)` - -| Arguments | Type | Semantics | -| --------------- | ----------------------- | -------------------------------- | -| `operand` | `XlaOp` | n dimensional array to be | -: : : normalized (x) : -| `scale` | `XlaOp` | 1 dimensional array | -: : : (\\(\gamma\\)) : -| `mean` | `XlaOp` | 1 dimensional array (\\(\mu\\)) | -| `variance` | `XlaOp` | 1 dimensional array | -: : : (\\(\sigma^2\\)) : -| `grad_output` | `XlaOp` | Gradients passed to | -: : : `BatchNormTraining` : -: : : (\\( \nabla y\\)) : -| `epsilon` | `float` | Epsilon value (\\(\epsilon\\)) | -| `feature_index` | `int64` | Index to feature dimension in | -: : : `operand` : - -For each feature in the feature dimension (`feature_index` is the index for the -feature dimension in `operand`), the operation calculates the gradients with -respect to `operand`, `offset` and `scale` across all the other dimensions. The -`feature_index` must be a valid index for the feature dimension in `operand`. - -The three gradients are defined by the following formulas (assuming a -4-dimensional tensor as `operand` and with feature dimension index \\(l\\), -batch size `m` and spatial sizes `w` and `h`): - -\\[ \begin{split} c_l&= -\frac{1}{mwh}\sum_{i=1}^m\sum_{j=1}^w\sum_{k=1}^h -\left( \nabla y_{ijkl} \frac{x_{ijkl} - \mu_l}{\sigma^2_l+\epsilon} \right) -\\\\ -\nabla x_{ijkl} &= \frac{\gamma_{l}}{\sqrt{\sigma^2_{l}+\epsilon}} -\left( \nabla y_{ijkl} - \mathrm{mean}(\nabla y) - c_l (x_{ijkl} - \mu_{l}) -\right) -\\\\ -\nabla \gamma_l &= \sum_{i=1}^m\sum_{j=1}^w\sum_{k=1}^h \left( \nabla y_{ijkl} -\frac{x_{ijkl} - \mu_l}{\sqrt{\sigma^2_{l}+\epsilon}} \right) -\\\\\ -\nabla \beta_l &= \sum_{i=1}^m\sum_{j=1}^w\sum_{k=1}^h \nabla y_{ijkl} -\end{split} \\] - -The inputs `mean` and `variance` represent moments value -across batch and spatial dimensions. - -The output type is a tuple of three handles: - -| Outputs | Type | Semantics | -| ------------- | ----------------------- | --------------------------------- | -| `grad_operand` | `XlaOp` | gradient with respect to input | -: : : `operand` (\\( \nabla x\\)) : -| `grad_scale` | `XlaOp` | gradient with respect to input | -: : : `scale` (\\( \nabla \gamma\\)) : -| `grad_offset` | `XlaOp` | gradient with respect to input | -: : : `offset`(\\( \nabla \beta\\)) : - -## BatchNormInference - -See also -[`XlaBuilder::BatchNormInference`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_builder.h) -and [the original batch normalization paper](https://arxiv.org/abs/1502.03167) -for a detailed description of the algorithm. - -Normalizes an array across batch and spatial dimensions. - - `BatchNormInference(operand, scale, offset, mean, variance, epsilon, feature_index)` - -Arguments | Type | Semantics ---------------- | ------- | --------------------------------------- -`operand` | `XlaOp` | n dimensional array to be normalized -`scale` | `XlaOp` | 1 dimensional array -`offset` | `XlaOp` | 1 dimensional array -`mean` | `XlaOp` | 1 dimensional array -`variance` | `XlaOp` | 1 dimensional array -`epsilon` | `float` | Epsilon value -`feature_index` | `int64` | Index to feature dimension in `operand` - -For each feature in the feature dimension (`feature_index` is the index for the -feature dimension in `operand`), the operation calculates the mean and variance -across all the other dimensions and uses the mean and variance to normalize each -element in `operand`. The `feature_index` must be a valid index for the feature -dimension in `operand`. - -`BatchNormInference` is equivalent to calling `BatchNormTraining` without -computing `mean` and `variance` for each batch. It uses the input `mean` and -`variance` instead as estimated values. The purpose of this op is to reduce -latency in inference, hence the name `BatchNormInference`. - -The output is an n-dimensional, normalized array with the same shape as input -`operand`. - -## BatchNormTraining - -See also -[`XlaBuilder::BatchNormTraining`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_builder.h) -and [`the original batch normalization paper`](https://arxiv.org/abs/1502.03167) -for a detailed description of the algorithm. - -Normalizes an array across batch and spatial dimensions. - - `BatchNormTraining(operand, scale, offset, epsilon, feature_index)` - -Arguments | Type | Semantics ---------------- | ------- | ---------------------------------------- -`operand` | `XlaOp` | n dimensional array to be normalized (x) -`scale` | `XlaOp` | 1 dimensional array (\\(\gamma\\)) -`offset` | `XlaOp` | 1 dimensional array (\\(\beta\\)) -`epsilon` | `float` | Epsilon value (\\(\epsilon\\)) -`feature_index` | `int64` | Index to feature dimension in `operand` - -For each feature in the feature dimension (`feature_index` is the index for the -feature dimension in `operand`), the operation calculates the mean and variance -across all the other dimensions and uses the mean and variance to normalize each -element in `operand`. The `feature_index` must be a valid index for the feature -dimension in `operand`. - -The algorithm goes as follows for each batch in `operand` \\(x\\) that -contains `m` elements with `w` and `h` as the size of spatial dimensions -(assuming `operand` is an 4 dimensional array): - -- Calculates batch mean \\(\mu_l\\) for each feature `l` in feature dimension: -\\(\mu_l=\frac{1}{mwh}\sum_{i=1}^m\sum_{j=1}^w\sum_{k=1}^h x_{ijkl}\\) - -- Calculates batch variance \\(\sigma^2_l\\): -\\(\sigma^2_l=\frac{1}{mwh}\sum_{i=1}^m\sum_{j=1}^w\sum_{k=1}^h (x_{ijkl} - \mu_l)^2\\) - -- Normalizes, scales and shifts: -\\(y_{ijkl}=\frac{\gamma_l(x_{ijkl}-\mu_l)}{\sqrt[2]{\sigma^2_l+\epsilon}}+\beta_l\\) - -The epsilon value, usually a small number, is added to avoid divide-by-zero errors. - -The output type is a tuple of three `XlaOp`s: - -| Outputs | Type | Semantics | -| ------------ | ----------------------- | -------------------------------------| -| `output` | `XlaOp` | n dimensional array with the same | -: : : shape as input `operand` (y) : -| `batch_mean` | `XlaOp` | 1 dimensional array (\\(\mu\\)) | -| `batch_var` | `XlaOp` | 1 dimensional array (\\(\sigma^2\\)) | - -The `batch_mean` and `batch_var` are moments calculated across the batch and -spatial dimensions using the formulas above. - -## BitcastConvertType - -See also -[`XlaBuilder::BitcastConvertType`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_builder.h). - -Similar to a `tf.bitcast` in TensorFlow, performs an element-wise bitcast -operation from a data shape to a target shape. The dimensions must match, and -the conversion is an element-wise one; e.g. `s32` elements become `f32` elements -via bitcast routine. Bitcast is implemented as a low-level cast, so machines -with different floating-point representations will give different results. - - `BitcastConvertType(operand, new_element_type)` - -Arguments | Type | Semantics ------------------- | --------------- | --------------------------- -`operand` | `XlaOp` | array of type T with dims D -`new_element_type` | `PrimitiveType` | type U - -The dimensions of the operand and the target shape must match. The bit-width of -the source and destination element types must be equal. The source -and destination element types must not be tuples. - -## Broadcast - -See also -[`XlaBuilder::Broadcast`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_builder.h). - -Adds dimensions to an array by duplicating the data in the array. - - `Broadcast(operand, broadcast_sizes)` - -Arguments | Type | Semantics ------------------ | ------------------- | ------------------------------- -`operand` | `XlaOp` | The array to duplicate -`broadcast_sizes` | `ArraySlice` | The sizes of the new dimensions - -The new dimensions are inserted on the left, i.e. if `broadcast_sizes` has -values `{a0, ..., aN}` and the operand shape has dimensions `{b0, ..., bM}` then -the shape of the output has dimensions `{a0, ..., aN, b0, ..., bM}`. - -The new dimensions index into copies of the operand, i.e. - -``` -output[i0, ..., iN, j0, ..., jM] = operand[j0, ..., jM] -``` - -For example, if `operand` is a scalar `f32` with value `2.0f`, and -`broadcast_sizes` is `{2, 3}`, then the result will be an array with shape -`f32[2, 3]` and all the values in the result will be `2.0f`. - -## Call - -See also -[`XlaBuilder::Call`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_builder.h). - -Invokes a computation with the given arguments. - - `Call(computation, args...)` - -| Arguments | Type | Semantics | -| ------------- | ---------------------- | ----------------------------------- | -| `computation` | `XlaComputation` | computation of type `T_0, T_1, ..., | -: : : T_N -> S` with N parameters of : -: : : arbitrary type : -| `args` | sequence of N `XlaOp`s | N arguments of arbitrary type | - -The arity and types of the `args` must match the parameters of the -`computation`. It is allowed to have no `args`. - -## Clamp - -See also -[`XlaBuilder::Clamp`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_builder.h). - -Clamps an operand to within the range between a minimum and maximum value. - - `Clamp(min, operand, max)` - -Arguments | Type | Semantics ---------- | ------- | --------------- -`min` | `XlaOp` | array of type T -`operand` | `XlaOp` | array of type T -`max` | `XlaOp` | array of type T - -Given an operand and minimum and maximum values, returns the operand if it is in -the range between the minimum and maximum, else returns the minimum value if the -operand is below this range or the maximum value if the operand is above this -range. That is, `clamp(a, x, b) = min(max(a, x), b)`. - -All three arrays must be the same shape. Alternatively, as a restricted form of -[broadcasting](broadcasting.md), `min` and/or `max` can be a scalar of type `T`. - -Example with scalar `min` and `max`: - -``` -let operand: s32[3] = {-1, 5, 9}; -let min: s32 = 0; -let max: s32 = 6; -==> -Clamp(min, operand, max) = s32[3]{0, 5, 6}; -``` - -## Collapse - -See also -[`XlaBuilder::Collapse`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_builder.h) -and the `tf.reshape` operation. - -Collapses dimensions of an array into one dimension. - - `Collapse(operand, dimensions)` - -Arguments | Type | Semantics ------------- | -------------- | ----------------------------------------------- -`operand` | `XlaOp` | array of type T -`dimensions` | `int64` vector | in-order, consecutive subset of T's dimensions. - -Collapse replaces the given subset of the operand's dimensions by a single -dimension. The input arguments are an arbitrary array of type T and a -compile-time-constant vector of dimension indices. The dimension indices must be -an in-order (low to high dimension numbers), consecutive subset of T's -dimensions. Thus, {0, 1, 2}, {0, 1}, or {1, 2} are all valid dimension sets, but -{1, 0} or {0, 2} are not. They are replaced by a single new dimension, in the -same position in the dimension sequence as those they replace, with the new -dimension size equal to the product of original dimension sizes. The lowest -dimension number in `dimensions` is the slowest varying dimension (most major) -in the loop nest which collapses these dimension, and the highest dimension -number is fastest varying (most minor). See the `tf.reshape` operator -if more general collapse ordering is needed. - -For example, let v be an array of 24 elements: - -``` -let v = f32[4x2x3] {{{10, 11, 12}, {15, 16, 17}}, - {{20, 21, 22}, {25, 26, 27}}, - {{30, 31, 32}, {35, 36, 37}}, - {{40, 41, 42}, {45, 46, 47}}}; - -// Collapse to a single dimension, leaving one dimension. -let v012 = Collapse(v, {0,1,2}); -then v012 == f32[24] {10, 11, 12, 15, 16, 17, - 20, 21, 22, 25, 26, 27, - 30, 31, 32, 35, 36, 37, - 40, 41, 42, 45, 46, 47}; - -// Collapse the two lower dimensions, leaving two dimensions. -let v01 = Collapse(v, {0,1}); -then v01 == f32[4x6] {{10, 11, 12, 15, 16, 17}, - {20, 21, 22, 25, 26, 27}, - {30, 31, 32, 35, 36, 37}, - {40, 41, 42, 45, 46, 47}}; - -// Collapse the two higher dimensions, leaving two dimensions. -let v12 = Collapse(v, {1,2}); -then v12 == f32[8x3] {{10, 11, 12}, - {15, 16, 17}, - {20, 21, 22}, - {25, 26, 27}, - {30, 31, 32}, - {35, 36, 37}, - {40, 41, 42}, - {45, 46, 47}}; - -``` - -## Concatenate - -See also -[`XlaBuilder::ConcatInDim`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_builder.h). - -Concatenate composes an array from multiple array operands. The array is of the -same rank as each of the input array operands (which must be of the same rank as -each other) and contains the arguments in the order that they were specified. - - `Concatenate(operands..., dimension)` - -| Arguments | Type | Semantics | -| ----------- | --------------------- | -------------------------------------- | -| `operands` | sequence of N `XlaOp` | N arrays of type T with dimensions | -: : : [L0, L1, ...]. Requires N >= 1. : -| `dimension` | `int64` | A value in the interval `[0, N)` that | -: : : names the dimension to be concatenated : -: : : between the `operands`. : - -With the exception of `dimension` all dimensions must be the same. This is -because XLA does not support "ragged" arrays. Also note that rank-0 values -cannot be concatenated (as it's impossible to name the dimension along which the -concatenation occurs). - -1-dimensional example: - -``` -Concat({{2, 3}, {4, 5}, {6, 7}}, 0) ->>> {2, 3, 4, 5, 6, 7} -``` - -2-dimensional example: - -``` -let a = { - {1, 2}, - {3, 4}, - {5, 6}, -}; -let b = { - {7, 8}, -}; -Concat({a, b}, 0) ->>> { - {1, 2}, - {3, 4}, - {5, 6}, - {7, 8}, -} -``` - -Diagram: -
- -
- -## Conditional - -See also -[`XlaBuilder::Conditional`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_builder.h). - - `Conditional(pred, true_operand, true_computation, false_operand, -false_computation)` - -Arguments | Type | Semantics -------------------- | ---------------- | --------------------------------- -`pred` | `XlaOp` | Scalar of type `PRED` -`true_operand` | `XlaOp` | Argument of type `T_0` -`true_computation` | `XlaComputation` | XlaComputation of type `T_0 -> S` -`false_operand` | `XlaOp` | Argument of type `T_1` -`false_computation` | `XlaComputation` | XlaComputation of type `T_1 -> S` - -Executes `true_computation` if `pred` is `true`, `false_computation` if `pred` -is `false`, and returns the result. - -The `true_computation` must take in a single argument of type `T_0` and will be -invoked with `true_operand` which must be of the same type. The -`false_computation` must take in a single argument of type `T_1` and will be -invoked with `false_operand` which must be of the same type. The type of the -returned value of `true_computation` and `false_computation` must be the same. - -Note that only one of `true_computation` and `false_computation` will be -executed depending on the value of `pred`. - -## Conv (convolution) - -See also -[`XlaBuilder::Conv`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_builder.h). - -As ConvWithGeneralPadding, but the padding is specified in a short-hand way as -either SAME or VALID. SAME padding pads the input (`lhs`) with zeroes so that -the output has the same shape as the input when not taking striding into -account. VALID padding simply means no padding. - -## ConvWithGeneralPadding (convolution) - -See also -[`XlaBuilder::ConvWithGeneralPadding`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_builder.h). - -Computes a convolution of the kind used in neural networks. Here, a convolution -can be thought of as a n-dimensional window moving across a n-dimensional base -area and a computation is performed for each possible position of the window. - -| Arguments | Type | Semantics | -| --------------------- | -------------------- | ----------------------------- | -| `lhs` | `XlaOp` | rank n+2 array of inputs | -| `rhs` | `XlaOp` | rank n+2 array of kernel | -: : : weights : -| `window_strides` | `ArraySlice` | n-d array of kernel strides | -| `padding` | `ArraySlice< | n-d array of (low, high) | -: : pair>` : padding : -| `lhs_dilation` | `ArraySlice` | n-d lhs dilation factor array | -| `rhs_dilation` | `ArraySlice` | n-d rhs dilation factor array | -| `feature_group_count` | int64 | the number of feature groups | - -Let n be the number of spatial dimensions. The `lhs` argument is a rank n+2 -array describing the base area. This is called the input, even though of course -the rhs is also an input. In a neural network, these are the input activations. -The n+2 dimensions are, in this order: - -* `batch`: Each coordinate in this dimension represents an independent input - for which convolution is carried out. -* `z/depth/features`: Each (y,x) position in the base area has a vector - associated to it, which goes into this dimension. -* `spatial_dims`: Describes the `n` spatial dimensions that define the base - area that the window moves across. - -The `rhs` argument is a rank n+2 array describing the convolutional -filter/kernel/window. The dimensions are, in this order: - -* `output-z`: The `z` dimension of the output. -* `input-z`: The size of this dimension times `feature_group_count` should - equal the size of the `z` dimension in lhs. -* `spatial_dims`: Describes the `n` spatial dimensions that define the n-d - window that moves across the base area. - -The `window_strides` argument specifies the stride of the convolutional window -in the spatial dimensions. For example, if the stride in the first spatial -dimension is 3, then the window can only be placed at coordinates where the -first spatial index is divisible by 3. - -The `padding` argument specifies the amount of zero padding to be applied to the -base area. The amount of padding can be negative -- the absolute value of -negative padding indicates the number of elements to remove from the specified -dimension before doing the convolution. `padding[0]` specifies the padding for -dimension `y` and `padding[1]` specifies the padding for dimension `x`. Each -pair has the low padding as the first element and the high padding as the second -element. The low padding is applied in the direction of lower indices while the -high padding is applied in the direction of higher indices. For example, if -`padding[1]` is `(2,3)` then there will be a padding by 2 zeroes on the left and -by 3 zeroes on the right in the second spatial dimension. Using padding is -equivalent to inserting those same zero values into the input (`lhs`) before -doing the convolution. - -The `lhs_dilation` and `rhs_dilation` arguments specify the dilation factor to -be applied to the lhs and rhs, respectively, in each spatial dimension. If the -dilation factor in a spatial dimension is d, then d-1 holes are implicitly -placed between each of the entries in that dimension, increasing the size of the -array. The holes are filled with a no-op value, which for convolution means -zeroes. - -Dilation of the rhs is also called atrous convolution. For more details, see -`tf.nn.atrous_conv2d`. Dilation of the lhs is also called transposed -convolution. For more details, see `tf.nn.conv2d_transpose`. - -The `feature_group_count` argument (default value 1) can be used for grouped -convolutions. `feature_group_count` needs to be a divisor of both the input and -the output feature dimension. If `feature_group_count` is greater than 1, it -means that conceptually the input and output feature dimension and the `rhs` -output feature dimension are split evenly into `feature_group_count` many -groups, each group consisting of a consecutive subsequence of features. The -input feature dimension of `rhs` needs to be equal to the `lhs` input feature -dimension divided by `feature_group_count` (so it already has the size of a -group of input features). The i-th groups are used together to compute -`feature_group_count` many separate convolutions. The results of these -convolutions are concatenated together in the output feature dimension. - -For depthwise convolution the `feature_group_count` argument would be set to the -input feature dimension, and the filter would be reshaped from -`[filter_height, filter_width, in_channels, channel_multiplier]` to -`[filter_height, filter_width, 1, in_channels * channel_multiplier]`. For more -details, see `tf.nn.depthwise_conv2d`. - -The output shape has these dimensions, in this order: - -* `batch`: Same size as `batch` on the input (`lhs`). -* `z`: Same size as `output-z` on the kernel (`rhs`). -* `spatial_dims`: One value for each valid placement of the convolutional - window. - -The valid placements of the convolutional window are determined by the strides -and the size of the base area after padding. - -To describe what a convolution does, consider a 2d convolution, and pick some -fixed `batch`, `z`, `y`, `x` coordinates in the output. Then `(y,x)` is a -position of a corner of the window within the base area (e.g. the upper left -corner, depending on how you interpret the spatial dimensions). We now have a 2d -window, taken from the base area, where each 2d point is associated to a 1d -vector, so we get a 3d box. From the convolutional kernel, since we fixed the -output coordinate `z`, we also have a 3d box. The two boxes have the same -dimensions, so we can take the sum of the element-wise products between the two -boxes (similar to a dot product). That is the output value. - -Note that if `output-z` is e.g., 5, then each position of the window produces 5 -values in the output into the `z` dimension of the output. These values differ -in what part of the convolutional kernel is used - there is a separate 3d box of -values used for each `output-z` coordinate. So you could think of it as 5 -separate convolutions with a different filter for each of them. - -Here is pseudo-code for a 2d convolution with padding and striding: - -``` -for (b, oz, oy, ox) { // output coordinates - value = 0; - for (iz, ky, kx) { // kernel coordinates and input z - iy = oy*stride_y + ky - pad_low_y; - ix = ox*stride_x + kx - pad_low_x; - if ((iy, ix) inside the base area considered without padding) { - value += input(b, iz, iy, ix) * kernel(oz, iz, ky, kx); - } - } - output(b, oz, oy, ox) = value; -} -``` - -## ConvertElementType - -See also -[`XlaBuilder::ConvertElementType`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_builder.h). - -Similar to an element-wise `static_cast` in C++, performs an element-wise -conversion operation from a data shape to a target shape. The dimensions must -match, and the conversion is an element-wise one; e.g. `s32` elements become -`f32` elements via an `s32`-to-`f32` conversion routine. - - `ConvertElementType(operand, new_element_type)` - -Arguments | Type | Semantics ------------------- | --------------- | --------------------------- -`operand` | `XlaOp` | array of type T with dims D -`new_element_type` | `PrimitiveType` | type U - -The dimensions of the operand and the target shape must match. The source and -destination element types must not be tuples. - -A conversion such as `T=s32` to `U=f32` will perform a normalizing int-to-float -conversion routine such as round-to-nearest-even. - -> Note: The precise float-to-int and visa-versa conversions are currently -> unspecified, but may become additional arguments to the convert operation in -> the future. Not all possible conversions have been implemented for all ->targets. - -``` -let a: s32[3] = {0, 1, 2}; -let b: f32[3] = convert(a, f32); -then b == f32[3]{0.0, 1.0, 2.0} -``` - -## CrossReplicaSum - -See also -[`XlaBuilder::CrossReplicaSum`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_builder.h). - -Computes a sum across replicas. - - `CrossReplicaSum(operand)` - -Arguments | Type | Semantics ---------- | ------- | ----------------------------- -`operand` | `XlaOp` | Array to sum across replicas. -| `replica_group_ids` | `int64` vector | Group ID for each replica. | - -The output shape is the same as the input shape. For example, if there are two -replicas and the operand has the value `(1.0, 2.5)` and `(3.0, 5.25)` -respectively on the two replicas, then the output value from this op will be -`(4.0, 7.75)` on both replicas. - -`replica_group_ids` identifies the group ID of each replica. The group ID must -either be empty (all replicas belong to a single group), or contain the same -number of elements as the number of replicas. For example, if -`replica_group_ids` = {0, 1, 2, 3, 0, 1, 2, 3} has eight replicas, there are -four subgroups of replica IDs: {0, 4}, {1, 5}, {2, 6}, and {3, 7}. The size of -each subgroup *must* be identical, so, for example, using: -`replica_group_ids` = {0, 1, 2, 0} for four replicas is invalid. - -Computing the result of CrossReplicaSum requires having one input from each -replica, so if one replica executes a CrossReplicaSum node more times than -another, then the former replica will wait forever. Since the replicas are all -running the same program, there are not a lot of ways for that to happen, but it -is possible when a while loop's condition depends on data from infeed and the -data that is infed causes the while loop to iterate more times on one replica -than another. - -## CustomCall - -See also -[`XlaBuilder::CustomCall`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_builder.h). - -Call a user-provided function within a computation. - - `CustomCall(target_name, args..., shape)` - -| Arguments | Type | Semantics | -| ------------- | ---------------------- | --------------------------------- | -| `target_name` | `string` | Name of the function. A call | -: : : instruction will be emitted which : -: : : targets this symbol name. : -| `args` | sequence of N `XlaOp`s | N arguments of arbitrary type, | -: : : which will be passed to the : -: : : function. : -| `shape` | `Shape` | Output shape of the function | - -The function signature is the same, regardless of the arity or type of args: - -``` -extern "C" void target_name(void* out, void** in); -``` - -For example, if CustomCall is used as follows: - -``` -let x = f32[2] {1,2}; -let y = f32[2x3] {{10, 20, 30}, {40, 50, 60}}; - -CustomCall("myfunc", {x, y}, f32[3x3]) -``` - -Here is an example of an implementation of `myfunc`: - -``` -extern "C" void myfunc(void* out, void** in) { - float (&x)[2] = *static_cast(in[0]); - float (&y)[2][3] = *static_cast(in[1]); - EXPECT_EQ(1, x[0]); - EXPECT_EQ(2, x[1]); - EXPECT_EQ(10, y[0][0]); - EXPECT_EQ(20, y[0][1]); - EXPECT_EQ(30, y[0][2]); - EXPECT_EQ(40, y[1][0]); - EXPECT_EQ(50, y[1][1]); - EXPECT_EQ(60, y[1][2]); - float (&z)[3][3] = *static_cast(out); - z[0][0] = x[1] + y[1][0]; - // ... -} -``` - -The user-provided function must not have side-effects and its execution must be -idempotent. - -> Note: The opaque nature of the user-provided function restricts optimization -> opportunities for the compiler. Try to express your computation in terms of -> native XLA ops whenever possible; only use CustomCall as a last resort. - -## Dot - -See also -[`XlaBuilder::Dot`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_builder.h). - - `Dot(lhs, rhs)` - -Arguments | Type | Semantics ---------- | ------- | --------------- -`lhs` | `XlaOp` | array of type T -`rhs` | `XlaOp` | array of type T - -The exact semantics of this operation depend on the ranks of the operands: - -| Input | Output | Semantics | -| ----------------------- | --------------------- | ----------------------- | -| vector [n] `dot` vector | scalar | vector dot product | -: [n] : : : -| matrix [m x k] `dot` | vector [m] | matrix-vector | -: vector [k] : : multiplication : -| matrix [m x k] `dot` | matrix [m x n] | matrix-matrix | -: matrix [k x n] : : multiplication : - -The operation performs sum of products over the last dimension of `lhs` and the -one-before-last dimension of `rhs`. These are the "contracted" dimensions. The -contracted dimensions of `lhs` and `rhs` must be of the same size. In practice, -it can be used to perform dot products between vectors, vector/matrix -multiplications or matrix/matrix multiplications. - -## DotGeneral - -See also -[`XlaBuilder::DotGeneral`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_builder.h). - - `DotGeneral(lhs, rhs, dimension_numbers)` - -Arguments | Type | Semantics -------------------- | --------------------- | --------------- -`lhs` | `XlaOp` | array of type T -`rhs` | `XlaOp` | array of type T -`dimension_numbers` | `DotDimensionNumbers` | array of type T - -As Dot, but allows contracting and batch dimension numbers to be specified for -both the 'lhs' and 'rhs'. - -| DotDimensionNumbers Fields | Type | Semantics -| --------- | ----------------------- | --------------- -| 'lhs_contracting_dimensions' | repeated int64 | 'lhs' contracting dimension numbers | -| 'rhs_contracting_dimensions' | repeated int64 | 'rhs' contracting dimension numbers | -| 'lhs_batch_dimensions' | repeated int64 | 'lhs' batch dimension numbers | -| 'rhs_batch_dimensions' | repeated int64 | 'rhs' batch dimension numbers | - -DotGeneral performs the sum of products over contracting dimensions specified -in 'dimension_numbers'. - -Associated contracting dimension numbers from the 'lhs' and 'rhs' do not need -to be the same, but must be listed in the same order in both -'lhs/rhs_contracting_dimensions' arrays and have the same dimension sizes. -There must be exactly one contracting dimension on both 'lhs' and 'rhs'. - -Example with contracting dimension numbers: - -``` -lhs = { {1.0, 2.0, 3.0}, - {4.0, 5.0, 6.0} } - -rhs = { {1.0, 1.0, 1.0}, - {2.0, 2.0, 2.0} } - -DotDimensionNumbers dnums; -dnums.add_lhs_contracting_dimensions(1); -dnums.add_rhs_contracting_dimensions(1); - -DotGeneral(lhs, rhs, dnums) -> { {6.0, 12.0}, - {15.0, 30.0} } -``` - -Associated batch dimension numbers from the 'lhs' and 'rhs' must have the same -dimension number, must be listed in the same order in both arrays, must -have the same dimension sizes, and must be ordered before contracting and -non-contracting/non-batch dimension numbers. - -Example with batch dimension numbers (batch size 2, 2x2 matrices): - -``` -lhs = { { {1.0, 2.0}, - {3.0, 4.0} }, - { {5.0, 6.0}, - {7.0, 8.0} } } - -rhs = { { {1.0, 0.0}, - {0.0, 1.0} }, - { {1.0, 0.0}, - {0.0, 1.0} } } - -DotDimensionNumbers dnums; -dnums.add_lhs_contracting_dimensions(2); -dnums.add_rhs_contracting_dimensions(1); -dnums.add_lhs_batch_dimensions(0); -dnums.add_rhs_batch_dimensions(0); - -DotGeneral(lhs, rhs, dnums) -> { { {1.0, 2.0}, - {3.0, 4.0} }, - { {5.0, 6.0}, - {7.0, 8.0} } } -``` - -| Input | Output | Semantics | -| ----------------------------------- | ----------------- | ---------------- | -| [b0, m, k] `dot` [b0, k, n] | [b0, m, n] | batch matmul | -| [b0, b1, m, k] `dot` [b0, b1, k, n] | [b0, b1, m, n] | batch matmul | - -It follows that the resulting dimension number starts with the batch dimension, -then the 'lhs' non-contracting/non-batch dimension, and finally the 'rhs' -non-contracting/non-batch dimension. - -## DynamicSlice - -See also -[`XlaBuilder::DynamicSlice`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_builder.h). - -DynamicSlice extracts a sub-array from the input array at dynamic -`start_indices`. The size of the slice in each dimension is passed in -`size_indices`, which specify the end point of exclusive slice intervals in each -dimension: [start, start + size). The shape of `start_indices` must be rank == -1, with dimension size equal to the rank of `operand`. - - `DynamicSlice(operand, start_indices, size_indices)` - -| Arguments | Type | Semantics | -| --------------- | ------------------- | ----------------------------------- | -| `operand` | `XlaOp` | N dimensional array of type T | -| `start_indices` | `XlaOp` | Rank 1 array of N integers | -: : : containing the starting indices of : -: : : the slice for each dimension. Value : -: : : must be greater than or equal to : -: : : zero. : -| `size_indices` | `ArraySlice` | List of N integers containing the | -: : : slice size for each dimension. Each : -: : : value must be strictly greater than : -: : : zero, and start + size must be less : -: : : than or equal to the size of the : -: : : dimension to avoid wrapping modulo : -: : : dimension size. : - -The effective slice indices are computed by applying the following -transformation for each index `i` in `[1, N)` before performing the slice: - -``` -start_indices[i] = clamp(start_indices[i], 0, operand.dimension_size[i] - size_indices[i]) -``` - -This ensures that the extracted slice is always in-bounds with respect to the -operand array. If the slice is in-bounds before the transformation is applied, -the transformation has no effect. - -1-dimensional example: - -``` -let a = {0.0, 1.0, 2.0, 3.0, 4.0} -let s = {2} - -DynamicSlice(a, s, {2}) produces: - {2.0, 3.0} -``` - -2-dimensional example: - -``` -let b = - { {0.0, 1.0, 2.0}, - {3.0, 4.0, 5.0}, - {6.0, 7.0, 8.0}, - {9.0, 10.0, 11.0} } -let s = {2, 1} - -DynamicSlice(b, s, {2, 2}) produces: - { { 7.0, 8.0}, - {10.0, 11.0} } -``` -## DynamicUpdateSlice - -See also -[`XlaBuilder::DynamicUpdateSlice`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_builder.h). - -DynamicUpdateSlice generates a result which is the value of the input array -`operand`, with a slice `update` overwritten at `start_indices`. -The shape of `update` determines the shape of the sub-array of the result which -is updated. -The shape of `start_indices` must be rank == 1, with dimension size equal to -the rank of `operand`. - - `DynamicUpdateSlice(operand, update, start_indices)` - -| Arguments | Type | Semantics | -| --------------- | ------- | ------------------------------------------------ | -| `operand` | `XlaOp` | N dimensional array of type T | -| `update` | `XlaOp` | N dimensional array of type T containing the | -: : : slice update. Each dimension of update shape : -: : : must be strictly greater than zero, and start + : -: : : update must be less than or equal to the operand : -: : : size for each dimension to avoid generating : -: : : out-of-bounds update indices. : -| `start_indices` | `XlaOp` | Rank 1 array of N integers containing the | -: : : starting indices of the slice for each : -: : : dimension. Value must be greater than or equal : -: : : to zero. : - -The effective slice indices are computed by applying the following -transformation for each index `i` in `[1, N)` before performing the slice: - -``` -start_indices[i] = clamp(start_indices[i], 0, operand.dimension_size[i] - update.dimension_size[i]) -``` - -This ensures that the updated slice is always in-bounds with respect to the -operand array. If the slice is in-bounds before the transformation is applied, -the transformation has no effect. - -1-dimensional example: - -``` -let a = {0.0, 1.0, 2.0, 3.0, 4.0} -let u = {5.0, 6.0} -let s = {2} - -DynamicUpdateSlice(a, u, s) produces: - {0.0, 1.0, 5.0, 6.0, 4.0} -``` - -2-dimensional example: - -``` -let b = - { {0.0, 1.0, 2.0}, - {3.0, 4.0, 5.0}, - {6.0, 7.0, 8.0}, - {9.0, 10.0, 11.0} } -let u = - { {12.0, 13.0}, - {14.0, 15.0}, - {16.0, 17.0} } - -let s = {1, 1} - -DynamicUpdateSlice(b, u, s) produces: - { {0.0, 1.0, 2.0}, - {3.0, 12.0, 13.0}, - {6.0, 14.0, 15.0}, - {9.0, 16.0, 17.0} } -``` - -## Element-wise binary arithmetic operations - -See also -[`XlaBuilder::Add`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_builder.h). - -A set of element-wise binary arithmetic operations is supported. - - `Op(lhs, rhs)` - -Where `Op` is one of `Add` (addition), `Sub` (subtraction), `Mul` -(multiplication), `Div` (division), `Rem` (remainder), `Max` (maximum), `Min` -(minimum), `LogicalAnd` (logical AND), or `LogicalOr` (logical OR). - -Arguments | Type | Semantics ---------- | ------- | ---------------------------------------- -`lhs` | `XlaOp` | left-hand-side operand: array of type T -`rhs` | `XlaOp` | right-hand-side operand: array of type T - -The arguments' shapes have to be either similar or compatible. See the -[broadcasting](../../performance/xla/broadcasting.md) documentation about what it means for shapes to -be compatible. The result of an operation has a shape which is the result of -broadcasting the two input arrays. In this variant, operations between arrays of -different ranks are *not* supported, unless one of the operands is a scalar. - -When `Op` is `Rem`, the sign of the result is taken from the dividend, and the -absolute value of the result is always less than the divisor's absolute value. - -Integer division overflow (signed/unsigned division/remainder by zero or signed -divison/remainder of `INT_SMIN` with `-1`) produces an implementation defined -value. - -An alternative variant with different-rank broadcasting support exists for these -operations: - - `Op(lhs, rhs, broadcast_dimensions)` - -Where `Op` is the same as above. This variant of the operation should be used -for arithmetic operations between arrays of different ranks (such as adding a -matrix to a vector). - -The additional `broadcast_dimensions` operand is a slice of integers used to -expand the rank of the lower-rank operand up to the rank of the higher-rank -operand. `broadcast_dimensions` maps the dimensions of the lower-rank shape to -the dimensions of the higher-rank shape. The unmapped dimensions of the expanded -shape are filled with dimensions of size one. Degenerate-dimension broadcasting -then broadcasts the shapes along these degenerate dimensions to equalize the -shapes of both operands. The semantics are described in detail on the -[broadcasting page](../../performance/xla/broadcasting.md). - -## Element-wise comparison operations - -See also -[`XlaBuilder::Eq`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_builder.h). - -A set of standard element-wise binary comparison operations is supported. Note -that standard IEEE 754 floating-point comparison semantics apply when comparing -floating-point types. - - `Op(lhs, rhs)` - -Where `Op` is one of `Eq` (equal-to), `Ne` (not equal-to), `Ge` -(greater-or-equal-than), `Gt` (greater-than), `Le` (less-or-equal-than), `Lt` -(less-than). - -Arguments | Type | Semantics ---------- | ------- | ---------------------------------------- -`lhs` | `XlaOp` | left-hand-side operand: array of type T -`rhs` | `XlaOp` | right-hand-side operand: array of type T - -The arguments' shapes have to be either similar or compatible. See the -[broadcasting](../../performance/xla/broadcasting.md) documentation about what it means for shapes to -be compatible. The result of an operation has a shape which is the result of -broadcasting the two input arrays with the element type `PRED`. In this variant, -operations between arrays of different ranks are *not* supported, unless one of -the operands is a scalar. - -An alternative variant with different-rank broadcasting support exists for these -operations: - - `Op(lhs, rhs, broadcast_dimensions)` - -Where `Op` is the same as above. This variant of the operation should be used -for comparison operations between arrays of different ranks (such as adding a -matrix to a vector). - -The additional `broadcast_dimensions` operand is a slice of integers specifying -the dimensions to use for broadcasting the operands. The semantics are described -in detail on the [broadcasting page](../../performance/xla/broadcasting.md). - -## Element-wise unary functions - -XlaBuilder supports these element-wise unary functions: - -`Abs(operand)` Element-wise abs `x -> |x|`. - -`Ceil(operand)` Element-wise ceil `x -> ⌈x⌉`. - -`Cos(operand)` Element-wise cosine `x -> cos(x)`. - -`Exp(operand)` Element-wise natural exponential `x -> e^x`. - -`Floor(operand)` Element-wise floor `x -> ⌊x⌋`. - -`IsFinite(operand)` Tests whether each element of `operand` is finite, -i.e., is not positive or negative infinity, and is not `NaN`. Returns an array -of `PRED` values with the same shape as the input, where each element is `true` -if and only if the corresponding input element is finite. - -`Log(operand)` Element-wise natural logarithm `x -> ln(x)`. - -`LogicalNot(operand)` Element-wise logical not `x -> !(x)`. - -`Neg(operand)` Element-wise negation `x -> -x`. - -`Sign(operand)` Element-wise sign operation `x -> sgn(x)` where - -$$\text{sgn}(x) = \begin{cases} -1 & x < 0\\ 0 & x = 0\\ 1 & x > 0 \end{cases}$$ - -using the comparison operator of the element type of `operand`. - -`Tanh(operand)` Element-wise hyperbolic tangent `x -> tanh(x)`. - - -Arguments | Type | Semantics ---------- | ------- | --------------------------- -`operand` | `XlaOp` | The operand to the function - -The function is applied to each element in the `operand` array, resulting in an -array with the same shape. It is allowed for `operand` to be a scalar (rank 0). - -## Gather - -The XLA gather operation stitches together several slices (each slice at a -potentially different runtime offset) of an input array. - -### General Semantics - -See also -[`XlaBuilder::Gather`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_builder.h). -For a more intuitive description, see the "Informal Description" section below. - - `gather(operand, start_indices, offset_dims, collapsed_slice_dims, slice_sizes, start_index_map)` - -|Arguments | Type | Semantics | -|----------------- | ----------------------- | --------------------------------| -|`operand` | `XlaOp` | The array we’re gathering | -: : : from. : -|`start_indices` | `XlaOp` | Array containing the starting | -: : : indices of the slices we gather.: -|`index_vector_dim` | `int64` | The dimension in | -: : : `start_indices` that "contains" : -: : : the starting indices. See : -: : : below for a detailed : -: : : description. : -|`offset_dims` | `ArraySlice` | The set of dimensions in the : -: : : output shape that offset into a : -: : : array sliced from operand. : -|`slice_sizes` | `ArraySlice` | `slice_sizes[i]` is the bounds | -: : : for the slice on dimension `i`.: -|`collapsed_slice_dims` | `ArraySlice` | The set of dimensions in each : -| : | slice that are collapsed away. : -| : | These dimensions must have size: -| : | 1. | -|`start_index_map` | `ArraySlice` | A map that describes how to map| -: : : indices in `start_indices` to : -: : : to legal indices into operand. : - -For convenience, we label dimensions in the output array not in `offset_dims` -as `batch_dims`. - -The output is an array of rank `batch_dims.size` + `operand.rank` - -`collapsed_slice_dims`.size. - -If `index_vector_dim` is equal to `start_indices.rank` we implicitly consider -`start_indices` to have a trailing `1` dimension (i.e. if `start_indices` was of -shape `[6,7]` and `index_vector_dim` is `2` then we implicitly consider the -shape of `start_indices` to be `[6,7,1]`). - -The bounds for the output array along dimension `i` is computed as follows: - - 1. If `i` is present in `batch_dims` (i.e. is equal to `batch_dims[k]` for - some `k`) then we pick the corresponding dimension bounds out of - `start_indices.shape`, skipping `index_vector_dim` (i.e. pick - `start_indices.shape.dims`[`k`] if `k` < `index_vector_dim` and - `start_indices.shape.dims`[`k`+`1`] otherwise). - - 2. If `i` is present in `offset_dims` (i.e. equal to `offset_dims`[`k`] for - some `k`) then we pick the corresponding bound out of `slice_sizes` after - accounting for `collapsed_slice_dims` (i.e. we pick - `adjusted_slice_sizes`[`k`] where `adjusted_slice_sizes` is `slice_sizes` - with the bounds at indices `collapsed_slice_dims` removed). - -Formally, the operand index `In` corresponding to an output index `Out` is -computed as follows: - - 1. Let `G` = { `Out`[`k`] for `k` in `batch_dims` }. Use `G` to slice out - vector `S` such that `S`[`i`] = `start_indices`[Combine(`G`, `i`)] where - Combine(A, b) inserts b at position `index_vector_dim` into A. Note that - this is well defined even if `G` is empty -- if `G` is empty then `S` = - `start_indices`. - - 2. Create a starting index, `S``in`, into `operand` using `S` by - scattering `S` using `start_index_map`. More precisely: - 1. `S``in`[`start_index_map`[`k`]] = `S`[`k`] if `k` < - `start_index_map.size`. - 2. `S``in`[`_`] = `0` otherwise. - - 3. Create an index `O``in` into `operand` by scattering the indices - at the offset dimensions in `Out` according to the `collapsed_slice_dims` - set. More precisely: - 1. `O``in`[`expand_offset_dims`(`k`)] = - `Out`[`offset_dims`[`k`]] if `k` < `offset_dims.size` - (`expand_offset_dims` is defined below). - 2. `O``in`[`_`] = `0` otherwise. - 4. `In` is `O``in` + `S``in` where + is element-wise - addition. - -`expand_offset_dims` is the monotonic function with domain [`0`, `offset.size`) -and range [`0`, `operand.rank`) \ `collapsed_slice_dims`. So if, e.g., -`offset.size` is `4`, `operand.rank` is `6` and `collapsed_slice_dims` is {`0`, -`2`} then `expand_offset_dims` is {`0`→`1`, `1`→`3`, `2`→`4`, `3`→`5`}. - -### Informal Description and Examples - -Informally, every index `Out` in the output array corresponds to an element `E` -in the operand array, computed as follows: - - - We use the batch dimensions in `Out` to look up a starting index from - `start_indices`. - - - We use `start_index_map` to map the starting index (which may have size less - than operand.rank) to a "full" starting index into operand. - - - We dynamic-slice out a slice with size `slice_sizes` using the full starting - index. - - - We reshape the slice by collapsing the `collapsed_slice_dims` dimensions. - Since all collapsed slice dimensions have to have bound 1 this reshape is - always legal. - - - We use the offset dimensions in `Out` to index into this slice to get the - input element, `E`, corresponding to output index `Out`. - -`index_vector_dim` is set to `start_indices.rank` - `1` in all of the -examples that follow. More interesting values for `index_vector_dim` does not -change the operation fundamentally, but makes the visual representation more -cumbersome. - -To get an intuition on how all of the above fits together, let's look at an -example that gathers 5 slices of shape `[8,6]` from a `[16,11]` array. The -position of a slice into the `[16,11]` array can be represented as an index -vector of shape `S64[2]`, so the set of 5 positions can be represented as a -`S64[5,2]` array. - -The behavior of the gather operation can then be depicted as an index -transformation that takes [`G`,`O``0`,`O``1`], an index in -the output shape, and maps it to an element in the input array in the following -way: - -
- -
- -We first select an (`X`,`Y`) vector from the gather indices array using `G`. -The element in the output array at index -[`G`,`O``0`,`O``1`] is then the element in the input -array at index [`X`+`O``0`,`Y`+`O``1`]. - -`slice_sizes` is `[8,6]`, which decides the range of W`0` and -W`1`, and this in turn decides the bounds of the slice. - -This gather operation acts as a batch dynamic slice with `G` as the batch -dimension. - -The gather indices may be multidimensional. For instance, a more general -version of the example above using a "gather indices" array of shape `[4,5,2]` -would translate indices like this: - -
- -
- -Again, this acts as a batch dynamic slice `G``0` and -`G``1` as the batch dimensions. The slice size is still `[8,6]`. - -The gather operation in XLA generalizes the informal semantics outlined above in -the following ways: - - 1. We can configure which dimensions in the output shape are the offset - dimensions (dimensions containing `O``0`, `O``1` in - the last example). The output batch dimensions (dimensions containing - `G``0`, `G``1` in the last example) are defined to be - the output dimensions that are not offset dimensions. - - 2. The number of output offset dimensions explicitly present in the output - shape may be smaller than the input rank. These "missing" dimensions, which - are listed explicitly as `collapsed_slice_dims`, must have a slice size of - `1`. Since they have a slice size of `1` the only valid index for them is - `0` and eliding them does not introduce ambiguity. - - 3. The slice extracted from the "Gather Indices" array ((`X`, `Y`) in the last - example) may have fewer elements than the input array rank, and an explicit - mapping dictates how the index should be expanded to have the same rank as - the input. - -As a final example, we use (2) and (3) to implement `tf.gather_nd`: - -
- -
- -`G``0` and `G``1` are used to slice out a starting index -from the gather indices array as usual, except the starting index has only one -element, `X`. Similarly, there is only one output offset index with the value -`O``0`. However, before being used as indices into the input array, -these are expanded in accordance to "Gather Index Mapping" (`start_index_map` in -the formal description) and "Offset Mapping" (`expand_offset_dims` in the formal -description) into [`0`,`O``0`] and [`X`,`0`] respectively, adding up -to [`X`,`O``0`]. In other words, the output index -[`G``0`,`G``1`,`O``0`] maps to the input index -[`GatherIndices`[`G``0`,`G``1`,`0`],`X`] which gives us -the semantics for `tf.gather_nd`. - -`slice_sizes` for this case is `[1,11]`. Intuitively this means that every -index `X` in the gather indices array picks an entire row and the result is the -concatenation of all these rows. - -## GetTupleElement - -See also -[`XlaBuilder::GetTupleElement`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_builder.h). - -Indexes into a tuple with a compile-time-constant value. - -The value must be a compile-time-constant so that shape inference can determine -the type of the resulting value. - -This is analogous to `std::get(t)` in C++. Conceptually: - -``` -let v: f32[10] = f32[10]{0, 1, 2, 3, 4, 5, 6, 7, 8, 9}; -let s: s32 = 5; -let t: (f32[10], s32) = tuple(v, s); -let element_1: s32 = gettupleelement(t, 1); // Inferred shape matches s32. -``` - -See also `tf.tuple`. - -## Infeed - -See also -[`XlaBuilder::Infeed`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_builder.h). - - `Infeed(shape)` - -| Argument | Type | Semantics | -| -------- | ------- | ----------------------------------------------------- | -| `shape` | `Shape` | Shape of the data read from the Infeed interface. The | -: : : layout field of the shape must be set to match the : -: : : layout of the data sent to the device; otherwise its : -: : : behavior is undefined. : - -Reads a single data item from the implicit Infeed streaming interface of the -device, interpreting the data as the given shape and its layout, and returns a -`XlaOp` of the data. Multiple Infeed operations are allowed in a -computation, but there must be a total order among the Infeed operations. For -example, two Infeeds in the code below have a total order since there is a -dependency between the while loops. - -``` -result1 = while (condition, init = init_value) { - Infeed(shape) -} - -result2 = while (condition, init = result1) { - Infeed(shape) -} -``` - -Nested tuple shapes are not supported. For an empty tuple shape, the Infeed -operation is effectively a no-op and proceeds without reading any data from the -Infeed of the device. - -> Note: We plan to allow multiple Infeed operations without a total order, in -> which case the compiler will provide information about how the Infeed -> operations are serialized in the compiled program. - -## Iota - - `Iota()` - -Builds a constant literal on device rather than a potentially large host -transfer. Creates a rank 1 tensor of values starting at zero and incrementing -by one. - -Arguments | Type | Semantics ------------------- | --------------- | --------------------------- -`type` | `PrimitiveType` | type U -`size` | `int64` | The number of elements in the tensor. - -## Map - -See also -[`XlaBuilder::Map`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_builder.h). - - `Map(operands..., computation)` - -| Arguments | Type | Semantics | -| ----------------- | ---------------------- | ------------------------------ | -| `operands` | sequence of N `XlaOp`s | N arrays of types T_0..T_{N-1} | -| `computation` | `XlaComputation` | computation of type `T_0, T_1, | -: : : ..., T_{N + M -1} -> S` with N : -: : : parameters of type T and M of : -: : : arbitrary type : -| `dimensions` | `int64` array | array of map dimensions | - -Applies a scalar function over the given `operands` arrays, producing an array -of the same dimensions where each element is the result of the mapped function -applied to the corresponding elements in the input arrays. - -The mapped function is an arbitrary computation with the restriction that it has -N inputs of scalar type `T` and a single output with type `S`. The output has -the same dimensions as the operands except that the element type T is replaced -with S. - -For example: `Map(op1, op2, op3, computation, par1)` maps `elem_out <- -computation(elem1, elem2, elem3, par1)` at each (multi-dimensional) index in the -input arrays to produce the output array. - -## Pad - -See also -[`XlaBuilder::Pad`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_builder.h). - - `Pad(operand, padding_value, padding_config)` - -| Arguments | Type | Semantics | -| ---------------- | --------------- | --------------------------------------- | -| `operand` | `XlaOp` | array of type `T` | -| `padding_value` | `XlaOp` | scalar of type `T` to fill in the added | -: : : padding : -| `padding_config` | `PaddingConfig` | padding amount on both edges (low, | -: : : high) and between the elements of each : -: : : dimension : - -Expands the given `operand` array by padding around the array as well as between -the elements of the array with the given `padding_value`. `padding_config` -specifies the amount of edge padding and the interior padding for each -dimension. - -`PaddingConfig` is a repeated field of `PaddingConfigDimension`, which contains -three fields for each dimension: `edge_padding_low`, `edge_padding_high`, and -`interior_padding`. `edge_padding_low` and `edge_padding_high` specify the -amount of padding added at the low-end (next to index 0) and the high-end (next -to the highest index) of each dimension respectively. The amount of edge padding -can be negative -- the absolute value of negative padding indicates the number -of elements to remove from the specified dimension. `interior_padding` specifies -the amount of padding added between any two elements in each dimension. Interior -padding occurs logically before edge padding, so in the case of negative edge -padding elements are removed from the interior-padded operand. This operation is -a no-op if the edge padding pairs are all (0, 0) and the interior padding values -are all 0. The figure below shows examples of different `edge_padding` and -`interior_padding` values for a two-dimensional array. - -
- -
- -## Recv - -See also -[`XlaBuilder::Recv`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_builder.h). - - `Recv(shape, channel_handle)` - -| Arguments | Type | Semantics | -| ---------------- | --------------- | ------------------------------------ | -| `shape` | `Shape` | shape of the data to receive | -| `channel_handle` | `ChannelHandle` | unique identifier for each send/recv pair | - -Receives data of the given shape from a `Send` instruction in another -computation that shares the same channel handle. Returns a -XlaOp for the received data. - -The client API of `Recv` operation represents synchronous communication. -However, the instruction is internally decomposed into 2 HLO instructions -(`Recv` and `RecvDone`) to enable asynchronous data transfers. See also -[`HloInstruction::CreateRecv` and `HloInstruction::CreateRecvDone`](https://www.tensorflow.org/code/tensorflow/compiler/xla/service/hlo_instruction.h). - -`Recv(const Shape& shape, int64 channel_id)` - -Allocates resources required to receive data from a `Send` instruction with the -same channel_id. Returns a context for the allocated resources, which is used -by a following `RecvDone` instruction to wait for the completion of the data -transfer. The context is a tuple of {receive buffer (shape), request identifier -(U32)} and it can only be used by a `RecvDone` instruction. - - `RecvDone(HloInstruction context)` - -Given a context created by a `Recv` instruction, waits for the data transfer to -complete and returns the received data. - -## Reduce - -See also -[`XlaBuilder::Reduce`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_builder.h). - -Applies a reduction function to one or more arrays in parallel. - - `Reduce(operands..., init_values..., computation, dimensions)` - -Arguments | Type | Semantics -------------- | --------------------- | --------------------------------------- -`operands` | Sequence of N `XlaOp` | N arrays of types `T_0, ..., T_N`. -`init_values` | Sequence of N `XlaOp` | N scalars of types `T_0, ..., T_N`. -`computation` | `XlaComputation` | computation of type - : : `T_0, ..., T_N, T_0, ..., T_N -> Collate(T_0, ..., T_N)` -`dimensions` | `int64` array | unordered array of dimensions to reduce - -Where: -* N is required to be greater or equal to 1. -* All input arrays must have the same dimensions. -* If `N = 1`, `Collate(T)` is `T`. -* If `N > 1`, `Collate(T_0, ..., T_N)` is a tuple of `N` elements of type `T`. - -The output of the op is `Collate(Q_0, ..., Q_N)` where `Q_i` is an array of type -`T_i`, the dimensions of which are described below. - -This operation reduces one or more dimensions of each input array into scalars. -The rank of each returned array is `rank(operand) - len(dimensions)`. -`init_value` is the initial value used for every reduction and may be inserted -anywhere during computation by the back-end. In most cases, `init_value` is an -identity of the reduction function (for example, 0 for addition). The applied -`computation` is always passed the `init_value` on the left-hand side. - -The evaluation order of the reduction function is arbitrary and may be -non-deterministic. Therefore, the reduction function should not be overly -sensitive to reassociation. - -Some reduction functions like addition are not strictly associative for floats. -However, if the range of the data is limited, floating-point addition is close -enough to being associative for most practical uses. It is possible to conceive -of some completely non-associative reductions, however, and these will produce -incorrect or unpredictable results in XLA reductions. - -As an example, when reducing across one dimension in a single 1D array with -values [10, 11, 12, 13], with reduction function `f` (this is `computation`) -then that could be computed as - -`f(10, f(11, f(12, f(init_value, 13)))` - -but there are also many other possibilities, e.g. - -`f(init_value, f(f(10, f(init_value, 11)), f(f(init_value, 12), f(init_value, 13))))` - -The following is a rough pseudo-code example of how reduction could be -implemented, using summation as the reduction computation with an initial value -of 0. - -```python -result_shape <- remove all dims in dimensions from operand_shape - -# Iterate over all elements in result_shape. The number of r's here is equal -# to the rank of the result -for r0 in range(result_shape[0]), r1 in range(result_shape[1]), ...: - # Initialize this result element - result[r0, r1...] <- 0 - - # Iterate over all the reduction dimensions - for d0 in range(dimensions[0]), d1 in range(dimensions[1]), ...: - # Increment the result element with the value of the operand's element. - # The index of the operand's element is constructed from all ri's and di's - # in the right order (by construction ri's and di's together index over the - # whole operand shape). - result[r0, r1...] += operand[ri... di] -``` - -Here's an example of reducing a 2D array (matrix). The shape has rank 2, -dimension 0 of size 2 and dimension 1 of size 3: - -
- -
- -Results of reducing dimensions 0 or 1 with an "add" function: - -
- -
- -Note that both reduction results are 1D arrays. The diagram shows one as column -and another as row just for visual convenience. - -For a more complex example, here is a 3D array. Its rank is 3, dimension 0 of -size 4, dimension 1 of size 2 and dimension 2 of size 3. For simplicity, the -values 1 to 6 are replicated across dimension 0. - -
- -
- -Similarly to the 2D example, we can reduce just one dimension. If we reduce -dimension 0, for example, we get a rank-2 array where all values across -dimension 0 were folded into a scalar: - -```text -| 4 8 12 | -| 16 20 24 | -``` - -If we reduce dimension 2, we also get a rank-2 array where all values across -dimension 2 were folded into a scalar: - -```text -| 6 15 | -| 6 15 | -| 6 15 | -| 6 15 | -``` - -Note that the relative order between the remaining dimensions in the input is -preserved in the output, but some dimensions may get assigned new numbers (since -the rank changes). - -We can also reduce multiple dimensions. Add-reducing dimensions 0 and 1 produces -the 1D array `| 20 28 36 |`. - -Reducing the 3D array over all its dimensions produces the scalar `84`. - -When `N > 1`, reduce function application is slightly more complex, as it is -applied simultaneously to all inputs. For example, consider the following -reduction function, which can be used to compute the max and the argmax of a -a 1-D tensor in parallel: - -``` -f: (Float, Int, Float, Int) -> Float, Int -f(max, argmax, value, index): - if value >= argmax: - return (value, index) - else: - return (max, argmax) -``` - -For 1-D Input arrays `V = Float[N], K = Int[N]`, and init values -`I_V = Float, I_K = Int`, the result `f_(N-1)` of reducing across the only -input dimension is equivalent to the following recursive application: -``` -f_0 = f(I_V, I_K, V_0, K_0) -f_1 = f(f_0.first, f_0.second, V_1, K_1) -... -f_(N-1) = f(f_(N-2).first, f_(N-2).second, V_(N-1), K_(N-1)) -``` - -Applying this reduction to an array of values, and an array of sequential -indices (i.e. iota), will co-iterate over the arrays, and return a tuple -containing the maximal value and the matching index. - -## ReducePrecision - -See also -[`XlaBuilder::ReducePrecision`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_builder.h). - -Models the effect of converting floating-point values to a lower-precision -format (such as IEEE-FP16) and back to the original format. The number of -exponent and mantissa bits in the lower-precision format can be specified -arbitrarily, although all bit sizes may not be supported on all hardware -implementations. - - `ReducePrecision(operand, mantissa_bits, exponent_bits)` - -Arguments | Type | Semantics ---------------- | ------- | ------------------------------------------------- -`operand` | `XlaOp` | array of floating-point type `T`. -`exponent_bits` | `int32` | number of exponent bits in lower-precision format -`mantissa_bits` | `int32` | number of mantissa bits in lower-precision format - -The result is an array of type `T`. The input values are rounded to the nearest -value representable with the given number of mantissa bits (using "ties to even" -semantics), and any values that exceed the range specified by the number of -exponent bits are clamped to positive or negative infinity. `NaN` values are -retained, although they may be converted to canonical `NaN` values. - -The lower-precision format must have at least one exponent bit (in order to -distinguish a zero value from an infinity, since both have a zero mantissa), and -must have a non-negative number of mantissa bits. The number of exponent or -mantissa bits may exceed the corresponding value for type `T`; the corresponding -portion of the conversion is then simply a no-op. - -## ReduceWindow - -See also -[`XlaBuilder::ReduceWindow`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_builder.h). - -Applies a reduction function to all elements in each window of the input -multi-dimensional array, producing an output multi-dimensional array with the -same number of elements as the number of valid positions of the window. A -pooling layer can be expressed as a `ReduceWindow`. Similar to -[`Reduce`](#reduce), the applied `computation` is always passed the `init_value` -on the left-hand side. - - `ReduceWindow(operand, init_value, computation, window_dimensions, -window_strides, padding)` - -| Arguments | Type | Semantics | -| ------------------- | ------------------- | -------------------------------- | -| `operand` | `XlaOp` | N dimensional array containing | -: : : elements of type T. This is the : -: : : base area on which the window is : -: : : placed. : -| `init_value` | `XlaOp` | Starting value for the | -: : : reduction. See [Reduce](#reduce) : -: : : for details. : -| `computation` | `XlaComputation` | Reduction function of type `T, T | -: : : -> T`, to apply to all elements : -: : : in each window : -| `window_dimensions` | `ArraySlice` | array of integers for window | -: : : dimension values : -| `window_strides` | `ArraySlice` | array of integers for window | -: : : stride values : -| `padding` | `Padding` | padding type for window | -: : : (Padding\:\:kSame or : -: : : Padding\:\:kValid) : - -Below code and figure shows an example of using `ReduceWindow`. Input is a -matrix of size [4x6] and both window_dimensions and window_stride_dimensions are -[2x3]. - -``` -// Create a computation for the reduction (maximum). -XlaComputation max; -{ - XlaBuilder builder(client_, "max"); - auto y = builder.Parameter(0, ShapeUtil::MakeShape(F32, {}), "y"); - auto x = builder.Parameter(1, ShapeUtil::MakeShape(F32, {}), "x"); - builder.Max(y, x); - max = builder.Build().ConsumeValueOrDie(); -} - -// Create a ReduceWindow computation with the max reduction computation. -XlaBuilder builder(client_, "reduce_window_2x3"); -auto shape = ShapeUtil::MakeShape(F32, {4, 6}); -auto input = builder.Parameter(0, shape, "input"); -builder.ReduceWindow( - input, *max, - /*init_val=*/builder.ConstantLiteral(LiteralUtil::MinValue(F32)), - /*window_dimensions=*/{2, 3}, - /*window_stride_dimensions=*/{2, 3}, - Padding::kValid); -``` - -
- -
- -Stride of 1 in a dimension specifies that the position of a window in the -dimension is 1 element away from its adjacent window. In order to specify that -no windows overlap with each other, window_stride_dimensions should be equal to -window_dimensions. The figure below illustrates the use of two different stride -values. Padding is applied to each dimension of the input and the calculations -are the same as though the input came in with the dimensions it has after -padding. - -
- -
- -The evaluation order of the reduction function is arbitrary and may be -non-deterministic. Therefore, the reduction function should not be overly -sensitive to reassociation. See the discussion about associativity in the -context of [`Reduce`](#reduce) for more details. - -## Reshape - -See also -[`XlaBuilder::Reshape`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_builder.h) -and the [`Collapse`](#collapse) operation. - -Reshapes the dimensions of an array into a new configuration. - - `Reshape(operand, new_sizes)` - `Reshape(operand, dimensions, new_sizes)` - -Arguments | Type | Semantics ------------- | -------------- | --------------------------------------- -`operand` | `XlaOp` | array of type T -`dimensions` | `int64` vector | order in which dimensions are collapsed -`new_sizes` | `int64` vector | vector of sizes of new dimensions - -Conceptually, reshape first flattens an array into a one-dimensional vector of -data values, and then refines this vector into a new shape. The input arguments -are an arbitrary array of type T, a compile-time-constant vector of dimension -indices, and a compile-time-constant vector of dimension sizes for the result. -The values in the `dimension` vector, if given, must be a permutation of all of -T's dimensions; the default if not given is `{0, ..., rank - 1}`. The order of -the dimensions in `dimensions` is from slowest-varying dimension (most major) to -fastest-varying dimension (most minor) in the loop nest which collapses the -input array into a single dimension. The `new_sizes` vector determines the size -of the output array. The value at index 0 in `new_sizes` is the size of -dimension 0, the value at index 1 is the size of dimension 1, and so on. The -product of the `new_size` dimensions must equal the product of the operand's -dimension sizes. When refining the collapsed array into the multidimensional -array defined by `new_sizes`, the dimensions in `new_sizes` are ordered from -slowest varying (most major) and to fastest varying (most minor). - -For example, let v be an array of 24 elements: - -``` -let v = f32[4x2x3] {{{10, 11, 12}, {15, 16, 17}}, - {{20, 21, 22}, {25, 26, 27}}, - {{30, 31, 32}, {35, 36, 37}}, - {{40, 41, 42}, {45, 46, 47}}}; - -In-order collapse: -let v012_24 = Reshape(v, {0,1,2}, {24}); -then v012_24 == f32[24] {10, 11, 12, 15, 16, 17, 20, 21, 22, 25, 26, 27, - 30, 31, 32, 35, 36, 37, 40, 41, 42, 45, 46, 47}; - -let v012_83 = Reshape(v, {0,1,2}, {8,3}); -then v012_83 == f32[8x3] {{10, 11, 12}, {15, 16, 17}, - {20, 21, 22}, {25, 26, 27}, - {30, 31, 32}, {35, 36, 37}, - {40, 41, 42}, {45, 46, 47}}; - -Out-of-order collapse: -let v021_24 = Reshape(v, {1,2,0}, {24}); -then v012_24 == f32[24] {10, 20, 30, 40, 11, 21, 31, 41, 12, 22, 32, 42, - 15, 25, 35, 45, 16, 26, 36, 46, 17, 27, 37, 47}; - -let v021_83 = Reshape(v, {1,2,0}, {8,3}); -then v021_83 == f32[8x3] {{10, 20, 30}, {40, 11, 21}, - {31, 41, 12}, {22, 32, 42}, - {15, 25, 35}, {45, 16, 26}, - {36, 46, 17}, {27, 37, 47}}; - - -let v021_262 = Reshape(v, {1,2,0}, {2,6,2}); -then v021_262 == f32[2x6x2] {{{10, 20}, {30, 40}, - {11, 21}, {31, 41}, - {12, 22}, {32, 42}}, - {{15, 25}, {35, 45}, - {16, 26}, {36, 46}, - {17, 27}, {37, 47}}}; -``` - -As a special case, reshape can transform a single-element array to a scalar and -vice versa. For example, - -``` -Reshape(f32[1x1] {{5}}, {0,1}, {}) == 5; -Reshape(5, {}, {1,1}) == f32[1x1] {{5}}; -``` - -## Rev (reverse) - -See also -[`XlaBuilder::Rev`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_builder.h). - -`Rev(operand, dimensions)` - -Arguments | Type | Semantics ------------- | ------------------- | --------------------- -`operand` | `XlaOp` | array of type T -`dimensions` | `ArraySlice` | dimensions to reverse - -Reverses the order of elements in the `operand` array along the specified -`dimensions`, generating an output array of the same shape. Each element of the -operand array at a multidimensional index is stored into the output array at a -transformed index. The multidimensional index is transformed by reversing the -index in each dimension to be reversed (i.e., if a dimension of size N is one of -the reversing dimensions, its index i is transformed into N - 1 - i). - -One use for the `Rev` operation is to reverse the convolution weight array along -the two window dimensions during the gradient computation in neural networks. - -## RngNormal - -See also -[`XlaBuilder::RngNormal`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_builder.h). - -Constructs an output of a given shape with random numbers generated following -the $$N(\mu, \sigma)$$ normal distribution. The parameters $$\mu$$ and -$$\sigma$$, and output shape have to have a floating point elemental type. The -parameters furthermore have to be scalar valued. - -`RngNormal(mu, sigma, shape)` - -| Arguments | Type | Semantics | -| --------- | ------- | --------------------------------------------------- | -| `mu` | `XlaOp` | Scalar of type T specifying mean of generated | -: : : numbers : -| `sigma` | `XlaOp` | Scalar of type T specifying standard deviation of | -: : : generated numbers : -| `shape` | `Shape` | Output shape of type T | - -## RngUniform - -See also -[`XlaBuilder::RngUniform`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_builder.h). - -Constructs an output of a given shape with random numbers generated following -the uniform distribution over the interval $$[a,b)$$. The parameters and output -element type have to be a boolean type, an integral type or a floating point -types, and the types have to be consistent. The CPU and GPU backends currently -only support F64, F32, F16, BF16, S64, U64, S32 and U32. Furthermore, the -parameters need to be scalar valued. If $$b <= a$$ the result is -implementation-defined. - -`RngUniform(a, b, shape)` - -| Arguments | Type | Semantics | -| --------- | ----------------------- | --------------------------------- | -| `a` | `XlaOp` | Scalar of type T specifying lower | -: : : limit of interval : -| `b` | `XlaOp` | Scalar of type T specifying upper | -: : : limit of interval : -| `shape` | `Shape` | Output shape of type T | - -## Scatter - -The XLA scatter operation generates a result which is the value of the input -tensor `operand`, with several slices (at indices specified by -`scatter_indices`) updated with the values in `updates` using -`update_computation`. - -See also -[`XlaBuilder::Scatter`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_builder.h). - - `scatter(operand, scatter_indices, updates, update_computation, index_vector_dim, update_window_dims, inserted_window_dims, scatter_dims_to_operand_dims)` - -|Arguments | Type | Semantics | -|------------------|------------------------|----------------------------------| -|`operand` | `XlaOp` | Tensor to be scattered into. | -|`scatter_indices` | `XlaOp` | Tensor containing the starting | -: : : indices of the slices that must : -: : : be scattered to. : -|`updates` | `XlaOp` | Tensor containing the values that| -: : : must be used for scattering. : -|`update_computation`| `XlaComputation` | Computation to be used for | -: : : combining the existing values in : -: : : the input tensor and the updates : -: : : during scatter. This computation : -: : : should be of type `T, T -> T`. : -|`index_vector_dim`| `int64` | The dimension in | -: : : `scatter_indices` that contains : -: : : the starting indices. : -|`update_window_dims`| `ArraySlice` | The set of dimensions in | -: : : `updates` shape that are _window : -: : : dimensions_. : -|`inserted_window_dims`| `ArraySlice`| The set of _window dimensions_ | -: : : that must be inserted into : -: : : `updates` shape. : -|`scatter_dims_to_operand_dims`| `ArraySlice` | A dimensions map from | -: : : the scatter indices to the : -: : : operand index space. This array : -: : : is interpreted as mapping `i` to : -: : : `scatter_dims_to_operand_dims[i]`: -: : : . It has to be one-to-one and : -: : : total. : - -If `index_vector_dim` is equal to `scatter_indices.rank` we implicitly consider -`scatter_indices` to have a trailing `1` dimension. - -We define `update_scatter_dims` of type `ArraySlice` as the set of -dimensions in `updates` shape that are not in `update_window_dims`, in ascending -order. - -The arguments of scatter should follow these constraints: - - - `updates` tensor must be of rank `update_window_dims.size + - scatter_indices.rank - 1`. - - - Bounds of dimension `i` in `updates` must conform to the following: - - If `i` is present in `update_window_dims` (i.e. equal to - `update_window_dims`[`k`] for some `k`), then the bound of dimension - `i` in `updates` must not exceed the corresponding bound of `operand` - after accounting for the `inserted_window_dims` (i.e. - `adjusted_window_bounds`[`k`], where `adjusted_window_bounds` contains - the bounds of `operand` with the bounds at indices - `inserted_window_dims` removed). - - If `i` is present in `update_scatter_dims` (i.e. equal to - `update_scatter_dims`[`k`] for some `k`), then the bound of dimension - `i` in `updates` must be equal to the corresponding bound of - `scatter_indices`, skipping `index_vector_dim` (i.e. - `scatter_indices.shape.dims`[`k`], if `k` < `index_vector_dim` and - `scatter_indices.shape.dims`[`k+1`] otherwise). - - - `update_window_dims` must be in ascending order, not have any repeating - dimension numbers, and be in the range `[0, updates.rank)`. - - - `inserted_window_dims` must be in ascending order, not have any - repeating dimension numbers, and be in the range `[0, operand.rank)`. - - - `scatter_dims_to_operand_dims.size` must be equal to - `scatter_indices`[`index_vector_dim`], and its values must be in the range - `[0, operand.rank)`. - -For a given index `U` in the `updates` tensor, the corresponding index `I` in -the `operand` tensor into which this update has to be applied is computed as -follows: - - 1. Let `G` = { `U`[`k`] for `k` in `update_scatter_dims` }. Use `G` to look up - an index vector `S` in the `scatter_indices` tensor such that `S`[`i`] = - `scatter_indices`[Combine(`G`, `i`)] where Combine(A, b) inserts b at - positions `index_vector_dim` into A. - 2. Create an index `S``in` into `operand` using `S` by scattering - `S` using the `scatter_dims_to_operand_dims` map. More formally: - 1. `S``in`[`scatter_dims_to_operand_dims`[`k`]] = `S`[`k`] if - `k` < `scatter_dims_to_operand_dims.size`. - 2. `S``in`[`_`] = `0` otherwise. - 3. Create an index `W``in` into `operand` by scattering the indices - at `update_window_dims` in `U` according to `inserted_window_dims`. - More formally: - 1. `W``in`[`window_dims_to_operand_dims`(`k`)] = `U`[`k`] if - `k` < `update_window_dims.size`, where `window_dims_to_operand_dims` - is the monotonic function with domain [`0`, `update_window_dims.size`) - and range [`0`, `operand.rank`) \\ `inserted_window_dims`. (For - example, if `update_window_dims.size` is `4`, `operand.rank` is `6`, - and `inserted_window_dims` is {`0`, `2`} then - `window_dims_to_operand_dims` is {`0`→`1`, `1`→`3`, `2`→`4`, - `3`→`5`}). - 2. `W``in`[`_`] = `0` otherwise. - 4. `I` is `W``in` + `S``in` where + is element-wise - addition. - -In summary, the scatter operation can be defined as follows. - - - Initialize `output` with `operand`, i.e. for all indices `O` in the - `operand` tensor:\ - `output`[`O`] = `operand`[`O`] - - For every index `U` in the `updates` tensor and the corresponding index `O` - in the `operand` tensor:\ - `output`[`O`] = `update_computation`(`output`[`O`], `updates`[`U`]) - -The order in which updates are applied is non-deterministic. So, when multiple -indices in `updates` refer to the same index in `operand`, the corresponding -value in `output` will be non-deterministic. - -Note that the first parameter that is passed into the `update_computation` will -always be the current value from the `output` tensor and the second parameter -will always be the value from the `updates` tensor. This is important -specifically for cases when the `update_computation` is _not commutative_. - -Informally, the scatter op can be viewed as an _inverse_ of the gather op, i.e. -the scatter op updates the elements in the input that are extracted by the -corresponding gather op. - -For a detailed informal description and examples, refer to the -"Informal Description" section under `Gather`. - -## Select - -See also -[`XlaBuilder::Select`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_builder.h). - -Constructs an output array from elements of two input arrays, based on the -values of a predicate array. - - `Select(pred, on_true, on_false)` - -Arguments | Type | Semantics ----------- | ------- | ------------------ -`pred` | `XlaOp` | array of type PRED -`on_true` | `XlaOp` | array of type T -`on_false` | `XlaOp` | array of type T - -The arrays `on_true` and `on_false` must have the same shape. This is also the -shape of the output array. The array `pred` must have the same dimensionality as -`on_true` and `on_false`, with the `PRED` element type. - -For each element `P` of `pred`, the corresponding element of the output array is -taken from `on_true` if the value of `P` is `true`, and from `on_false` if the -value of `P` is `false`. As a restricted form of [broadcasting] -(broadcasting.md), `pred` can be a scalar of type `PRED`. In this case, the -output array is taken wholly from `on_true` if `pred` is `true`, and from -`on_false` if `pred` is `false`. - -Example with non-scalar `pred`: - -``` -let pred: PRED[4] = {true, false, false, true}; -let v1: s32[4] = {1, 2, 3, 4}; -let v2: s32[4] = {100, 200, 300, 400}; -==> -Select(pred, v1, v2) = s32[4]{1, 200, 300, 4}; -``` - -Example with scalar `pred`: - -``` -let pred: PRED = true; -let v1: s32[4] = {1, 2, 3, 4}; -let v2: s32[4] = {100, 200, 300, 400}; -==> -Select(pred, v1, v2) = s32[4]{1, 2, 3, 4}; -``` - -Selections between tuples are supported. Tuples are considered to be scalar -types for this purpose. If `on_true` and `on_false` are tuples (which must have -the same shape!) then `pred` has to be a scalar of type `PRED`. - -## SelectAndScatter - -See also -[`XlaBuilder::SelectAndScatter`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_builder.h). - -This operation can be considered as a composite operation that first computes -`ReduceWindow` on the `operand` array to select an element from each window, and -then scatters the `source` array to the indices of the selected elements to -construct an output array with the same shape as the operand array. The binary -`select` function is used to select an element from each window by applying it -across each window, and it is called with the property that the first -parameter's index vector is lexicographically less than the second parameter's -index vector. The `select` function returns `true` if the first parameter is -selected and returns `false` if the second parameter is selected, and the -function must hold transitivity (i.e., if `select(a, b)` and `select(b, c)` are -`true`, then `select(a, c)` is also `true`) so that the selected element does -not depend on the order of the elements traversed for a given window. - -The function `scatter` is applied at each selected index in the output array. It -takes two scalar parameters: - -1. Current value at the selected index in the output array -2. The scatter value from `source` that applies to the selected index - -It combines the two parameters and returns a scalar value that's used to update -the value at the selected index in the output array. Initially, all indices of -the output array are set to `init_value`. - -The output array has the same shape as the `operand` array and the `source` -array must have the same shape as the result of applying a `ReduceWindow` -operation on the `operand` array. `SelectAndScatter` can be used to -backpropagate the gradient values for a pooling layer in a neural network. - -`SelectAndScatter(operand, select, window_dimensions, window_strides, -padding, source, init_value, scatter)` - -| Arguments | Type | Semantics | -| ------------------- | ------------------- | -------------------------------- | -| `operand` | `XlaOp` | array of type T over which the | -: : : windows slide : -| `select` | `XlaComputation` | binary computation of type `T, T | -: : : -> PRED`, to apply to all : -: : : elements in each window; returns : -: : : `true` if the first parameter is : -: : : selected and returns `false` if : -: : : the second parameter is selected : -| `window_dimensions` | `ArraySlice` | array of integers for window | -: : : dimension values : -| `window_strides` | `ArraySlice` | array of integers for window | -: : : stride values : -| `padding` | `Padding` | padding type for window | -: : : (Padding\:\:kSame or : -: : : Padding\:\:kValid) : -| `source` | `XlaOp` | array of type T with the values | -: : : to scatter : -| `init_value` | `XlaOp` | scalar value of type T for the | -: : : initial value of the output : -: : : array : -| `scatter` | `XlaComputation` | binary computation of type `T, T | -: : : -> T`, to apply each scatter : -: : : source element with its : -: : : destination element : - -The figure below shows examples of using `SelectAndScatter`, with the `select` -function computing the maximal value among its parameters. Note that when the -windows overlap, as in the figure (2) below, an index of the `operand` array may -be selected multiple times by different windows. In the figure, the element of -value 9 is selected by both of the top windows (blue and red) and the binary -addition `scatter` function produces the output element of value 8 (2 + 6). - -
- -
- -The evaluation order of the `scatter` function is arbitrary and may be -non-deterministic. Therefore, the `scatter` function should not be overly -sensitive to reassociation. See the discussion about associativity in the -context of [`Reduce`](#reduce) for more details. - -## Send - -See also -[`XlaBuilder::Send`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_builder.h). - - `Send(operand, channel_handle)` - -Arguments | Type | Semantics ----------------- | --------------- | ----------------------------------------- -`operand` | `XlaOp` | data to send (array of type T) -`channel_handle` | `ChannelHandle` | unique identifier for each send/recv pair - -Sends the given operand data to a `Recv` instruction in another computation -that shares the same channel handle. Does not return any data. - -Similar to the `Recv` operation, the client API of `Send` operation represents -synchronous communication, and is internally decomposed into 2 HLO instructions -(`Send` and `SendDone`) to enable asynchronous data transfers. See also -[`HloInstruction::CreateSend` and `HloInstruction::CreateSendDone`](https://www.tensorflow.org/code/tensorflow/compiler/xla/service/hlo_instruction.h). - -`Send(HloInstruction operand, int64 channel_id)` - -Initiates an asynchronous transfer of the operand to the resources allocated by -the `Recv` instruction with the same channel id. Returns a context, which is -used by a following `SendDone` instruction to wait for the completion of the -data transfer. The context is a tuple of {operand (shape), request identifier -(U32)} and it can only be used by a `SendDone` instruction. - - `SendDone(HloInstruction context)` - -Given a context created by a `Send` instruction, waits for the data transfer to -complete. The instruction does not return any data. - - Scheduling of channel instructions - -The execution order of the 4 instructions for each channel (`Recv`, `RecvDone`, -`Send`, `SendDone`) is as below. - -
- -
- -* `Recv` happens before `Send` -* `Send` happens before `RecvDone` -* `Recv` happens before `RecvDone` -* `Send` happens before `SendDone` - -When the backend compilers generate a linear schedule for each computation that -communicates via channel instructions, there must not be cycles across the -computations. For example, below schedules lead to deadlocks. - -
- -
- -## Slice - -See also -[`XlaBuilder::Slice`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_builder.h). - -Slicing extracts a sub-array from the input array. The sub-array is of the same -rank as the input and contains the values inside a bounding box within the input -array where the dimensions and indices of the bounding box are given as -arguments to the slice operation. - - `Slice(operand, start_indices, limit_indices)` - -| Arguments | Type | Semantics | -| --------------- | ------------------- | ------------------------------------ | -| `operand` | `XlaOp` | N dimensional array of type T | -| `start_indices` | `ArraySlice` | List of N integers containing the | -: : : starting indices of the slice for : -: : : each dimension. Values must be : -: : : greater than or equal to zero. : -| `limit_indices` | `ArraySlice` | List of N integers containing the | -: : : ending indices (exclusive) for the : -: : : slice for each dimension. Each value : -: : : must be greater than or equal to the : -: : : respective `start_indices` value for : -: : : the dimension and less than or equal : -: : : to the size of the dimension. : - -1-dimensional example: - -``` -let a = {0.0, 1.0, 2.0, 3.0, 4.0} -Slice(a, {2}, {4}) produces: - {2.0, 3.0} -``` - -2-dimensional example: - -``` -let b = - { {0.0, 1.0, 2.0}, - {3.0, 4.0, 5.0}, - {6.0, 7.0, 8.0}, - {9.0, 10.0, 11.0} } - -Slice(b, {2, 1}, {4, 3}) produces: - { { 7.0, 8.0}, - {10.0, 11.0} } -``` - -## Sort - -See also -[`XlaBuilder::Sort`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_builder.h). - -There are two versions of the Sort instruction: a single-operand and a -two-operand version. - -`Sort(operand)` - -Arguments | Type | Semantics ------------ | ------- | -------------------- -`operand` | `XlaOp` | The operand to sort. -`dimension` | `int64` | The dimension along which to sort. - -Sorts the elements in the operand in ascending order along the provided -dimension. For example, for a rank-2 (matrix) operand, a `dimension` value of 0 -will sort each column independently, and a `dimension` value of 1 will sort each -row independently. If the operand's elements have floating point type, and the -operand contains NaN elements, the order of elements in the output is -implementation-defined. - -`Sort(key, value)` - -Sorts both the key and the value operands. The keys are sorted as in the -single-operand version. The values are sorted according to the order of their -corresponding keys. For example, if the inputs are `keys = [3, 1]` and -`values = [42, 50]`, then the output of the sort is the tuple -`{[1, 3], [50, 42]}`. - -The sort is not guaranteed to be stable, that is, if the keys array contains -duplicates, the order of their corresponding values may not be preserved. - -Arguments | Type | Semantics ------------ | ------- | ------------------- -`keys` | `XlaOp` | The sort keys. -`values` | `XlaOp` | The values to sort. -`dimension` | `int64` | The dimension along which to sort. - -The `keys` and `values` must have the same dimensions, but may have different -element types. - -## Transpose - -See also the `tf.reshape` operation. - -`Transpose(operand)` - -Arguments | Type | Semantics -------------- | ------------------- | ------------------------------ -`operand` | `XlaOp` | The operand to transpose. -`permutation` | `ArraySlice` | How to permute the dimensions. - - -Permutes the operand dimensions with the given permutation, so -`∀ i . 0 ≤ i < rank ⇒ input_dimensions[permutation[i]] = output_dimensions[i]`. - -This is the same as Reshape(operand, permutation, - Permute(permutation, operand.shape.dimensions)). - -## Tuple - -See also -[`XlaBuilder::Tuple`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_builder.h). - -A tuple containing a variable number of data handles, each of which has its own -shape. - -This is analogous to `std::tuple` in C++. Conceptually: - -``` -let v: f32[10] = f32[10]{0, 1, 2, 3, 4, 5, 6, 7, 8, 9}; -let s: s32 = 5; -let t: (f32[10], s32) = tuple(v, s); -``` - -Tuples can be deconstructed (accessed) via the [`GetTupleElement`] -(#gettupleelement) operation. - -## While - -See also -[`XlaBuilder::While`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_builder.h). - - `While(condition, body, init)` - -| Arguments | Type | Semantics | -| ----------- | ---------------- | ---------------------------------------- | -| `condition` | `XlaComputation` | XlaComputation of type `T -> PRED` which | -: : : defines the termination condition of the : -: : : loop. : -| `body` | `XlaComputation` | XlaComputation of type `T -> T` which | -: : : defines the body of the loop. : -| `init` | `T` | Initial value for the parameter of | -: : : `condition` and `body`. : - -Sequentially executes the `body` until the `condition` fails. This is similar to -a typical while loop in many other languages except for the differences and -restrictions listed below. - -* A `While` node returns a value of type `T`, which is the result from the - last execution of the `body`. -* The shape of the type `T` is statically determined and must be the same - across all iterations. - -The T parameters of the computations are initialized with the `init` value in -the first iteration and are automatically updated to the new result from `body` -in each subsequent iteration. - -One main use case of the `While` node is to implement the repeated execution of -training in neural networks. Simplified pseudocode is shown below with a graph -that represents the computation. The code can be found in -[`while_test.cc`](https://www.tensorflow.org/code/tensorflow/compiler/xla/tests/while_test.cc). -The type `T` in this example is a `Tuple` consisting of an `int32` for the -iteration count and a `vector[10]` for the accumulator. For 1000 iterations, the -loop keeps adding a constant vector to the accumulator. - -``` -// Pseudocode for the computation. -init = {0, zero_vector[10]} // Tuple of int32 and float[10]. -result = init; -while (result(0) < 1000) { - iteration = result(0) + 1; - new_vector = result(1) + constant_vector[10]; - result = {iteration, new_vector}; -} -``` - -
- -
diff --git a/tensorflow/docs_src/performance/xla/shapes.md b/tensorflow/docs_src/performance/xla/shapes.md deleted file mode 100644 index 39e74ff307..0000000000 --- a/tensorflow/docs_src/performance/xla/shapes.md +++ /dev/null @@ -1,150 +0,0 @@ -# Shapes and Layout - -The XLA `Shape` proto -([xla_data.proto](https://www.tensorflow.org/code/tensorflow/compiler/xla/xla_data.proto)) -describes the rank, size, and data type of an N-dimensional array (*array* in -short). - -## Terminology, Notation, and Conventions - -* The rank of an array is equal to the number of dimensions. The *true rank* - of an array is the number of dimensions which have a size greater than 1. - -* Dimensions are numbered from `0` up to `N-1` for an `N` dimensional array. - The dimension numbers are arbitrary labels for convenience. The order of - these dimension numbers does not imply a particular minor/major ordering in - the layout of the shape. The layout is determined by the `Layout` proto. - -* By convention, dimensions are listed in increasing order of dimension - number. For example, for a 3-dimensional array of size `[A x B x C]`, - dimension 0 has size `A`, dimension 1 has size `B` and dimension 2 has size - `C`. - - Some utilities in XLA also support negative indexing, similarly to Python; - dimension -1 is the last dimension (equivalent to `N-1` for an `N` - dimensional array). For example, for the 3-dimensional array described - above, dimension -1 has size `C`, dimension -2 has size `B` and so on. - -* Two, three, and four dimensional arrays often have specific letters - associated with dimensions. For example, for a 2D array: - - * dimension 0: `y` - * dimension 1: `x` - - For a 3D array: - - * dimension 0: `z` - * dimension 1: `y` - * dimension 2: `x` - - For a 4D array: - - * dimension 0: `p` - * dimension 1: `z` - * dimension 2: `y` - * dimension 3: `x` - -* Functions in the XLA API which take dimensions do so in increasing order of - dimension number. This matches the ordering used when passing dimensions as - an `initializer_list`; e.g. - - `ShapeUtil::MakeShape(F32, {A, B, C, D})` - - Will create a shape whose dimension size array consists of the sequence - `[A, B, C, D]`. - -## Layout - -The `Layout` proto describes how an array is represented in memory. The `Layout` -proto includes the following fields: - -``` -message Layout { - repeated int64 minor_to_major = 1; - repeated int64 padded_dimensions = 2; - optional PaddingValue padding_value = 3; -} -``` - -### Minor-to-major dimension ordering - -The only required field is `minor_to_major`. This field describes the -minor-to-major ordering of the dimensions within a shape. Values in -`minor_to_major` are an ordering of the dimensions of the array (`0` to `N-1` -for an `N` dimensional array) with the first value being the most-minor -dimension up to the last value which is the most-major dimension. The most-minor -dimension is the dimension which changes most rapidly when stepping through the -elements of the array laid out in linear memory. - -For example, consider the following 2D array of size `[2 x 3]`: - -``` -a b c -d e f -``` - -Here dimension `0` is size 2, and dimension `1` is size 3. If the -`minor_to_major` field in the layout is `[0, 1]` then dimension `0` is the -most-minor dimension and dimension `1` is the most-major dimension. This -corresponds to the following layout in linear memory: - -``` -a d b e c f -``` - -This minor-to-major dimension order of `0` up to `N-1` is akin to *column-major* -(at rank 2). Assuming a monotonic ordering of dimensions, another name we may -use to refer to this layout in the code is simply "dim 0 is minor". - -On the other hand, if the `minor_to_major` field in the layout is `[1, 0]` then -the layout in linear memory is: - -``` -a b c d e f -``` - -A minor-to-major dimension order of `N-1` down to `0` for an `N` dimensional -array is akin to *row-major* (at rank 2). Assuming a monotonic ordering of -dimensions, another name we may use to refer to this layout in the code is -simply "dim 0 is major". - -#### Default minor-to-major ordering - -The default layout for newly created Shapes is "dimension order is -major-to-minor" (akin to row-major at rank 2). - -### Padding - -Padding is defined in the optional `padded_dimensions` and `padding_value` -fields. The field `padded_dimensions` describes the sizes (widths) to which each -dimension is padded. If present, the number of elements in `padded_dimensions` -must equal the rank of the shape. - -For example, given the `[2 x 3]` array defined above, if `padded_dimension` is -`[3, 5]` then dimension 0 is padded to a width of 3 and dimension 1 is padded to -a width of 5. The layout in linear memory (assuming a padding value of 0 and -column-major layout) is: - -``` -a d 0 b e 0 c f 0 0 0 0 0 0 0 -``` - -This is equivalent to the layout of the following array with the same -minor-to-major dimension order: - -``` -a b c 0 0 -d e f 0 0 -0 0 0 0 0 -``` - -### Indexing into arrays - -The class `IndexUtil` in -[index_util.h](https://www.tensorflow.org/code/tensorflow/compiler/xla/index_util.h) -provides utilities for converting between multidimensional indices and linear -indices given a shape and layout. Multidimensional indices include a `int64` -index for each dimension. Linear indices are a single `int64` value which -indexes into the buffer holding the array. See `shape_util.h` and -`layout_util.h` in the same directory for utilities that simplify creation and -manipulation of shapes and layouts. diff --git a/tensorflow/docs_src/performance/xla/tfcompile.md b/tensorflow/docs_src/performance/xla/tfcompile.md deleted file mode 100644 index 2e0f3774c4..0000000000 --- a/tensorflow/docs_src/performance/xla/tfcompile.md +++ /dev/null @@ -1,281 +0,0 @@ -# Using AOT compilation - -## What is tfcompile? - -`tfcompile` is a standalone tool that ahead-of-time (AOT) compiles TensorFlow -graphs into executable code. It can reduce total binary size, and also avoid -some runtime overheads. A typical use-case of `tfcompile` is to compile an -inference graph into executable code for mobile devices. - -The TensorFlow graph is normally executed by the TensorFlow runtime. This incurs -some runtime overhead for execution of each node in the graph. This also leads -to a larger total binary size, since the code for the TensorFlow runtime needs -to be available, in addition to the graph itself. The executable code produced -by `tfcompile` does not use the TensorFlow runtime, and only has dependencies on -kernels that are actually used in the computation. - -The compiler is built on top of the XLA framework. The code bridging TensorFlow -to the XLA framework resides under -[tensorflow/compiler](https://www.tensorflow.org/code/tensorflow/compiler/), -which also includes support for [just-in-time (JIT) compilation](../../performance/xla/jit.md) of -TensorFlow graphs. - -## What does tfcompile do? - -`tfcompile` takes a subgraph, identified by the TensorFlow concepts of -feeds and fetches, and generates a function that implements that subgraph. -The `feeds` are the input arguments for the function, and the `fetches` are the -output arguments for the function. All inputs must be fully specified by the -feeds; the resulting pruned subgraph cannot contain Placeholder or Variable -nodes. It is common to specify all Placeholders and Variables as feeds, which -ensures the resulting subgraph no longer contains these nodes. The generated -function is packaged as a `cc_library`, with a header file exporting the -function signature, and an object file containing the implementation. The user -writes code to invoke the generated function as appropriate. - -## Using tfcompile - -This section details high level steps for generating an executable binary with -`tfcompile` from a TensorFlow subgraph. The steps are: - -* Step 1: Configure the subgraph to compile -* Step 2: Use the `tf_library` build macro to compile the subgraph -* Step 3: Write code to invoke the subgraph -* Step 4: Create the final binary - -### Step 1: Configure the subgraph to compile - -Identify the feeds and fetches that correspond to the input and output -arguments for the generated function. Then configure the `feeds` and `fetches` -in a [`tensorflow.tf2xla.Config`](https://www.tensorflow.org/code/tensorflow/compiler/tf2xla/tf2xla.proto) -proto. - -```textproto -# Each feed is a positional input argument for the generated function. The order -# of each entry matches the order of each input argument. Here “x_hold” and “y_hold” -# refer to the names of placeholder nodes defined in the graph. -feed { - id { node_name: "x_hold" } - shape { - dim { size: 2 } - dim { size: 3 } - } -} -feed { - id { node_name: "y_hold" } - shape { - dim { size: 3 } - dim { size: 2 } - } -} - -# Each fetch is a positional output argument for the generated function. The order -# of each entry matches the order of each output argument. Here “x_y_prod” -# refers to the name of a matmul node defined in the graph. -fetch { - id { node_name: "x_y_prod" } -} -``` - -### Step 2: Use tf_library build macro to compile the subgraph - -This step converts the graph into a `cc_library` using the `tf_library` build -macro. The `cc_library` consists of an object file containing the code generated -from the graph, along with a header file that gives access to the generated -code. `tf_library` utilizes `tfcompile` to compile the TensorFlow graph into -executable code. - -```build -load("//tensorflow/compiler/aot:tfcompile.bzl", "tf_library") - -# Use the tf_library macro to compile your graph into executable code. -tf_library( - # name is used to generate the following underlying build rules: - # : cc_library packaging the generated header and object files - # _test : cc_test containing a simple test and benchmark - # _benchmark : cc_binary containing a stand-alone benchmark with minimal deps; - # can be run on a mobile device - name = "test_graph_tfmatmul", - # cpp_class specifies the name of the generated C++ class, with namespaces allowed. - # The class will be generated in the given namespace(s), or if no namespaces are - # given, within the global namespace. - cpp_class = "foo::bar::MatMulComp", - # graph is the input GraphDef proto, by default expected in binary format. To - # use the text format instead, just use the ‘.pbtxt’ suffix. A subgraph will be - # created from this input graph, with feeds as inputs and fetches as outputs. - # No Placeholder or Variable ops may exist in this subgraph. - graph = "test_graph_tfmatmul.pb", - # config is the input Config proto, by default expected in binary format. To - # use the text format instead, use the ‘.pbtxt’ suffix. This is where the - # feeds and fetches were specified above, in the previous step. - config = "test_graph_tfmatmul.config.pbtxt", -) -``` - -> To generate the GraphDef proto (test_graph_tfmatmul.pb) for this example, run -> [make_test_graphs.py]("https://www.tensorflow.org/code/tensorflow/compiler/aot/tests/make_test_graphs.py") -> and specify the output location with the --out_dir flag. - -Typical graphs contain [`Variables`](../../api_guides/python/state_ops.md) -representing the weights that are learned via training, but `tfcompile` cannot -compile a subgraph that contain `Variables`. The -[freeze_graph.py](https://www.tensorflow.org/code/tensorflow/python/tools/freeze_graph.py) -tool converts variables into constants, using values stored in a checkpoint -file. As a convenience, the `tf_library` macro supports the `freeze_checkpoint` -argument, which runs the tool. For more examples see -[tensorflow/compiler/aot/tests/BUILD](https://www.tensorflow.org/code/tensorflow/compiler/aot/tests/BUILD). - -> Constants that show up in the compiled subgraph are compiled directly into the -> generated code. To pass the constants into the generated function, rather than -> having them compiled-in, simply pass them in as feeds. - -For details on the `tf_library` build macro, see -[tfcompile.bzl](https://www.tensorflow.org/code/tensorflow/compiler/aot/tfcompile.bzl). - -For details on the underlying `tfcompile` tool, see -[tfcompile_main.cc](https://www.tensorflow.org/code/tensorflow/compiler/aot/tfcompile_main.cc). - -### Step 3: Write code to invoke the subgraph - -This step uses the header file (`test_graph_tfmatmul.h`) generated by the -`tf_library` build macro in the previous step to invoke the generated code. The -header file is located in the `bazel-genfiles` directory corresponding to the -build package, and is named based on the name attribute set on the `tf_library` -build macro. For example, the header generated for `test_graph_tfmatmul` would -be `test_graph_tfmatmul.h`. Below is an abbreviated version of what is -generated. The generated file, in `bazel-genfiles`, contains additional useful -comments. - -```c++ -namespace foo { -namespace bar { - -// MatMulComp represents a computation previously specified in a -// TensorFlow graph, now compiled into executable code. -class MatMulComp { - public: - // AllocMode controls the buffer allocation mode. - enum class AllocMode { - ARGS_RESULTS_AND_TEMPS, // Allocate arg, result and temp buffers - RESULTS_AND_TEMPS_ONLY, // Only allocate result and temp buffers - }; - - MatMulComp(AllocMode mode = AllocMode::ARGS_RESULTS_AND_TEMPS); - ~MatMulComp(); - - // Runs the computation, with inputs read from arg buffers, and outputs - // written to result buffers. Returns true on success and false on failure. - bool Run(); - - // Arg methods for managing input buffers. Buffers are in row-major order. - // There is a set of methods for each positional argument. - void** args(); - - void set_arg0_data(float* data); - float* arg0_data(); - float& arg0(size_t dim0, size_t dim1); - - void set_arg1_data(float* data); - float* arg1_data(); - float& arg1(size_t dim0, size_t dim1); - - // Result methods for managing output buffers. Buffers are in row-major order. - // Must only be called after a successful Run call. There is a set of methods - // for each positional result. - void** results(); - - - float* result0_data(); - float& result0(size_t dim0, size_t dim1); -}; - -} // end namespace bar -} // end namespace foo -``` - -The generated C++ class is called `MatMulComp` in the `foo::bar` namespace, -because that was the `cpp_class` specified in the `tf_library` macro. All -generated classes have a similar API, with the only difference being the methods -to handle arg and result buffers. Those methods differ based on the number and -types of the buffers, which were specified by the `feed` and `fetch` arguments -to the `tf_library` macro. - -There are three types of buffers managed within the generated class: `args` -representing the inputs, `results` representing the outputs, and `temps` -representing temporary buffers used internally to perform the computation. By -default, each instance of the generated class allocates and manages all of these -buffers for you. The `AllocMode` constructor argument may be used to change this -behavior. All buffers are aligned to 64-byte boundaries. - -The generated C++ class is just a wrapper around the low-level code generated by -XLA. - -Example of invoking the generated function based on -[`tfcompile_test.cc`](https://www.tensorflow.org/code/tensorflow/compiler/aot/tests/tfcompile_test.cc): - -```c++ -#define EIGEN_USE_THREADS -#define EIGEN_USE_CUSTOM_THREAD_POOL - -#include -#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor" -#include "tensorflow/compiler/aot/tests/test_graph_tfmatmul.h" // generated - -int main(int argc, char** argv) { - Eigen::ThreadPool tp(2); // Size the thread pool as appropriate. - Eigen::ThreadPoolDevice device(&tp, tp.NumThreads()); - - - foo::bar::MatMulComp matmul; - matmul.set_thread_pool(&device); - - // Set up args and run the computation. - const float args[12] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12}; - std::copy(args + 0, args + 6, matmul.arg0_data()); - std::copy(args + 6, args + 12, matmul.arg1_data()); - matmul.Run(); - - // Check result - if (matmul.result0(0, 0) == 58) { - std::cout << "Success" << std::endl; - } else { - std::cout << "Failed. Expected value 58 at 0,0. Got:" - << matmul.result0(0, 0) << std::endl; - } - - return 0; -} -``` - -### Step 4: Create the final binary - -This step combines the library generated by `tf_library` in step 2 and the code -written in step 3 to create a final binary. Below is an example `bazel` BUILD -file. - -```build -# Example of linking your binary -# Also see //tensorflow/compiler/aot/tests/BUILD -load("//tensorflow/compiler/aot:tfcompile.bzl", "tf_library") - -# The same tf_library call from step 2 above. -tf_library( - name = "test_graph_tfmatmul", - ... -) - -# The executable code generated by tf_library can then be linked into your code. -cc_binary( - name = "my_binary", - srcs = [ - "my_code.cc", # include test_graph_tfmatmul.h to access the generated header - ], - deps = [ - ":test_graph_tfmatmul", # link in the generated object file - "//third_party/eigen3", - ], - linkopts = [ - "-lpthread", - ] -) -``` diff --git a/tensorflow/docs_src/tutorials/_index.yaml b/tensorflow/docs_src/tutorials/_index.yaml deleted file mode 100644 index 9534114689..0000000000 --- a/tensorflow/docs_src/tutorials/_index.yaml +++ /dev/null @@ -1,202 +0,0 @@ -project_path: /_project.yaml -book_path: /_book.yaml -description: -landing_page: - custom_css_path: /site-assets/css/style.css - show_side_navs: True - rows: - - description: > -

Get Started with TensorFlow

-

- TensorFlow is an open-source machine learning library for research and - production. TensorFlow offers APIs for beginners and experts to develop - for desktop, mobile, web, and cloud. See the sections below to get - started. -

- items: - - custom_html: > -
-

Learn and use ML

-
-

- The high-level Keras API provides building blocks to create and - train deep learning models. Start with these beginner-friendly - notebook examples, then read the - TensorFlow Keras guide. -

-
    -
  1. Basic classification
  2. -
  3. Text classification
  4. -
  5. Regression
  6. -
  7. Overfitting and underfitting
  8. -
  9. Save and load
  10. -
-
- -
- - classname: tfo-landing-row-item-code-block - code_block: | -
-        import tensorflow as tf
-        mnist = tf.keras.datasets.mnist
-
-        (x_train, y_train),(x_test, y_test) = mnist.load_data()
-        x_train, x_test = x_train / 255.0, x_test / 255.0
-
-        model = tf.keras.models.Sequential([
-          tf.keras.layers.Flatten(),
-          tf.keras.layers.Dense(512, activation=tf.nn.relu),
-          tf.keras.layers.Dropout(0.2),
-          tf.keras.layers.Dense(10, activation=tf.nn.softmax)
-        ])
-        model.compile(optimizer='adam',
-                      loss='sparse_categorical_crossentropy',
-                      metrics=['accuracy'])
-
-        model.fit(x_train, y_train, epochs=5)
-        model.evaluate(x_test, y_test)
-        
- {% dynamic if request.tld != 'cn' %} - Run in a Notebook - {% dynamic endif %} - - - items: - - custom_html: > -
-

Research and experimentation

-
-

- Eager execution provides an imperative, define-by-run interface for advanced operations. Write custom layers, forward passes, and training loops with auto‑differentiation. Start with - these notebooks, then read the eager execution guide. -

-
    -
  1. - {% dynamic if request.tld == 'cn' %} - Eager execution basics - {% dynamic else %} - Eager execution basics - {% dynamic endif %} -
  2. -
  3. - {% dynamic if request.tld == 'cn' %} - Automatic differentiation and gradient tape - {% dynamic else %} - Automatic differentiation and gradient tape - {% dynamic endif %} -
  4. -
  5. - {% dynamic if request.tld == 'cn' %} - Custom training: basics - {% dynamic else %} - Custom training: basics - {% dynamic endif %} -
  6. -
  7. - {% dynamic if request.tld == 'cn' %} - Custom layers - {% dynamic else %} - Custom layers - {% dynamic endif %} -
  8. -
  9. Custom training: walkthrough
  10. -
  11. - {% dynamic if request.tld == 'cn' %} - Example: Neural machine translation w/ attention - {% dynamic else %} - Example: Neural machine translation w/ attention - {% dynamic endif %} -
  12. -
-
- -
- - custom_html: > -
-

ML at production scale

-
-

- Estimators can train large models on multiple machines in a - production environment. TensorFlow provides a collection of - pre-made Estimators to implement common ML algorithms. See the - Estimators guide. -

-
    -
  1. Build a linear model with Estimators
  2. -
  3. Wide and deep learning with Estimators
  4. -
  5. Boosted trees
  6. -
  7. How to build a simple text classifier with TF-Hub
  8. -
  9. Build a Convolutional Neural Network using Estimators
  10. -
-
- -
- - - description: > -

Google Colab: An easy way to learn and use TensorFlow

-

- Colaboratory - is a Google research project created to help disseminate machine learning - education and research. It's a Jupyter notebook environment that requires - no setup to use and runs entirely in the cloud. - Read the blog post. -

- - - description: > -

Build your first ML app

-

Create and deploy TensorFlow models on web and mobile.

- background: grey - items: - - custom_html: > -
- -

Web developers

-
-
- TensorFlow.js is a WebGL accelerated, JavaScript library to train and - deploy ML models in the browser and for Node.js. -
-
- - custom_html: > -
- -

Mobile developers

-
-
- TensorFlow Lite is lightweight solution for mobile and embedded devices. -
-
- - - description: > -

Videos and updates

-

- Subscribe to the TensorFlow - YouTube channel - and blog for - the latest videos and updates. -

- items: - - description: > -

Get started with TensorFlow's High-Level APIs

- youtube_id: tjsHSIG8I08 - buttons: - - label: Watch the video - path: https://www.youtube.com/watch?v=tjsHSIG8I08 - - description: > -

Eager execution

- youtube_id: T8AW0fKP0Hs - background: grey - buttons: - - label: Watch the video - path: https://www.youtube.com/watch?v=T8AW0fKP0Hs - - description: > -

tf.data: Fast, flexible, and easy-to-use input pipelines

- youtube_id: uIcqeP7MFH0 - buttons: - - label: Watch the video - path: https://www.youtube.com/watch?v=uIcqeP7MFH0 diff --git a/tensorflow/docs_src/tutorials/_toc.yaml b/tensorflow/docs_src/tutorials/_toc.yaml deleted file mode 100644 index c0b85497e0..0000000000 --- a/tensorflow/docs_src/tutorials/_toc.yaml +++ /dev/null @@ -1,128 +0,0 @@ -toc: -- title: Get started with TensorFlow - path: /tutorials/ - -- title: Learn and use ML - style: accordion - section: - - title: Overview - path: /tutorials/keras/ - - title: Basic classification - path: /tutorials/keras/basic_classification - - title: Text classification - path: /tutorials/keras/basic_text_classification - - title: Regression - path: /tutorials/keras/basic_regression - - title: Overfitting and underfitting - path: /tutorials/keras/overfit_and_underfit - - title: Save and restore models - path: /tutorials/keras/save_and_restore_models - -- title: Research and experimentation - style: accordion - section: - - title: Overview - path: /tutorials/eager/ - - title: Eager execution - path: https://github.com/tensorflow/tensorflow/blob/master/tensorflow/contrib/eager/python/examples/notebooks/eager_basics.ipynb - status: external - - title: Automatic differentiation - path: https://github.com/tensorflow/tensorflow/blob/master/tensorflow/contrib/eager/python/examples/notebooks/automatic_differentiation.ipynb - status: external - - title: "Custom training: basics" - path: https://github.com/tensorflow/tensorflow/blob/master/tensorflow/contrib/eager/python/examples/notebooks/custom_training.ipynb - status: external - - title: Custom layers - path: https://github.com/tensorflow/tensorflow/blob/master/tensorflow/contrib/eager/python/examples/notebooks/custom_layers.ipynb - status: external - - title: "Custom training: walkthrough" - path: /tutorials/eager/custom_training_walkthrough - -- title: ML at production scale - style: accordion - section: - - title: Linear model with Estimators - path: /tutorials/estimators/linear - - title: Wide and deep learning - path: https://github.com/tensorflow/models/tree/master/official/wide_deep - status: external - - title: Boosted trees - path: https://github.com/tensorflow/models/tree/master/official/boosted_trees - status: external - - title: Text classifier with TF-Hub - path: /hub/tutorials/text_classification_with_tf_hub - - title: Build a CNN using Estimators - path: /tutorials/estimators/cnn - -- title: Generative models - style: accordion - section: - - title: Text generation - path: https://github.com/tensorflow/tensorflow/blob/master/tensorflow/contrib/eager/python/examples/generative_examples/text_generation.ipynb - status: external - - title: Translation with attention - path: https://github.com/tensorflow/tensorflow/blob/master/tensorflow/contrib/eager/python/examples/nmt_with_attention/nmt_with_attention.ipynb - status: external - - title: Image captioning - path: https://github.com/tensorflow/tensorflow/blob/master/tensorflow/contrib/eager/python/examples/generative_examples/image_captioning_with_attention.ipynb - status: external - - title: DCGAN - path: https://github.com/tensorflow/tensorflow/blob/master/tensorflow/contrib/eager/python/examples/generative_examples/dcgan.ipynb - status: external - - title: VAE - path: https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/eager/python/examples/generative_examples/cvae.ipynb - status: external - -- title: Images - style: accordion - section: - - title: Pix2Pix - path: https://github.com/tensorflow/tensorflow/blob/master/tensorflow/contrib/eager/python/examples/pix2pix/pix2pix_eager.ipynb - status: external - - title: Neural Style Transfer - path: https://github.com/tensorflow/models/blob/master/research/nst_blogpost/4_Neural_Style_Transfer_with_Eager_Execution.ipynb - status: external - - title: Image Segmentation - path: https://github.com/tensorflow/models/blob/master/samples/outreach/blogs/segmentation_blogpost/image_segmentation.ipynb - status: external - - title: Image recognition - path: /tutorials/images/image_recognition - - title: Image retraining - path: /hub/tutorials/image_retraining - - title: Advanced CNN - path: /tutorials/images/deep_cnn - -- title: Sequences - style: accordion - section: - - title: Recurrent neural network - path: /tutorials/sequences/recurrent - - title: Drawing classification - path: /tutorials/sequences/recurrent_quickdraw - - title: Simple audio recognition - path: /tutorials/sequences/audio_recognition - - title: Neural machine translation - path: https://github.com/tensorflow/nmt - status: external - -- title: Data representation - style: accordion - section: - - title: Vector representations of words - path: /tutorials/representation/word2vec - - title: Kernel methods - path: /tutorials/representation/kernel_methods - - title: Large-scale linear models - path: /tutorials/representation/linear - -- title: Non-ML - style: accordion - section: - - title: Mandelbrot set - path: /tutorials/non-ml/mandelbrot - - title: Partial differential equations - path: /tutorials/non-ml/pdes - -- break: True -- title: Next steps - path: /tutorials/next_steps diff --git a/tensorflow/docs_src/tutorials/eager/custom_training_walkthrough.md b/tensorflow/docs_src/tutorials/eager/custom_training_walkthrough.md deleted file mode 100644 index b564a27ecf..0000000000 --- a/tensorflow/docs_src/tutorials/eager/custom_training_walkthrough.md +++ /dev/null @@ -1,3 +0,0 @@ -# Custom training: walkthrough - -[Colab notebook](https://colab.research.google.com/github/tensorflow/models/blob/master/samples/core/tutorials/eager/custom_training_walkthrough.ipynb) diff --git a/tensorflow/docs_src/tutorials/eager/index.md b/tensorflow/docs_src/tutorials/eager/index.md deleted file mode 100644 index 887c820b85..0000000000 --- a/tensorflow/docs_src/tutorials/eager/index.md +++ /dev/null @@ -1,12 +0,0 @@ -# Research and experimentation - -Eager execution provides an imperative, define-by-run interface for advanced -operations. Write custom layers, forward passes, and training loops with -auto differentiation. Start with these notebooks, then read the -[eager execution guide](../../guide/eager). - -1. [Eager execution](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/contrib/eager/python/examples/notebooks/eager_basics.ipynb){:.external} -2. [Automatic differentiation and gradient tape](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/contrib/eager/python/examples/notebooks/automatic_differentiation.ipynb){:.external} -3. [Custom training: basics](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/contrib/eager/python/examples/notebooks/custom_training.ipynb){:.external} -4. [Custom layers](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/contrib/eager/python/examples/notebooks/custom_layers.ipynb){:.external} -5. [Custom training: walkthrough](/tutorials/eager/custom_training_walkthrough) diff --git a/tensorflow/docs_src/tutorials/estimators/cnn.md b/tensorflow/docs_src/tutorials/estimators/cnn.md deleted file mode 100644 index 2fd69f50a0..0000000000 --- a/tensorflow/docs_src/tutorials/estimators/cnn.md +++ /dev/null @@ -1,694 +0,0 @@ -# Build a Convolutional Neural Network using Estimators - -The `tf.layers` module provides a high-level API that makes -it easy to construct a neural network. It provides methods that facilitate the -creation of dense (fully connected) layers and convolutional layers, adding -activation functions, and applying dropout regularization. In this tutorial, -you'll learn how to use `layers` to build a convolutional neural network model -to recognize the handwritten digits in the MNIST data set. - -![handwritten digits 0–9 from the MNIST data set](https://www.tensorflow.org/images/mnist_0-9.png) - -**The [MNIST dataset](http://yann.lecun.com/exdb/mnist/) comprises 60,000 -training examples and 10,000 test examples of the handwritten digits 0–9, -formatted as 28x28-pixel monochrome images.** - -## Getting Started - -Let's set up the skeleton for our TensorFlow program. Create a file called -`cnn_mnist.py`, and add the following code: - -```python -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -# Imports -import numpy as np -import tensorflow as tf - -tf.logging.set_verbosity(tf.logging.INFO) - -# Our application logic will be added here - -if __name__ == "__main__": - tf.app.run() -``` - -As you work through the tutorial, you'll add code to construct, train, and -evaluate the convolutional neural network. The complete, final code can be -[found here](https://www.tensorflow.org/code/tensorflow/examples/tutorials/layers/cnn_mnist.py). - -## Intro to Convolutional Neural Networks - -Convolutional neural networks (CNNs) are the current state-of-the-art model -architecture for image classification tasks. CNNs apply a series of filters to -the raw pixel data of an image to extract and learn higher-level features, which -the model can then use for classification. CNNs contains three components: - -* **Convolutional layers**, which apply a specified number of convolution - filters to the image. For each subregion, the layer performs a set of - mathematical operations to produce a single value in the output feature map. - Convolutional layers then typically apply a - [ReLU activation function](https://en.wikipedia.org/wiki/Rectifier_\(neural_networks\)) to - the output to introduce nonlinearities into the model. - -* **Pooling layers**, which - [downsample the image data](https://en.wikipedia.org/wiki/Convolutional_neural_network#Pooling_layer) - extracted by the convolutional layers to reduce the dimensionality of the - feature map in order to decrease processing time. A commonly used pooling - algorithm is max pooling, which extracts subregions of the feature map - (e.g., 2x2-pixel tiles), keeps their maximum value, and discards all other - values. - -* **Dense (fully connected) layers**, which perform classification on the - features extracted by the convolutional layers and downsampled by the - pooling layers. In a dense layer, every node in the layer is connected to - every node in the preceding layer. - -Typically, a CNN is composed of a stack of convolutional modules that perform -feature extraction. Each module consists of a convolutional layer followed by a -pooling layer. The last convolutional module is followed by one or more dense -layers that perform classification. The final dense layer in a CNN contains a -single node for each target class in the model (all the possible classes the -model may predict), with a -[softmax](https://en.wikipedia.org/wiki/Softmax_function) activation function to -generate a value between 0–1 for each node (the sum of all these softmax values -is equal to 1). We can interpret the softmax values for a given image as -relative measurements of how likely it is that the image falls into each target -class. - -> Note: For a more comprehensive walkthrough of CNN architecture, see Stanford -> University's -> Convolutional Neural Networks for Visual Recognition course materials.

- -## Building the CNN MNIST Classifier {#building_the_cnn_mnist_classifier} - -Let's build a model to classify the images in the MNIST dataset using the -following CNN architecture: - -1. **Convolutional Layer #1**: Applies 32 5x5 filters (extracting 5x5-pixel - subregions), with ReLU activation function -2. **Pooling Layer #1**: Performs max pooling with a 2x2 filter and stride of 2 - (which specifies that pooled regions do not overlap) -3. **Convolutional Layer #2**: Applies 64 5x5 filters, with ReLU activation - function -4. **Pooling Layer #2**: Again, performs max pooling with a 2x2 filter and - stride of 2 -5. **Dense Layer #1**: 1,024 neurons, with dropout regularization rate of 0.4 - (probability of 0.4 that any given element will be dropped during training) -6. **Dense Layer #2 (Logits Layer)**: 10 neurons, one for each digit target - class (0–9). - -The `tf.layers` module contains methods to create each of the three layer types -above: - -* `conv2d()`. Constructs a two-dimensional convolutional layer. Takes number - of filters, filter kernel size, padding, and activation function as - arguments. -* `max_pooling2d()`. Constructs a two-dimensional pooling layer using the - max-pooling algorithm. Takes pooling filter size and stride as arguments. -* `dense()`. Constructs a dense layer. Takes number of neurons and activation - function as arguments. - -Each of these methods accepts a tensor as input and returns a transformed tensor -as output. This makes it easy to connect one layer to another: just take the -output from one layer-creation method and supply it as input to another. - -Open `cnn_mnist.py` and add the following `cnn_model_fn` function, which -conforms to the interface expected by TensorFlow's Estimator API (more on this -later in [Create the Estimator](#create-the-estimator)). `cnn_mnist.py` takes -MNIST feature data, labels, and mode (from -`tf.estimator.ModeKeys`: `TRAIN`, `EVAL`, `PREDICT`) as arguments; -configures the CNN; and returns predictions, loss, and a training operation: - -```python -def cnn_model_fn(features, labels, mode): - """Model function for CNN.""" - # Input Layer - input_layer = tf.reshape(features["x"], [-1, 28, 28, 1]) - - # Convolutional Layer #1 - conv1 = tf.layers.conv2d( - inputs=input_layer, - filters=32, - kernel_size=[5, 5], - padding="same", - activation=tf.nn.relu) - - # Pooling Layer #1 - pool1 = tf.layers.max_pooling2d(inputs=conv1, pool_size=[2, 2], strides=2) - - # Convolutional Layer #2 and Pooling Layer #2 - conv2 = tf.layers.conv2d( - inputs=pool1, - filters=64, - kernel_size=[5, 5], - padding="same", - activation=tf.nn.relu) - pool2 = tf.layers.max_pooling2d(inputs=conv2, pool_size=[2, 2], strides=2) - - # Dense Layer - pool2_flat = tf.reshape(pool2, [-1, 7 * 7 * 64]) - dense = tf.layers.dense(inputs=pool2_flat, units=1024, activation=tf.nn.relu) - dropout = tf.layers.dropout( - inputs=dense, rate=0.4, training=mode == tf.estimator.ModeKeys.TRAIN) - - # Logits Layer - logits = tf.layers.dense(inputs=dropout, units=10) - - predictions = { - # Generate predictions (for PREDICT and EVAL mode) - "classes": tf.argmax(input=logits, axis=1), - # Add `softmax_tensor` to the graph. It is used for PREDICT and by the - # `logging_hook`. - "probabilities": tf.nn.softmax(logits, name="softmax_tensor") - } - - if mode == tf.estimator.ModeKeys.PREDICT: - return tf.estimator.EstimatorSpec(mode=mode, predictions=predictions) - - # Calculate Loss (for both TRAIN and EVAL modes) - loss = tf.losses.sparse_softmax_cross_entropy(labels=labels, logits=logits) - - # Configure the Training Op (for TRAIN mode) - if mode == tf.estimator.ModeKeys.TRAIN: - optimizer = tf.train.GradientDescentOptimizer(learning_rate=0.001) - train_op = optimizer.minimize( - loss=loss, - global_step=tf.train.get_global_step()) - return tf.estimator.EstimatorSpec(mode=mode, loss=loss, train_op=train_op) - - # Add evaluation metrics (for EVAL mode) - eval_metric_ops = { - "accuracy": tf.metrics.accuracy( - labels=labels, predictions=predictions["classes"])} - return tf.estimator.EstimatorSpec( - mode=mode, loss=loss, eval_metric_ops=eval_metric_ops) -``` - -The following sections (with headings corresponding to each code block above) -dive deeper into the `tf.layers` code used to create each layer, as well as how -to calculate loss, configure the training op, and generate predictions. If -you're already experienced with CNNs and [TensorFlow `Estimator`s](../../guide/custom_estimators.md), -and find the above code intuitive, you may want to skim these sections or just -skip ahead to ["Training and Evaluating the CNN MNIST Classifier"](#train_eval_mnist). - -### Input Layer - -The methods in the `layers` module for creating convolutional and pooling layers -for two-dimensional image data expect input tensors to have a shape of -[batch_size, image_height, image_width, -channels] by default. This behavior can be changed using the data_format parameter; defined as follows: - - -* _`batch_size`_. Size of the subset of examples to use when performing - gradient descent during training. -* _`image_height`_. Height of the example images. -* _`image_width`_. Width of the example images. -* _`channels`_. Number of color channels in the example images. For color - images, the number of channels is 3 (red, green, blue). For monochrome - images, there is just 1 channel (black). -* _`data_format`_. A string, one of `channels_last` (default) or `channels_first`. - `channels_last` corresponds to inputs with shape - `(batch, ..., channels)` while `channels_first` corresponds to - inputs with shape `(batch, channels, ...)`. - -Here, our MNIST dataset is composed of monochrome 28x28 pixel images, so the -desired shape for our input layer is [batch_size, 28, 28, -1]. - -To convert our input feature map (`features`) to this shape, we can perform the -following `reshape` operation: - -```python -input_layer = tf.reshape(features["x"], [-1, 28, 28, 1]) -``` - -Note that we've indicated `-1` for batch size, which specifies that this -dimension should be dynamically computed based on the number of input values in -`features["x"]`, holding the size of all other dimensions constant. This allows -us to treat `batch_size` as a hyperparameter that we can tune. For example, if -we feed examples into our model in batches of 5, `features["x"]` will contain -3,920 values (one value for each pixel in each image), and `input_layer` will -have a shape of `[5, 28, 28, 1]`. Similarly, if we feed examples in batches of -100, `features["x"]` will contain 78,400 values, and `input_layer` will have a -shape of `[100, 28, 28, 1]`. - -### Convolutional Layer #1 - -In our first convolutional layer, we want to apply 32 5x5 filters to the input -layer, with a ReLU activation function. We can use the `conv2d()` method in the -`layers` module to create this layer as follows: - -```python -conv1 = tf.layers.conv2d( - inputs=input_layer, - filters=32, - kernel_size=[5, 5], - padding="same", - activation=tf.nn.relu) -``` - -The `inputs` argument specifies our input tensor, which must have the shape -[batch_size, image_height, image_width, -channels]. Here, we're connecting our first convolutional layer -to `input_layer`, which has the shape [batch_size, 28, 28, -1]. - -> Note: conv2d() will instead accept a shape of -> [batch_size, channels, image_height, image_width] when passed the argument -> data_format=channels_first. - -The `filters` argument specifies the number of filters to apply (here, 32), and -`kernel_size` specifies the dimensions of the filters as [height, -width] (here, [5, 5]). - -

TIP: If filter height and width have the same value, you can instead specify a -single integer for kernel_size—e.g., kernel_size=5.

- -The `padding` argument specifies one of two enumerated values -(case-insensitive): `valid` (default value) or `same`. To specify that the -output tensor should have the same height and width values as the input tensor, -we set `padding=same` here, which instructs TensorFlow to add 0 values to the -edges of the input tensor to preserve height and width of 28. (Without padding, -a 5x5 convolution over a 28x28 tensor will produce a 24x24 tensor, as there are -24x24 locations to extract a 5x5 tile from a 28x28 grid.) - -The `activation` argument specifies the activation function to apply to the -output of the convolution. Here, we specify ReLU activation with -`tf.nn.relu`. - -Our output tensor produced by `conv2d()` has a shape of -[batch_size, 28, 28, 32]: the same height and width -dimensions as the input, but now with 32 channels holding the output from each -of the filters. - -### Pooling Layer #1 - -Next, we connect our first pooling layer to the convolutional layer we just -created. We can use the `max_pooling2d()` method in `layers` to construct a -layer that performs max pooling with a 2x2 filter and stride of 2: - -```python -pool1 = tf.layers.max_pooling2d(inputs=conv1, pool_size=[2, 2], strides=2) -``` - -Again, `inputs` specifies the input tensor, with a shape of -[batch_size, image_height, image_width, -channels]. Here, our input tensor is `conv1`, the output from -the first convolutional layer, which has a shape of [batch_size, -28, 28, 32]. - -> Note: As with conv2d(), max_pooling2d() will instead -> accept a shape of [batch_size, channels, -> image_height, image_width] when passed the argument -> data_format=channels_first. - -The `pool_size` argument specifies the size of the max pooling filter as -[height, width] (here, `[2, 2]`). If both -dimensions have the same value, you can instead specify a single integer (e.g., -`pool_size=2`). - -The `strides` argument specifies the size of the stride. Here, we set a stride -of 2, which indicates that the subregions extracted by the filter should be -separated by 2 pixels in both the height and width dimensions (for a 2x2 filter, -this means that none of the regions extracted will overlap). If you want to set -different stride values for height and width, you can instead specify a tuple or -list (e.g., `stride=[3, 6]`). - -Our output tensor produced by `max_pooling2d()` (`pool1`) has a shape of -[batch_size, 14, 14, 32]: the 2x2 filter reduces height and width by 50% each. - -### Convolutional Layer #2 and Pooling Layer #2 - -We can connect a second convolutional and pooling layer to our CNN using -`conv2d()` and `max_pooling2d()` as before. For convolutional layer #2, we -configure 64 5x5 filters with ReLU activation, and for pooling layer #2, we use -the same specs as pooling layer #1 (a 2x2 max pooling filter with stride of 2): - -```python -conv2 = tf.layers.conv2d( - inputs=pool1, - filters=64, - kernel_size=[5, 5], - padding="same", - activation=tf.nn.relu) - -pool2 = tf.layers.max_pooling2d(inputs=conv2, pool_size=[2, 2], strides=2) -``` - -Note that convolutional layer #2 takes the output tensor of our first pooling -layer (`pool1`) as input, and produces the tensor `conv2` as output. `conv2` -has a shape of [batch_size, 14, 14, 64], the same height and width as `pool1` (due to `padding="same"`), and 64 channels for the 64 -filters applied. - -Pooling layer #2 takes `conv2` as input, producing `pool2` as output. `pool2` -has shape [batch_size, 7, 7, 64] (50% reduction of height and width from `conv2`). - -### Dense Layer - -Next, we want to add a dense layer (with 1,024 neurons and ReLU activation) to -our CNN to perform classification on the features extracted by the -convolution/pooling layers. Before we connect the layer, however, we'll flatten -our feature map (`pool2`) to shape [batch_size, -features], so that our tensor has only two dimensions: - -```python -pool2_flat = tf.reshape(pool2, [-1, 7 * 7 * 64]) -``` - -In the `reshape()` operation above, the `-1` signifies that the *`batch_size`* -dimension will be dynamically calculated based on the number of examples in our -input data. Each example has 7 (`pool2` height) * 7 (`pool2` width) * 64 -(`pool2` channels) features, so we want the `features` dimension to have a value -of 7 * 7 * 64 (3136 in total). The output tensor, `pool2_flat`, has shape -[batch_size, 3136]. - -Now, we can use the `dense()` method in `layers` to connect our dense layer as -follows: - -```python -dense = tf.layers.dense(inputs=pool2_flat, units=1024, activation=tf.nn.relu) -``` - -The `inputs` argument specifies the input tensor: our flattened feature map, -`pool2_flat`. The `units` argument specifies the number of neurons in the dense -layer (1,024). The `activation` argument takes the activation function; again, -we'll use `tf.nn.relu` to add ReLU activation. - -To help improve the results of our model, we also apply dropout regularization -to our dense layer, using the `dropout` method in `layers`: - -```python -dropout = tf.layers.dropout( - inputs=dense, rate=0.4, training=mode == tf.estimator.ModeKeys.TRAIN) -``` - -Again, `inputs` specifies the input tensor, which is the output tensor from our -dense layer (`dense`). - -The `rate` argument specifies the dropout rate; here, we use `0.4`, which means -40% of the elements will be randomly dropped out during training. - -The `training` argument takes a boolean specifying whether or not the model is -currently being run in training mode; dropout will only be performed if -`training` is `True`. Here, we check if the `mode` passed to our model function -`cnn_model_fn` is `TRAIN` mode. - -Our output tensor `dropout` has shape [batch_size, 1024]. - -### Logits Layer - -The final layer in our neural network is the logits layer, which will return the -raw values for our predictions. We create a dense layer with 10 neurons (one for -each target class 0–9), with linear activation (the default): - -```python -logits = tf.layers.dense(inputs=dropout, units=10) -``` - -Our final output tensor of the CNN, `logits`, has shape -[batch_size, 10]. - -### Generate Predictions {#generate_predictions} - -The logits layer of our model returns our predictions as raw values in a -[batch_size, 10]-dimensional tensor. Let's convert these -raw values into two different formats that our model function can return: - -* The **predicted class** for each example: a digit from 0–9. -* The **probabilities** for each possible target class for each example: the - probability that the example is a 0, is a 1, is a 2, etc. - -For a given example, our predicted class is the element in the corresponding row -of the logits tensor with the highest raw value. We can find the index of this -element using the `tf.argmax` -function: - -```python -tf.argmax(input=logits, axis=1) -``` - -The `input` argument specifies the tensor from which to extract maximum -values—here `logits`. The `axis` argument specifies the axis of the `input` -tensor along which to find the greatest value. Here, we want to find the largest -value along the dimension with index of 1, which corresponds to our predictions -(recall that our logits tensor has shape [batch_size, -10]). - -We can derive probabilities from our logits layer by applying softmax activation -using `tf.nn.softmax`: - -```python -tf.nn.softmax(logits, name="softmax_tensor") -``` - -> Note: We use the `name` argument to explicitly name this operation -> `softmax_tensor`, so we can reference it later. (We'll set up logging for the -> softmax values in ["Set Up a Logging Hook"](#set-up-a-logging-hook)). - -We compile our predictions in a dict, and return an `EstimatorSpec` object: - -```python -predictions = { - "classes": tf.argmax(input=logits, axis=1), - "probabilities": tf.nn.softmax(logits, name="softmax_tensor") -} -if mode == tf.estimator.ModeKeys.PREDICT: - return tf.estimator.EstimatorSpec(mode=mode, predictions=predictions) -``` - -### Calculate Loss {#calculating-loss} - -For both training and evaluation, we need to define a -[loss function](https://en.wikipedia.org/wiki/Loss_function) -that measures how closely the model's predictions match the target classes. For -multiclass classification problems like MNIST, -[cross entropy](https://en.wikipedia.org/wiki/Cross_entropy) is typically used -as the loss metric. The following code calculates cross entropy when the model -runs in either `TRAIN` or `EVAL` mode: - -```python -loss = tf.losses.sparse_softmax_cross_entropy(labels=labels, logits=logits) -``` - -Let's take a closer look at what's happening above. - -Our `labels` tensor contains a list of prediction indices for our examples, e.g. `[1, -9, ...]`. `logits` contains the linear outputs of our last layer. - -`tf.losses.sparse_softmax_cross_entropy`, calculates the softmax crossentropy -(aka: categorical crossentropy, negative log-likelihood) from these two inputs -in an efficient, numerically stable way. - - -### Configure the Training Op - -In the previous section, we defined loss for our CNN as the softmax -cross-entropy of the logits layer and our labels. Let's configure our model to -optimize this loss value during training. We'll use a learning rate of 0.001 and -[stochastic gradient descent](https://en.wikipedia.org/wiki/Stochastic_gradient_descent) -as the optimization algorithm: - -```python -if mode == tf.estimator.ModeKeys.TRAIN: - optimizer = tf.train.GradientDescentOptimizer(learning_rate=0.001) - train_op = optimizer.minimize( - loss=loss, - global_step=tf.train.get_global_step()) - return tf.estimator.EstimatorSpec(mode=mode, loss=loss, train_op=train_op) -``` - -> Note: For a more in-depth look at configuring training ops for Estimator model -> functions, see ["Defining the training op for the model"](../../guide/custom_estimators.md#defining-the-training-op-for-the-model) -> in the ["Creating Estimations in tf.estimator"](../../guide/custom_estimators.md) tutorial. - - -### Add evaluation metrics - -To add accuracy metric in our model, we define `eval_metric_ops` dict in EVAL -mode as follows: - -```python -eval_metric_ops = { - "accuracy": tf.metrics.accuracy( - labels=labels, predictions=predictions["classes"])} -return tf.estimator.EstimatorSpec( - mode=mode, loss=loss, eval_metric_ops=eval_metric_ops) -``` - - -## Training and Evaluating the CNN MNIST Classifier - -We've coded our MNIST CNN model function; now we're ready to train and evaluate -it. - -### Load Training and Test Data - -First, let's load our training and test data. Add a `main()` function to -`cnn_mnist.py` with the following code: - -```python -def main(unused_argv): - # Load training and eval data - mnist = tf.contrib.learn.datasets.load_dataset("mnist") - train_data = mnist.train.images # Returns np.array - train_labels = np.asarray(mnist.train.labels, dtype=np.int32) - eval_data = mnist.test.images # Returns np.array - eval_labels = np.asarray(mnist.test.labels, dtype=np.int32) -``` - -We store the training feature data (the raw pixel values for 55,000 images of -hand-drawn digits) and training labels (the corresponding value from 0–9 for -each image) as [numpy -arrays](https://docs.scipy.org/doc/numpy/reference/generated/numpy.array.html) -in `train_data` and `train_labels`, respectively. Similarly, we store the -evaluation feature data (10,000 images) and evaluation labels in `eval_data` -and `eval_labels`, respectively. - -### Create the Estimator {#create-the-estimator} - -Next, let's create an `Estimator` (a TensorFlow class for performing high-level -model training, evaluation, and inference) for our model. Add the following code -to `main()`: - -```python -# Create the Estimator -mnist_classifier = tf.estimator.Estimator( - model_fn=cnn_model_fn, model_dir="/tmp/mnist_convnet_model") -``` - -The `model_fn` argument specifies the model function to use for training, -evaluation, and prediction; we pass it the `cnn_model_fn` we created in -["Building the CNN MNIST Classifier."](#building-the-cnn-mnist-classifier) The -`model_dir` argument specifies the directory where model data (checkpoints) will -be saved (here, we specify the temp directory `/tmp/mnist_convnet_model`, but -feel free to change to another directory of your choice). - -> Note: For an in-depth walkthrough of the TensorFlow `Estimator` API, see the -> tutorial ["Creating Estimators in tf.estimator."](../../guide/custom_estimators.md) - -### Set Up a Logging Hook {#set_up_a_logging_hook} - -Since CNNs can take a while to train, let's set up some logging so we can track -progress during training. We can use TensorFlow's `tf.train.SessionRunHook` to create a -`tf.train.LoggingTensorHook` -that will log the probability values from the softmax layer of our CNN. Add the -following to `main()`: - -```python -# Set up logging for predictions -tensors_to_log = {"probabilities": "softmax_tensor"} -logging_hook = tf.train.LoggingTensorHook( - tensors=tensors_to_log, every_n_iter=50) -``` - -We store a dict of the tensors we want to log in `tensors_to_log`. Each key is a -label of our choice that will be printed in the log output, and the -corresponding label is the name of a `Tensor` in the TensorFlow graph. Here, our -`probabilities` can be found in `softmax_tensor`, the name we gave our softmax -operation earlier when we generated the probabilities in `cnn_model_fn`. - -> Note: If you don't explicitly assign a name to an operation via the `name` -> argument, TensorFlow will assign a default name. A couple easy ways to -> discover the names applied to operations are to visualize your graph on -> [TensorBoard](../../guide/graph_viz.md)) or to enable the -> [TensorFlow Debugger (tfdbg)](../../guide/debugger.md). - -Next, we create the `LoggingTensorHook`, passing `tensors_to_log` to the -`tensors` argument. We set `every_n_iter=50`, which specifies that probabilities -should be logged after every 50 steps of training. - -### Train the Model - -Now we're ready to train our model, which we can do by creating `train_input_fn` -and calling `train()` on `mnist_classifier`. Add the following to `main()`: - -```python -# Train the model -train_input_fn = tf.estimator.inputs.numpy_input_fn( - x={"x": train_data}, - y=train_labels, - batch_size=100, - num_epochs=None, - shuffle=True) -mnist_classifier.train( - input_fn=train_input_fn, - steps=20000, - hooks=[logging_hook]) -``` - -In the `numpy_input_fn` call, we pass the training feature data and labels to -`x` (as a dict) and `y`, respectively. We set a `batch_size` of `100` (which -means that the model will train on minibatches of 100 examples at each step). -`num_epochs=None` means that the model will train until the specified number of -steps is reached. We also set `shuffle=True` to shuffle the training data. -In the `train` call, we set `steps=20000` -(which means the model will train for 20,000 steps total). We pass our -`logging_hook` to the `hooks` argument, so that it will be triggered during -training. - -### Evaluate the Model - -Once training is complete, we want to evaluate our model to determine its -accuracy on the MNIST test set. We call the `evaluate` method, which evaluates -the metrics we specified in `eval_metric_ops` argument in the `model_fn`. -Add the following to `main()`: - -```python -# Evaluate the model and print results -eval_input_fn = tf.estimator.inputs.numpy_input_fn( - x={"x": eval_data}, - y=eval_labels, - num_epochs=1, - shuffle=False) -eval_results = mnist_classifier.evaluate(input_fn=eval_input_fn) -print(eval_results) -``` - -To create `eval_input_fn`, we set `num_epochs=1`, so that the model evaluates -the metrics over one epoch of data and returns the result. We also set -`shuffle=False` to iterate through the data sequentially. - -### Run the Model - -We've coded the CNN model function, `Estimator`, and the training/evaluation -logic; now let's see the results. Run `cnn_mnist.py`. - -> Note: Training CNNs is quite computationally intensive. Estimated completion -> time of `cnn_mnist.py` will vary depending on your processor, but will likely -> be upwards of 1 hour on CPU. To train more quickly, you can decrease the -> number of `steps` passed to `train()`, but note that this will affect accuracy. - -As the model trains, you'll see log output like the following: - -```python -INFO:tensorflow:loss = 2.36026, step = 1 -INFO:tensorflow:probabilities = [[ 0.07722801 0.08618255 0.09256398, ...]] -... -INFO:tensorflow:loss = 2.13119, step = 101 -INFO:tensorflow:global_step/sec: 5.44132 -... -INFO:tensorflow:Loss for final step: 0.553216. - -INFO:tensorflow:Restored model from /tmp/mnist_convnet_model -INFO:tensorflow:Eval steps [0,inf) for training step 20000. -INFO:tensorflow:Input iterator is exhausted. -INFO:tensorflow:Saving evaluation summary for step 20000: accuracy = 0.9733, loss = 0.0902271 -{'loss': 0.090227105, 'global_step': 20000, 'accuracy': 0.97329998} -``` - -Here, we've achieved an accuracy of 97.3% on our test data set. - -## Additional Resources - -To learn more about TensorFlow Estimators and CNNs in TensorFlow, see the -following resources: - -* [Creating Estimators in tf.estimator](../../guide/custom_estimators.md) - provides an introduction to the TensorFlow Estimator API. It walks through - configuring an Estimator, writing a model function, calculating loss, and - defining a training op. -* [Advanced Convolutional Neural Networks](../../tutorials/images/deep_cnn.md) walks through how to build a MNIST CNN classification model - *without estimators* using lower-level TensorFlow operations. diff --git a/tensorflow/docs_src/tutorials/estimators/linear.md b/tensorflow/docs_src/tutorials/estimators/linear.md deleted file mode 100644 index 067a33ac03..0000000000 --- a/tensorflow/docs_src/tutorials/estimators/linear.md +++ /dev/null @@ -1,3 +0,0 @@ -# Build a linear model with Estimators - -[Colab notebook](https://colab.research.google.com/github/tensorflow/models/blob/master/samples/core/tutorials/estimators/linear.ipynb) diff --git a/tensorflow/docs_src/tutorials/images/deep_cnn.md b/tensorflow/docs_src/tutorials/images/deep_cnn.md deleted file mode 100644 index 00996b82e6..0000000000 --- a/tensorflow/docs_src/tutorials/images/deep_cnn.md +++ /dev/null @@ -1,446 +0,0 @@ -# Advanced Convolutional Neural Networks - -## Overview - -CIFAR-10 classification is a common benchmark problem in machine learning. The -problem is to classify RGB 32x32 pixel images across 10 categories: -``` -airplane, automobile, bird, cat, deer, dog, frog, horse, ship, and truck. -``` - -For more details refer to the [CIFAR-10 page](https://www.cs.toronto.edu/~kriz/cifar.html) -and a [Tech Report](https://www.cs.toronto.edu/~kriz/learning-features-2009-TR.pdf) -by Alex Krizhevsky. - -### Goals - -The goal of this tutorial is to build a relatively small [convolutional neural -network](https://en.wikipedia.org/wiki/Convolutional_neural_network) (CNN) for -recognizing images. In the process, this tutorial: - -1. Highlights a canonical organization for network architecture, -training and evaluation. -2. Provides a template for constructing larger and more sophisticated models. - -The reason CIFAR-10 was selected was that it is complex enough to exercise -much of TensorFlow's ability to scale to large models. At the same time, -the model is small enough to train fast, which is ideal for trying out -new ideas and experimenting with new techniques. - -### Highlights of the Tutorial -The CIFAR-10 tutorial demonstrates several important constructs for -designing larger and more sophisticated models in TensorFlow: - -* Core mathematical components including `tf.nn.conv2d` -([wiki](https://en.wikipedia.org/wiki/Convolution)), -`tf.nn.relu` -([wiki](https://en.wikipedia.org/wiki/Rectifier_(neural_networks))), -`tf.nn.max_pool` -([wiki](https://en.wikipedia.org/wiki/Convolutional_neural_network#Pooling_layer)) -and `tf.nn.local_response_normalization` -(Chapter 3.3 in -[AlexNet paper](https://papers.nips.cc/paper/4824-imagenet-classification-with-deep-convolutional-neural-networks.pdf)). -* [Visualization](../../guide/summaries_and_tensorboard.md) -of network activities during training, including input images, -losses and distributions of activations and gradients. -* Routines for calculating the -`tf.train.ExponentialMovingAverage` -of learned parameters and using these averages -during evaluation to boost predictive performance. -* Implementation of a -`tf.train.exponential_decay` -that systematically decrements over time. -* Prefetching `tf.train.shuffle_batch` -for input -data to isolate the model from disk latency and expensive image pre-processing. - -We also provide a [multi-GPU version](#training-a-model-using-multiple-gpu-cards) -of the model which demonstrates: - -* Configuring a model to train across multiple GPU cards in parallel. -* Sharing and updating variables among multiple GPUs. - -We hope that this tutorial provides a launch point for building larger CNNs for -vision tasks on TensorFlow. - -### Model Architecture - -The model in this CIFAR-10 tutorial is a multi-layer architecture consisting of -alternating convolutions and nonlinearities. These layers are followed by fully -connected layers leading into a softmax classifier. The model follows the -architecture described by -[Alex Krizhevsky](https://code.google.com/p/cuda-convnet/), with a few -differences in the top few layers. - -This model achieves a peak performance of about 86% accuracy within a few hours -of training time on a GPU. Please see [below](#evaluating-a-model) and the code -for details. It consists of 1,068,298 learnable parameters and requires about -19.5M multiply-add operations to compute inference on a single image. - -## Code Organization - -The code for this tutorial resides in -[`models/tutorials/image/cifar10/`](https://github.com/tensorflow/models/tree/master/tutorials/image/cifar10/). - -File | Purpose ---- | --- -[`cifar10_input.py`](https://github.com/tensorflow/models/tree/master/tutorials/image/cifar10/cifar10_input.py) | Reads the native CIFAR-10 binary file format. -[`cifar10.py`](https://github.com/tensorflow/models/tree/master/tutorials/image/cifar10/cifar10.py) | Builds the CIFAR-10 model. -[`cifar10_train.py`](https://github.com/tensorflow/models/tree/master/tutorials/image/cifar10/cifar10_train.py) | Trains a CIFAR-10 model on a CPU or GPU. -[`cifar10_multi_gpu_train.py`](https://github.com/tensorflow/models/tree/master/tutorials/image/cifar10/cifar10_multi_gpu_train.py) | Trains a CIFAR-10 model on multiple GPUs. -[`cifar10_eval.py`](https://github.com/tensorflow/models/tree/master/tutorials/image/cifar10/cifar10_eval.py) | Evaluates the predictive performance of a CIFAR-10 model. - - -## CIFAR-10 Model - -The CIFAR-10 network is largely contained in -[`cifar10.py`](https://github.com/tensorflow/models/tree/master/tutorials/image/cifar10/cifar10.py). -The complete training -graph contains roughly 765 operations. We find that we can make the code most -reusable by constructing the graph with the following modules: - -1. [**Model inputs:**](#model-inputs) `inputs()` and `distorted_inputs()` add -operations that read and preprocess CIFAR images for evaluation and training, -respectively. -1. [**Model prediction:**](#model-prediction) `inference()` -adds operations that perform inference, i.e. classification, on supplied images. -1. [**Model training:**](#model-training) `loss()` and `train()` -add operations that compute the loss, -gradients, variable updates and visualization summaries. - -### Model Inputs - -The input part of the model is built by the functions `inputs()` and -`distorted_inputs()` which read images from the CIFAR-10 binary data files. -These files contain fixed byte length records, so we use -`tf.FixedLengthRecordReader`. -See [Reading Data](../../api_guides/python/reading_data.md#reading-from-files) to -learn more about how the `Reader` class works. - -The images are processed as follows: - -* They are cropped to 24 x 24 pixels, centrally for evaluation or - `tf.random_crop` for training. -* They are `tf.image.per_image_standardization` - to make the model insensitive to dynamic range. - -For training, we additionally apply a series of random distortions to -artificially increase the data set size: - -* `tf.image.random_flip_left_right` the image from left to right. -* Randomly distort the `tf.image.random_brightness`. -* Randomly distort the `tf.image.random_contrast`. - -Please see the [Images](../../api_guides/python/image.md) page for the list of -available distortions. We also attach an -`tf.summary.image` to the images -so that we may visualize them in [TensorBoard](../../guide/summaries_and_tensorboard.md). -This is a good practice to verify that inputs are built correctly. - -
- -
- -Reading images from disk and distorting them can use a non-trivial amount of -processing time. To prevent these operations from slowing down training, we run -them inside 16 separate threads which continuously fill a TensorFlow -`tf.train.shuffle_batch`. - -### Model Prediction - -The prediction part of the model is constructed by the `inference()` function -which adds operations to compute the *logits* of the predictions. That part of -the model is organized as follows: - -Layer Name | Description ---- | --- -`conv1` | `tf.nn.conv2d` and `tf.nn.relu` activation. -`pool1` | `tf.nn.max_pool`. -`norm1` | `tf.nn.local_response_normalization`. -`conv2` | `tf.nn.conv2d` and `tf.nn.relu` activation. -`norm2` | `tf.nn.local_response_normalization`. -`pool2` | `tf.nn.max_pool`. -`local3` | [fully connected layer with rectified linear activation](../../api_guides/python/nn.md). -`local4` | [fully connected layer with rectified linear activation](../../api_guides/python/nn.md). -`softmax_linear` | linear transformation to produce logits. - -Here is a graph generated from TensorBoard describing the inference operation: - -
- -
- -> **EXERCISE**: The output of `inference` are un-normalized logits. Try editing -the network architecture to return normalized predictions using -`tf.nn.softmax`. - -The `inputs()` and `inference()` functions provide all the components -necessary to perform an evaluation of a model. We now shift our focus towards -building operations for training a model. - -> **EXERCISE:** The model architecture in `inference()` differs slightly from -the CIFAR-10 model specified in -[cuda-convnet](https://code.google.com/p/cuda-convnet/). In particular, the top -layers of Alex's original model are locally connected and not fully connected. -Try editing the architecture to exactly reproduce the locally connected -architecture in the top layer. - -### Model Training - -The usual method for training a network to perform N-way classification is -[multinomial logistic regression](https://en.wikipedia.org/wiki/Multinomial_logistic_regression), -aka. *softmax regression*. Softmax regression applies a -`tf.nn.softmax` nonlinearity to the -output of the network and calculates the -`tf.nn.sparse_softmax_cross_entropy_with_logits` -between the normalized predictions and the label index. -For regularization, we also apply the usual -`tf.nn.l2_loss` losses to all learned -variables. The objective function for the model is the sum of the cross entropy -loss and all these weight decay terms, as returned by the `loss()` function. - -We visualize it in TensorBoard with a `tf.summary.scalar`: - -![CIFAR-10 Loss](https://www.tensorflow.org/images/cifar_loss.png "CIFAR-10 Total Loss") - -We train the model using standard -[gradient descent](https://en.wikipedia.org/wiki/Gradient_descent) -algorithm (see [Training](../../api_guides/python/train.md) for other methods) -with a learning rate that -`tf.train.exponential_decay` -over time. - -![CIFAR-10 Learning Rate Decay](https://www.tensorflow.org/images/cifar_lr_decay.png "CIFAR-10 Learning Rate Decay") - -The `train()` function adds the operations needed to minimize the objective by -calculating the gradient and updating the learned variables (see -`tf.train.GradientDescentOptimizer` -for details). It returns an operation that executes all the calculations -needed to train and update the model for one batch of images. - -## Launching and Training the Model - -We have built the model, let's now launch it and run the training operation with -the script `cifar10_train.py`. - -```shell -python cifar10_train.py -``` - -> **NOTE:** The first time you run any target in the CIFAR-10 tutorial, -the CIFAR-10 dataset is automatically downloaded. The data set is ~160MB -so you may want to grab a quick cup of coffee for your first run. - -You should see the output: - -```shell -Filling queue with 20000 CIFAR images before starting to train. This will take a few minutes. -2015-11-04 11:45:45.927302: step 0, loss = 4.68 (2.0 examples/sec; 64.221 sec/batch) -2015-11-04 11:45:49.133065: step 10, loss = 4.66 (533.8 examples/sec; 0.240 sec/batch) -2015-11-04 11:45:51.397710: step 20, loss = 4.64 (597.4 examples/sec; 0.214 sec/batch) -2015-11-04 11:45:54.446850: step 30, loss = 4.62 (391.0 examples/sec; 0.327 sec/batch) -2015-11-04 11:45:57.152676: step 40, loss = 4.61 (430.2 examples/sec; 0.298 sec/batch) -2015-11-04 11:46:00.437717: step 50, loss = 4.59 (406.4 examples/sec; 0.315 sec/batch) -... -``` - -The script reports the total loss every 10 steps as well as the speed at which -the last batch of data was processed. A few comments: - -* The first batch of data can be inordinately slow (e.g. several minutes) as the -preprocessing threads fill up the shuffling queue with 20,000 processed CIFAR -images. - -* The reported loss is the average loss of the most recent batch. Remember that -this loss is the sum of the cross entropy and all weight decay terms. - -* Keep an eye on the processing speed of a batch. The numbers shown above were -obtained on a Tesla K40c. If you are running on a CPU, expect slower performance. - - -> **EXERCISE:** When experimenting, it is sometimes annoying that the first -training step can take so long. Try decreasing the number of images that -initially fill up the queue. Search for `min_fraction_of_examples_in_queue` -in `cifar10_input.py`. - -`cifar10_train.py` periodically uses a `tf.train.Saver` to save -all model parameters in -[checkpoint files](../../guide/saved_model.md) -but it does *not* evaluate the model. The checkpoint file -will be used by `cifar10_eval.py` to measure the predictive -performance (see [Evaluating a Model](#evaluating-a-model) below). - - -If you followed the previous steps, then you have now started training -a CIFAR-10 model. [Congratulations!](https://www.youtube.com/watch?v=9bZkp7q19f0) - -The terminal text returned from `cifar10_train.py` provides minimal insight into -how the model is training. We want more insight into the model during training: - -* Is the loss *really* decreasing or is that just noise? -* Is the model being provided appropriate images? -* Are the gradients, activations and weights reasonable? -* What is the learning rate currently at? - -[TensorBoard](../../guide/summaries_and_tensorboard.md) provides this -functionality, displaying data exported periodically from `cifar10_train.py` via -a -`tf.summary.FileWriter`. - -For instance, we can watch how the distribution of activations and degree of -sparsity in `local3` features evolve during training: - -
- - -
- -Individual loss functions, as well as the total loss, are particularly -interesting to track over time. However, the loss exhibits a considerable amount -of noise due to the small batch size employed by training. In practice we find -it extremely useful to visualize their moving averages in addition to their raw -values. See how the scripts use -`tf.train.ExponentialMovingAverage` -for this purpose. - -## Evaluating a Model - -Let us now evaluate how well the trained model performs on a hold-out data set. -The model is evaluated by the script `cifar10_eval.py`. It constructs the model -with the `inference()` function and uses all 10,000 images in the evaluation set -of CIFAR-10. It calculates the *precision at 1:* how often the top prediction -matches the true label of the image. - -To monitor how the model improves during training, the evaluation script runs -periodically on the latest checkpoint files created by the `cifar10_train.py`. - -```shell -python cifar10_eval.py -``` - -> Be careful not to run the evaluation and training binary on the same GPU or -else you might run out of memory. Consider running the evaluation on -a separate GPU if available or suspending the training binary while running -the evaluation on the same GPU. - -You should see the output: - -```shell -2015-11-06 08:30:44.391206: precision @ 1 = 0.860 -... -``` - -The script merely returns the precision @ 1 periodically -- in this case -it returned 86% accuracy. `cifar10_eval.py` also -exports summaries that may be visualized in TensorBoard. These summaries -provide additional insight into the model during evaluation. - -The training script calculates the -`tf.train.ExponentialMovingAverage` of all learned variables. -The evaluation script substitutes -all learned model parameters with the moving average version. This -substitution boosts model performance at evaluation time. - -> **EXERCISE:** Employing averaged parameters may boost predictive performance -by about 3% as measured by precision @ 1. Edit `cifar10_eval.py` to not employ -the averaged parameters for the model and verify that the predictive performance -drops. - - -## Training a Model Using Multiple GPU Cards - -Modern workstations may contain multiple GPUs for scientific computation. -TensorFlow can leverage this environment to run the training operation -concurrently across multiple cards. - -Training a model in a parallel, distributed fashion requires -coordinating training processes. For what follows we term *model replica* -to be one copy of a model training on a subset of data. - -Naively employing asynchronous updates of model parameters -leads to sub-optimal training performance -because an individual model replica might be trained on a stale -copy of the model parameters. Conversely, employing fully synchronous -updates will be as slow as the slowest model replica. - -In a workstation with multiple GPU cards, each GPU will have similar speed -and contain enough memory to run an entire CIFAR-10 model. Thus, we opt to -design our training system in the following manner: - -* Place an individual model replica on each GPU. -* Update model parameters synchronously by waiting for all GPUs to finish -processing a batch of data. - -Here is a diagram of this model: - -
- -
- -Note that each GPU computes inference as well as the gradients for a unique -batch of data. This setup effectively permits dividing up a larger batch -of data across the GPUs. - -This setup requires that all GPUs share the model parameters. A well-known -fact is that transferring data to and from GPUs is quite slow. For this -reason, we decide to store and update all model parameters on the CPU (see -green box). A fresh set of model parameters is transferred to the GPU -when a new batch of data is processed by all GPUs. - -The GPUs are synchronized in operation. All gradients are accumulated from -the GPUs and averaged (see green box). The model parameters are updated with -the gradients averaged across all model replicas. - -### Placing Variables and Operations on Devices - -Placing operations and variables on devices requires some special -abstractions. - -The first abstraction we require is a function for computing inference and -gradients for a single model replica. In the code we term this abstraction -a "tower". We must set two attributes for each tower: - -* A unique name for all operations within a tower. -`tf.name_scope` provides -this unique name by prepending a scope. For instance, all operations in -the first tower are prepended with `tower_0`, e.g. `tower_0/conv1/Conv2D`. - -* A preferred hardware device to run the operation within a tower. -`tf.device` specifies this. For -instance, all operations in the first tower reside within `device('/device:GPU:0')` -scope indicating that they should be run on the first GPU. - -All variables are pinned to the CPU and accessed via -`tf.get_variable` -in order to share them in a multi-GPU version. -See how-to on [Sharing Variables](../../guide/variables.md). - -### Launching and Training the Model on Multiple GPU cards - -If you have several GPU cards installed on your machine you can use them to -train the model faster with the `cifar10_multi_gpu_train.py` script. This -version of the training script parallelizes the model across multiple GPU cards. - -```shell -python cifar10_multi_gpu_train.py --num_gpus=2 -``` - -Note that the number of GPU cards used defaults to 1. Additionally, if only 1 -GPU is available on your machine, all computations will be placed on it, even if -you ask for more. - -> **EXERCISE:** The default settings for `cifar10_train.py` is to -run on a batch size of 128. Try running `cifar10_multi_gpu_train.py` on 2 GPUs -with a batch size of 64 and compare the training speed. - -## Next Steps - -If you are now interested in developing and training your own image -classification system, we recommend forking this tutorial and replacing -components to address your image classification problem. - - -> **EXERCISE:** Download the -[Street View House Numbers (SVHN)](http://ufldl.stanford.edu/housenumbers/) data set. -Fork the CIFAR-10 tutorial and swap in the SVHN as the input data. Try adapting -the network architecture to improve predictive performance. diff --git a/tensorflow/docs_src/tutorials/images/image_recognition.md b/tensorflow/docs_src/tutorials/images/image_recognition.md deleted file mode 100644 index 52913b2082..0000000000 --- a/tensorflow/docs_src/tutorials/images/image_recognition.md +++ /dev/null @@ -1,455 +0,0 @@ -# Image Recognition - -Our brains make vision seem easy. It doesn't take any effort for humans to -tell apart a lion and a jaguar, read a sign, or recognize a human's face. -But these are actually hard problems to solve with a computer: they only -seem easy because our brains are incredibly good at understanding images. - -In the last few years, the field of machine learning has made tremendous -progress on addressing these difficult problems. In particular, we've -found that a kind of model called a deep -[convolutional neural network](https://colah.github.io/posts/2014-07-Conv-Nets-Modular/) -can achieve reasonable performance on hard visual recognition tasks -- -matching or exceeding human performance in some domains. - -Researchers have demonstrated steady progress -in computer vision by validating their work against -[ImageNet](http://www.image-net.org) -- an academic benchmark for computer vision. -Successive models continue to show improvements, each time achieving -a new state-of-the-art result: -[QuocNet], [AlexNet], [Inception (GoogLeNet)], [BN-Inception-v2]. -Researchers both internal and external to Google have published papers describing all -these models but the results are still hard to reproduce. -We're now taking the next step by releasing code for running image recognition -on our latest model, [Inception-v3]. - -[QuocNet]: https://static.googleusercontent.com/media/research.google.com/en//archive/unsupervised_icml2012.pdf -[AlexNet]: https://www.cs.toronto.edu/~fritz/absps/imagenet.pdf -[Inception (GoogLeNet)]: https://arxiv.org/abs/1409.4842 -[BN-Inception-v2]: https://arxiv.org/abs/1502.03167 -[Inception-v3]: https://arxiv.org/abs/1512.00567 - -Inception-v3 is trained for the [ImageNet] Large Visual Recognition Challenge -using the data from 2012. This is a standard task in computer vision, -where models try to classify entire -images into [1000 classes], like "Zebra", "Dalmatian", and "Dishwasher". -For example, here are the results from [AlexNet] classifying some images: - -
- -
- -To compare models, we examine how often the model fails to predict the -correct answer as one of their top 5 guesses -- termed "top-5 error rate". -[AlexNet] achieved by setting a top-5 error rate of 15.3% on the 2012 -validation data set; [Inception (GoogLeNet)] achieved 6.67%; -[BN-Inception-v2] achieved 4.9%; [Inception-v3] reaches 3.46%. - -> How well do humans do on ImageNet Challenge? There's a [blog post] by -Andrej Karpathy who attempted to measure his own performance. He reached -5.1% top-5 error rate. - -[ImageNet]: http://image-net.org/ -[1000 classes]: http://image-net.org/challenges/LSVRC/2014/browse-synsets -[blog post]: https://karpathy.github.io/2014/09/02/what-i-learned-from-competing-against-a-convnet-on-imagenet/ - -This tutorial will teach you how to use [Inception-v3]. You'll learn how to -classify images into [1000 classes] in Python or C++. We'll also discuss how to -extract higher level features from this model which may be reused for other -vision tasks. - -We're excited to see what the community will do with this model. - - -##Usage with Python API - -`classify_image.py` downloads the trained model from `tensorflow.org` -when the program is run for the first time. You'll need about 200M of free space -available on your hard disk. - -Start by cloning the [TensorFlow models repo](https://github.com/tensorflow/models) from GitHub. Run the following commands: - - cd models/tutorials/image/imagenet - python classify_image.py - -The above command will classify a supplied image of a panda bear. - -
- -
- -If the model runs correctly, the script will produce the following output: - - giant panda, panda, panda bear, coon bear, Ailuropoda melanoleuca (score = 0.88493) - indri, indris, Indri indri, Indri brevicaudatus (score = 0.00878) - lesser panda, red panda, panda, bear cat, cat bear, Ailurus fulgens (score = 0.00317) - custard apple (score = 0.00149) - earthstar (score = 0.00127) - -If you wish to supply other JPEG images, you may do so by editing -the `--image_file` argument. - -> If you download the model data to a different directory, you -will need to point `--model_dir` to the directory used. - -## Usage with the C++ API - -You can run the same [Inception-v3] model in C++ for use in production -environments. You can download the archive containing the GraphDef that defines -the model like this (running from the root directory of the TensorFlow -repository): - -```bash -curl -L "https://storage.googleapis.com/download.tensorflow.org/models/inception_v3_2016_08_28_frozen.pb.tar.gz" | - tar -C tensorflow/examples/label_image/data -xz -``` - -Next, we need to compile the C++ binary that includes the code to load and run the graph. -If you've followed -[the instructions to download the source installation of TensorFlow](../../install/install_sources.md) -for your platform, you should be able to build the example by -running this command from your shell terminal: - -```bash -bazel build tensorflow/examples/label_image/... -``` - -That should create a binary executable that you can then run like this: - -```bash -bazel-bin/tensorflow/examples/label_image/label_image -``` - -This uses the default example image that ships with the framework, and should -output something similar to this: - -``` -I tensorflow/examples/label_image/main.cc:206] military uniform (653): 0.834306 -I tensorflow/examples/label_image/main.cc:206] mortarboard (668): 0.0218692 -I tensorflow/examples/label_image/main.cc:206] academic gown (401): 0.0103579 -I tensorflow/examples/label_image/main.cc:206] pickelhaube (716): 0.00800814 -I tensorflow/examples/label_image/main.cc:206] bulletproof vest (466): 0.00535088 -``` -In this case, we're using the default image of -[Admiral Grace Hopper](https://en.wikipedia.org/wiki/Grace_Hopper), and you can -see the network correctly identifies she's wearing a military uniform, with a high -score of 0.8. - - -
- -
- -Next, try it out on your own images by supplying the --image= argument, e.g. - -```bash -bazel-bin/tensorflow/examples/label_image/label_image --image=my_image.png -``` - -If you look inside the [`tensorflow/examples/label_image/main.cc`](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/examples/label_image/main.cc) -file, you can find out -how it works. We hope this code will help you integrate TensorFlow into -your own applications, so we will walk step by step through the main functions: - -The command line flags control where the files are loaded from, and properties of the input images. -The model expects to get square 299x299 RGB images, so those are the `input_width` -and `input_height` flags. We also need to scale the pixel values from integers that -are between 0 and 255 to the floating point values that the graph operates on. -We control the scaling with the `input_mean` and `input_std` flags: we first subtract -`input_mean` from each pixel value, then divide it by `input_std`. - -These values probably look somewhat magical, but they are just defined by the -original model author based on what he/she wanted to use as input images for -training. If you have a graph that you've trained yourself, you'll just need -to adjust the values to match whatever you used during your training process. - -You can see how they're applied to an image in the -[`ReadTensorFromImageFile()`](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/examples/label_image/main.cc#L88) -function. - -```C++ -// Given an image file name, read in the data, try to decode it as an image, -// resize it to the requested size, and then scale the values as desired. -Status ReadTensorFromImageFile(string file_name, const int input_height, - const int input_width, const float input_mean, - const float input_std, - std::vector* out_tensors) { - tensorflow::GraphDefBuilder b; -``` -We start by creating a `GraphDefBuilder`, which is an object we can use to -specify a model to run or load. - -```C++ - string input_name = "file_reader"; - string output_name = "normalized"; - tensorflow::Node* file_reader = - tensorflow::ops::ReadFile(tensorflow::ops::Const(file_name, b.opts()), - b.opts().WithName(input_name)); -``` -We then start creating nodes for the small model we want to run -to load, resize, and scale the pixel values to get the result the main model -expects as its input. The first node we create is just a `Const` op that holds a -tensor with the file name of the image we want to load. That's then passed as the -first input to the `ReadFile` op. You might notice we're passing `b.opts()` as the last -argument to all the op creation functions. The argument ensures that the node is added to -the model definition held in the `GraphDefBuilder`. We also name the `ReadFile` -operator by making the `WithName()` call to `b.opts()`. This gives a name to the node, -which isn't strictly necessary since an automatic name will be assigned if you don't -do this, but it does make debugging a bit easier. - -```C++ - // Now try to figure out what kind of file it is and decode it. - const int wanted_channels = 3; - tensorflow::Node* image_reader; - if (tensorflow::StringPiece(file_name).ends_with(".png")) { - image_reader = tensorflow::ops::DecodePng( - file_reader, - b.opts().WithAttr("channels", wanted_channels).WithName("png_reader")); - } else { - // Assume if it's not a PNG then it must be a JPEG. - image_reader = tensorflow::ops::DecodeJpeg( - file_reader, - b.opts().WithAttr("channels", wanted_channels).WithName("jpeg_reader")); - } - // Now cast the image data to float so we can do normal math on it. - tensorflow::Node* float_caster = tensorflow::ops::Cast( - image_reader, tensorflow::DT_FLOAT, b.opts().WithName("float_caster")); - // The convention for image ops in TensorFlow is that all images are expected - // to be in batches, so that they're four-dimensional arrays with indices of - // [batch, height, width, channel]. Because we only have a single image, we - // have to add a batch dimension of 1 to the start with ExpandDims(). - tensorflow::Node* dims_expander = tensorflow::ops::ExpandDims( - float_caster, tensorflow::ops::Const(0, b.opts()), b.opts()); - // Bilinearly resize the image to fit the required dimensions. - tensorflow::Node* resized = tensorflow::ops::ResizeBilinear( - dims_expander, tensorflow::ops::Const({input_height, input_width}, - b.opts().WithName("size")), - b.opts()); - // Subtract the mean and divide by the scale. - tensorflow::ops::Div( - tensorflow::ops::Sub( - resized, tensorflow::ops::Const({input_mean}, b.opts()), b.opts()), - tensorflow::ops::Const({input_std}, b.opts()), - b.opts().WithName(output_name)); -``` -We then keep adding more nodes, to decode the file data as an image, to cast the -integers into floating point values, to resize it, and then finally to run the -subtraction and division operations on the pixel values. - -```C++ - // This runs the GraphDef network definition that we've just constructed, and - // returns the results in the output tensor. - tensorflow::GraphDef graph; - TF_RETURN_IF_ERROR(b.ToGraphDef(&graph)); -``` -At the end of this we have -a model definition stored in the b variable, which we turn into a full graph -definition with the `ToGraphDef()` function. - -```C++ - std::unique_ptr session( - tensorflow::NewSession(tensorflow::SessionOptions())); - TF_RETURN_IF_ERROR(session->Create(graph)); - TF_RETURN_IF_ERROR(session->Run({}, {output_name}, {}, out_tensors)); - return Status::OK(); -``` -Then we create a `tf.Session` -object, which is the interface to actually running the graph, and run it, -specifying which node we want to get the output from, and where to put the -output data. - -This gives us a vector of `Tensor` objects, which in this case we know will only be a -single object long. You can think of a `Tensor` as a multi-dimensional array in this -context, and it holds a 299 pixel high, 299 pixel wide, 3 channel image as float -values. If you have your own image-processing framework in your product already, you -should be able to use that instead, as long as you apply the same transformations -before you feed images into the main graph. - -This is a simple example of creating a small TensorFlow graph dynamically in C++, -but for the pre-trained Inception model we want to load a much larger definition from -a file. You can see how we do that in the `LoadGraph()` function. - -```C++ -// Reads a model graph definition from disk, and creates a session object you -// can use to run it. -Status LoadGraph(string graph_file_name, - std::unique_ptr* session) { - tensorflow::GraphDef graph_def; - Status load_graph_status = - ReadBinaryProto(tensorflow::Env::Default(), graph_file_name, &graph_def); - if (!load_graph_status.ok()) { - return tensorflow::errors::NotFound("Failed to load compute graph at '", - graph_file_name, "'"); - } -``` -If you've looked through the image loading code, a lot of the terms should seem familiar. Rather than -using a `GraphDefBuilder` to produce a `GraphDef` object, we load a protobuf file that -directly contains the `GraphDef`. - -```C++ - session->reset(tensorflow::NewSession(tensorflow::SessionOptions())); - Status session_create_status = (*session)->Create(graph_def); - if (!session_create_status.ok()) { - return session_create_status; - } - return Status::OK(); -} -``` -Then we create a Session object from that `GraphDef` and -pass it back to the caller so that they can run it at a later time. - -The `GetTopLabels()` function is a lot like the image loading, except that in this case -we want to take the results of running the main graph, and turn it into a sorted list -of the highest-scoring labels. Just like the image loader, it creates a -`GraphDefBuilder`, adds a couple of nodes to it, and then runs the short graph to get a -pair of output tensors. In this case they represent the sorted scores and index -positions of the highest results. - -```C++ -// Analyzes the output of the Inception graph to retrieve the highest scores and -// their positions in the tensor, which correspond to categories. -Status GetTopLabels(const std::vector& outputs, int how_many_labels, - Tensor* indices, Tensor* scores) { - tensorflow::GraphDefBuilder b; - string output_name = "top_k"; - tensorflow::ops::TopK(tensorflow::ops::Const(outputs[0], b.opts()), - how_many_labels, b.opts().WithName(output_name)); - // This runs the GraphDef network definition that we've just constructed, and - // returns the results in the output tensors. - tensorflow::GraphDef graph; - TF_RETURN_IF_ERROR(b.ToGraphDef(&graph)); - std::unique_ptr session( - tensorflow::NewSession(tensorflow::SessionOptions())); - TF_RETURN_IF_ERROR(session->Create(graph)); - // The TopK node returns two outputs, the scores and their original indices, - // so we have to append :0 and :1 to specify them both. - std::vector out_tensors; - TF_RETURN_IF_ERROR(session->Run({}, {output_name + ":0", output_name + ":1"}, - {}, &out_tensors)); - *scores = out_tensors[0]; - *indices = out_tensors[1]; - return Status::OK(); -``` -The `PrintTopLabels()` function takes those sorted results, and prints them out in a -friendly way. The `CheckTopLabel()` function is very similar, but just makes sure that -the top label is the one we expect, for debugging purposes. - -At the end, [`main()`](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/examples/label_image/main.cc#L252) -ties together all of these calls. - -```C++ -int main(int argc, char* argv[]) { - // We need to call this to set up global state for TensorFlow. - tensorflow::port::InitMain(argv[0], &argc, &argv); - Status s = tensorflow::ParseCommandLineFlags(&argc, argv); - if (!s.ok()) { - LOG(ERROR) << "Error parsing command line flags: " << s.ToString(); - return -1; - } - - // First we load and initialize the model. - std::unique_ptr session; - string graph_path = tensorflow::io::JoinPath(FLAGS_root_dir, FLAGS_graph); - Status load_graph_status = LoadGraph(graph_path, &session); - if (!load_graph_status.ok()) { - LOG(ERROR) << load_graph_status; - return -1; - } -``` -We load the main graph. - -```C++ - // Get the image from disk as a float array of numbers, resized and normalized - // to the specifications the main graph expects. - std::vector resized_tensors; - string image_path = tensorflow::io::JoinPath(FLAGS_root_dir, FLAGS_image); - Status read_tensor_status = ReadTensorFromImageFile( - image_path, FLAGS_input_height, FLAGS_input_width, FLAGS_input_mean, - FLAGS_input_std, &resized_tensors); - if (!read_tensor_status.ok()) { - LOG(ERROR) << read_tensor_status; - return -1; - } - const Tensor& resized_tensor = resized_tensors[0]; -``` -Load, resize, and process the input image. - -```C++ - // Actually run the image through the model. - std::vector outputs; - Status run_status = session->Run({{FLAGS_input_layer, resized_tensor}}, - {FLAGS_output_layer}, {}, &outputs); - if (!run_status.ok()) { - LOG(ERROR) << "Running model failed: " << run_status; - return -1; - } -``` -Here we run the loaded graph with the image as an input. - -```C++ - // This is for automated testing to make sure we get the expected result with - // the default settings. We know that label 866 (military uniform) should be - // the top label for the Admiral Hopper image. - if (FLAGS_self_test) { - bool expected_matches; - Status check_status = CheckTopLabel(outputs, 866, &expected_matches); - if (!check_status.ok()) { - LOG(ERROR) << "Running check failed: " << check_status; - return -1; - } - if (!expected_matches) { - LOG(ERROR) << "Self-test failed!"; - return -1; - } - } -``` -For testing purposes we can check to make sure we get the output we expect here. - -```C++ - // Do something interesting with the results we've generated. - Status print_status = PrintTopLabels(outputs, FLAGS_labels); -``` -Finally we print the labels we found. - -```C++ - if (!print_status.ok()) { - LOG(ERROR) << "Running print failed: " << print_status; - return -1; - } -``` - -The error handling here is using TensorFlow's `Status` -object, which is very convenient because it lets you know whether any error has -occurred with the `ok()` checker, and then can be printed out to give a readable error -message. - -In this case we are demonstrating object recognition, but you should be able to -use very similar code on other models you've found or trained yourself, across -all -sorts of domains. We hope this small example gives you some ideas on how to use -TensorFlow within your own products. - -> **EXERCISE**: Transfer learning is the idea that, if you know how to solve a task well, you -should be able to transfer some of that understanding to solving related -problems. One way to perform transfer learning is to remove the final -classification layer of the network and extract -the [next-to-last layer of the CNN](https://arxiv.org/abs/1310.1531), in this case a 2048 dimensional vector. - - -## Resources for Learning More - -To learn about neural networks in general, Michael Nielsen's -[free online book](http://neuralnetworksanddeeplearning.com/chap1.html) -is an excellent resource. For convolutional neural networks in particular, -Chris Olah has some -[nice blog posts](https://colah.github.io/posts/2014-07-Conv-Nets-Modular/), -and Michael Nielsen's book has a -[great chapter](http://neuralnetworksanddeeplearning.com/chap6.html) -covering them. - -To find out more about implementing convolutional neural networks, you can jump -to the TensorFlow [deep convolutional networks tutorial](../../tutorials/images/deep_cnn.md), -or start a bit more gently with our [Estimator MNIST tutorial](../estimators/cnn.md). -Finally, if you want to get up to speed on research in this area, you can -read the recent work of all the papers referenced in this tutorial. - diff --git a/tensorflow/docs_src/tutorials/keras/basic_classification.md b/tensorflow/docs_src/tutorials/keras/basic_classification.md deleted file mode 100644 index e028af99b9..0000000000 --- a/tensorflow/docs_src/tutorials/keras/basic_classification.md +++ /dev/null @@ -1,3 +0,0 @@ -# Basic Classification - -[Colab notebook](https://colab.research.google.com/github/tensorflow/models/blob/master/samples/core/tutorials/keras/basic_classification.ipynb) diff --git a/tensorflow/docs_src/tutorials/keras/basic_regression.md b/tensorflow/docs_src/tutorials/keras/basic_regression.md deleted file mode 100644 index 8721b7aca1..0000000000 --- a/tensorflow/docs_src/tutorials/keras/basic_regression.md +++ /dev/null @@ -1,3 +0,0 @@ -# Basic Regression - -[Colab notebook](https://colab.research.google.com/github/tensorflow/models/blob/master/samples/core/tutorials/keras/basic_regression.ipynb) diff --git a/tensorflow/docs_src/tutorials/keras/basic_text_classification.md b/tensorflow/docs_src/tutorials/keras/basic_text_classification.md deleted file mode 100644 index c2a16bdd20..0000000000 --- a/tensorflow/docs_src/tutorials/keras/basic_text_classification.md +++ /dev/null @@ -1,3 +0,0 @@ -# Basic Text Classification - -[Colab notebook](https://colab.research.google.com/github/tensorflow/models/blob/master/samples/core/tutorials/keras/basic_text_classification.ipynb) diff --git a/tensorflow/docs_src/tutorials/keras/index.md b/tensorflow/docs_src/tutorials/keras/index.md deleted file mode 100644 index 9d42281c8f..0000000000 --- a/tensorflow/docs_src/tutorials/keras/index.md +++ /dev/null @@ -1,22 +0,0 @@ -# Learn and use machine learning - -This notebook collection is inspired by the book -*[Deep Learning with Python](https://books.google.com/books?id=Yo3CAQAACAAJ)*. -These tutorials use `tf.keras`, TensorFlow's high-level Python API for building -and training deep learning models. To learn more about using Keras with -TensorFlow, see the [TensorFlow Keras Guide](../../guide/keras). - -Publisher's note: *Deep Learning with Python* introduces the field of deep -learning using the Python language and the powerful Keras library. Written by -Keras creator and Google AI researcher François Chollet, this book builds your -understanding through intuitive explanations and practical examples. - -To learn about machine learning fundamentals and concepts, consider taking the -[Machine Learning Crash Course](https://developers.google.com/machine-learning/crash-course/). -Additional TensorFlow and machine learning resources are listed in [next steps](../next_steps). - -1. [Basic classification](./basic_classification) -2. [Text classification](./basic_text_classification) -3. [Regression](./basic_regression) -4. [Overfitting and underfitting](./overfit_and_underfit) -5. [Save and restore models](./save_and_restore_models) diff --git a/tensorflow/docs_src/tutorials/keras/overfit_and_underfit.md b/tensorflow/docs_src/tutorials/keras/overfit_and_underfit.md deleted file mode 100644 index f07f3addd8..0000000000 --- a/tensorflow/docs_src/tutorials/keras/overfit_and_underfit.md +++ /dev/null @@ -1,3 +0,0 @@ -# Overfitting and Underfitting - -[Colab notebook](https://colab.research.google.com/github/tensorflow/models/blob/master/samples/core/tutorials/keras/overfit_and_underfit.ipynb) diff --git a/tensorflow/docs_src/tutorials/keras/save_and_restore_models.md b/tensorflow/docs_src/tutorials/keras/save_and_restore_models.md deleted file mode 100644 index a799b379a0..0000000000 --- a/tensorflow/docs_src/tutorials/keras/save_and_restore_models.md +++ /dev/null @@ -1,3 +0,0 @@ -# Save and restore Models - -[Colab notebook](https://colab.research.google.com/github/tensorflow/models/blob/master/samples/core/tutorials/keras/save_and_restore_models.ipynb) diff --git a/tensorflow/docs_src/tutorials/next_steps.md b/tensorflow/docs_src/tutorials/next_steps.md deleted file mode 100644 index 01c9f7204a..0000000000 --- a/tensorflow/docs_src/tutorials/next_steps.md +++ /dev/null @@ -1,36 +0,0 @@ -# Next steps - -## Learn more about TensorFlow - -* The [TensorFlow Guide](/guide) includes usage guides for the - high-level APIs, as well as advanced TensorFlow operations. -* [Premade Estimators](/guide/premade_estimators) are designed to - get results out of the box. Use TensorFlow without building your own models. -* [TensorFlow.js](https://js.tensorflow.org/) allows web developers to train and - deploy ML models in the browser and using Node.js. -* [TFLite](/mobile/tflite) allows mobile developers to do inference efficiently - on mobile devices. -* [TensorFlow Serving](/serving) is an open-source project that can put - TensorFlow models in production quickly. -* The [ecosystem](/ecosystem) contains more projects, including - [Magenta](https://magenta.tensorflow.org/), [TFX](/tfx), - [Swift for TensorFlow](https://github.com/tensorflow/swift), and more. - -## Learn more about machine learning - -Recommended resources include: - -* [Machine Learning Crash Course](https://developers.google.com/machine-learning/crash-course/), - a course from Google that introduces machine learning concepts. -* [CS 20: Tensorflow for Deep Learning Research](http://web.stanford.edu/class/cs20si/), - notes from an intro course from Stanford. -* [CS231n: Convolutional Neural Networks for Visual Recognition](http://cs231n.stanford.edu/), - a course that teaches how convolutional networks work. -* [Machine Learning Recipes](https://www.youtube.com/watch?v=cKxRvEZd3Mw&list=PLOU2XLYxmsIIuiBfYad6rFYQU_jL2ryal), - a video series that introduces basic machine learning concepts with few prerequisites. -* [Deep Learning with Python](https://www.manning.com/books/deep-learning-with-python), - a book by Francois Chollet about the Keras API, as well as an excellent hands on intro to Deep Learning. -* [Hands-on Machine Learning with Scikit-Learn and TensorFlow](https://github.com/ageron/handson-ml), - a book by Aurélien Geron's that is a clear getting-started guide to data science and deep learning. -* [Deep Learning](https://www.deeplearningbook.org/), a book by Ian Goodfellow et al. - that provides a technical dive into learning machine learning. diff --git a/tensorflow/docs_src/tutorials/non-ml/mandelbrot.md b/tensorflow/docs_src/tutorials/non-ml/mandelbrot.md deleted file mode 100644 index 1c0a548129..0000000000 --- a/tensorflow/docs_src/tutorials/non-ml/mandelbrot.md +++ /dev/null @@ -1,116 +0,0 @@ -# Mandelbrot Set - -Visualizing the [Mandelbrot set](https://en.wikipedia.org/wiki/Mandelbrot_set) -doesn't have anything to do with machine learning, but it makes for a fun -example of how one can use TensorFlow for general mathematics. This is -actually a pretty naive implementation of the visualization, but it makes the -point. (We may end up providing a more elaborate implementation down the line -to produce more truly beautiful images.) - - -## Basic Setup - -We'll need a few imports to get started. - -```python -# Import libraries for simulation -import tensorflow as tf -import numpy as np - -# Imports for visualization -import PIL.Image -from io import BytesIO -from IPython.display import Image, display -``` - -Now we'll define a function to actually display the image once we have -iteration counts. - -```python -def DisplayFractal(a, fmt='jpeg'): - """Display an array of iteration counts as a - colorful picture of a fractal.""" - a_cyclic = (6.28*a/20.0).reshape(list(a.shape)+[1]) - img = np.concatenate([10+20*np.cos(a_cyclic), - 30+50*np.sin(a_cyclic), - 155-80*np.cos(a_cyclic)], 2) - img[a==a.max()] = 0 - a = img - a = np.uint8(np.clip(a, 0, 255)) - f = BytesIO() - PIL.Image.fromarray(a).save(f, fmt) - display(Image(data=f.getvalue())) -``` - -## Session and Variable Initialization - -For playing around like this, we often use an interactive session, but a regular -session would work as well. - -```python -sess = tf.InteractiveSession() -``` - -It's handy that we can freely mix NumPy and TensorFlow. - -```python -# Use NumPy to create a 2D array of complex numbers - -Y, X = np.mgrid[-1.3:1.3:0.005, -2:1:0.005] -Z = X+1j*Y -``` - -Now we define and initialize TensorFlow tensors. - -```python -xs = tf.constant(Z.astype(np.complex64)) -zs = tf.Variable(xs) -ns = tf.Variable(tf.zeros_like(xs, tf.float32)) -``` - -TensorFlow requires that you explicitly initialize variables before using them. - -```python -tf.global_variables_initializer().run() -``` - -## Defining and Running the Computation - -Now we specify more of the computation... - -```python -# Compute the new values of z: z^2 + x -zs_ = zs*zs + xs - -# Have we diverged with this new value? -not_diverged = tf.abs(zs_) < 4 - -# Operation to update the zs and the iteration count. -# -# Note: We keep computing zs after they diverge! This -# is very wasteful! There are better, if a little -# less simple, ways to do this. -# -step = tf.group( - zs.assign(zs_), - ns.assign_add(tf.cast(not_diverged, tf.float32)) - ) -``` - -... and run it for a couple hundred steps - -```python -for i in range(200): step.run() -``` - -Let's see what we've got. - -```python -DisplayFractal(ns.eval()) -``` - -![jpeg](https://www.tensorflow.org/images/mandelbrot_output.jpg) - -Not bad! - - diff --git a/tensorflow/docs_src/tutorials/non-ml/pdes.md b/tensorflow/docs_src/tutorials/non-ml/pdes.md deleted file mode 100644 index b5a0fa834a..0000000000 --- a/tensorflow/docs_src/tutorials/non-ml/pdes.md +++ /dev/null @@ -1,140 +0,0 @@ -# Partial Differential Equations - -TensorFlow isn't just for machine learning. Here we give a (somewhat -pedestrian) example of using TensorFlow for simulating the behavior of a -[partial differential equation]( -https://en.wikipedia.org/wiki/Partial_differential_equation). -We'll simulate the surface of square pond as a few raindrops land on it. - - -## Basic Setup - -A few imports we'll need. - -```python -#Import libraries for simulation -import tensorflow as tf -import numpy as np - -#Imports for visualization -import PIL.Image -from io import BytesIO -from IPython.display import clear_output, Image, display -``` - -A function for displaying the state of the pond's surface as an image. - -```python -def DisplayArray(a, fmt='jpeg', rng=[0,1]): - """Display an array as a picture.""" - a = (a - rng[0])/float(rng[1] - rng[0])*255 - a = np.uint8(np.clip(a, 0, 255)) - f = BytesIO() - PIL.Image.fromarray(a).save(f, fmt) - clear_output(wait = True) - display(Image(data=f.getvalue())) -``` - -Here we start an interactive TensorFlow session for convenience in playing -around. A regular session would work as well if we were doing this in an -executable .py file. - -```python -sess = tf.InteractiveSession() -``` - -## Computational Convenience Functions - - -```python -def make_kernel(a): - """Transform a 2D array into a convolution kernel""" - a = np.asarray(a) - a = a.reshape(list(a.shape) + [1,1]) - return tf.constant(a, dtype=1) - -def simple_conv(x, k): - """A simplified 2D convolution operation""" - x = tf.expand_dims(tf.expand_dims(x, 0), -1) - y = tf.nn.depthwise_conv2d(x, k, [1, 1, 1, 1], padding='SAME') - return y[0, :, :, 0] - -def laplace(x): - """Compute the 2D laplacian of an array""" - laplace_k = make_kernel([[0.5, 1.0, 0.5], - [1.0, -6., 1.0], - [0.5, 1.0, 0.5]]) - return simple_conv(x, laplace_k) -``` - -## Define the PDE - -Our pond is a perfect 500 x 500 square, as is the case for most ponds found in -nature. - -```python -N = 500 -``` - -Here we create our pond and hit it with some rain drops. - -```python -# Initial Conditions -- some rain drops hit a pond - -# Set everything to zero -u_init = np.zeros([N, N], dtype=np.float32) -ut_init = np.zeros([N, N], dtype=np.float32) - -# Some rain drops hit a pond at random points -for n in range(40): - a,b = np.random.randint(0, N, 2) - u_init[a,b] = np.random.uniform() - -DisplayArray(u_init, rng=[-0.1, 0.1]) -``` - -![jpeg](https://www.tensorflow.org/images/pde_output_1.jpg) - - -Now let's specify the details of the differential equation. - - -```python -# Parameters: -# eps -- time resolution -# damping -- wave damping -eps = tf.placeholder(tf.float32, shape=()) -damping = tf.placeholder(tf.float32, shape=()) - -# Create variables for simulation state -U = tf.Variable(u_init) -Ut = tf.Variable(ut_init) - -# Discretized PDE update rules -U_ = U + eps * Ut -Ut_ = Ut + eps * (laplace(U) - damping * Ut) - -# Operation to update the state -step = tf.group( - U.assign(U_), - Ut.assign(Ut_)) -``` - -## Run The Simulation - -This is where it gets fun -- running time forward with a simple for loop. - -```python -# Initialize state to initial conditions -tf.global_variables_initializer().run() - -# Run 1000 steps of PDE -for i in range(1000): - # Step simulation - step.run({eps: 0.03, damping: 0.04}) - DisplayArray(U.eval(), rng=[-0.1, 0.1]) -``` - -![jpeg](../../images/pde_output_2.jpg) - -Look! Ripples! diff --git a/tensorflow/docs_src/tutorials/representation/kernel_methods.md b/tensorflow/docs_src/tutorials/representation/kernel_methods.md deleted file mode 100644 index 67adc4951c..0000000000 --- a/tensorflow/docs_src/tutorials/representation/kernel_methods.md +++ /dev/null @@ -1,303 +0,0 @@ -# Improving Linear Models Using Explicit Kernel Methods - -Note: This document uses a deprecated version of `tf.estimator`, -`tf.contrib.learn.Estimator`, which has a different interface. It also uses -other `contrib` methods whose [API may not be stable](../../guide/version_compat.md#not_covered). - -In this tutorial, we demonstrate how combining (explicit) kernel methods with -linear models can drastically increase the latters' quality of predictions -without significantly increasing training and inference times. Unlike dual -kernel methods, explicit (primal) kernel methods scale well with the size of the -training dataset both in terms of training/inference times and in terms of -memory requirements. - -**Intended audience:** Even though we provide a high-level overview of concepts -related to explicit kernel methods, this tutorial primarily targets readers who -already have at least basic knowledge of kernel methods and Support Vector -Machines (SVMs). If you are new to kernel methods, refer to either of the -following sources for an introduction: - -* If you have a strong mathematical background: -[Kernel Methods in Machine Learning](https://arxiv.org/pdf/math/0701907.pdf) -* [Kernel method wikipedia page](https://en.wikipedia.org/wiki/Kernel_method) - -Currently, TensorFlow supports explicit kernel mappings for dense features only; -TensorFlow will provide support for sparse features at a later release. - -This tutorial uses [tf.contrib.learn](https://www.tensorflow.org/code/tensorflow/contrib/learn/python/learn) -(TensorFlow's high-level Machine Learning API) Estimators for our ML models. -If you are not familiar with this API, The [Estimator guide](../../guide/estimators.md) -is a good place to start. We will use the MNIST dataset. The tutorial consists -of the following steps: - -* Load and prepare MNIST data for classification. -* Construct a simple linear model, train it, and evaluate it on the eval data. -* Replace the linear model with a kernelized linear model, re-train, and -re-evaluate. - -## Load and prepare MNIST data for classification -Run the following utility command to load the MNIST dataset: - -```python -data = tf.contrib.learn.datasets.mnist.load_mnist() -``` -The preceding method loads the entire MNIST dataset (containing 70K samples) and -splits it into train, validation, and test data with 55K, 5K, and 10K samples -respectively. Each split contains one numpy array for images (with shape -[sample_size, 784]) and one for labels (with shape [sample_size, 1]). In this -tutorial, we only use the train and validation splits to train and evaluate our -models respectively. - -In order to feed data to a `tf.contrib.learn Estimator`, it is helpful to convert -it to Tensors. For this, we will use an `input function` which adds Ops to the -TensorFlow graph that, when executed, create mini-batches of Tensors to be used -downstream. For more background on input functions, check -[this section on input functions](../../guide/premade_estimators.md#create_input_functions). -In this example, we will use the `tf.train.shuffle_batch` Op which, besides -converting numpy arrays to Tensors, allows us to specify the batch_size and -whether to randomize the input every time the input_fn Ops are executed -(randomization typically expedites convergence during training). The full code -for loading and preparing the data is shown in the snippet below. In this -example, we use mini-batches of size 256 for training and the entire sample -(5K entries) for evaluation. Feel free to experiment with different batch sizes. - -```python -import numpy as np -import tensorflow as tf - -def get_input_fn(dataset_split, batch_size, capacity=10000, min_after_dequeue=3000): - - def _input_fn(): - images_batch, labels_batch = tf.train.shuffle_batch( - tensors=[dataset_split.images, dataset_split.labels.astype(np.int32)], - batch_size=batch_size, - capacity=capacity, - min_after_dequeue=min_after_dequeue, - enqueue_many=True, - num_threads=4) - features_map = {'images': images_batch} - return features_map, labels_batch - - return _input_fn - -data = tf.contrib.learn.datasets.mnist.load_mnist() - -train_input_fn = get_input_fn(data.train, batch_size=256) -eval_input_fn = get_input_fn(data.validation, batch_size=5000) - -``` - -## Training a simple linear model -We can now train a linear model over the MNIST dataset. We will use the -`tf.contrib.learn.LinearClassifier` estimator with 10 classes representing the -10 digits. The input features form a 784-dimensional dense vector which can -be specified as follows: - -```python -image_column = tf.contrib.layers.real_valued_column('images', dimension=784) -``` - -The full code for constructing, training and evaluating a LinearClassifier -estimator is as follows: - -```python -import time - -# Specify the feature(s) to be used by the estimator. -image_column = tf.contrib.layers.real_valued_column('images', dimension=784) -estimator = tf.contrib.learn.LinearClassifier(feature_columns=[image_column], n_classes=10) - -# Train. -start = time.time() -estimator.fit(input_fn=train_input_fn, steps=2000) -end = time.time() -print('Elapsed time: {} seconds'.format(end - start)) - -# Evaluate and report metrics. -eval_metrics = estimator.evaluate(input_fn=eval_input_fn, steps=1) -print(eval_metrics) -``` -The following table summarizes the results on the eval data. - -metric | value -:------------ | :------------ -loss | 0.25 to 0.30 -accuracy | 92.5% -training time | ~25 seconds on my machine - -Note: Metrics will vary depending on various factors. - -In addition to experimenting with the (training) batch size and the number of -training steps, there are a couple other parameters that can be tuned as well. -For instance, you can change the optimization method used to minimize the loss -by explicitly selecting another optimizer from the collection of -[available optimizers](https://www.tensorflow.org/code/tensorflow/python/training). -As an example, the following code constructs a LinearClassifier estimator that -uses the Follow-The-Regularized-Leader (FTRL) optimization strategy with a -specific learning rate and L2-regularization. - - -```python -optimizer = tf.train.FtrlOptimizer(learning_rate=5.0, l2_regularization_strength=1.0) -estimator = tf.contrib.learn.LinearClassifier( - feature_columns=[image_column], n_classes=10, optimizer=optimizer) -``` - -Regardless of the values of the parameters, the maximum accuracy a linear model -can achieve on this dataset caps at around **93%**. - -## Using explicit kernel mappings with the linear model. -The relatively high error (~7%) of the linear model over MNIST indicates that -the input data is not linearly separable. We will use explicit kernel mappings -to reduce the classification error. - -**Intuition:** The high-level idea is to use a non-linear map to transform the -input space to another feature space (of possibly higher dimension) where the -(transformed) features are (almost) linearly separable and then apply a linear -model on the mapped features. This is shown in the following figure: - -
- -
- - -### Technical details -In this example we will use **Random Fourier Features**, introduced in the -["Random Features for Large-Scale Kernel Machines"](https://people.eecs.berkeley.edu/~brecht/papers/07.rah.rec.nips.pdf) -paper by Rahimi and Recht, to map the input data. Random Fourier Features map a -vector \\(\mathbf{x} \in \mathbb{R}^d\\) to \\(\mathbf{x'} \in \mathbb{R}^D\\) -via the following mapping: - -$$ -RFFM(\cdot): \mathbb{R}^d \to \mathbb{R}^D, \quad -RFFM(\mathbf{x}) = \cos(\mathbf{\Omega} \cdot \mathbf{x}+ \mathbf{b}) -$$ - -where \\(\mathbf{\Omega} \in \mathbb{R}^{D \times d}\\), -\\(\mathbf{x} \in \mathbb{R}^d,\\) \\(\mathbf{b} \in \mathbb{R}^D\\) and the -cosine is applied element-wise. - -In this example, the entries of \\(\mathbf{\Omega}\\) and \\(\mathbf{b}\\) are -sampled from distributions such that the mapping satisfies the following -property: - -$$ -RFFM(\mathbf{x})^T \cdot RFFM(\mathbf{y}) \approx -e^{-\frac{\|\mathbf{x} - \mathbf{y}\|^2}{2 \sigma^2}} -$$ - -The right-hand-side quantity of the expression above is known as the RBF (or -Gaussian) kernel function. This function is one of the most-widely used kernel -functions in Machine Learning and implicitly measures similarity in a different, -much higher dimensional space than the original one. See -[Radial basis function kernel](https://en.wikipedia.org/wiki/Radial_basis_function_kernel) -for more details. - -### Kernel classifier -`tf.contrib.kernel_methods.KernelLinearClassifier` is a pre-packaged -`tf.contrib.learn` estimator that combines the power of explicit kernel mappings -with linear models. Its constructor is almost identical to that of the -LinearClassifier estimator with the additional option to specify a list of -explicit kernel mappings to be applied to each feature the classifier uses. The -following code snippet demonstrates how to replace LinearClassifier with -KernelLinearClassifier. - - -```python -# Specify the feature(s) to be used by the estimator. This is identical to the -# code used for the LinearClassifier. -image_column = tf.contrib.layers.real_valued_column('images', dimension=784) -optimizer = tf.train.FtrlOptimizer( - learning_rate=50.0, l2_regularization_strength=0.001) - - -kernel_mapper = tf.contrib.kernel_methods.RandomFourierFeatureMapper( - input_dim=784, output_dim=2000, stddev=5.0, name='rffm') -kernel_mappers = {image_column: [kernel_mapper]} -estimator = tf.contrib.kernel_methods.KernelLinearClassifier( - n_classes=10, optimizer=optimizer, kernel_mappers=kernel_mappers) - -# Train. -start = time.time() -estimator.fit(input_fn=train_input_fn, steps=2000) -end = time.time() -print('Elapsed time: {} seconds'.format(end - start)) - -# Evaluate and report metrics. -eval_metrics = estimator.evaluate(input_fn=eval_input_fn, steps=1) -print(eval_metrics) -``` -The only additional parameter passed to `KernelLinearClassifier` is a dictionary -from feature_columns to a list of kernel mappings to be applied to the -corresponding feature column. The following lines instruct the classifier to -first map the initial 784-dimensional images to 2000-dimensional vectors using -random Fourier features and then learn a linear model on the transformed -vectors: - -```python -kernel_mapper = tf.contrib.kernel_methods.RandomFourierFeatureMapper( - input_dim=784, output_dim=2000, stddev=5.0, name='rffm') -kernel_mappers = {image_column: [kernel_mapper]} -estimator = tf.contrib.kernel_methods.KernelLinearClassifier( - n_classes=10, optimizer=optimizer, kernel_mappers=kernel_mappers) -``` -Notice the `stddev` parameter. This is the standard deviation (\\(\sigma\\)) of -the approximated RBF kernel and controls the similarity measure used in -classification. `stddev` is typically determined via hyperparameter tuning. - -The results of running the preceding code are summarized in the following table. -We can further increase the accuracy by increasing the output dimension of the -mapping and tuning the standard deviation. - -metric | value -:------------ | :------------ -loss | 0.10 -accuracy | 97% -training time | ~35 seconds on my machine - - -### stddev -The classification quality is very sensitive to the value of stddev. The -following table shows the accuracy of the classifier on the eval data for -different values of stddev. The optimal value is stddev=5.0. Notice how too -small or too high stddev values can dramatically decrease the accuracy of the -classification. - -stddev | eval accuracy -:----- | :------------ -1.0 | 0.1362 -2.0 | 0.4764 -4.0 | 0.9654 -5.0 | 0.9766 -8.0 | 0.9714 -16.0 | 0.8878 - -### Output dimension -Intuitively, the larger the output dimension of the mapping, the closer the -inner product of two mapped vectors approximates the kernel, which typically -translates to better classification accuracy. Another way to think about this is -that the output dimension equals the number of weights of the linear model; the -larger this dimension, the larger the "degrees of freedom" of the model. -However, after a certain threshold, higher output dimensions increase the -accuracy by very little, while making training take more time. This is shown in -the following two Figures which depict the eval accuracy as a function of the -output dimension and the training time, respectively. - -![image](https://www.tensorflow.org/versions/master/images/acc_vs_outdim.png) -![image](https://www.tensorflow.org/versions/master/images/acc-vs-trn_time.png) - - -## Summary -Explicit kernel mappings combine the predictive power of nonlinear models with -the scalability of linear models. Unlike traditional dual kernel methods, -explicit kernel methods can scale to millions or hundreds of millions of -samples. When using explicit kernel mappings, consider the following tips: - -* Random Fourier Features can be particularly effective for datasets with dense -features. -* The parameters of the kernel mapping are often data-dependent. Model quality -can be very sensitive to these parameters. Use hyperparameter tuning to find the -optimal values. -* If you have multiple numerical features, concatenate them into a single -multi-dimensional feature and apply the kernel mapping to the concatenated -vector. diff --git a/tensorflow/docs_src/tutorials/representation/linear.md b/tensorflow/docs_src/tutorials/representation/linear.md deleted file mode 100644 index 4f0e67f08e..0000000000 --- a/tensorflow/docs_src/tutorials/representation/linear.md +++ /dev/null @@ -1,239 +0,0 @@ -# Large-scale Linear Models with TensorFlow - -`tf.estimator` provides (among other things) a rich set of tools for -working with linear models in TensorFlow. This document provides an overview of -those tools. It explains: - - * What a linear model is. - * Why you might want to use a linear model. - * How Estimators make it easy to build linear models in TensorFlow. - * How you can use Estimators to combine linear models with. - deep learning to get the advantages of both. - -Read this overview to decide whether the Estimator's linear model tools might -be useful to you. Then work through the -[Estimator wide and deep learning tutorial](https://github.com/tensorflow/models/tree/master/official/wide_deep) -to give it a try. This overview uses code samples from the tutorial, but the -tutorial walks through the code in greater detail. - -To understand this overview it will help to have some familiarity -with basic machine learning concepts, and also with -[Estimators](../../guide/premade_estimators.md). - -[TOC] - -## What is a linear model? - -A **linear model** uses a single weighted sum of features to make a prediction. -For example, if you have [data](https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.names) -on age, years of education, and weekly hours of -work for a population, a model can learn weights for each of those numbers so that -their weighted sum estimates a person's salary. You can also use linear models -for classification. - -Some linear models transform the weighted sum into a more convenient form. For -example, [**logistic regression**](https://developers.google.com/machine-learning/glossary/#logistic_regression) plugs the weighted sum into the logistic -function to turn the output into a value between 0 and 1. But you still just -have one weight for each input feature. - -## Why would you want to use a linear model? - -Why would you want to use so simple a model when recent research has -demonstrated the power of more complex neural networks with many layers? - -Linear models: - - * train quickly, compared to deep neural nets. - * can work well on very large feature sets. - * can be trained with algorithms that don't require a lot of fiddling - with learning rates, etc. - * can be interpreted and debugged more easily than neural nets. - You can examine the weights assigned to each feature to figure out what's - having the biggest impact on a prediction. - * provide an excellent starting point for learning about machine learning. - * are widely used in industry. - -## How do Estimators help you build linear models? - -You can build a linear model from scratch in TensorFlow without the help of a -special API. But Estimators provides some tools that make it easier to build -effective large-scale linear models. - -### Feature columns and transformations - -Much of the work of designing a linear model consists of transforming raw data -into suitable input features. Tensorflow uses the `FeatureColumn` abstraction to -enable these transformations. - -A `FeatureColumn` represents a single feature in your data. A `FeatureColumn` -may represent a quantity like 'height', or it may represent a category like -'eye_color' where the value is drawn from a set of discrete possibilities like -{'blue', 'brown', 'green'}. - -In the case of both *continuous features* like 'height' and *categorical -features* like 'eye_color', a single value in the data might get transformed -into a sequence of numbers before it is input into the model. The -`FeatureColumn` abstraction lets you manipulate the feature as a single -semantic unit in spite of this fact. You can specify transformations and -select features to include without dealing with specific indices in the -tensors you feed into the model. - -#### Sparse columns - -Categorical features in linear models are typically translated into a sparse -vector in which each possible value has a corresponding index or id. For -example, if there are only three possible eye colors you can represent -'eye_color' as a length 3 vector: 'brown' would become [1, 0, 0], 'blue' would -become [0, 1, 0] and 'green' would become [0, 0, 1]. These vectors are called -"sparse" because they may be very long, with many zeros, when the set of -possible values is very large (such as all English words). - -While you don't need to use categorical columns to use the linear model tools -provided by Estimators, one of the strengths of linear models is their ability -to deal with large sparse vectors. Sparse features are a primary use case for -the linear model tools provided by Estimators. - -##### Encoding sparse columns - -`FeatureColumn` handles the conversion of categorical values into vectors -automatically, with code like this: - -```python -eye_color = tf.feature_column.categorical_column_with_vocabulary_list( - "eye_color", vocabulary_list=["blue", "brown", "green"]) -``` - -where `eye_color` is the name of a column in your source data. - -You can also generate `FeatureColumn`s for categorical features for which you -don't know all possible values. For this case you would use -`categorical_column_with_hash_bucket()`, which uses a hash function to assign -indices to feature values. - -```python -education = tf.feature_column.categorical_column_with_hash_bucket( - "education", hash_bucket_size=1000) -``` - -##### Feature Crosses - -Because linear models assign independent weights to separate features, they -can't learn the relative importance of specific combinations of feature -values. If you have a feature 'favorite_sport' and a feature 'home_city' and -you're trying to predict whether a person likes to wear red, your linear model -won't be able to learn that baseball fans from St. Louis especially like to -wear red. - -You can get around this limitation by creating a new feature -'favorite_sport_x_home_city'. The value of this feature for a given person is -just the concatenation of the values of the two source features: -'baseball_x_stlouis', for example. This sort of combination feature is called -a *feature cross*. - -The `crossed_column()` method makes it easy to set up feature crosses: - -```python -sport_x_city = tf.feature_column.crossed_column( - ["sport", "city"], hash_bucket_size=int(1e4)) -``` - -#### Continuous columns - -You can specify a continuous feature like so: - -```python -age = tf.feature_column.numeric_column("age") -``` - -Although, as a single real number, a continuous feature can often be input -directly into the model, Tensorflow offers useful transformations for this sort -of column as well. - -##### Bucketization - -*Bucketization* turns a continuous column into a categorical column. This -transformation lets you use continuous features in feature crosses, or learn -cases where specific value ranges have particular importance. - -Bucketization divides the range of possible values into subranges called -buckets: - -```python -age_buckets = tf.feature_column.bucketized_column( - age, boundaries=[18, 25, 30, 35, 40, 45, 50, 55, 60, 65]) -``` - -The bucket into which a value falls becomes the categorical label for -that value. - -#### Input function - -`FeatureColumn`s provide a specification for the input data for your model, -indicating how to represent and transform the data. But they do not provide -the data itself. You provide the data through an input function. - -The input function must return a dictionary of tensors. Each key corresponds to -the name of a `FeatureColumn`. Each key's value is a tensor containing the -values of that feature for all data instances. See -[Premade Estimators](../../guide/premade_estimators.md#input_fn) for a -more comprehensive look at input functions, and `input_fn` in the -[wide and deep learning tutorial](https://github.com/tensorflow/models/tree/master/official/wide_deep) -for an example implementation of an input function. - -The input function is passed to the `train()` and `evaluate()` calls that -initiate training and testing, as described in the next section. - -### Linear estimators - -Tensorflow estimator classes provide a unified training and evaluation harness -for regression and classification models. They take care of the details of the -training and evaluation loops and allow the user to focus on model inputs and -architecture. - -To build a linear estimator, you can use either the -`tf.estimator.LinearClassifier` estimator or the -`tf.estimator.LinearRegressor` estimator, for classification and -regression respectively. - -As with all tensorflow estimators, to run the estimator you just: - - 1. Instantiate the estimator class. For the two linear estimator classes, - you pass a list of `FeatureColumn`s to the constructor. - 2. Call the estimator's `train()` method to train it. - 3. Call the estimator's `evaluate()` method to see how it does. - -For example: - -```python -e = tf.estimator.LinearClassifier( - feature_columns=[ - native_country, education, occupation, workclass, marital_status, - race, age_buckets, education_x_occupation, - age_buckets_x_race_x_occupation], - model_dir=YOUR_MODEL_DIRECTORY) -e.train(input_fn=input_fn_train, steps=200) -# Evaluate for one step (one pass through the test data). -results = e.evaluate(input_fn=input_fn_test) - -# Print the stats for the evaluation. -for key in sorted(results): - print("%s: %s" % (key, results[key])) -``` - -### Wide and deep learning - -The `tf.estimator` module also provides an estimator class that lets you jointly -train a linear model and a deep neural network. This novel approach combines the -ability of linear models to "memorize" key features with the generalization -ability of neural nets. Use `tf.estimator.DNNLinearCombinedClassifier` to -create this sort of "wide and deep" model: - -```python -e = tf.estimator.DNNLinearCombinedClassifier( - model_dir=YOUR_MODEL_DIR, - linear_feature_columns=wide_columns, - dnn_feature_columns=deep_columns, - dnn_hidden_units=[100, 50]) -``` -For more information, see the -[wide and deep learning tutorial](https://github.com/tensorflow/models/tree/master/official/wide_deep). diff --git a/tensorflow/docs_src/tutorials/representation/word2vec.md b/tensorflow/docs_src/tutorials/representation/word2vec.md deleted file mode 100644 index df0d3176b6..0000000000 --- a/tensorflow/docs_src/tutorials/representation/word2vec.md +++ /dev/null @@ -1,405 +0,0 @@ -# Vector Representations of Words - -In this tutorial we look at the word2vec model by -[Mikolov et al.](https://papers.nips.cc/paper/5021-distributed-representations-of-words-and-phrases-and-their-compositionality.pdf) -This model is used for learning vector representations of words, called "word -embeddings". - -## Highlights - -This tutorial is meant to highlight the interesting, substantive parts of -building a word2vec model in TensorFlow. - -* We start by giving the motivation for why we would want to -represent words as vectors. -* We look at the intuition behind the model and how it is trained -(with a splash of math for good measure). -* We also show a simple implementation of the model in TensorFlow. -* Finally, we look at ways to make the naive version scale better. - -We walk through the code later during the tutorial, but if you'd prefer to dive -straight in, feel free to look at the minimalistic implementation in -[tensorflow/examples/tutorials/word2vec/word2vec_basic.py](https://www.tensorflow.org/code/tensorflow/examples/tutorials/word2vec/word2vec_basic.py) -This basic example contains the code needed to download some data, train on it a -bit and visualize the result. Once you get comfortable with reading and running -the basic version, you can graduate to -[models/tutorials/embedding/word2vec.py](https://github.com/tensorflow/models/tree/master/tutorials/embedding/word2vec.py) -which is a more serious implementation that showcases some more advanced -TensorFlow principles about how to efficiently use threads to move data into a -text model, how to checkpoint during training, etc. - -But first, let's look at why we would want to learn word embeddings in the first -place. Feel free to skip this section if you're an Embedding Pro and you'd just -like to get your hands dirty with the details. - -## Motivation: Why Learn Word Embeddings? - -Image and audio processing systems work with rich, high-dimensional datasets -encoded as vectors of the individual raw pixel-intensities for image data, or -e.g. power spectral density coefficients for audio data. For tasks like object -or speech recognition we know that all the information required to successfully -perform the task is encoded in the data (because humans can perform these tasks -from the raw data). However, natural language processing systems traditionally -treat words as discrete atomic symbols, and therefore 'cat' may be represented -as `Id537` and 'dog' as `Id143`. These encodings are arbitrary, and provide -no useful information to the system regarding the relationships that may exist -between the individual symbols. This means that the model can leverage -very little of what it has learned about 'cats' when it is processing data about -'dogs' (such that they are both animals, four-legged, pets, etc.). Representing -words as unique, discrete ids furthermore leads to data sparsity, and usually -means that we may need more data in order to successfully train statistical -models. Using vector representations can overcome some of these obstacles. - -
- -
- -[Vector space models](https://en.wikipedia.org/wiki/Vector_space_model) (VSMs) -represent (embed) words in a continuous vector space where semantically -similar words are mapped to nearby points ('are embedded nearby each other'). -VSMs have a long, rich history in NLP, but all methods depend in some way or -another on the -[Distributional Hypothesis](https://en.wikipedia.org/wiki/Distributional_semantics#Distributional_Hypothesis), -which states that words that appear in the same contexts share -semantic meaning. The different approaches that leverage this principle can be -divided into two categories: *count-based methods* (e.g. -[Latent Semantic Analysis](https://en.wikipedia.org/wiki/Latent_semantic_analysis)), -and *predictive methods* (e.g. -[neural probabilistic language models](http://www.scholarpedia.org/article/Neural_net_language_models)). - -This distinction is elaborated in much more detail by -[Baroni et al.](http://clic.cimec.unitn.it/marco/publications/acl2014/baroni-etal-countpredict-acl2014.pdf), -but in a nutshell: Count-based methods compute the statistics of -how often some word co-occurs with its neighbor words in a large text corpus, -and then map these count-statistics down to a small, dense vector for each word. -Predictive models directly try to predict a word from its neighbors in terms of -learned small, dense *embedding vectors* (considered parameters of the -model). - -Word2vec is a particularly computationally-efficient predictive model for -learning word embeddings from raw text. It comes in two flavors, the Continuous -Bag-of-Words model (CBOW) and the Skip-Gram model (Section 3.1 and 3.2 in [Mikolov et al.](https://arxiv.org/pdf/1301.3781.pdf)). Algorithmically, these -models are similar, except that CBOW predicts target words (e.g. 'mat') from -source context words ('the cat sits on the'), while the skip-gram does the -inverse and predicts source context-words from the target words. This inversion -might seem like an arbitrary choice, but statistically it has the effect that -CBOW smoothes over a lot of the distributional information (by treating an -entire context as one observation). For the most part, this turns out to be a -useful thing for smaller datasets. However, skip-gram treats each context-target -pair as a new observation, and this tends to do better when we have larger -datasets. We will focus on the skip-gram model in the rest of this tutorial. - - -## Scaling up with Noise-Contrastive Training - -Neural probabilistic language models are traditionally trained using the -[maximum likelihood](https://en.wikipedia.org/wiki/Maximum_likelihood) (ML) -principle to maximize the probability of the next word \\(w_t\\) (for "target") -given the previous words \\(h\\) (for "history") in terms of a -[*softmax* function](https://en.wikipedia.org/wiki/Softmax_function), - -$$ -\begin{align} -P(w_t | h) &= \text{softmax}(\text{score}(w_t, h)) \\ - &= \frac{\exp \{ \text{score}(w_t, h) \} } - {\sum_\text{Word w' in Vocab} \exp \{ \text{score}(w', h) \} } -\end{align} -$$ - -where \\(\text{score}(w_t, h)\\) computes the compatibility of word \\(w_t\\) -with the context \\(h\\) (a dot product is commonly used). We train this model -by maximizing its [log-likelihood](https://en.wikipedia.org/wiki/Likelihood_function) -on the training set, i.e. by maximizing - -$$ -\begin{align} - J_\text{ML} &= \log P(w_t | h) \\ - &= \text{score}(w_t, h) - - \log \left( \sum_\text{Word w' in Vocab} \exp \{ \text{score}(w', h) \} \right). -\end{align} -$$ - -This yields a properly normalized probabilistic model for language modeling. -However this is very expensive, because we need to compute and normalize each -probability using the score for all other \\(V\\) words \\(w'\\) in the current -context \\(h\\), *at every training step*. - -
- -
- -On the other hand, for feature learning in word2vec we do not need a full -probabilistic model. The CBOW and skip-gram models are instead trained using a -binary classification objective ([logistic regression](https://en.wikipedia.org/wiki/Logistic_regression)) -to discriminate the real target words \\(w_t\\) from \\(k\\) imaginary (noise) words \\(\tilde w\\), in the -same context. We illustrate this below for a CBOW model. For skip-gram the -direction is simply inverted. - -
- -
- -Mathematically, the objective (for each example) is to maximize - -$$J_\text{NEG} = \log Q_\theta(D=1 |w_t, h) + - k \mathop{\mathbb{E}}_{\tilde w \sim P_\text{noise}} - \left[ \log Q_\theta(D = 0 |\tilde w, h) \right]$$ - -where \\(Q_\theta(D=1 | w, h)\\) is the binary logistic regression probability -under the model of seeing the word \\(w\\) in the context \\(h\\) in the dataset -\\(D\\), calculated in terms of the learned embedding vectors \\(\theta\\). In -practice we approximate the expectation by drawing \\(k\\) contrastive words -from the noise distribution (i.e. we compute a -[Monte Carlo average](https://en.wikipedia.org/wiki/Monte_Carlo_integration)). - -This objective is maximized when the model assigns high probabilities -to the real words, and low probabilities to noise words. Technically, this is -called -[Negative Sampling](https://papers.nips.cc/paper/5021-distributed-representations-of-words-and-phrases-and-their-compositionality.pdf), -and there is good mathematical motivation for using this loss function: -The updates it proposes approximate the updates of the softmax function in the -limit. But computationally it is especially appealing because computing the -loss function now scales only with the number of *noise words* that we -select (\\(k\\)), and not *all words* in the vocabulary (\\(V\\)). This makes it -much faster to train. We will actually make use of the very similar -[noise-contrastive estimation (NCE)](https://papers.nips.cc/paper/5165-learning-word-embeddings-efficiently-with-noise-contrastive-estimation.pdf) -loss, for which TensorFlow has a handy helper function `tf.nn.nce_loss()`. - -Let's get an intuitive feel for how this would work in practice! - -## The Skip-gram Model - -As an example, let's consider the dataset - -`the quick brown fox jumped over the lazy dog` - -We first form a dataset of words and the contexts in which they appear. We -could define 'context' in any way that makes sense, and in fact people have -looked at syntactic contexts (i.e. the syntactic dependents of the current -target word, see e.g. -[Levy et al.](https://levyomer.files.wordpress.com/2014/04/dependency-based-word-embeddings-acl-2014.pdf)), -words-to-the-left of the target, words-to-the-right of the target, etc. For now, -let's stick to the vanilla definition and define 'context' as the window -of words to the left and to the right of a target word. Using a window -size of 1, we then have the dataset - -`([the, brown], quick), ([quick, fox], brown), ([brown, jumped], fox), ...` - -of `(context, target)` pairs. Recall that skip-gram inverts contexts and -targets, and tries to predict each context word from its target word, so the -task becomes to predict 'the' and 'brown' from 'quick', 'quick' and 'fox' from -'brown', etc. Therefore our dataset becomes - -`(quick, the), (quick, brown), (brown, quick), (brown, fox), ...` - -of `(input, output)` pairs. The objective function is defined over the entire -dataset, but we typically optimize this with -[stochastic gradient descent](https://en.wikipedia.org/wiki/Stochastic_gradient_descent) -(SGD) using one example at a time (or a 'minibatch' of `batch_size` examples, -where typically `16 <= batch_size <= 512`). So let's look at one step of -this process. - -Let's imagine at training step \\(t\\) we observe the first training case above, -where the goal is to predict `the` from `quick`. We select `num_noise` number -of noisy (contrastive) examples by drawing from some noise distribution, -typically the unigram distribution, \\(P(w)\\). For simplicity let's say -`num_noise=1` and we select `sheep` as a noisy example. Next we compute the -loss for this pair of observed and noisy examples, i.e. the objective at time -step \\(t\\) becomes - -$$J^{(t)}_\text{NEG} = \log Q_\theta(D=1 | \text{the, quick}) + - \log(Q_\theta(D=0 | \text{sheep, quick}))$$ - -The goal is to make an update to the embedding parameters \\(\theta\\) to improve -(in this case, maximize) this objective function. We do this by deriving the -gradient of the loss with respect to the embedding parameters \\(\theta\\), i.e. -\\(\frac{\partial}{\partial \theta} J_\text{NEG}\\) (luckily TensorFlow provides -easy helper functions for doing this!). We then perform an update to the -embeddings by taking a small step in the direction of the gradient. When this -process is repeated over the entire training set, this has the effect of -'moving' the embedding vectors around for each word until the model is -successful at discriminating real words from noise words. - -We can visualize the learned vectors by projecting them down to 2 dimensions -using for instance something like the -[t-SNE dimensionality reduction technique](https://lvdmaaten.github.io/tsne/). -When we inspect these visualizations it becomes apparent that the vectors -capture some general, and in fact quite useful, semantic information about -words and their relationships to one another. It was very interesting when we -first discovered that certain directions in the induced vector space specialize -towards certain semantic relationships, e.g. *male-female*, *verb tense* and -even *country-capital* relationships between words, as illustrated in the figure -below (see also for example -[Mikolov et al., 2013](https://www.aclweb.org/anthology/N13-1090)). - -
- -
- -This explains why these vectors are also useful as features for many canonical -NLP prediction tasks, such as part-of-speech tagging or named entity recognition -(see for example the original work by -[Collobert et al., 2011](https://arxiv.org/abs/1103.0398) -([pdf](https://arxiv.org/pdf/1103.0398.pdf)), or follow-up work by -[Turian et al., 2010](https://www.aclweb.org/anthology/P10-1040)). - -But for now, let's just use them to draw pretty pictures! - -## Building the Graph - -This is all about embeddings, so let's define our embedding matrix. -This is just a big random matrix to start. We'll initialize the values to be -uniform in the unit cube. - -```python -embeddings = tf.Variable( - tf.random_uniform([vocabulary_size, embedding_size], -1.0, 1.0)) -``` - -The noise-contrastive estimation loss is defined in terms of a logistic regression -model. For this, we need to define the weights and biases for each word in the -vocabulary (also called the `output weights` as opposed to the `input -embeddings`). So let's define that. - -```python -nce_weights = tf.Variable( - tf.truncated_normal([vocabulary_size, embedding_size], - stddev=1.0 / math.sqrt(embedding_size))) -nce_biases = tf.Variable(tf.zeros([vocabulary_size])) -``` - -Now that we have the parameters in place, we can define our skip-gram model -graph. For simplicity, let's suppose we've already integerized our text corpus -with a vocabulary so that each word is represented as an integer (see -[tensorflow/examples/tutorials/word2vec/word2vec_basic.py](https://www.tensorflow.org/code/tensorflow/examples/tutorials/word2vec/word2vec_basic.py) -for the details). The skip-gram model takes two inputs. One is a batch full of -integers representing the source context words, the other is for the target -words. Let's create placeholder nodes for these inputs, so that we can feed in -data later. - -```python -# Placeholders for inputs -train_inputs = tf.placeholder(tf.int32, shape=[batch_size]) -train_labels = tf.placeholder(tf.int32, shape=[batch_size, 1]) -``` - -Now what we need to do is look up the vector for each of the source words in -the batch. TensorFlow has handy helpers that make this easy. - -```python -embed = tf.nn.embedding_lookup(embeddings, train_inputs) -``` - -Ok, now that we have the embeddings for each word, we'd like to try to predict -the target word using the noise-contrastive training objective. - -```python -# Compute the NCE loss, using a sample of the negative labels each time. -loss = tf.reduce_mean( - tf.nn.nce_loss(weights=nce_weights, - biases=nce_biases, - labels=train_labels, - inputs=embed, - num_sampled=num_sampled, - num_classes=vocabulary_size)) -``` - -Now that we have a loss node, we need to add the nodes required to compute -gradients and update the parameters, etc. For this we will use stochastic -gradient descent, and TensorFlow has handy helpers to make this easy as well. - -```python -# We use the SGD optimizer. -optimizer = tf.train.GradientDescentOptimizer(learning_rate=1.0).minimize(loss) -``` - -## Training the Model - -Training the model is then as simple as using a `feed_dict` to push data into -the placeholders and calling -`tf.Session.run` with this new data -in a loop. - -```python -for inputs, labels in generate_batch(...): - feed_dict = {train_inputs: inputs, train_labels: labels} - _, cur_loss = session.run([optimizer, loss], feed_dict=feed_dict) -``` - -See the full example code in -[tensorflow/examples/tutorials/word2vec/word2vec_basic.py](https://www.tensorflow.org/code/tensorflow/examples/tutorials/word2vec/word2vec_basic.py). - -## Visualizing the Learned Embeddings - -After training has finished we can visualize the learned embeddings using -t-SNE. - -
- -
- -Et voila! As expected, words that are similar end up clustering nearby each -other. For a more heavyweight implementation of word2vec that showcases more of -the advanced features of TensorFlow, see the implementation in -[models/tutorials/embedding/word2vec.py](https://github.com/tensorflow/models/tree/master/tutorials/embedding/word2vec.py). - -## Evaluating Embeddings: Analogical Reasoning - -Embeddings are useful for a wide variety of prediction tasks in NLP. Short of -training a full-blown part-of-speech model or named-entity model, one simple way -to evaluate embeddings is to directly use them to predict syntactic and semantic -relationships like `king is to queen as father is to ?`. This is called -*analogical reasoning* and the task was introduced by -[Mikolov and colleagues -](https://www.aclweb.org/anthology/N13-1090). -Download the dataset for this task from -[download.tensorflow.org](http://download.tensorflow.org/data/questions-words.txt). - -To see how we do this evaluation, have a look at the `build_eval_graph()` and -`eval()` functions in -[models/tutorials/embedding/word2vec.py](https://github.com/tensorflow/models/tree/master/tutorials/embedding/word2vec.py). - -The choice of hyperparameters can strongly influence the accuracy on this task. -To achieve state-of-the-art performance on this task requires training over a -very large dataset, carefully tuning the hyperparameters and making use of -tricks like subsampling the data, which is out of the scope of this tutorial. - - -## Optimizing the Implementation - -Our vanilla implementation showcases the flexibility of TensorFlow. For -example, changing the training objective is as simple as swapping out the call -to `tf.nn.nce_loss()` for an off-the-shelf alternative such as -`tf.nn.sampled_softmax_loss()`. If you have a new idea for a loss function, you -can manually write an expression for the new objective in TensorFlow and let -the optimizer compute its derivatives. This flexibility is invaluable in the -exploratory phase of machine learning model development, where we are trying -out several different ideas and iterating quickly. - -Once you have a model structure you're satisfied with, it may be worth -optimizing your implementation to run more efficiently (and cover more data in -less time). For example, the naive code we used in this tutorial would suffer -compromised speed because we use Python for reading and feeding data items -- -each of which require very little work on the TensorFlow back-end. If you find -your model is seriously bottlenecked on input data, you may want to implement a -custom data reader for your problem, as described in -[New Data Formats](../../extend/new_data_formats.md). For the case of Skip-Gram -modeling, we've actually already done this for you as an example in -[models/tutorials/embedding/word2vec.py](https://github.com/tensorflow/models/tree/master/tutorials/embedding/word2vec.py). - -If your model is no longer I/O bound but you want still more performance, you -can take things further by writing your own TensorFlow Ops, as described in -[Adding a New Op](../../extend/adding_an_op.md). Again we've provided an -example of this for the Skip-Gram case -[models/tutorials/embedding/word2vec_optimized.py](https://github.com/tensorflow/models/tree/master/tutorials/embedding/word2vec_optimized.py). -Feel free to benchmark these against each other to measure performance -improvements at each stage. - -## Conclusion - -In this tutorial we covered the word2vec model, a computationally efficient -model for learning word embeddings. We motivated why embeddings are useful, -discussed efficient training techniques and showed how to implement all of this -in TensorFlow. Overall, we hope that this has show-cased how TensorFlow affords -you the flexibility you need for early experimentation, and the control you -later need for bespoke optimized implementation. diff --git a/tensorflow/docs_src/tutorials/sequences/audio_recognition.md b/tensorflow/docs_src/tutorials/sequences/audio_recognition.md deleted file mode 100644 index d7a8da6f96..0000000000 --- a/tensorflow/docs_src/tutorials/sequences/audio_recognition.md +++ /dev/null @@ -1,631 +0,0 @@ -# Simple Audio Recognition - -This tutorial will show you how to build a basic speech recognition network that -recognizes ten different words. It's important to know that real speech and -audio recognition systems are much more complex, but like MNIST for images, it -should give you a basic understanding of the techniques involved. Once you've -completed this tutorial, you'll have a model that tries to classify a one second -audio clip as either silence, an unknown word, "yes", "no", "up", "down", -"left", "right", "on", "off", "stop", or "go". You'll also be able to take this -model and run it in an Android application. - -## Preparation - -You should make sure you have TensorFlow installed, and since the script -downloads over 1GB of training data, you'll need a good internet connection and -enough free space on your machine. The training process itself can take several -hours, so make sure you have a machine available for that long. - -## Training - -To begin the training process, go to the TensorFlow source tree and run: - -```bash -python tensorflow/examples/speech_commands/train.py -``` - -The script will start off by downloading the [Speech Commands -dataset](https://storage.cloud.google.com/download.tensorflow.org/data/speech_commands_v0.02.tar.gz), -which consists of over 105,000 WAVE audio files of people saying thirty -different words. This data was collected by Google and released under a CC BY -license, and you can help improve it by [contributing five minutes of your own -voice](https://aiyprojects.withgoogle.com/open_speech_recording). The archive is -over 2GB, so this part may take a while, but you should see progress logs, and -once it's been downloaded once you won't need to do this step again. You can -find more information about this dataset in this -[Speech Commands paper](https://arxiv.org/abs/1804.03209). - -Once the downloading has completed, you'll see logging information that looks -like this: - -``` -I0730 16:53:44.766740 55030 train.py:176] Training from step: 1 -I0730 16:53:47.289078 55030 train.py:217] Step #1: rate 0.001000, accuracy 7.0%, cross entropy 2.611571 -``` - -This shows that the initialization process is done and the training loop has -begun. You'll see that it outputs information for every training step. Here's a -break down of what it means: - -`Step #1` shows that we're on the first step of the training loop. In this case -there are going to be 18,000 steps in total, so you can look at the step number -to get an idea of how close it is to finishing. - -`rate 0.001000` is the learning rate that's controlling the speed of the -network's weight updates. Early on this is a comparatively high number (0.001), -but for later training cycles it will be reduced 10x, to 0.0001. - -`accuracy 7.0%` is the how many classes were correctly predicted on this -training step. This value will often fluctuate a lot, but should increase on -average as training progresses. The model outputs an array of numbers, one for -each label, and each number is the predicted likelihood of the input being that -class. The predicted label is picked by choosing the entry with the highest -score. The scores are always between zero and one, with higher values -representing more confidence in the result. - -`cross entropy 2.611571` is the result of the loss function that we're using to -guide the training process. This is a score that's obtained by comparing the -vector of scores from the current training run to the correct labels, and this -should trend downwards during training. - -After a hundred steps, you should see a line like this: - -`I0730 16:54:41.813438 55030 train.py:252] Saving to -"/tmp/speech_commands_train/conv.ckpt-100"` - -This is saving out the current trained weights to a checkpoint file. If your -training script gets interrupted, you can look for the last saved checkpoint and -then restart the script with -`--start_checkpoint=/tmp/speech_commands_train/conv.ckpt-100` as a command line -argument to start from that point. - -## Confusion Matrix - -After four hundred steps, this information will be logged: - -``` -I0730 16:57:38.073667 55030 train.py:243] Confusion Matrix: - [[258 0 0 0 0 0 0 0 0 0 0 0] - [ 7 6 26 94 7 49 1 15 40 2 0 11] - [ 10 1 107 80 13 22 0 13 10 1 0 4] - [ 1 3 16 163 6 48 0 5 10 1 0 17] - [ 15 1 17 114 55 13 0 9 22 5 0 9] - [ 1 1 6 97 3 87 1 12 46 0 0 10] - [ 8 6 86 84 13 24 1 9 9 1 0 6] - [ 9 3 32 112 9 26 1 36 19 0 0 9] - [ 8 2 12 94 9 52 0 6 72 0 0 2] - [ 16 1 39 74 29 42 0 6 37 9 0 3] - [ 15 6 17 71 50 37 0 6 32 2 1 9] - [ 11 1 6 151 5 42 0 8 16 0 0 20]] -``` - -The first section is a [confusion -matrix](https://www.tensorflow.org/api_docs/python/tf/confusion_matrix). To -understand what it means, you first need to know the labels being used, which in -this case are "_silence_", "_unknown_", "yes", "no", "up", "down", "left", -"right", "on", "off", "stop", and "go". Each column represents a set of samples -that were predicted to be each label, so the first column represents all the -clips that were predicted to be silence, the second all those that were -predicted to be unknown words, the third "yes", and so on. - -Each row represents clips by their correct, ground truth labels. The first row -is all the clips that were silence, the second clips that were unknown words, -the third "yes", etc. - -This matrix can be more useful than just a single accuracy score because it -gives a good summary of what mistakes the network is making. In this example you -can see that all of the entries in the first row are zero, apart from the -initial one. Because the first row is all the clips that are actually silence, -this means that none of them were mistakenly labeled as words, so we have no -false negatives for silence. This shows the network is already getting pretty -good at distinguishing silence from words. - -If we look down the first column though, we see a lot of non-zero values. The -column represents all the clips that were predicted to be silence, so positive -numbers outside of the first cell are errors. This means that some clips of real -spoken words are actually being predicted to be silence, so we do have quite a -few false positives. - -A perfect model would produce a confusion matrix where all of the entries were -zero apart from a diagonal line through the center. Spotting deviations from -that pattern can help you figure out how the model is most easily confused, and -once you've identified the problems you can address them by adding more data or -cleaning up categories. - -## Validation - -After the confusion matrix, you should see a line like this: - -`I0730 16:57:38.073777 55030 train.py:245] Step 400: Validation accuracy = 26.3% -(N=3093)` - -It's good practice to separate your data set into three categories. The largest -(in this case roughly 80% of the data) is used for training the network, a -smaller set (10% here, known as "validation") is reserved for evaluation of the -accuracy during training, and another set (the last 10%, "testing") is used to -evaluate the accuracy once after the training is complete. - -The reason for this split is that there's always a danger that networks will -start memorizing their inputs during training. By keeping the validation set -separate, you can ensure that the model works with data it's never seen before. -The testing set is an additional safeguard to make sure that you haven't just -been tweaking your model in a way that happens to work for both the training and -validation sets, but not a broader range of inputs. - -The training script automatically separates the data set into these three -categories, and the logging line above shows the accuracy of model when run on -the validation set. Ideally, this should stick fairly close to the training -accuracy. If the training accuracy increases but the validation doesn't, that's -a sign that overfitting is occurring, and your model is only learning things -about the training clips, not broader patterns that generalize. - -## Tensorboard - -A good way to visualize how the training is progressing is using Tensorboard. By -default, the script saves out events to /tmp/retrain_logs, and you can load -these by running: - -`tensorboard --logdir /tmp/retrain_logs` - -Then navigate to [http://localhost:6006](http://localhost:6006) in your browser, -and you'll see charts and graphs showing your models progress. - -
- -
- -## Training Finished - -After a few hours of training (depending on your machine's speed), the script -should have completed all 18,000 steps. It will print out a final confusion -matrix, along with an accuracy score, all run on the testing set. With the -default settings, you should see an accuracy of between 85% and 90%. - -Because audio recognition is particularly useful on mobile devices, next we'll -export it to a compact format that's easy to work with on those platforms. To do -that, run this command line: - -``` -python tensorflow/examples/speech_commands/freeze.py \ ---start_checkpoint=/tmp/speech_commands_train/conv.ckpt-18000 \ ---output_file=/tmp/my_frozen_graph.pb -``` - -Once the frozen model has been created, you can test it with the `label_wav.py` -script, like this: - -``` -python tensorflow/examples/speech_commands/label_wav.py \ ---graph=/tmp/my_frozen_graph.pb \ ---labels=/tmp/speech_commands_train/conv_labels.txt \ ---wav=/tmp/speech_dataset/left/a5d485dc_nohash_0.wav -``` - -This should print out three labels: - -``` -left (score = 0.81477) -right (score = 0.14139) -_unknown_ (score = 0.03808) -``` - -Hopefully "left" is the top score since that's the correct label, but since the -training is random it may not for the first file you try. Experiment with some -of the other .wav files in that same folder to see how well it does. - -The scores are between zero and one, and higher values mean the model is more -confident in its prediction. - -## Running the Model in an Android App - -The easiest way to see how this model works in a real application is to download -[the prebuilt Android demo -applications](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/examples/android#prebuilt-components) -and install them on your phone. You'll see 'TF Speech' appear in your app list, -and opening it will show you the same list of action words we've just trained -our model on, starting with "Yes" and "No". Once you've given the app permission -to use the microphone, you should be able to try saying those words and see them -highlighted in the UI when the model recognizes one of them. - -You can also build this application yourself, since it's open source and -[available as part of the TensorFlow repository on -github](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/examples/android#building-in-android-studio-using-the-tensorflow-aar-from-jcenter). -By default it downloads [a pretrained model from -tensorflow.org](http://download.tensorflow.org/models/speech_commands_v0.02.zip), -but you can easily [replace it with a model you've trained -yourself](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/examples/android#install-model-files-optional). -If you do this, you'll need to make sure that the constants in [the main -SpeechActivity Java source -file](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/examples/android/src/org/tensorflow/demo/SpeechActivity.java) -like `SAMPLE_RATE` and `SAMPLE_DURATION` match any changes you've made to the -defaults while training. You'll also see that there's a [Java version of the -RecognizeCommands -module](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/examples/android/src/org/tensorflow/demo/RecognizeCommands.java) -that's very similar to the C++ version in this tutorial. If you've tweaked -parameters for that, you can also update them in SpeechActivity to get the same -results as in your server testing. - -The demo app updates its UI list of results automatically based on the labels -text file you copy into assets alongside your frozen graph, which means you can -easily try out different models without needing to make any code changes. You -will need to update `LABEL_FILENAME` and `MODEL_FILENAME` to point to the files -you've added if you change the paths though. - -## How does this Model Work? - -The architecture used in this tutorial is based on some described in the paper -[Convolutional Neural Networks for Small-footprint Keyword -Spotting](http://www.isca-speech.org/archive/interspeech_2015/papers/i15_1478.pdf). -It was chosen because it's comparatively simple, quick to train, and easy to -understand, rather than being state of the art. There are lots of different -approaches to building neural network models to work with audio, including -[recurrent networks](https://svds.com/tensorflow-rnn-tutorial/) or [dilated -(atrous) -convolutions](https://deepmind.com/blog/wavenet-generative-model-raw-audio/). -This tutorial is based on the kind of convolutional network that will feel very -familiar to anyone who's worked with image recognition. That may seem surprising -at first though, since audio is inherently a one-dimensional continuous signal -across time, not a 2D spatial problem. - -We solve that issue by defining a window of time we believe our spoken words -should fit into, and converting the audio signal in that window into an image. -This is done by grouping the incoming audio samples into short segments, just a -few milliseconds long, and calculating the strength of the frequencies across a -set of bands. Each set of frequency strengths from a segment is treated as a -vector of numbers, and those vectors are arranged in time order to form a -two-dimensional array. This array of values can then be treated like a -single-channel image, and is known as a -[spectrogram](https://en.wikipedia.org/wiki/Spectrogram). If you want to view -what kind of image an audio sample produces, you can run the `wav_to_spectrogram -tool: - -``` -bazel run tensorflow/examples/wav_to_spectrogram:wav_to_spectrogram -- \ ---input_wav=/tmp/speech_dataset/happy/ab00c4b2_nohash_0.wav \ ---output_image=/tmp/spectrogram.png -``` - -If you open up `/tmp/spectrogram.png` you should see something like this: - -
- -
- -Because of TensorFlow's memory order, time in this image is increasing from top -to bottom, with frequencies going from left to right, unlike the usual -convention for spectrograms where time is left to right. You should be able to -see a couple of distinct parts, with the first syllable "Ha" distinct from -"ppy". - -Because the human ear is more sensitive to some frequencies than others, it's -been traditional in speech recognition to do further processing to this -representation to turn it into a set of [Mel-Frequency Cepstral -Coefficients](https://en.wikipedia.org/wiki/Mel-frequency_cepstrum), or MFCCs -for short. This is also a two-dimensional, one-channel representation so it can -be treated like an image too. If you're targeting general sounds rather than -speech you may find you can skip this step and operate directly on the -spectrograms. - -The image that's produced by these processing steps is then fed into a -multi-layer convolutional neural network, with a fully-connected layer followed -by a softmax at the end. You can see the definition of this portion in -[tensorflow/examples/speech_commands/models.py](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/examples/speech_commands/models.py). - -## Streaming Accuracy - -Most audio recognition applications need to run on a continuous stream of audio, -rather than on individual clips. A typical way to use a model in this -environment is to apply it repeatedly at different offsets in time and average -the results over a short window to produce a smoothed prediction. If you think -of the input as an image, it's continuously scrolling along the time axis. The -words we want to recognize can start at any time, so we need to take a series of -snapshots to have a chance of having an alignment that captures most of the -utterance in the time window we feed into the model. If we sample at a high -enough rate, then we have a good chance of capturing the word in multiple -windows, so averaging the results improves the overall confidence of the -prediction. - -For an example of how you can use your model on streaming data, you can look at -[test_streaming_accuracy.cc](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/examples/speech_commands/). -This uses the -[RecognizeCommands](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/examples/speech_commands/recognize_commands.h) -class to run through a long-form input audio, try to spot words, and compare -those predictions against a ground truth list of labels and times. This makes it -a good example of applying a model to a stream of audio signals over time. - -You'll need a long audio file to test it against, along with labels showing -where each word was spoken. If you don't want to record one yourself, you can -generate some synthetic test data using the `generate_streaming_test_wav` -utility. By default this will create a ten minute .wav file with words roughly -every three seconds, and a text file containing the ground truth of when each -word was spoken. These words are pulled from the test portion of your current -dataset, mixed in with background noise. To run it, use: - -``` -bazel run tensorflow/examples/speech_commands:generate_streaming_test_wav -``` - -This will save a .wav file to `/tmp/speech_commands_train/streaming_test.wav`, -and a text file listing the labels to -`/tmp/speech_commands_train/streaming_test_labels.txt`. You can then run -accuracy testing with: - -``` -bazel run tensorflow/examples/speech_commands:test_streaming_accuracy -- \ ---graph=/tmp/my_frozen_graph.pb \ ---labels=/tmp/speech_commands_train/conv_labels.txt \ ---wav=/tmp/speech_commands_train/streaming_test.wav \ ---ground_truth=/tmp/speech_commands_train/streaming_test_labels.txt \ ---verbose -``` - -This will output information about the number of words correctly matched, how -many were given the wrong labels, and how many times the model triggered when -there was no real word spoken. There are various parameters that control how the -signal averaging works, including `--average_window_ms` which sets the length of -time to average results over, `--clip_stride_ms` which is the time between -applications of the model, `--suppression_ms` which stops subsequent word -detections from triggering for a certain time after an initial one is found, and -`--detection_threshold`, which controls how high the average score must be -before it's considered a solid result. - -You'll see that the streaming accuracy outputs three numbers, rather than just -the one metric used in training. This is because different applications have -varying requirements, with some being able to tolerate frequent incorrect -results as long as real words are found (high recall), while others very focused -on ensuring the predicted labels are highly likely to be correct even if some -aren't detected (high precision). The numbers from the tool give you an idea of -how your model will perform in an application, and you can try tweaking the -signal averaging parameters to tune it to give the kind of performance you want. -To understand what the right parameters are for your application, you can look -at generating an [ROC -curve](https://en.wikipedia.org/wiki/Receiver_operating_characteristic) to help -you understand the tradeoffs. - -## RecognizeCommands - -The streaming accuracy tool uses a simple decoder contained in a small C++ class -called -[RecognizeCommands](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/examples/speech_commands/recognize_commands.h). -This class is fed the output of running the TensorFlow model over time, it -averages the signals, and returns information about a label when it has enough -evidence to think that a recognized word has been found. The implementation is -fairly small, just keeping track of the last few predictions and averaging them, -so it's easy to port to other platforms and languages as needed. For example, -it's convenient to do something similar at the Java level on Android, or Python -on the Raspberry Pi. As long as these implementations share the same logic, you -can tune the parameters that control the averaging using the streaming test -tool, and then transfer them over to your application to get similar results. - -## Advanced Training - -The defaults for the training script are designed to produce good end to end -results in a comparatively small file, but there are a lot of options you can -change to customize the results for your own requirements. - -### Custom Training Data - -By default the script will download the [Speech Commands -dataset](https://download.tensorflow.org/data/speech_commands_v0.01.tgz), but -you can also supply your own training data. To train on your own data, you -should make sure that you have at least several hundred recordings of each sound -you would like to recognize, and arrange them into folders by class. For -example, if you were trying to recognize dog barks from cat miaows, you would -create a root folder called `animal_sounds`, and then within that two -sub-folders called `bark` and `miaow`. You would then organize your audio files -into the appropriate folders. - -To point the script to your new audio files, you'll need to set `--data_url=` to -disable downloading of the Speech Commands dataset, and -`--data_dir=/your/data/folder/` to find the files you've just created. - -The files themselves should be 16-bit little-endian PCM-encoded WAVE format. The -sample rate defaults to 16,000, but as long as all your audio is consistently -the same rate (the script doesn't support resampling) you can change this with -the `--sample_rate` argument. The clips should also all be roughly the same -duration. The default expected duration is one second, but you can set this with -the `--clip_duration_ms` flag. If you have clips with variable amounts of -silence at the start, you can look at word alignment tools to standardize them -([here's a quick and dirty approach you can use -too](https://petewarden.com/2017/07/17/a-quick-hack-to-align-single-word-audio-recordings/)). - -One issue to watch out for is that you may have very similar repetitions of the -same sounds in your dataset, and these can give misleading metrics if they're -spread across your training, validation, and test sets. For example, the Speech -Commands set has people repeating the same word multiple times. Each one of -those repetitions is likely to be pretty close to the others, so if training was -overfitting and memorizing one, it could perform unrealistically well when it -saw a very similar copy in the test set. To avoid this danger, Speech Commands -trys to ensure that all clips featuring the same word spoken by a single person -are put into the same partition. Clips are assigned to training, test, or -validation sets based on a hash of their filename, to ensure that the -assignments remain steady even as new clips are added and avoid any training -samples migrating into the other sets. To make sure that all a given speaker's -words are in the same bucket, [the hashing -function](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/examples/speech_commands/input_data.py) -ignores anything in a filename after '_nohash_' when calculating the -assignments. This means that if you have file names like `pete_nohash_0.wav` and -`pete_nohash_1.wav`, they're guaranteed to be in the same set. - -### Unknown Class - -It's likely that your application will hear sounds that aren't in your training -set, and you'll want the model to indicate that it doesn't recognize the noise -in those cases. To help the network learn what sounds to ignore, you need to -provide some clips of audio that are neither of your classes. To do this, you'd -create `quack`, `oink`, and `moo` subfolders and populate them with noises from -other animals your users might encounter. The `--wanted_words` argument to the -script defines which classes you care about, all the others mentioned in -subfolder names will be used to populate an `_unknown_` class during training. -The Speech Commands dataset has twenty words in its unknown classes, including -the digits zero through nine and random names like "Sheila". - -By default 10% of the training examples are picked from the unknown classes, but -you can control this with the `--unknown_percentage` flag. Increasing this will -make the model less likely to mistake unknown words for wanted ones, but making -it too large can backfire as the model might decide it's safest to categorize -all words as unknown! - -### Background Noise - -Real applications have to recognize audio even when there are other irrelevant -sounds happening in the environment. To build a model that's robust to this kind -of interference, we need to train against recorded audio with similar -properties. The files in the Speech Commands dataset were captured on a variety -of devices by users in many different environments, not in a studio, so that -helps add some realism to the training. To add even more, you can mix in random -segments of environmental audio to the training inputs. In the Speech Commands -set there's a special folder called `_background_noise_` which contains -minute-long WAVE files with white noise and recordings of machinery and everyday -household activity. - -Small snippets of these files are chosen at random and mixed at a low volume -into clips during training. The loudness is also chosen randomly, and controlled -by the `--background_volume` argument as a proportion where 0 is silence, and 1 -is full volume. Not all clips have background added, so the -`--background_frequency` flag controls what proportion have them mixed in. - -Your own application might operate in its own environment with different -background noise patterns than these defaults, so you can supply your own audio -clips in the `_background_noise_` folder. These should be the same sample rate -as your main dataset, but much longer in duration so that a good set of random -segments can be selected from them. - -### Silence - -In most cases the sounds you care about will be intermittent and so it's -important to know when there's no matching audio. To support this, there's a -special `_silence_` label that indicates when the model detects nothing -interesting. Because there's never complete silence in real environments, we -actually have to supply examples with quiet and irrelevant audio. For this, we -reuse the `_background_noise_` folder that's also mixed in to real clips, -pulling short sections of the audio data and feeding those in with the ground -truth class of `_silence_`. By default 10% of the training data is supplied like -this, but the `--silence_percentage` can be used to control the proportion. As -with unknown words, setting this higher can weight the model results in favor of -true positives for silence, at the expense of false negatives for words, but too -large a proportion can cause it to fall into the trap of always guessing -silence. - -### Time Shifting - -Adding in background noise is one way of distorting the training data in a -realistic way to effectively increase the size of the dataset, and so increase -overall accuracy, and time shifting is another. This involves a random offset in -time of the training sample data, so that a small part of the start or end is -cut off and the opposite section is padded with zeroes. This mimics the natural -variations in starting time in the training data, and is controlled with the -`--time_shift_ms` flag, which defaults to 100ms. Increasing this value will -provide more variation, but at the risk of cutting off important parts of the -audio. A related way of augmenting the data with realistic distortions is by -using [time stretching and pitch -scaling](https://en.wikipedia.org/wiki/Audio_time_stretching_and_pitch_scaling), -but that's outside the scope of this tutorial. - -## Customizing the Model - -The default model used for this script is pretty large, taking over 800 million -FLOPs for each inference and using 940,000 weight parameters. This runs at -usable speeds on desktop machines or modern phones, but it involves too many -calculations to run at interactive speeds on devices with more limited -resources. To support these use cases, there's a couple of alternatives -available: - - -**low_latency_conv** -Based on the 'cnn-one-fstride4' topology described in the [Convolutional -Neural Networks for Small-footprint Keyword Spotting -paper](http://www.isca-speech.org/archive/interspeech_2015/papers/i15_1478.pdf). -The accuracy is slightly lower than 'conv' but the number of weight parameters -is about the same, and it only needs 11 million FLOPs to run one prediction, -making it much faster. - -To use this model, you specify `--model_architecture=low_latency_conv` on -the command line. You'll also need to update the training rates and the number -of steps, so the full command will look like: - -``` -python tensorflow/examples/speech_commands/train \ ---model_architecture=low_latency_conv \ ---how_many_training_steps=20000,6000 \ ---learning_rate=0.01,0.001 -``` - -This asks the script to train with a learning rate of 0.01 for 20,000 steps, and -then do a fine-tuning pass of 6,000 steps with a 10x smaller rate. - -**low_latency_svdf** -Based on the topology presented in the [Compressing Deep Neural Networks using a -Rank-Constrained Topology paper](https://static.googleusercontent.com/media/research.google.com/en//pubs/archive/43813.pdf). -The accuracy is also lower than 'conv' but it only uses about 750 thousand -parameters, and most significantly, it allows for an optimized execution at -test time (i.e. when you will actually use it in your application), resulting -in 750 thousand FLOPs. - -To use this model, you specify `--model_architecture=low_latency_svdf` on -the command line, and update the training rates and the number -of steps, so the full command will look like: - -``` -python tensorflow/examples/speech_commands/train \ ---model_architecture=low_latency_svdf \ ---how_many_training_steps=100000,35000 \ ---learning_rate=0.01,0.005 -``` - -Note that despite requiring a larger number of steps than the previous two -topologies, the reduced number of computations means that training should take -about the same time, and at the end reach an accuracy of around 85%. -You can also further tune the topology fairly easily for computation and -accuracy by changing these parameters in the SVDF layer: - -* rank - The rank of the approximation (higher typically better, but results in - more computation). -* num_units - Similar to other layer types, specifies the number of nodes in - the layer (more nodes better quality, and more computation). - -Regarding runtime, since the layer allows optimizations by caching some of the -internal neural network activations, you need to make sure to use a consistent -stride (e.g. 'clip_stride_ms' flag) both when you freeze the graph, and when -executing the model in streaming mode (e.g. test_streaming_accuracy.cc). - -**Other parameters to customize** -If you want to experiment with customizing models, a good place to start is by -tweaking the spectrogram creation parameters. This has the effect of altering -the size of the input image to the model, and the creation code in -[models.py](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/examples/speech_commands/models.py) -will adjust the number of computations and weights automatically to fit with -different dimensions. If you make the input smaller, the model will need fewer -computations to process it, so it can be a great way to trade off some accuracy -for improved latency. The `--window_stride_ms` controls how far apart each -frequency analysis sample is from the previous. If you increase this value, then -fewer samples will be taken for a given duration, and the time axis of the input -will shrink. The `--dct_coefficient_count` flag controls how many buckets are -used for the frequency counting, so reducing this will shrink the input in the -other dimension. The `--window_size_ms` argument doesn't affect the size, but -does control how wide the area used to calculate the frequencies is for each -sample. Reducing the duration of the training samples, controlled by -`--clip_duration_ms`, can also help if the sounds you're looking for are short, -since that also reduces the time dimension of the input. You'll need to make -sure that all your training data contains the right audio in the initial portion -of the clip though. - -If you have an entirely different model in mind for your problem, you may find -that you can plug it into -[models.py](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/examples/speech_commands/models.py) -and have the rest of the script handle all of the preprocessing and training -mechanics. You would add a new clause to `create_model`, looking for the name of -your architecture and then calling a model creation function. This function is -given the size of the spectrogram input, along with other model information, and -is expected to create TensorFlow ops to read that in and produce an output -prediction vector, and a placeholder to control the dropout rate. The rest of -the script will handle integrating this model into a larger graph doing the -input calculations and applying softmax and a loss function to train it. - -One common problem when you're adjusting models and training hyper-parameters is -that not-a-number values can creep in, thanks to numerical precision issues. In -general you can solve these by reducing the magnitude of things like learning -rates and weight initialization functions, but if they're persistent you can -enable the `--check_nans` flag to track down the source of the errors. This will -insert check ops between most regular operations in TensorFlow, and abort the -training process with a useful error message when they're encountered. diff --git a/tensorflow/docs_src/tutorials/sequences/recurrent.md b/tensorflow/docs_src/tutorials/sequences/recurrent.md deleted file mode 100644 index 39ad441381..0000000000 --- a/tensorflow/docs_src/tutorials/sequences/recurrent.md +++ /dev/null @@ -1,230 +0,0 @@ -# Recurrent Neural Networks - -## Introduction - -See [Understanding LSTM Networks](https://colah.github.io/posts/2015-08-Understanding-LSTMs/){:.external} -for an introduction to recurrent neural networks and LSTMs. - -## Language Modeling - -In this tutorial we will show how to train a recurrent neural network on -a challenging task of language modeling. The goal of the problem is to fit a -probabilistic model which assigns probabilities to sentences. It does so by -predicting next words in a text given a history of previous words. For this -purpose we will use the [Penn Tree Bank](https://catalog.ldc.upenn.edu/ldc99t42) -(PTB) dataset, which is a popular benchmark for measuring the quality of these -models, whilst being small and relatively fast to train. - -Language modeling is key to many interesting problems such as speech -recognition, machine translation, or image captioning. It is also fun -- -take a look [here](https://karpathy.github.io/2015/05/21/rnn-effectiveness/). - -For the purpose of this tutorial, we will reproduce the results from -[Zaremba et al., 2014](https://arxiv.org/abs/1409.2329) -([pdf](https://arxiv.org/pdf/1409.2329.pdf)), which achieves very good quality -on the PTB dataset. - -## Tutorial Files - -This tutorial references the following files from `models/tutorials/rnn/ptb` in the [TensorFlow models repo](https://github.com/tensorflow/models): - -File | Purpose ---- | --- -`ptb_word_lm.py` | The code to train a language model on the PTB dataset. -`reader.py` | The code to read the dataset. - -## Download and Prepare the Data - -The data required for this tutorial is in the `data/` directory of the -[PTB dataset from Tomas Mikolov's webpage](http://www.fit.vutbr.cz/~imikolov/rnnlm/simple-examples.tgz). - -The dataset is already preprocessed and contains overall 10000 different words, -including the end-of-sentence marker and a special symbol (\) for rare -words. In `reader.py`, we convert each word to a unique integer identifier, -in order to make it easy for the neural network to process the data. - -## The Model - -### LSTM - -The core of the model consists of an LSTM cell that processes one word at a -time and computes probabilities of the possible values for the next word in the -sentence. The memory state of the network is initialized with a vector of zeros -and gets updated after reading each word. For computational reasons, we will -process data in mini-batches of size `batch_size`. In this example, it is -important to note that `current_batch_of_words` does not correspond to a -"sentence" of words. Every word in a batch should correspond to a time t. -TensorFlow will automatically sum the gradients of each batch for you. - -For example: - -``` - t=0 t=1 t=2 t=3 t=4 -[The, brown, fox, is, quick] -[The, red, fox, jumped, high] - -words_in_dataset[0] = [The, The] -words_in_dataset[1] = [brown, red] -words_in_dataset[2] = [fox, fox] -words_in_dataset[3] = [is, jumped] -words_in_dataset[4] = [quick, high] -batch_size = 2, time_steps = 5 -``` - -The basic pseudocode is as follows: - -```python -words_in_dataset = tf.placeholder(tf.float32, [time_steps, batch_size, num_features]) -lstm = tf.contrib.rnn.BasicLSTMCell(lstm_size) -# Initial state of the LSTM memory. -state = lstm.zero_state(batch_size, dtype=tf.float32) -probabilities = [] -loss = 0.0 -for current_batch_of_words in words_in_dataset: - # The value of state is updated after processing each batch of words. - output, state = lstm(current_batch_of_words, state) - - # The LSTM output can be used to make next word predictions - logits = tf.matmul(output, softmax_w) + softmax_b - probabilities.append(tf.nn.softmax(logits)) - loss += loss_function(probabilities, target_words) -``` - -### Truncated Backpropagation - -By design, the output of a recurrent neural network (RNN) depends on arbitrarily -distant inputs. Unfortunately, this makes backpropagation computation difficult. -In order to make the learning process tractable, it is common practice to create -an "unrolled" version of the network, which contains a fixed number -(`num_steps`) of LSTM inputs and outputs. The model is then trained on this -finite approximation of the RNN. This can be implemented by feeding inputs of -length `num_steps` at a time and performing a backward pass after each -such input block. - -Here is a simplified block of code for creating a graph which performs -truncated backpropagation: - -```python -# Placeholder for the inputs in a given iteration. -words = tf.placeholder(tf.int32, [batch_size, num_steps]) - -lstm = tf.contrib.rnn.BasicLSTMCell(lstm_size) -# Initial state of the LSTM memory. -initial_state = state = lstm.zero_state(batch_size, dtype=tf.float32) - -for i in range(num_steps): - # The value of state is updated after processing each batch of words. - output, state = lstm(words[:, i], state) - - # The rest of the code. - # ... - -final_state = state -``` - -And this is how to implement an iteration over the whole dataset: - -```python -# A numpy array holding the state of LSTM after each batch of words. -numpy_state = initial_state.eval() -total_loss = 0.0 -for current_batch_of_words in words_in_dataset: - numpy_state, current_loss = session.run([final_state, loss], - # Initialize the LSTM state from the previous iteration. - feed_dict={initial_state: numpy_state, words: current_batch_of_words}) - total_loss += current_loss -``` - -### Inputs - -The word IDs will be embedded into a dense representation (see the -[Vector Representations Tutorial](../../tutorials/representation/word2vec.md)) before feeding to -the LSTM. This allows the model to efficiently represent the knowledge about -particular words. It is also easy to write: - -```python -# embedding_matrix is a tensor of shape [vocabulary_size, embedding size] -word_embeddings = tf.nn.embedding_lookup(embedding_matrix, word_ids) -``` - -The embedding matrix will be initialized randomly and the model will learn to -differentiate the meaning of words just by looking at the data. - -### Loss Function - -We want to minimize the average negative log probability of the target words: - -$$ \text{loss} = -\frac{1}{N}\sum_{i=1}^{N} \ln p_{\text{target}_i} $$ - -It is not very difficult to implement but the function -`sequence_loss_by_example` is already available, so we can just use it here. - -The typical measure reported in the papers is average per-word perplexity (often -just called perplexity), which is equal to - -$$e^{-\frac{1}{N}\sum_{i=1}^{N} \ln p_{\text{target}_i}} = e^{\text{loss}} $$ - -and we will monitor its value throughout the training process. - -### Stacking multiple LSTMs - -To give the model more expressive power, we can add multiple layers of LSTMs -to process the data. The output of the first layer will become the input of -the second and so on. - -We have a class called `MultiRNNCell` that makes the implementation seamless: - -```python -def lstm_cell(): - return tf.contrib.rnn.BasicLSTMCell(lstm_size) -stacked_lstm = tf.contrib.rnn.MultiRNNCell( - [lstm_cell() for _ in range(number_of_layers)]) - -initial_state = state = stacked_lstm.zero_state(batch_size, tf.float32) -for i in range(num_steps): - # The value of state is updated after processing each batch of words. - output, state = stacked_lstm(words[:, i], state) - - # The rest of the code. - # ... - -final_state = state -``` - -## Run the Code - -Before running the code, download the PTB dataset, as discussed at the beginning -of this tutorial. Then, extract the PTB dataset underneath your home directory -as follows: - -```bsh -tar xvfz simple-examples.tgz -C $HOME -``` -_(Note: On Windows, you may need to use -[other tools](https://wiki.haskell.org/How_to_unpack_a_tar_file_in_Windows).)_ - -Now, clone the [TensorFlow models repo](https://github.com/tensorflow/models) -from GitHub. Run the following commands: - -```bsh -cd models/tutorials/rnn/ptb -python ptb_word_lm.py --data_path=$HOME/simple-examples/data/ --model=small -``` - -There are 3 supported model configurations in the tutorial code: "small", -"medium" and "large". The difference between them is in size of the LSTMs and -the set of hyperparameters used for training. - -The larger the model, the better results it should get. The `small` model should -be able to reach perplexity below 120 on the test set and the `large` one below -80, though it might take several hours to train. - -## What Next? - -There are several tricks that we haven't mentioned that make the model better, -including: - -* decreasing learning rate schedule, -* dropout between the LSTM layers. - -Study the code and modify it to improve the model even further. diff --git a/tensorflow/docs_src/tutorials/sequences/recurrent_quickdraw.md b/tensorflow/docs_src/tutorials/sequences/recurrent_quickdraw.md deleted file mode 100644 index 657fab8a53..0000000000 --- a/tensorflow/docs_src/tutorials/sequences/recurrent_quickdraw.md +++ /dev/null @@ -1,410 +0,0 @@ -# Recurrent Neural Networks for Drawing Classification - -[Quick, Draw!]: http://quickdraw.withgoogle.com - -[Quick, Draw!] is a game where a player is challenged to draw a number of -objects and see if a computer can recognize the drawing. - -The recognition in [Quick, Draw!] is performed by a classifier that takes the -user input, given as a sequence of strokes of points in x and y, and recognizes -the object category that the user tried to draw. - -In this tutorial we'll show how to build an RNN-based recognizer for this -problem. The model will use a combination of convolutional layers, LSTM layers, -and a softmax output layer to classify the drawings: - -
![RNN model structure](../../images/quickdraw_model.png)
- -The figure above shows the structure of the model that we will build in this -tutorial. The input is a drawing that is encoded as a sequence of strokes of -points in x, y, and n, where n indicates whether a the point is the first point -in a new stroke. - -Then, a series of 1-dimensional convolutions is applied. Then LSTM layers are -applied and the sum of the outputs of all LSTM steps is fed into a softmax layer -to make a classification decision among the classes of drawings that we know. - -This tutorial uses the data from actual [Quick, Draw!] games [that is publicly -available](https://quickdraw.withgoogle.com/data). This dataset contains of 50M -drawings in 345 categories. - -## Run the tutorial code - -To try the code for this tutorial: - -1. [Install TensorFlow](../../install/index.md) if you haven't already. -1. Download the [tutorial code] -(https://github.com/tensorflow/models/tree/master/tutorials/rnn/quickdraw/train_model.py). -1. [Download the data](#download-the-data) in `TFRecord` format from - [here](http://download.tensorflow.org/data/quickdraw_tutorial_dataset_v1.tar.gz) and unzip it. More details about [how to - obtain the original Quick, Draw! - data](#optional_download_the_full_quick_draw_data) and [how to convert that - to `TFRecord` files](#optional_converting_the_data) is available below. - -1. Execute the tutorial code with the following command to train the RNN-based - model described in this tutorial. Make sure to adjust the paths to point to - the unzipped data from the download in step 3. - -```shell - python train_model.py \ - --training_data=rnn_tutorial_data/training.tfrecord-?????-of-????? \ - --eval_data=rnn_tutorial_data/eval.tfrecord-?????-of-????? \ - --classes_file=rnn_tutorial_data/training.tfrecord.classes -``` - -## Tutorial details - -### Download the data - -We make the data that we use in this tutorial available as `TFRecord` files -containing `TFExamples`. You can download the data from here: -http://download.tensorflow.org/data/quickdraw_tutorial_dataset_v1.tar.gz (~1GB). - -Alternatively you can download the original data in `ndjson` format from the -Google cloud and convert it to the `TFRecord` files containing `TFExamples` -yourself as described in the next section. - -### Optional: Download the full Quick Draw Data - -The full [Quick, Draw!](https://quickdraw.withgoogle.com) -[dataset](https://quickdraw.withgoogle.com/data) is available on Google Cloud -Storage as [ndjson](http://ndjson.org/) files separated by category. You can -[browse the list of files in Cloud -Console](https://console.cloud.google.com/storage/quickdraw_dataset). - -To download the data we recommend using -[gsutil](https://cloud.google.com/storage/docs/gsutil_install#install) to -download the entire dataset. Note that the original .ndjson files require -downloading ~22GB. - -Then use the following command to check that your gsutil installation works and -that you can access the data bucket: - -```shell -gsutil ls -r "gs://quickdraw_dataset/full/simplified/*" -``` - -which will output a long list of files like the following: - -```shell -gs://quickdraw_dataset/full/simplified/The Eiffel Tower.ndjson -gs://quickdraw_dataset/full/simplified/The Great Wall of China.ndjson -gs://quickdraw_dataset/full/simplified/The Mona Lisa.ndjson -gs://quickdraw_dataset/full/simplified/aircraft carrier.ndjson -... -``` - -Then create a folder and download the dataset there. - -```shell -mkdir rnn_tutorial_data -cd rnn_tutorial_data -gsutil -m cp "gs://quickdraw_dataset/full/simplified/*" . -``` - -This download will take a while and download a bit more than 23GB of data. - -### Optional: Converting the data - -To convert the `ndjson` files to -[TFRecord](../../api_guides/python/python_io.md#TFRecords_Format_Details) files containing -[`tf.train.Example`](https://www.tensorflow.org/code/tensorflow/core/example/example.proto) -protos run the following command. - -```shell - python create_dataset.py --ndjson_path rnn_tutorial_data \ - --output_path rnn_tutorial_data -``` - -This will store the data in 10 shards of -[TFRecord](../../api_guides/python/python_io.md#TFRecords_Format_Details) files with 10000 items -per class for the training data and 1000 items per class as eval data. - -This conversion process is described in more detail in the following. - -The original QuickDraw data is formatted as `ndjson` files where each line -contains a JSON object like the following: - -```json -{"word":"cat", - "countrycode":"VE", - "timestamp":"2017-03-02 23:25:10.07453 UTC", - "recognized":true, - "key_id":"5201136883597312", - "drawing":[ - [ - [130,113,99,109,76,64,55,48,48,51,59,86,133,154,170,203,214,217,215,208,186,176,162,157,132], - [72,40,27,79,82,88,100,120,134,152,165,184,189,186,179,152,131,114,100,89,76,0,31,65,70] - ],[ - [76,28,7], - [136,128,128] - ],[ - [76,23,0], - [160,164,175] - ],[ - [87,52,37], - [175,191,204] - ],[ - [174,220,246,251], - [134,132,136,139] - ],[ - [175,255], - [147,168] - ],[ - [171,208,215], - [164,198,210] - ],[ - [130,110,108,111,130,139,139,119], - [129,134,137,144,148,144,136,130] - ],[ - [107,106], - [96,113] - ] - ] -} -``` - -For our purpose of building a classifier we only care about the fields "`word`" -and "`drawing`". While parsing the ndjson files, we process them line by line -using a function that converts the strokes from the `drawing` field into a -tensor of size `[number of points, 3]` containing the differences of consecutive -points. This function also returns the class name as a string. - -```python -def parse_line(ndjson_line): - """Parse an ndjson line and return ink (as np array) and classname.""" - sample = json.loads(ndjson_line) - class_name = sample["word"] - inkarray = sample["drawing"] - stroke_lengths = [len(stroke[0]) for stroke in inkarray] - total_points = sum(stroke_lengths) - np_ink = np.zeros((total_points, 3), dtype=np.float32) - current_t = 0 - for stroke in inkarray: - for i in [0, 1]: - np_ink[current_t:(current_t + len(stroke[0])), i] = stroke[i] - current_t += len(stroke[0]) - np_ink[current_t - 1, 2] = 1 # stroke_end - # Preprocessing. - # 1. Size normalization. - lower = np.min(np_ink[:, 0:2], axis=0) - upper = np.max(np_ink[:, 0:2], axis=0) - scale = upper - lower - scale[scale == 0] = 1 - np_ink[:, 0:2] = (np_ink[:, 0:2] - lower) / scale - # 2. Compute deltas. - np_ink = np_ink[1:, 0:2] - np_ink[0:-1, 0:2] - return np_ink, class_name -``` - -Since we want the data to be shuffled for writing we read from each of the -category files in random order and write to a random shard. - -For the training data we read the first 10000 items for each class and for the -eval data we read the next 1000 items for each class. - -This data is then reformatted into a tensor of shape `[num_training_samples, -max_length, 3]`. Then we determine the bounding box of the original drawing in -screen coordinates and normalize the size such that the drawing has unit height. - -
![Size normalization](../../images/quickdraw_sizenormalization.png)
- -Finally, we compute the differences between consecutive points and store these -as a `VarLenFeature` in a -[tensorflow.Example](https://www.tensorflow.org/code/tensorflow/core/example/example.proto) -under the key `ink`. In addition we store the `class_index` as a single entry -`FixedLengthFeature` and the `shape` of the `ink` as a `FixedLengthFeature` of -length 2. - -### Defining the model - -To define the model we create a new `Estimator`. If you want to read more about -estimators, we recommend [this tutorial](../../guide/custom_estimators.md). - -To build the model, we: - -1. reshape the input back into the original shape - where the mini batch is - padded to the maximal length of its contents. In addition to the ink data we - also have the lengths for each example and the target class. This happens in - the function [`_get_input_tensors`](#-get-input-tensors). - -1. pass the input through to a series of convolution layers in - [`_add_conv_layers`](#-add-conv-layers). - -1. pass the output of the convolutions into a series of bidirectional LSTM - layers in [`_add_rnn_layers`](#-add-rnn-layers). At the end of that, the - outputs for each time step are summed up to have a compact, fixed length - embedding of the input. - -1. classify this embedding using a softmax layer in - [`_add_fc_layers`](#-add-fc-layers). - -In code this looks like: - -```python -inks, lengths, targets = _get_input_tensors(features, targets) -convolved = _add_conv_layers(inks) -final_state = _add_rnn_layers(convolved, lengths) -logits =_add_fc_layers(final_state) -``` - -### _get_input_tensors - -To obtain the input features we first obtain the shape from the features dict -and then create a 1D tensor of size `[batch_size]` containing the lengths of the -input sequences. The ink is stored as a SparseTensor in the features dict which -we convert into a dense tensor and then reshape to be `[batch_size, ?, 3]`. And -finally, if targets were passed in we make sure they are stored as a 1D tensor -of size `[batch_size]` - -In code this looks like this: - -```python -shapes = features["shape"] -lengths = tf.squeeze( - tf.slice(shapes, begin=[0, 0], size=[params["batch_size"], 1])) -inks = tf.reshape( - tf.sparse_tensor_to_dense(features["ink"]), - [params["batch_size"], -1, 3]) -if targets is not None: - targets = tf.squeeze(targets) -``` - -### _add_conv_layers - -The desired number of convolution layers and the lengths of the filters is -configured through the parameters `num_conv` and `conv_len` in the `params` -dict. - -The input is a sequence where each point has dimensionality 3. We are going to -use 1D convolutions where we treat the 3 input features as channels. That means -that the input is a `[batch_size, length, 3]` tensor and the output will be a -`[batch_size, length, number_of_filters]` tensor. - -```python -convolved = inks -for i in range(len(params.num_conv)): - convolved_input = convolved - if params.batch_norm: - convolved_input = tf.layers.batch_normalization( - convolved_input, - training=(mode == tf.estimator.ModeKeys.TRAIN)) - # Add dropout layer if enabled and not first convolution layer. - if i > 0 and params.dropout: - convolved_input = tf.layers.dropout( - convolved_input, - rate=params.dropout, - training=(mode == tf.estimator.ModeKeys.TRAIN)) - convolved = tf.layers.conv1d( - convolved_input, - filters=params.num_conv[i], - kernel_size=params.conv_len[i], - activation=None, - strides=1, - padding="same", - name="conv1d_%d" % i) -return convolved, lengths -``` - -### _add_rnn_layers - -We pass the output from the convolutions into bidirectional LSTM layers for -which we use a helper function from contrib. - -```python -outputs, _, _ = contrib_rnn.stack_bidirectional_dynamic_rnn( - cells_fw=[cell(params.num_nodes) for _ in range(params.num_layers)], - cells_bw=[cell(params.num_nodes) for _ in range(params.num_layers)], - inputs=convolved, - sequence_length=lengths, - dtype=tf.float32, - scope="rnn_classification") -``` - -see the code for more details and how to use `CUDA` accelerated implementations. - -To create a compact, fixed-length embedding, we sum up the output of the LSTMs. -We first zero out the regions of the batch where the sequences have no data. - -```python -mask = tf.tile( - tf.expand_dims(tf.sequence_mask(lengths, tf.shape(outputs)[1]), 2), - [1, 1, tf.shape(outputs)[2]]) -zero_outside = tf.where(mask, outputs, tf.zeros_like(outputs)) -outputs = tf.reduce_sum(zero_outside, axis=1) -``` - -### _add_fc_layers - -The embedding of the input is passed into a fully connected layer which we then -use as a softmax layer. - -```python -tf.layers.dense(final_state, params.num_classes) -``` - -### Loss, predictions, and optimizer - -Finally, we need to add a loss, a training op, and predictions to create the -`ModelFn`: - -```python -cross_entropy = tf.reduce_mean( - tf.nn.sparse_softmax_cross_entropy_with_logits( - labels=targets, logits=logits)) -# Add the optimizer. -train_op = tf.contrib.layers.optimize_loss( - loss=cross_entropy, - global_step=tf.train.get_global_step(), - learning_rate=params.learning_rate, - optimizer="Adam", - # some gradient clipping stabilizes training in the beginning. - clip_gradients=params.gradient_clipping_norm, - summaries=["learning_rate", "loss", "gradients", "gradient_norm"]) -predictions = tf.argmax(logits, axis=1) -return model_fn_lib.ModelFnOps( - mode=mode, - predictions={"logits": logits, - "predictions": predictions}, - loss=cross_entropy, - train_op=train_op, - eval_metric_ops={"accuracy": tf.metrics.accuracy(targets, predictions)}) -``` - -### Training and evaluating the model - -To train and evaluate the model we can rely on the functionalities of the -`Estimator` APIs and easily run training and evaluation with the `Experiment` -APIs: - -```python - estimator = tf.estimator.Estimator( - model_fn=model_fn, - model_dir=output_dir, - config=config, - params=model_params) - # Train the model. - tf.contrib.learn.Experiment( - estimator=estimator, - train_input_fn=get_input_fn( - mode=tf.contrib.learn.ModeKeys.TRAIN, - tfrecord_pattern=FLAGS.training_data, - batch_size=FLAGS.batch_size), - train_steps=FLAGS.steps, - eval_input_fn=get_input_fn( - mode=tf.contrib.learn.ModeKeys.EVAL, - tfrecord_pattern=FLAGS.eval_data, - batch_size=FLAGS.batch_size), - min_eval_frequency=1000) -``` - -Note that this tutorial is just a quick example on a relatively small dataset to -get you familiar with the APIs of recurrent neural networks and estimators. Such -models can be even more powerful if you try them on a large dataset. - -When training the model for 1M steps you can expect to get an accuracy of -approximately of approximately 70% on the top-1 candidate. Note that this -accuracy is sufficient to build the quickdraw game because of the game dynamics -the user will be able to adjust their drawing until it is ready. Also, the game -does not use the top-1 candidate only but accepts a drawing as correct if the -target category shows up with a score better than a fixed threshold. -- GitLab From dbdf0ec5b3c5f5aca4b17d0f7db339c33fbf12f8 Mon Sep 17 00:00:00 2001 From: Sanjoy Das Date: Mon, 27 Aug 2018 12:03:48 -0700 Subject: [PATCH 163/598] [TF:XLA] Bump open source llvm revision to r340723 PiperOrigin-RevId: 210406811 --- tensorflow/workspace.bzl | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tensorflow/workspace.bzl b/tensorflow/workspace.bzl index 34b4a66c41..132f42000a 100644 --- a/tensorflow/workspace.bzl +++ b/tensorflow/workspace.bzl @@ -493,11 +493,11 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""): tf_http_archive( name = "llvm", urls = [ - "https://mirror.bazel.build/github.com/llvm-mirror/llvm/archive/97d7bcd5c024ee6aec4eecbc723bb6d4f4c3dc3d.tar.gz", - "https://github.com/llvm-mirror/llvm/archive/97d7bcd5c024ee6aec4eecbc723bb6d4f4c3dc3d.tar.gz", + "https://mirror.bazel.build/github.com/llvm-mirror/llvm/archive/deac5c28e00179be248aaf03abd329a848e8fac8.tar.gz", + "https://github.com/llvm-mirror/llvm/archive/deac5c28e00179be248aaf03abd329a848e8fac8.tar.gz", ], - sha256 = "2889b79ab979e676e344974cfeefbaf2c21c7c69a015bd584e8ae67b87b136bc", - strip_prefix = "llvm-97d7bcd5c024ee6aec4eecbc723bb6d4f4c3dc3d", + sha256 = "bb55a553facff0408574a7bbd0d93c7371dbf527c7020fc6f4b9adeb0d83f780", + strip_prefix = "llvm-deac5c28e00179be248aaf03abd329a848e8fac8", build_file = clean_dep("//third_party/llvm:llvm.autogenerated.BUILD"), ) -- GitLab From 918ec0d12ffa666dc9ae79366dbdc29127931370 Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Mon, 27 Aug 2018 12:09:26 -0700 Subject: [PATCH 164/598] Fix "estimator" spelling in contrib/timeseries/examples/known_anomaly.py PiperOrigin-RevId: 210407945 --- tensorflow/contrib/timeseries/examples/known_anomaly.py | 8 ++++---- .../contrib/timeseries/examples/known_anomaly_test.py | 4 ++-- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/tensorflow/contrib/timeseries/examples/known_anomaly.py b/tensorflow/contrib/timeseries/examples/known_anomaly.py index 71621abc71..1226433625 100644 --- a/tensorflow/contrib/timeseries/examples/known_anomaly.py +++ b/tensorflow/contrib/timeseries/examples/known_anomaly.py @@ -41,7 +41,7 @@ _MODULE_PATH = path.dirname(__file__) _DATA_FILE = path.join(_MODULE_PATH, "data/changepoints.csv") -def state_space_esitmator(exogenous_feature_columns): +def state_space_estimator(exogenous_feature_columns): """Constructs a StructuralEnsembleRegressor.""" def _exogenous_update_condition(times, features): @@ -68,7 +68,7 @@ def state_space_esitmator(exogenous_feature_columns): 4, 64) -def autoregressive_esitmator(exogenous_feature_columns): +def autoregressive_estimator(exogenous_feature_columns): input_window_size = 8 output_window_size = 2 return ( @@ -169,10 +169,10 @@ def main(unused_argv): "Please install matplotlib to generate a plot from this example.") make_plot("Ignoring a known anomaly (state space)", *train_and_evaluate_exogenous( - estimator_fn=state_space_esitmator)) + estimator_fn=state_space_estimator)) make_plot("Ignoring a known anomaly (autoregressive)", *train_and_evaluate_exogenous( - estimator_fn=autoregressive_esitmator, train_steps=3000)) + estimator_fn=autoregressive_estimator, train_steps=3000)) pyplot.show() diff --git a/tensorflow/contrib/timeseries/examples/known_anomaly_test.py b/tensorflow/contrib/timeseries/examples/known_anomaly_test.py index 8c64f2e186..57ccf8f260 100644 --- a/tensorflow/contrib/timeseries/examples/known_anomaly_test.py +++ b/tensorflow/contrib/timeseries/examples/known_anomaly_test.py @@ -28,7 +28,7 @@ class KnownAnomalyExampleTest(test.TestCase): def test_shapes_and_variance_structural_ar(self): (times, observed, all_times, mean, upper_limit, lower_limit, anomaly_locations) = known_anomaly.train_and_evaluate_exogenous( - train_steps=1, estimator_fn=known_anomaly.autoregressive_esitmator) + train_steps=1, estimator_fn=known_anomaly.autoregressive_estimator) self.assertAllEqual( anomaly_locations, [25, 50, 75, 100, 125, 150, 175, 249]) @@ -40,7 +40,7 @@ class KnownAnomalyExampleTest(test.TestCase): def test_shapes_and_variance_structural_ssm(self): (times, observed, all_times, mean, upper_limit, lower_limit, anomaly_locations) = known_anomaly.train_and_evaluate_exogenous( - train_steps=50, estimator_fn=known_anomaly.state_space_esitmator) + train_steps=50, estimator_fn=known_anomaly.state_space_estimator) self.assertAllEqual( anomaly_locations, [25, 50, 75, 100, 125, 150, 175, 249]) -- GitLab From 65bbbebb031cea56e9e7dc49ae8ac93745a714c1 Mon Sep 17 00:00:00 2001 From: Jian Li Date: Mon, 27 Aug 2018 12:11:15 -0700 Subject: [PATCH 165/598] Internal change. PiperOrigin-RevId: 210408262 --- tensorflow/contrib/lite/kernels/internal/kernel_utils.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/tensorflow/contrib/lite/kernels/internal/kernel_utils.h b/tensorflow/contrib/lite/kernels/internal/kernel_utils.h index 1824126828..599850db60 100644 --- a/tensorflow/contrib/lite/kernels/internal/kernel_utils.h +++ b/tensorflow/contrib/lite/kernels/internal/kernel_utils.h @@ -66,8 +66,7 @@ void RnnBatchStep(const float* input_ptr_batch, const int8_t* input_weights_ptr, // - n_input: the input size, // - n_output: the output size. // -// The pointers to the cell and output state and the output are updated. Unless -// projection is specified output and output state contain the same data. +// The pointers to the cell and output state and the output are updated. // // The pointers with the suffix "_batch" point to data aligned in batch_major // order, and each step processes batch_size many inputs from input_ptr_batch, -- GitLab From 05285015795b374e4f71b24d21e31f6c59ebfc8e Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Mon, 27 Aug 2018 12:11:36 -0700 Subject: [PATCH 166/598] Update NNAPI delegate to correctly handle input state tensors for LSTM and SVDF. PiperOrigin-RevId: 210408327 --- .../lite/delegates/nnapi/nnapi_delegate.cc | 93 ++++++++++--------- .../delegates/nnapi/nnapi_delegate_test.cc | 51 +++------- 2 files changed, 62 insertions(+), 82 deletions(-) diff --git a/tensorflow/contrib/lite/delegates/nnapi/nnapi_delegate.cc b/tensorflow/contrib/lite/delegates/nnapi/nnapi_delegate.cc index e6cc3dd99c..eb61b2266e 100644 --- a/tensorflow/contrib/lite/delegates/nnapi/nnapi_delegate.cc +++ b/tensorflow/contrib/lite/delegates/nnapi/nnapi_delegate.cc @@ -238,7 +238,7 @@ class NNAPIOpBuilder { tensor->params.zero_point}; CHECK_NN(context_, ANeuralNetworksModel_addOperand(nn_model_, &operand_type)); - augmented_inputs_.push_back(ann_index); + augmented_outputs_.push_back(ann_index); *ann_tensor_index_out = ann_index; return kTfLiteOk; @@ -370,8 +370,8 @@ struct NNAPIOpMappingArgs { TfLiteContext* context; NNAPIOpBuilder* builder; TfLiteNode* node; - std::vector* model_state_inputs; - std::vector* model_state_tfl_outputs; + std::vector* model_state_outputs; + std::vector* model_state_tfl_inputs; }; // The kernel that represents the subgraph of TF Lite being run on NN API. @@ -779,6 +779,7 @@ class NNAPIDelegateKernel { return nullptr; } break; +#if 0 case kTfLiteBuiltinRnn: // NNAPI only support float32 weights. // TODO(miaowang): check the number of inputs before accessing it. @@ -792,8 +793,8 @@ class NNAPIDelegateKernel { mapping_args.builder->AddStateFloat32Tensor( mapping_args.node->outputs->data[/*kHiddenStateTensor*/ 0], &ann_index); - mapping_args.model_state_inputs->push_back(ann_index); - mapping_args.model_state_tfl_outputs->push_back( + mapping_args.model_state_outputs->push_back(ann_index); + mapping_args.model_state_tfl_inputs->push_back( mapping_args.node->outputs->data[/*kHiddenStateTensor*/ 0]); auto builtin = reinterpret_cast( mapping_args.node->builtin_data); @@ -804,6 +805,7 @@ class NNAPIDelegateKernel { return nullptr; } break; +#endif case kTfLiteBuiltinSvdf: // NNAPI only support float32 weights. if (version == 1 && @@ -814,11 +816,13 @@ class NNAPIDelegateKernel { // NNAPI need both state_in and state_out. int ann_index; mapping_args.builder->AddStateFloat32Tensor( - mapping_args.node->outputs->data[/*kStateTensor*/ 0], + mapping_args.node->inputs + ->data[/*kInputActivationStateTensor*/ 4], &ann_index); - mapping_args.model_state_inputs->push_back(ann_index); - mapping_args.model_state_tfl_outputs->push_back( - mapping_args.node->outputs->data[/*kStateTensor*/ 0]); + mapping_args.model_state_outputs->push_back(ann_index); + mapping_args.model_state_tfl_inputs->push_back( + mapping_args.node->inputs + ->data[/*kInputActivationStateTensor*/ 4]); auto builtin = reinterpret_cast( mapping_args.node->builtin_data); @@ -833,28 +837,12 @@ class NNAPIDelegateKernel { case kTfLiteBuiltinLstm: // NNAPI only support float32 weights. // TODO(miaowang): add loggings to indicate why the op is rejected. - if (version == 1 && node->inputs->size == 18 && + if (version == 1 && node->inputs->size == 20 && context->tensors[node->inputs ->data[/*kInputToOutputWeightsTensor*/ 4]] .type == kTfLiteFloat32) { return [](const NNAPIOpMappingArgs& mapping_args) -> ANeuralNetworksOperationType { - // NNAPI need both state_in and state_out for cell_state and - // output_state. - int ann_index; - mapping_args.builder->AddStateFloat32Tensor( - mapping_args.node->outputs->data[/*kOutputStateTensor*/ 0], - &ann_index); - mapping_args.model_state_inputs->push_back(ann_index); - mapping_args.model_state_tfl_outputs->push_back( - mapping_args.node->outputs->data[/*kOutputStateTensor*/ 0]); - mapping_args.builder->AddStateFloat32Tensor( - mapping_args.node->outputs->data[/*kCellStateTensor*/ 1], - &ann_index); - mapping_args.model_state_inputs->push_back(ann_index); - mapping_args.model_state_tfl_outputs->push_back( - mapping_args.node->outputs->data[/*kCellStateTensor*/ 1]); - auto builtin = reinterpret_cast( mapping_args.node->builtin_data); mapping_args.builder->AddScalarInt32Operand(builtin->activation); @@ -864,6 +852,25 @@ class NNAPIDelegateKernel { // Current NNAPI implementation requires the sratch_buffer as // output. mapping_args.builder->AddAdditionalFloat32OutputTensor(2); + + // NNAPI need both state_in and state_out for cell_state and + // output_state. + int ann_index; + mapping_args.builder->AddStateFloat32Tensor( + mapping_args.node->inputs + ->data[/*kInputActivationStateTensor*/ 18], + &ann_index); + mapping_args.model_state_outputs->push_back(ann_index); + mapping_args.model_state_tfl_inputs->push_back( + mapping_args.node->inputs + ->data[/*kInputActivationStateTensor*/ 18]); + mapping_args.builder->AddStateFloat32Tensor( + mapping_args.node->inputs->data[/*kInputCellStateTensor*/ 19], + &ann_index); + mapping_args.model_state_outputs->push_back(ann_index); + mapping_args.model_state_tfl_inputs->push_back( + mapping_args.node->inputs->data[/*kInputCellStateTensor*/ 19]); + return ANEURALNETWORKS_LSTM; }; } else { @@ -950,12 +957,10 @@ class NNAPIDelegateKernel { // Set the input tensor buffers. Note: we access tflite tensors using // absolute indices but NN api indices inputs by relative indices. int relative_input_index = 0; - int num_optional_tensors = 0; size_t input_offset = 0; for (auto absolute_input_index : TfLiteIntArrayView(node->inputs)) { if (absolute_input_index == kOptionalTensor) { - num_optional_tensors++; continue; } TfLiteTensor* tensor = &context->tensors[absolute_input_index]; @@ -989,16 +994,16 @@ class NNAPIDelegateKernel { // The state_out of previous invocation need to be mapped to state_in of // current invocation. - for (size_t i = 0; i < model_state_tfl_outputs_.size(); i++) { - int state_tensor_idx = model_state_tfl_outputs_[i]; + for (size_t i = 0; i < model_state_tfl_inputs_.size(); i++) { + int state_tensor_idx = model_state_tfl_inputs_[i]; TfLiteTensor* tensor = &context->tensors[state_tensor_idx]; // Here we are using a deep copy for state_in tensors so that we are not // reading and writing into the same buffer during a invocation. // TODO(110369471): using double shared buffer to minimize the copies. - CHECK_NN(context, - ANeuralNetworksExecution_setInput( - execution, i + node->inputs->size - num_optional_tensors, - nullptr, tensor->data.raw, tensor->bytes)); + CHECK_NN(context, ANeuralNetworksExecution_setOutput( + execution, relative_output_index, nullptr, + tensor->data.raw, tensor->bytes)); + relative_output_index++; } // Invoke ANN in blocking fashion. ANeuralNetworksEvent* event = nullptr; @@ -1030,8 +1035,8 @@ class NNAPIDelegateKernel { // Track indices we use OperandMapping operand_mapping_; - std::vector model_state_inputs_; - std::vector model_state_tfl_outputs_; + std::vector model_state_outputs_; + std::vector model_state_tfl_inputs_; std::unique_ptr nn_input_memory_; std::unique_ptr nn_output_memory_; @@ -1063,9 +1068,9 @@ class NNAPIDelegateKernel { } } // Get op type and operands - int nn_op_type = Map(context, reg->builtin_code, reg->version, - node)({context, &builder, node, &model_state_inputs_, - &model_state_tfl_outputs_}); + int nn_op_type = Map(context, reg->builtin_code, reg->version, node)( + {context, &builder, node, &model_state_outputs_, + &model_state_tfl_inputs_}); // Map outputs to NN API tensor indices. for (auto output_index : TfLiteIntArrayView(node->outputs)) { TF_LITE_ENSURE_STATUS(builder.AddTensorOutput(output_index)); @@ -1098,17 +1103,17 @@ class NNAPIDelegateKernel { } } - // Add state input tensors as model inputs - for (int i : model_state_inputs_) { - inputs.push_back(i); - } - size_t total_output_byte_size = 0; for (int i : TfLiteIntArrayView(output_tensors)) { outputs.push_back(operand_mapping_.lite_index_to_ann(i)); total_output_byte_size += context->tensors[i].bytes; } + // Add state output tensors as model inputs + for (int i : model_state_outputs_) { + outputs.push_back(i); + } + // Tell ANN to declare inputs/outputs CHECK_NN(context, ANeuralNetworksModel_identifyInputsAndOutputs( nn_model_.get(), inputs.size(), inputs.data(), diff --git a/tensorflow/contrib/lite/delegates/nnapi/nnapi_delegate_test.cc b/tensorflow/contrib/lite/delegates/nnapi/nnapi_delegate_test.cc index 720d6b741e..73c27fb3a0 100644 --- a/tensorflow/contrib/lite/delegates/nnapi/nnapi_delegate_test.cc +++ b/tensorflow/contrib/lite/delegates/nnapi/nnapi_delegate_test.cc @@ -1970,7 +1970,8 @@ class BaseSVDFOpModel : public SingleOpModelWithNNAPI { bias_ = AddNullInput(); const int num_filters = units * rank; activation_state_ = AddInput( - TensorData{TensorType_FLOAT32, {batches, memory_size * num_filters}}); + TensorData{TensorType_FLOAT32, {batches, memory_size * num_filters}}, + /*is_variable=*/true); output_ = AddOutput(TensorType_FLOAT32); SetBuiltinOp( BuiltinOperator_SVDF, BuiltinOptions_SVDFOptions, @@ -2055,7 +2056,7 @@ class SVDFOpModel : public BaseSVDFOpModel { } }; -TEST(NNAPIDelegate, DISABLED_SVDFBlackBoxTestRank1) { +TEST(NNAPIDelegate, SVDFBlackBoxTestRank1) { SVDFOpModel svdf(/*batches=*/2, /*units=*/4, /*input_size=*/3, /*memory_size=*/10, /*rank=*/1); svdf.SetWeightsFeature({-0.31930989, -0.36118156, 0.0079667, 0.37613347, @@ -2078,7 +2079,7 @@ TEST(NNAPIDelegate, DISABLED_SVDFBlackBoxTestRank1) { svdf.VerifyGoldens(svdf_input, svdf_golden_output_rank_1, sizeof(svdf_input)); } -TEST(NNAPIDelegate, DISABLED_SVDFBlackBoxTestRank2) { +TEST(NNAPIDelegate, SVDFBlackBoxTestRank2) { SVDFOpModel svdf(/*batches=*/2, /*units=*/4, /*input_size=*/3, /*memory_size=*/10, /*rank=*/2); svdf.SetWeightsFeature({-0.31930989, 0.0079667, 0.39296314, 0.37613347, @@ -2184,8 +2185,12 @@ class LSTMOpModel : public SingleOpModelWithNNAPI { projection_bias_ = AddNullInput(); } - output_state_ = AddOutput(TensorType_FLOAT32); - cell_state_ = AddOutput(TensorType_FLOAT32); + // Adding the 2 input state tensors. + input_activation_state_ = + AddInput(TensorData{TensorType_FLOAT32, {n_batch_, n_output_}}, true); + input_cell_state_ = + AddInput(TensorData{TensorType_FLOAT32, {n_batch_, n_cell_}}, true); + output_ = AddOutput(TensorType_FLOAT32); SetBuiltinOp(BuiltinOperator_LSTM, BuiltinOptions_LSTMOptions, @@ -2263,22 +2268,6 @@ class LSTMOpModel : public SingleOpModelWithNNAPI { PopulateTensor(projection_bias_, f); } - void ResetOutputState() { - const int zero_buffer_size = n_cell_ * n_batch_; - std::unique_ptr zero_buffer(new float[zero_buffer_size]); - memset(zero_buffer.get(), 0, zero_buffer_size * sizeof(float)); - PopulateTensor(output_state_, 0, zero_buffer.get(), - zero_buffer.get() + zero_buffer_size); - } - - void ResetCellState() { - const int zero_buffer_size = n_cell_ * n_batch_; - std::unique_ptr zero_buffer(new float[zero_buffer_size]); - memset(zero_buffer.get(), 0, zero_buffer_size * sizeof(float)); - PopulateTensor(cell_state_, 0, zero_buffer.get(), - zero_buffer.get() + zero_buffer_size); - } - void SetInput(int offset, const float* begin, const float* end) { PopulateTensor(input_, offset, const_cast(begin), const_cast(end)); @@ -2434,8 +2423,7 @@ class NoCifgNoPeepholeNoProjectionNoClippingLstmTest : public BaseLstmTest { } }; -TEST_F(NoCifgNoPeepholeNoProjectionNoClippingLstmTest, - DISABLED_LstmBlackBoxTest) { +TEST_F(NoCifgNoPeepholeNoProjectionNoClippingLstmTest, LstmBlackBoxTest) { const int n_batch = 1; const int n_input = 2; // n_cell and n_output have the same size when there is no projection. @@ -2488,10 +2476,6 @@ TEST_F(NoCifgNoPeepholeNoProjectionNoClippingLstmTest, lstm.SetRecurrentToForgetWeights(recurrent_to_forget_weights_); lstm.SetRecurrentToOutputWeights(recurrent_to_output_weights_); - // Resetting cell_state and output_state - lstm.ResetCellState(); - lstm.ResetOutputState(); - VerifyGoldens(lstm_input_, lstm_golden_output_, &lstm); } @@ -2542,8 +2526,7 @@ class CifgNoPeepholeNoProjectionNoClippingLstmTest : public BaseLstmTest { } }; -TEST_F(CifgNoPeepholeNoProjectionNoClippingLstmTest, - DISABLED_LstmBlackBoxTest) { +TEST_F(CifgNoPeepholeNoProjectionNoClippingLstmTest, LstmBlackBoxTest) { const int n_batch = 1; const int n_input = 2; // n_cell and n_output have the same size when there is no projection. @@ -2596,10 +2579,6 @@ TEST_F(CifgNoPeepholeNoProjectionNoClippingLstmTest, lstm.SetCellToForgetWeights(cell_to_forget_weights_); lstm.SetCellToOutputWeights(cell_to_output_weights_); - // Resetting cell_state and output_state - lstm.ResetCellState(); - lstm.ResetOutputState(); - VerifyGoldens(lstm_input_, lstm_golden_output_, &lstm); } @@ -3202,7 +3181,7 @@ class NoCifgPeepholeProjectionClippingLstmTest : public BaseLstmTest { } }; -TEST_F(NoCifgPeepholeProjectionClippingLstmTest, DISABLED_LstmBlackBoxTest) { +TEST_F(NoCifgPeepholeProjectionClippingLstmTest, LstmBlackBoxTest) { const int n_batch = 2; const int n_input = 5; const int n_cell = 20; @@ -3260,10 +3239,6 @@ TEST_F(NoCifgPeepholeProjectionClippingLstmTest, DISABLED_LstmBlackBoxTest) { lstm.SetProjectionWeights(projection_weights_); - // Resetting cell_state and output_state - lstm.ResetCellState(); - lstm.ResetOutputState(); - VerifyGoldens(lstm_input_, lstm_golden_output_, &lstm); } -- GitLab From 9e27c8f01c4548e4cc7fe1a5015af1ec8e32e5d1 Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Mon, 27 Aug 2018 12:36:16 -0700 Subject: [PATCH 167/598] Fixed a bug in the dense split handler ops. PiperOrigin-RevId: 210412659 --- .../kernels/split_handler_ops.cc | 17 ++++++-- .../batch/ordinal_split_handler_test.py | 43 +++++++++++-------- 2 files changed, 37 insertions(+), 23 deletions(-) diff --git a/tensorflow/contrib/boosted_trees/kernels/split_handler_ops.cc b/tensorflow/contrib/boosted_trees/kernels/split_handler_ops.cc index 3a48635319..d0fd39fa30 100644 --- a/tensorflow/contrib/boosted_trees/kernels/split_handler_ops.cc +++ b/tensorflow/contrib/boosted_trees/kernels/split_handler_ops.cc @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. // ============================================================================= +#include #include #include #include @@ -325,13 +326,21 @@ class BuildDenseInequalitySplitsOp : public OpKernel { } float best_gain = std::numeric_limits::lowest(); - int64 best_bucket_idx = 0; + int64 best_bucket_id = 0; std::vector best_right_node_stats(num_elements, NodeStats(0)); std::vector best_left_node_stats(num_elements, NodeStats(0)); std::vector current_left_node_stats(num_elements, NodeStats(0)); std::vector current_right_node_stats(num_elements, NodeStats(0)); - int64 current_bucket_id = 0; + int64 current_bucket_id = std::numeric_limits::max(); int64 last_bucket_id = -1; + // Find the lowest bucket id, this is going to be the first bucket id to + // try. + for (int root_idx = 0; root_idx < num_elements; root_idx++) { + const int start_index = partition_boundaries[root_idx]; + if (bucket_ids(start_index, 0) < current_bucket_id) { + current_bucket_id = bucket_ids(start_index, 0); + } + } // Indexes offsets for each of the partitions that can be used to access // gradients of a partition for a current bucket we consider. std::vector current_layer_offsets(num_elements, 0); @@ -373,6 +382,7 @@ class BuildDenseInequalitySplitsOp : public OpKernel { best_gain = gain_of_split; best_left_node_stats = current_left_node_stats; best_right_node_stats = current_right_node_stats; + best_bucket_id = current_bucket_id; } current_bucket_id = next_bucket_id; } @@ -387,8 +397,7 @@ class BuildDenseInequalitySplitsOp : public OpKernel { oblivious_split_info.mutable_split_node() ->mutable_oblivious_dense_float_binary_split(); oblivious_dense_split->set_feature_column(state->feature_column_group_id()); - oblivious_dense_split->set_threshold( - bucket_boundaries(bucket_ids(best_bucket_idx, 0))); + oblivious_dense_split->set_threshold(bucket_boundaries(best_bucket_id)); (*gains)(0) = best_gain; for (int root_idx = 0; root_idx < num_elements; root_idx++) { diff --git a/tensorflow/contrib/boosted_trees/lib/learner/batch/ordinal_split_handler_test.py b/tensorflow/contrib/boosted_trees/lib/learner/batch/ordinal_split_handler_test.py index 31043264a1..5532bd026a 100644 --- a/tensorflow/contrib/boosted_trees/lib/learner/batch/ordinal_split_handler_test.py +++ b/tensorflow/contrib/boosted_trees/lib/learner/batch/ordinal_split_handler_test.py @@ -186,11 +186,12 @@ class DenseSplitHandlerTest(test_util.TensorFlowTestCase): with self.test_session() as sess: # The data looks like the following: # Example | Gradients | Partition | Dense Quantile | - # i0 | (0.2, 0.12) | 1 | 2 | - # i1 | (-0.5, 0.07) | 1 | 2 | - # i2 | (1.2, 0.2) | 1 | 0 | - # i3 | (4.0, 0.13) | 2 | 1 | - dense_column = array_ops.constant([0.62, 0.62, 0.3, 0.52]) + # i0 | (0.2, 0.12) | 1 | 3 | + # i1 | (-0.5, 0.07) | 1 | 3 | + # i2 | (1.2, 0.2) | 1 | 1 | + # i3 | (4.0, 0.13) | 2 | 2 | + dense_column = array_ops.placeholder( + dtypes.float32, shape=(4, 1), name="dense_column") gradients = array_ops.constant([0.2, -0.5, 1.2, 4.0]) hessians = array_ops.constant([0.12, 0.07, 0.2, 0.13]) partition_ids = array_ops.constant([1, 1, 1, 2], dtype=dtypes.int32) @@ -230,24 +231,28 @@ class DenseSplitHandlerTest(test_util.TensorFlowTestCase): with ops.control_dependencies([update_1]): are_splits_ready = split_handler.make_splits( np.int64(0), np.int64(1), class_id)[0] + # Forcing the creation of four buckets. + are_splits_ready = sess.run( + [are_splits_ready], + feed_dict={dense_column: [[0.2], [0.62], [0.3], [0.52]]})[0] - with ops.control_dependencies([are_splits_ready]): - update_2 = split_handler.update_stats_sync( - 1, - partition_ids, - gradients, - hessians, - empty_gradients, - empty_hessians, - example_weights, - is_active=array_ops.constant([True, True])) + update_2 = split_handler.update_stats_sync( + 1, + partition_ids, + gradients, + hessians, + empty_gradients, + empty_hessians, + example_weights, + is_active=array_ops.constant([True, True])) with ops.control_dependencies([update_2]): are_splits_ready2, partitions, gains, splits = ( split_handler.make_splits(np.int64(1), np.int64(2), class_id)) - are_splits_ready, are_splits_ready2, partitions, gains, splits = ( - sess.run([ - are_splits_ready, are_splits_ready2, partitions, gains, splits - ])) + # Only using the last three buckets. + are_splits_ready2, partitions, gains, splits = ( + sess.run( + [are_splits_ready2, partitions, gains, splits], + feed_dict={dense_column: [[0.62], [0.62], [0.3], [0.52]]})) # During the first iteration, inequality split handlers are not going to # have any splits. Make sure that we return not_ready in that case. -- GitLab From 28e2d44be5baf928d002e73fc8a2272db952b4d8 Mon Sep 17 00:00:00 2001 From: Mark Heffernan Date: Mon, 27 Aug 2018 12:36:19 -0700 Subject: [PATCH 168/598] Add regression test for b/112550242 related to conditionals and copy insertion. PiperOrigin-RevId: 210412671 --- .../compiler/xla/tests/conditional_test.cc | 52 +++++++++++++++++++ 1 file changed, 52 insertions(+) diff --git a/tensorflow/compiler/xla/tests/conditional_test.cc b/tensorflow/compiler/xla/tests/conditional_test.cc index b27c1044ba..25d10ab00a 100644 --- a/tensorflow/compiler/xla/tests/conditional_test.cc +++ b/tensorflow/compiler/xla/tests/conditional_test.cc @@ -642,5 +642,57 @@ XLA_TEST_F(ConditionalOpTest, SwappedInputsInSequentialConditionals) { test_swap(11.24f, 5.55f); } +// Test conditional that duplicates tuple elements in the then and else +// computations. This is a regression test for b/112550242. +XLA_TEST_F(ConditionalOpTest, DuplicateElementsConditional) { + const Shape scalar = ShapeUtil::MakeShape(S32, {}); + const Shape tuple2 = ShapeUtil::MakeTupleShape({scalar, scalar}); + XlaComputation then_comp; + { + XlaBuilder builder(TestName() + ".then"); + auto p = Parameter(&builder, 0, tuple2, "then.p"); + auto e0 = GetTupleElement(p, 0); + auto e1 = GetTupleElement(p, 1); + Tuple(&builder, {e0, e1, e0}); + then_comp = builder.Build().ConsumeValueOrDie(); + } + XlaComputation else_comp; + { + XlaBuilder builder(TestName() + ".else"); + auto p = Parameter(&builder, 0, tuple2, "else.p"); + auto e0 = GetTupleElement(p, 0); + auto e1 = GetTupleElement(p, 1); + Tuple(&builder, {e0, e1, e1}); + else_comp = builder.Build().ConsumeValueOrDie(); + } + + { + // Pred is true case. + std::vector args; + args.push_back(std::move( + *LiteralUtil::MakeTuple({LiteralUtil::CreateR0(123).get(), + LiteralUtil::CreateR0(-42).get()}))); + args.push_back(std::move(*LiteralUtil::CreateR0(true))); + XlaBuilder builder(TestName() + ".main"); + auto p = Parameter(&builder, 0, tuple2, "p0"); + auto p_pred = Parameter(&builder, 1, ShapeUtil::MakeShape(PRED, {}), "p1"); + Conditional(p_pred, p, then_comp, p, else_comp); + ComputeAndCompare(&builder, args); + } + { + // Pred is false case. + std::vector args; + args.push_back(std::move( + *LiteralUtil::MakeTuple({LiteralUtil::CreateR0(123).get(), + LiteralUtil::CreateR0(-42).get()}))); + args.push_back(std::move(*LiteralUtil::CreateR0(false))); + XlaBuilder builder(TestName() + ".main"); + auto p = Parameter(&builder, 0, tuple2, "p0"); + auto p_pred = Parameter(&builder, 1, ShapeUtil::MakeShape(PRED, {}), "p1"); + Conditional(p_pred, p, then_comp, p, else_comp); + ComputeAndCompare(&builder, args); + } +} + } // namespace } // namespace xla -- GitLab From f468d9a4d9a551601a74edf0564bd6274513be9f Mon Sep 17 00:00:00 2001 From: Gunhan Gulsoy Date: Mon, 27 Aug 2018 12:56:00 -0700 Subject: [PATCH 169/598] Disable flaky boosted_trees_test. PiperOrigin-RevId: 210415730 --- tensorflow/python/estimator/BUILD | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tensorflow/python/estimator/BUILD b/tensorflow/python/estimator/BUILD index 817c8e6848..9fce172bee 100644 --- a/tensorflow/python/estimator/BUILD +++ b/tensorflow/python/estimator/BUILD @@ -211,6 +211,9 @@ py_test( shard_count = 2, srcs_version = "PY2AND3", tags = [ + "manual", + "no_oss", + "notap", "optonly", ], deps = [ -- GitLab From c09e01232ace0a5657828c9c47de942751c94949 Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Mon, 27 Aug 2018 13:09:39 -0700 Subject: [PATCH 170/598] Simplify logic that defaults to the default graph when no graph is passed into saved_models.utils.get_tensor_from_tensor_info PiperOrigin-RevId: 210417762 --- tensorflow/python/saved_model/utils_impl.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tensorflow/python/saved_model/utils_impl.py b/tensorflow/python/saved_model/utils_impl.py index 20ff34fd8e..06d09325c8 100644 --- a/tensorflow/python/saved_model/utils_impl.py +++ b/tensorflow/python/saved_model/utils_impl.py @@ -75,7 +75,7 @@ def get_tensor_from_tensor_info(tensor_info, graph=None, import_scope=None): KeyError: If `tensor_info` does not correspond to a tensor in `graph`. ValueError: If `tensor_info` is malformed. """ - graph = graph if graph is not None else ops.get_default_graph() + graph = graph or ops.get_default_graph() def _get_tensor(name): return graph.get_tensor_by_name( ops.prepend_name_scope(name, import_scope=import_scope)) -- GitLab From fd470b7a73091e4e67417af913ffdbd0e97bac92 Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Mon, 27 Aug 2018 13:35:34 -0700 Subject: [PATCH 171/598] Fix broken links to colorbot data files PiperOrigin-RevId: 210422151 --- .../eager/python/examples/rnn_colorbot/rnn_colorbot.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tensorflow/contrib/eager/python/examples/rnn_colorbot/rnn_colorbot.py b/tensorflow/contrib/eager/python/examples/rnn_colorbot/rnn_colorbot.py index 5ee2176154..74ebb1ec77 100644 --- a/tensorflow/contrib/eager/python/examples/rnn_colorbot/rnn_colorbot.py +++ b/tensorflow/contrib/eager/python/examples/rnn_colorbot/rnn_colorbot.py @@ -243,8 +243,8 @@ def train_one_epoch(model, optimizer, train_data, log_interval=10): print("train/batch #%d\tloss: %.6f" % (batch, batch_model_loss())) -SOURCE_TRAIN_URL = "https://raw.githubusercontent.com/random-forests/tensorflow-workshop/master/extras/colorbot/data/train.csv" -SOURCE_TEST_URL = "https://raw.githubusercontent.com/random-forests/tensorflow-workshop/master/extras/colorbot/data/test.csv" +SOURCE_TRAIN_URL = "https://raw.githubusercontent.com/random-forests/tensorflow-workshop/master/archive/extras/colorbot/data/train.csv" +SOURCE_TEST_URL = "https://raw.githubusercontent.com/random-forests/tensorflow-workshop/master/archive/extras/colorbot/data/test.csv" def main(_): -- GitLab From b02f00174d366172b2ff2a93fb591f24c617a962 Mon Sep 17 00:00:00 2001 From: Gunhan Gulsoy Date: Mon, 27 Aug 2018 13:39:01 -0700 Subject: [PATCH 172/598] Add HostConst op dependency for the gather_op_test to fix broken tests and benchmarks. PiperOrigin-RevId: 210422767 --- tensorflow/core/kernels/BUILD | 1 + 1 file changed, 1 insertion(+) diff --git a/tensorflow/core/kernels/BUILD b/tensorflow/core/kernels/BUILD index 3690fd4362..633fe9ab77 100644 --- a/tensorflow/core/kernels/BUILD +++ b/tensorflow/core/kernels/BUILD @@ -1280,6 +1280,7 @@ tf_cuda_cc_test( srcs = ["gather_op_test.cc"], deps = [ ":gather_op", + ":host_constant_op", ":ops_testutil", ":ops_util", "//tensorflow/core:core_cpu", -- GitLab From 1059de1db9e927a85c483e353bbb9dc90e670b47 Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Mon, 27 Aug 2018 13:44:19 -0700 Subject: [PATCH 173/598] Adding hybrid implementation of the bidi-RNN Op. PiperOrigin-RevId: 210423591 --- .../kernels/bidirectional_sequence_rnn.cc | 256 +++++++++++++++--- 1 file changed, 212 insertions(+), 44 deletions(-) diff --git a/tensorflow/contrib/lite/kernels/bidirectional_sequence_rnn.cc b/tensorflow/contrib/lite/kernels/bidirectional_sequence_rnn.cc index 517309a226..4162d9bb88 100644 --- a/tensorflow/contrib/lite/kernels/bidirectional_sequence_rnn.cc +++ b/tensorflow/contrib/lite/kernels/bidirectional_sequence_rnn.cc @@ -23,6 +23,7 @@ limitations under the License. #include "tensorflow/contrib/lite/context.h" #include "tensorflow/contrib/lite/kernels/activation_functor.h" #include "tensorflow/contrib/lite/kernels/internal/kernel_utils.h" +#include "tensorflow/contrib/lite/kernels/kernel_util.h" #include "tensorflow/contrib/lite/kernels/op_macros.h" namespace tflite { @@ -44,25 +45,37 @@ constexpr int kFwOutputTensor = 1; constexpr int kBwHiddenStateTensor = 2; constexpr int kBwOutputTensor = 3; +void* Init(TfLiteContext* context, const char* buffer, size_t length) { + auto* scratch_tensor_index = new int; + context->AddTensors(context, /*tensors_to_add=*/3, scratch_tensor_index); + return scratch_tensor_index; +} + +void Free(TfLiteContext* context, void* buffer) { + delete reinterpret_cast(buffer); +} + TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) { // Check we have all the inputs and outputs we need. TF_LITE_ENSURE_EQ(context, node->inputs->size, 7); TF_LITE_ENSURE_EQ(context, node->outputs->size, 4); - TfLiteTensor* input = &context->tensors[node->inputs->data[kInputTensor]]; - TfLiteTensor* fw_input_weights = - &context->tensors[node->inputs->data[kFwWeightsTensor]]; - TfLiteTensor* fw_recurrent_weights = - &context->tensors[node->inputs->data[kFwRecurrentWeightsTensor]]; - TfLiteTensor* fw_bias = &context->tensors[node->inputs->data[kFwBiasTensor]]; - TfLiteTensor* bw_input_weights = - &context->tensors[node->inputs->data[kBwWeightsTensor]]; - TfLiteTensor* bw_recurrent_weights = - &context->tensors[node->inputs->data[kBwRecurrentWeightsTensor]]; - TfLiteTensor* bw_bias = &context->tensors[node->inputs->data[kBwBiasTensor]]; + const TfLiteTensor* input = GetInput(context, node, kInputTensor); + const TfLiteTensor* fw_input_weights = + GetInput(context, node, kFwWeightsTensor); + const TfLiteTensor* fw_recurrent_weights = + GetInput(context, node, kFwRecurrentWeightsTensor); + const TfLiteTensor* fw_bias = GetInput(context, node, kFwBiasTensor); + const TfLiteTensor* bw_input_weights = + GetInput(context, node, kBwWeightsTensor); + const TfLiteTensor* bw_recurrent_weights = + GetInput(context, node, kBwRecurrentWeightsTensor); + const TfLiteTensor* bw_bias = GetInput(context, node, kBwBiasTensor); // Check all the parameters of tensor match within themselves and match the // input configuration. + TF_LITE_ENSURE_EQ(context, input->type, kTfLiteFloat32); + const int batch_size = input->dims->data[0]; const int max_time = input->dims->data[1]; const int fw_num_units = fw_input_weights->dims->data[0]; @@ -76,17 +89,15 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) { TF_LITE_ASSERT_EQ(bw_recurrent_weights->dims->data[1], bw_bias->dims->data[0]); - TfLiteTensor* fw_output = - &context->tensors[node->outputs->data[kFwOutputTensor]]; - TfLiteTensor* bw_output = - &context->tensors[node->outputs->data[kBwOutputTensor]]; + TfLiteTensor* fw_output = GetOutput(context, node, kFwOutputTensor); + TfLiteTensor* bw_output = GetOutput(context, node, kBwOutputTensor); // Resize hidden states. TfLiteIntArray* fw_hidden_state_size_array = TfLiteIntArrayCreate(2); fw_hidden_state_size_array->data[0] = batch_size; fw_hidden_state_size_array->data[1] = fw_num_units; TfLiteTensor* fw_hidden_state = - &context->tensors[node->outputs->data[kFwHiddenStateTensor]]; + GetOutput(context, node, kFwHiddenStateTensor); TF_LITE_ENSURE_OK(context, context->ResizeTensor(context, fw_hidden_state, fw_hidden_state_size_array)); @@ -94,7 +105,7 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) { bw_hidden_state_size_array->data[0] = batch_size; bw_hidden_state_size_array->data[1] = fw_num_units; TfLiteTensor* bw_hidden_state = - &context->tensors[node->outputs->data[kBwHiddenStateTensor]]; + GetOutput(context, node, kBwHiddenStateTensor); TF_LITE_ENSURE_OK(context, context->ResizeTensor(context, bw_hidden_state, bw_hidden_state_size_array)); @@ -102,6 +113,50 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) { fw_hidden_state->allocation_type = kTfLiteArenaRwPersistent; bw_hidden_state->allocation_type = kTfLiteArenaRwPersistent; + const bool is_hybrid_op = + (fw_input_weights->type == kTfLiteUInt8 && input->type == kTfLiteFloat32); + + if (is_hybrid_op) { + int* scratch_tensor_index = reinterpret_cast(node->user_data); + TfLiteIntArrayFree(node->temporaries); + node->temporaries = TfLiteIntArrayCreate(2); + node->temporaries->data[0] = *scratch_tensor_index; + TfLiteTensor* input_quantized = GetTemporary(context, node, /*index=*/0); + input_quantized->type = kTfLiteUInt8; + input_quantized->allocation_type = kTfLiteArenaRw; + if (!TfLiteIntArrayEqual(input_quantized->dims, input->dims)) { + TfLiteIntArray* input_quantized_size = TfLiteIntArrayCopy(input->dims); + TF_LITE_ENSURE_OK(context, context->ResizeTensor(context, input_quantized, + input_quantized_size)); + } + node->temporaries->data[1] = *scratch_tensor_index + 1; + TfLiteTensor* fw_hidden_state_quantized = + GetTemporary(context, node, /*index=*/1); + fw_hidden_state_quantized->type = kTfLiteUInt8; + fw_hidden_state_quantized->allocation_type = kTfLiteArenaRw; + if (!TfLiteIntArrayEqual(fw_hidden_state_quantized->dims, + fw_hidden_state->dims)) { + TfLiteIntArray* fw_hidden_state_quantized_size = + TfLiteIntArrayCopy(fw_hidden_state->dims); + TF_LITE_ENSURE_OK( + context, context->ResizeTensor(context, fw_hidden_state_quantized, + fw_hidden_state_quantized_size)); + } + node->temporaries->data[2] = *scratch_tensor_index + 2; + TfLiteTensor* bw_hidden_state_quantized = + GetTemporary(context, node, /*index=*/2); + bw_hidden_state_quantized->type = kTfLiteUInt8; + bw_hidden_state_quantized->allocation_type = kTfLiteArenaRw; + if (!TfLiteIntArrayEqual(bw_hidden_state_quantized->dims, + bw_hidden_state->dims)) { + TfLiteIntArray* bw_hidden_state_quantized_size = + TfLiteIntArrayCopy(bw_hidden_state->dims); + TF_LITE_ENSURE_OK( + context, context->ResizeTensor(context, bw_hidden_state_quantized, + bw_hidden_state_quantized_size)); + } + } + // Resize outputs. TfLiteIntArray* fw_output_size_array = TfLiteIntArrayCreate(3); fw_output_size_array->data[0] = batch_size; @@ -119,30 +174,16 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) { return kTfLiteOk; } -TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) { - auto* params = reinterpret_cast(node->builtin_data); - - TfLiteTensor* input = &context->tensors[node->inputs->data[kInputTensor]]; - TfLiteTensor* fw_input_weights = - &context->tensors[node->inputs->data[kFwWeightsTensor]]; - TfLiteTensor* fw_recurrent_weights = - &context->tensors[node->inputs->data[kFwRecurrentWeightsTensor]]; - TfLiteTensor* fw_bias = &context->tensors[node->inputs->data[kFwBiasTensor]]; - TfLiteTensor* fw_hidden_state = - &context->tensors[node->outputs->data[kFwHiddenStateTensor]]; - TfLiteTensor* fw_output = - &context->tensors[node->outputs->data[kFwOutputTensor]]; - - TfLiteTensor* bw_input_weights = - &context->tensors[node->inputs->data[kBwWeightsTensor]]; - TfLiteTensor* bw_recurrent_weights = - &context->tensors[node->inputs->data[kBwRecurrentWeightsTensor]]; - TfLiteTensor* bw_bias = &context->tensors[node->inputs->data[kBwBiasTensor]]; - TfLiteTensor* bw_hidden_state = - &context->tensors[node->outputs->data[kBwHiddenStateTensor]]; - TfLiteTensor* bw_output = - &context->tensors[node->outputs->data[kBwOutputTensor]]; - +TfLiteStatus EvalFloat(const TfLiteTensor* input, + const TfLiteTensor* fw_input_weights, + const TfLiteTensor* fw_recurrent_weights, + const TfLiteTensor* fw_bias, + const TfLiteTensor* bw_input_weights, + const TfLiteTensor* bw_recurrent_weights, + const TfLiteTensor* bw_bias, + const TfLiteSequenceRNNParams* params, + TfLiteTensor* fw_hidden_state, TfLiteTensor* fw_output, + TfLiteTensor* bw_hidden_state, TfLiteTensor* bw_output) { const int batch_size = input->dims->data[0]; const int max_time = input->dims->data[1]; const int input_size = input->dims->data[2]; @@ -190,12 +231,139 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) { return kTfLiteOk; } +TfLiteStatus EvalHybrid( + const TfLiteTensor* input, const TfLiteTensor* fw_input_weights, + const TfLiteTensor* fw_recurrent_weights, const TfLiteTensor* fw_bias, + const TfLiteTensor* bw_input_weights, + const TfLiteTensor* bw_recurrent_weights, const TfLiteTensor* bw_bias, + const TfLiteSequenceRNNParams* params, TfLiteTensor* input_quantized, + TfLiteTensor* fw_hidden_state_quantized, TfLiteTensor* fw_scaling_factors, + TfLiteTensor* fw_hidden_state, TfLiteTensor* fw_output, + TfLiteTensor* bw_hidden_state_quantized, TfLiteTensor* bw_scaling_factors, + TfLiteTensor* bw_hidden_state, TfLiteTensor* bw_output) { + const int batch_size = input->dims->data[0]; + const int max_time = input->dims->data[1]; + const int input_size = input->dims->data[2]; + + const int fw_num_units = fw_input_weights->dims->data[0]; + const float* fw_bias_ptr = fw_bias->data.f; + const int8_t* fw_input_weights_ptr = + reinterpret_cast(fw_input_weights->data.uint8); + float fw_input_weights_scale = fw_input_weights->params.scale; + const int8_t* fw_recurrent_weights_ptr = + reinterpret_cast(fw_recurrent_weights->data.uint8); + float fw_recurrent_weights_scale = fw_recurrent_weights->params.scale; + + const int bw_num_units = bw_input_weights->dims->data[0]; + const float* bw_bias_ptr = bw_bias->data.f; + const int8_t* bw_input_weights_ptr = + reinterpret_cast(bw_input_weights->data.uint8); + float bw_input_weights_scale = bw_input_weights->params.scale; + const int8_t* bw_recurrent_weights_ptr = + reinterpret_cast(bw_recurrent_weights->data.uint8); + float bw_recurrent_weights_scale = bw_recurrent_weights->params.scale; + + // Initialize temporary storage for quantized values. + int8_t* quantized_input_ptr = + reinterpret_cast(input_quantized->data.uint8); + int8_t* fw_quantized_hidden_state_ptr = + reinterpret_cast(fw_hidden_state_quantized->data.uint8); + int8_t* bw_quantized_hidden_state_ptr = + reinterpret_cast(bw_hidden_state_quantized->data.uint8); + float* fw_scaling_factors_ptr = fw_scaling_factors->data.f; + float* bw_scaling_factors_ptr = bw_scaling_factors->data.f; + + for (int b = 0; b < batch_size; b++) { + // Forward cell. + float* fw_hidden_state_ptr_batch = + fw_hidden_state->data.f + b * fw_num_units; + for (int s = 0; s < max_time; s++) { + const float* input_ptr_batch = + input->data.f + b * input_size * max_time + s * input_size; + float* output_ptr_batch = + fw_output->data.f + b * fw_num_units * max_time + s * fw_num_units; + + kernel_utils::RnnBatchStep( + input_ptr_batch, fw_input_weights_ptr, fw_input_weights_scale, + fw_recurrent_weights_ptr, fw_recurrent_weights_scale, fw_bias_ptr, + input_size, fw_num_units, /*batch_size=*/1, params->activation, + quantized_input_ptr, fw_quantized_hidden_state_ptr, + fw_scaling_factors_ptr, fw_hidden_state_ptr_batch, output_ptr_batch); + } + // Backward cell. + float* bw_hidden_state_ptr_batch = + bw_hidden_state->data.f + b * bw_num_units; + for (int s = max_time - 1; s >= 0; s--) { + const float* input_ptr_batch = + input->data.f + b * input_size * max_time + s * input_size; + float* output_ptr_batch = + bw_output->data.f + b * bw_num_units * max_time + s * bw_num_units; + + kernel_utils::RnnBatchStep( + input_ptr_batch, bw_input_weights_ptr, bw_input_weights_scale, + bw_recurrent_weights_ptr, bw_recurrent_weights_scale, bw_bias_ptr, + input_size, bw_num_units, /*batch_size=*/1, params->activation, + quantized_input_ptr, bw_quantized_hidden_state_ptr, + bw_scaling_factors_ptr, bw_hidden_state_ptr_batch, output_ptr_batch); + } + } + return kTfLiteOk; +} + +TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) { + const auto* params = + reinterpret_cast(node->builtin_data); + + const TfLiteTensor* input = GetInput(context, node, kInputTensor); + const TfLiteTensor* fw_input_weights = + GetInput(context, node, kFwWeightsTensor); + const TfLiteTensor* fw_recurrent_weights = + GetInput(context, node, kFwRecurrentWeightsTensor); + const TfLiteTensor* fw_bias = GetInput(context, node, kFwBiasTensor); + const TfLiteTensor* bw_input_weights = + GetInput(context, node, kBwWeightsTensor); + const TfLiteTensor* bw_recurrent_weights = + GetInput(context, node, kBwRecurrentWeightsTensor); + const TfLiteTensor* bw_bias = GetInput(context, node, kBwBiasTensor); + + TfLiteTensor* fw_output = GetOutput(context, node, kFwOutputTensor); + TfLiteTensor* fw_hidden_state = + GetOutput(context, node, kFwHiddenStateTensor); + TfLiteTensor* bw_output = GetOutput(context, node, kBwOutputTensor); + TfLiteTensor* bw_hidden_state = + GetOutput(context, node, kBwHiddenStateTensor); + + switch (fw_input_weights->type) { + case kTfLiteFloat32: + return EvalFloat(input, fw_input_weights, fw_recurrent_weights, fw_bias, + bw_input_weights, bw_recurrent_weights, bw_bias, params, + fw_hidden_state, fw_output, bw_hidden_state, bw_output); + case kTfLiteUInt8: { + TfLiteTensor* input_quantized = GetTemporary(context, node, 0); + TfLiteTensor* fw_hidden_state_quantized = GetTemporary(context, node, 1); + TfLiteTensor* bw_hidden_state_quantized = GetTemporary(context, node, 2); + TfLiteTensor* fw_scaling_factors = GetTemporary(context, node, 3); + TfLiteTensor* bw_scaling_factors = GetTemporary(context, node, 4); + return EvalHybrid(input, fw_input_weights, fw_recurrent_weights, fw_bias, + bw_input_weights, bw_recurrent_weights, bw_bias, params, + input_quantized, fw_hidden_state_quantized, + fw_scaling_factors, fw_hidden_state, fw_output, + bw_hidden_state_quantized, bw_scaling_factors, + bw_hidden_state, bw_output); + } + default: + context->ReportError(context, "Type not currently supported."); + return kTfLiteError; + } + return kTfLiteOk; +} + } // namespace bidirectional_sequence_rnn TfLiteRegistration* Register_BIDIRECTIONAL_SEQUENCE_RNN() { - static TfLiteRegistration r = {/*init=*/nullptr, /*free=*/nullptr, - bidirectional_sequence_rnn::Prepare, - bidirectional_sequence_rnn::Eval}; + static TfLiteRegistration r = { + bidirectional_sequence_rnn::Init, bidirectional_sequence_rnn::Free, + bidirectional_sequence_rnn::Prepare, bidirectional_sequence_rnn::Eval}; return &r; } -- GitLab From 0f1b3bcf48eaaca4dccdf2d3208b0305b1c6056b Mon Sep 17 00:00:00 2001 From: Yanan Cao Date: Mon, 27 Aug 2018 13:46:16 -0700 Subject: [PATCH 174/598] Misc clean up tpu.py PiperOrigin-RevId: 210423935 --- tensorflow/contrib/tpu/python/tpu/tpu.py | 30 ++++++++++++------------ 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/tensorflow/contrib/tpu/python/tpu/tpu.py b/tensorflow/contrib/tpu/python/tpu/tpu.py index 7fa06d6d56..3c735a0b85 100644 --- a/tensorflow/contrib/tpu/python/tpu/tpu.py +++ b/tensorflow/contrib/tpu/python/tpu/tpu.py @@ -42,9 +42,9 @@ _BLACKLISTED_OPS = set([ "Placeholder", ]) -# These operations will currently fail to compile, but we should be able to -# support them eventually via CPU offload or extending our operation set. -_NOT_IMPLEMENTED_OPS = set([ +# XLA doesn't currently support reading of intermediate tensors, thus some ops +# are not supported. +_UNSUPPORTED_OPS = set([ "AudioSummary", "AudioSummaryV2", "HistogramSummary", @@ -149,6 +149,7 @@ class TPUReplicateContext(control_flow_ops.XLAControlFlowContext): self._gradient_colocation_stack = [] self._host_compute_core = [] self._name = name + self._name_as_bytes = compat.as_bytes(name) self._unsupported_ops = [] self._pivot = pivot self._replicated_vars = {} @@ -323,16 +324,13 @@ class TPUReplicateContext(control_flow_ops.XLAControlFlowContext): return self._host_compute_core def AddOp(self, op): - self._AddOpInternal(op) - - def _AddOpInternal(self, op): # pylint: disable=protected-access if op.type in _BLACKLISTED_OPS: logging.error("Operation of type %s (%s) is not supported on the TPU. " "Execution will fail if this op is used in the graph. " % (op.type, op.name)) - if op.type in _NOT_IMPLEMENTED_OPS: + if op.type in _UNSUPPORTED_OPS: self._unsupported_ops.append(op) if any(x.dtype._is_ref_dtype for x in op.inputs): @@ -342,7 +340,7 @@ class TPUReplicateContext(control_flow_ops.XLAControlFlowContext): if _TPU_REPLICATE_ATTR in op.node_def.attr: raise ValueError("TPU computations cannot be nested") op._set_attr(_TPU_REPLICATE_ATTR, - attr_value_pb2.AttrValue(s=compat.as_bytes(self._name))) + attr_value_pb2.AttrValue(s=self._name_as_bytes)) if self._outside_compilation_cluster: op._set_attr( _OUTSIDE_COMPILATION_ATTR, @@ -356,11 +354,12 @@ class TPUReplicateContext(control_flow_ops.XLAControlFlowContext): # Remove any control edges from outer control flow contexts. These may cause # mismatched frame errors. - control_inputs, external_inputs = self._RemoveExternalControlEdges(op) + (internal_control_inputs, + external_control_inputs) = self._RemoveExternalControlEdges(op) if not op.inputs: # Add a control edge from the control pivot to this op. - if not control_inputs: + if not internal_control_inputs: # pylint: disable=protected-access op._add_control_input(self.GetControlPivot()) # pylint: enable=protected-access @@ -371,19 +370,19 @@ class TPUReplicateContext(control_flow_ops.XLAControlFlowContext): if real_x != x: op._update_input(index, real_x) # pylint: disable=protected-access - if external_inputs: + if external_control_inputs: # Use an identity to pull control inputs as data inputs. Note that we # ignore ops which don't have outputs. TODO(phawkins): fix that. with ops.control_dependencies(None): self.Enter() - external_inputs = [ + external_control_inputs = [ array_ops.identity(x.outputs[0]).op - for x in external_inputs + for x in external_control_inputs if x.outputs ] self.Exit() # pylint: disable=protected-access - op._add_control_inputs(external_inputs) + op._add_control_inputs(external_control_inputs) # pylint: enable=protected-access # Mark op's outputs as seen by this context and any outer contexts. @@ -399,6 +398,7 @@ class TPUReplicateContext(control_flow_ops.XLAControlFlowContext): self._outer_context.AddInnerOp(op) def AddValue(self, val): + """Add `val` to the current context and its outer context recursively.""" if val.name in self._values: # Use the real value if it comes from outer context. result = self._external_values.get(val.name) @@ -415,7 +415,7 @@ class TPUReplicateContext(control_flow_ops.XLAControlFlowContext): return result def AddInnerOp(self, op): - self._AddOpInternal(op) + self.AddOp(op) if self._outer_context: self._outer_context.AddInnerOp(op) -- GitLab From 59f3c57182fac4d745bb01f3976bb9832c06333d Mon Sep 17 00:00:00 2001 From: Yanan Cao Date: Mon, 27 Aug 2018 13:48:19 -0700 Subject: [PATCH 175/598] [TF/XLA] Add XLACompileContext that marks ops inside for XLA compilation. PiperOrigin-RevId: 210424333 --- tensorflow/contrib/compiler/BUILD | 34 ++++ tensorflow/contrib/compiler/xla.py | 208 ++++++++++++++++++++++++ tensorflow/contrib/compiler/xla_test.py | 180 ++++++++++++++++++++ 3 files changed, 422 insertions(+) create mode 100644 tensorflow/contrib/compiler/xla.py create mode 100644 tensorflow/contrib/compiler/xla_test.py diff --git a/tensorflow/contrib/compiler/BUILD b/tensorflow/contrib/compiler/BUILD index bcee0b04c8..d7583be6d8 100644 --- a/tensorflow/contrib/compiler/BUILD +++ b/tensorflow/contrib/compiler/BUILD @@ -8,6 +8,7 @@ package_group( packages = ["//tensorflow/..."], ) +load("//tensorflow:tensorflow.bzl", "tf_py_test") load("//tensorflow:tensorflow.bzl", "cuda_py_test") py_library( @@ -46,3 +47,36 @@ cuda_py_test( ], xla_enabled = True, ) + +py_library( + name = "xla", + srcs = ["xla.py"], + srcs_version = "PY2AND3", + deps = [ + "//tensorflow/python:array_ops", + "//tensorflow/python:control_flow_ops", + "//tensorflow/python:framework_ops", + "//tensorflow/python:platform", + "//tensorflow/python:util", + "//tensorflow/python/estimator:model_fn", + ], +) + +tf_py_test( + name = "xla_test", + srcs = ["xla_test.py"], + additional_deps = [ + ":xla", + "@six_archive//:six", + "//tensorflow/python:constant_op", + "//tensorflow/python:control_flow_ops", + "//tensorflow/python:control_flow_util", + "//tensorflow/python:math_ops", + "//tensorflow/python:platform", + "//tensorflow/python:state_ops", + "//tensorflow/python:summary", + "//tensorflow/python:training", + "//tensorflow/python:variable_scope", + ], + tags = ["no_pip"], +) diff --git a/tensorflow/contrib/compiler/xla.py b/tensorflow/contrib/compiler/xla.py new file mode 100644 index 0000000000..60f5af1662 --- /dev/null +++ b/tensorflow/contrib/compiler/xla.py @@ -0,0 +1,208 @@ +# Copyright 2018 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================= +"""xla provides experimental xla support API.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +from six.moves import xrange # pylint: disable=redefined-builtin + +from tensorflow.core.framework import attr_value_pb2 +from tensorflow.python.framework import ops +from tensorflow.python.ops import array_ops +from tensorflow.python.ops import control_flow_ops +from tensorflow.python.platform import tf_logging as logging +from tensorflow.python.util import compat + +_XLA_COMPILE_ATTR = '_xla_compile_id' +_MAX_WARNING_LINES = 5 + +# Operations that indicate some error in the users graph. For example, XLA +# computation should not have any Placeholder op. +_BLACKLISTED_OPS = set([ + 'Placeholder', +]) + +# XLA doesn't currently support reading of intermediate tensors, thus some ops +# are not supported. +_UNSUPPORTED_OPS = set([ + 'AudioSummary', + 'AudioSummaryV2', + 'HistogramSummary', + 'ImageSummary', + 'MergeSummary', + 'Print', + 'ScalarSummary', + 'TensorSummary', + 'TensorSummaryV2', +]) + + +class XLACompileContext(control_flow_ops.XLAControlFlowContext): + """A `ControlFlowContext` for nodes inside an XLA computation cluster. + + THIS IS ONLY FOR TENSORFLOW INTERNAL IMPLEMENTATION, DO NO USE DIRECTLY. + + The primary role of `XLACompileContext` is to mark operators inside a + xla.compile() computation with attribute "_xla_compile_id=XYZ", where XYZ is + a unique name. + + `ControlFlowContext` is used to perform the annotation since it integrates + with Tensorflow constructs like ResourceVariables. For example, if a + `ResourceVariable` is constructed inside a xla.compile() block, the + `ResourceVariable` implementation can use + `with ops.control_dependencies(None)` to build the variable's definition + outside the compiled computation. + """ + + def __init__(self, name, pivot): + """Builds a new XLACompileContext. + + Args: + name: a unique name for the context, used to populate the + `_xla_compile_id` attribute. + pivot: a pivot node. Nodes in the XLACompileContext that do not have any + inputs will have a control dependency on the pivot node. This ensures + that nodes are correctly included in any enclosing control flow + contexts. + """ + super(XLACompileContext, self).__init__() + self._name = name + self._name_as_bytes = compat.as_bytes(name) + self._unsupported_ops = [] + self._pivot = pivot + + def report_unsupported_operations(self): + if self._unsupported_ops: + op_str = '\n'.join([ + ' %s (%s)' % (op.type, op.name) + for op in self._unsupported_ops[:_MAX_WARNING_LINES] + ]) + logging.warning('%d unsupported operations found: \n%s', + len(self._unsupported_ops), op_str) + if len(self._unsupported_ops) > _MAX_WARNING_LINES: + logging.warning('... and %d more', + len(self._unsupported_ops) - _MAX_WARNING_LINES) + + def AddOp(self, op): + """Create op in XLACompileContext and notifies outer context recursively.""" + # pylint: disable=protected-access + if op.type in _BLACKLISTED_OPS: + logging.error( + 'Operation of type %s (%s) is not supported in XLA. Execution will ' + 'fail if this op is used in the graph. ', op.type, op.name) + + # TODO(ycao): Automatically disable summaries instead of reporting them. + if op.type in _UNSUPPORTED_OPS: + self._unsupported_ops.append(op) + + if any(x.dtype._is_ref_dtype for x in op.inputs): + raise NotImplementedError( + 'Non-resource Variables are not supported inside XLA computations ' + '(operator name: %s)' % op.name) + + if _XLA_COMPILE_ATTR in op.node_def.attr: + raise ValueError('XLA compiled computations cannot be nested, (operator ' + 'name: %s)' % op.name) + + op._set_attr( + _XLA_COMPILE_ATTR, attr_value_pb2.AttrValue(s=self._name_as_bytes)) + + op.graph.prevent_feeding(op) + op.graph.prevent_fetching(op) + + # Remove any control edges from outer control flow contexts. These may cause + # mismatched frame errors. An example is when one of op's inputs is + # generated in a different While control flow context. + (internal_control_inputs, + external_control_inputs) = self._RemoveExternalControlEdges(op) + + if not op.inputs: + # Add a control edge from the control pivot to this op. + if not internal_control_inputs: + # pylint: disable=protected-access + op._add_control_input(self._pivot) + # pylint: enable=protected-access + else: + for index in xrange(len(op.inputs)): + x = op.inputs[index] + real_x = self.AddValue(x) + if real_x != x: + op._update_input(index, real_x) # pylint: disable=protected-access + + if external_control_inputs: + # Use an identity to pull control inputs as data inputs. Note that we + # ignore ops which don't have outputs. TODO(phawkins): fix that. + with ops.control_dependencies(None): + self.Enter() + external_control_inputs = [ + array_ops.identity(x.outputs[0]).op + for x in external_control_inputs + if x.outputs + ] + self.Exit() + # pylint: disable=protected-access + op._add_control_inputs(external_control_inputs) + # pylint: enable=protected-access + + # Mark op's outputs as seen by this context and any outer contexts. + output_names = [x.name for x in op.outputs] + context = self + while context is not None: + # pylint: disable=protected-access + context._values.update(output_names) + context = context._outer_context + # pylint: enable=protected-access + + if self._outer_context: + self._outer_context.AddInnerOp(op) + + def AddValue(self, val): + """Add `val` to the current context and its outer context recursively.""" + if val.name in self._values: + # Use the real value if it comes from outer context. + result = self._external_values.get(val.name) + return val if result is None else result + + result = val + self._values.add(val.name) + if self._outer_context: + result = self._outer_context.AddValue(val) + self._values.add(result.name) + + self._external_values[val.name] = result + + return result + + def AddInnerOp(self, op): + self.AddOp(op) + if self._outer_context: + self._outer_context.AddInnerOp(op) + + @property + def grad_state(self): + # Define the gradient loop state associated with the XLACompileContext to + # be None as the XLACompileContext does not get nested nor does the + # grad_state outside the XLACompileContext affect the graph inside so the + # grad_state should be as if this is the top-level gradient state. + return None + + @property + def back_prop(self): + """Forwards to the enclosing while context, if any.""" + if self.GetWhileContext(): + return self.GetWhileContext().back_prop + return False diff --git a/tensorflow/contrib/compiler/xla_test.py b/tensorflow/contrib/compiler/xla_test.py new file mode 100644 index 0000000000..a306b56f63 --- /dev/null +++ b/tensorflow/contrib/compiler/xla_test.py @@ -0,0 +1,180 @@ +# Copyright 2018 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================= +"""Tests for contrib.compiler.xla.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +from tensorflow.contrib.compiler import xla +from tensorflow.python import summary +from tensorflow.python.framework import constant_op +from tensorflow.python.framework import ops +from tensorflow.python.ops import control_flow_ops +from tensorflow.python.ops import control_flow_util +from tensorflow.python.ops import logging_ops +from tensorflow.python.ops import math_ops +from tensorflow.python.ops import state_ops +from tensorflow.python.ops import summary_ops +from tensorflow.python.ops import variable_scope +from tensorflow.python.platform import test + + +class XLACompileContextTest(test.TestCase): + + def create_test_xla_compile_context(self): + computation_name = ops.get_default_graph().unique_name('computation') + pivot = control_flow_ops.no_op(name=computation_name + '/pivot') + return xla.XLACompileContext(name=computation_name, pivot=pivot) + + def test_report_unsupported_operations(self): + """Tests that unsupported operations are detected.""" + context = self.create_test_xla_compile_context() + context.Enter() + dummy_tensor = constant_op.constant(1.1) + audio_summary = summary.audio('audio_summary', dummy_tensor, 0.5) + histogram_summary = summary.histogram('histogram_summary', dummy_tensor) + image_summary = summary.image('image_summary', dummy_tensor) + scalar_summary = summary.scalar('scalar_summary', dummy_tensor) + tensor_summary = summary_ops.tensor_summary('tensor_summary', dummy_tensor) + summary.merge( + [ + audio_summary, histogram_summary, image_summary, scalar_summary, + tensor_summary + ], + name='merge_summary') + logging_ops.Print(dummy_tensor, [dummy_tensor], name='print_op') + context.Exit() + + unsupported_ops_names = [op.name for op in context._unsupported_ops] + self.assertEqual(unsupported_ops_names, [ + u'audio_summary', u'histogram_summary', u'image_summary', + u'scalar_summary', u'tensor_summary', u'merge_summary/merge_summary', + u'print_op' + ]) + + def test_resource_variable(self): + """Tests that resource variable usage is allowed.""" + a = variable_scope.get_variable( + name='variable_a', shape=(1), use_resource=True) + + context = self.create_test_xla_compile_context() + context.Enter() + state_ops.assign(a, a + 1) + context.Exit() + + def test_non_resource_variable_error(self): + """Tests that non-resource variable usage is disallowed.""" + a = variable_scope.get_variable( + name='variable_a', shape=(1), use_resource=False) + + context = self.create_test_xla_compile_context() + context.Enter() + with self.assertRaisesRegexp( + NotImplementedError, 'Non-resource Variables are not supported inside ' + r'XLA computations \(operator name: Assign\)'): + state_ops.assign(a, a + 1) + context.Exit() + + def test_nested_xla_compile_error(self): + """Tests that nested XLA computation leads to fatal error.""" + context1 = self.create_test_xla_compile_context() + context1.Enter() + + context2 = self.create_test_xla_compile_context() + context2.Enter() + with self.assertRaisesRegexp(ValueError, + 'XLA compiled computations cannot be nested'): + constant_op.constant(1) + context2.Exit() + context1.Exit() + + def test_xla_compile_attr(self): + """Tests that ops are tagged with XLA compile ID attribute.""" + context = self.create_test_xla_compile_context() + context.Enter() + op = constant_op.constant(1) + context.Exit() + self.assertIn('_xla_compile_id', op.op.node_def.attr) + + def test_op_without_input(self): + """Tests that ops without inputs depend on pivot correctly.""" + context = self.create_test_xla_compile_context() + context.Enter() + op = constant_op.constant(1) + context.Exit() + + self.assertIn(context._pivot, op.op.control_inputs) + + def test_external_control_edges(self): + """Tests that external control edges are handled correctly.""" + i = constant_op.constant(1) + op1 = constant_op.constant(1) + + with ops.control_dependencies([op1]): + op2 = constant_op.constant(1) + self.assertIn(op1.op, op2.op.control_inputs) + + def while_body(i): + del i # unused + context = self.create_test_xla_compile_context() + context.Enter() + with ops.control_dependencies([op1]): + op3 = constant_op.constant(1) + context.Exit() + self.assertNotIn(op1.op, op3.op.control_inputs) + return op3 + + control_flow_ops.while_loop( + cond=lambda i: math_ops.less(i, 10), body=while_body, loop_vars=[i]) + + def test_op_output_marked_as_seen(self): + """Tests that any op output is marked as seen in context.""" + context = self.create_test_xla_compile_context() + context.Enter() + op = constant_op.constant(1) + context.Exit() + + self.assertIn(op.name, context._values) + + def testOpIsInContext(self): + """Tests that XLACompileContext is recognized as an XLA context.""" + op1 = constant_op.constant(1) + context = self.create_test_xla_compile_context() + context.Enter() + op2 = constant_op.constant(2) + context.Exit() + self.assertFalse(control_flow_util.IsInXLAContext(op1.op)) + self.assertTrue(control_flow_util.IsInXLAContext(op2.op)) + + def testOpPreventFeeding(self): + """Tests that ops created inside XLACompileContext can not be fed.""" + context = self.create_test_xla_compile_context() + context.Enter() + op = constant_op.constant(1) + context.Exit() + self.assertFalse(op.graph.is_feedable(op.op)) + + def testOpPreventFetching(self): + """Tests that ops created inside XLACompileContext can not be fetched.""" + context = self.create_test_xla_compile_context() + context.Enter() + op = constant_op.constant(1) + context.Exit() + self.assertFalse(op.graph.is_fetchable(op.op)) + + +if __name__ == '__main__': + test.main() -- GitLab From 85a6164912e21bc398b930943da7ea90ffe3bc20 Mon Sep 17 00:00:00 2001 From: Ayush Dubey Date: Mon, 27 Aug 2018 14:19:20 -0700 Subject: [PATCH 176/598] Refactor collectives to colocate implementation-specific code. Before this change, introducing a new collective algorithm required touching multiple files. CollectiveParams setup was in common_runtime/collective_param_resolver_local, and the data movement was in common_runtime/reducer and common_runtime/broadcaster. This change introduces CollectiveImplementationInterface. CollectiveImplementationInterface brings together param initialization and data movement for a collective algorithm. Every collective implementation will implement this interface and override the virtual methods. This should hopefully reduce obscurity and lead to code with fewer dependencies. PiperOrigin-RevId: 210430157 --- tensorflow/core/BUILD | 10 +- .../base_collective_executor.cc | 148 +++--- .../common_runtime/base_collective_executor.h | 20 +- tensorflow/core/common_runtime/broadcaster.cc | 300 ------------ .../collective_param_resolver_local.cc | 237 ++-------- .../collective_param_resolver_local.h | 17 +- .../collective_param_resolver_local_test.cc | 204 -------- .../core/common_runtime/collective_util.cc | 83 ++++ .../core/common_runtime/collective_util.h | 38 ++ .../hierarchical_tree_broadcaster.cc | 440 ++++++++++++++++++ ...ster.h => hierarchical_tree_broadcaster.h} | 58 ++- ... => hierarchical_tree_broadcaster_test.cc} | 239 ++++++++-- .../core/common_runtime/ring_reducer.cc | 320 ++++++++----- tensorflow/core/common_runtime/ring_reducer.h | 55 ++- .../core/common_runtime/ring_reducer_test.cc | 112 ++++- tensorflow/core/framework/collective.cc | 102 +++- tensorflow/core/framework/collective.h | 113 ++++- 17 files changed, 1427 insertions(+), 1069 deletions(-) delete mode 100644 tensorflow/core/common_runtime/broadcaster.cc create mode 100644 tensorflow/core/common_runtime/collective_util.cc create mode 100644 tensorflow/core/common_runtime/collective_util.h create mode 100644 tensorflow/core/common_runtime/hierarchical_tree_broadcaster.cc rename tensorflow/core/common_runtime/{broadcaster.h => hierarchical_tree_broadcaster.h} (53%) rename tensorflow/core/common_runtime/{broadcaster_test.cc => hierarchical_tree_broadcaster_test.cc} (80%) diff --git a/tensorflow/core/BUILD b/tensorflow/core/BUILD index 44662ea79e..51225f34bc 100644 --- a/tensorflow/core/BUILD +++ b/tensorflow/core/BUILD @@ -2707,12 +2707,13 @@ CORE_CPU_LIB_HEADERS = CORE_CPU_BASE_HDRS + [ "common_runtime/allocator_retry.h", "common_runtime/base_collective_executor.h", "common_runtime/bfc_allocator.h", - "common_runtime/broadcaster.h", + "common_runtime/hierarchical_tree_broadcaster.h", "common_runtime/buf_rendezvous.h", "common_runtime/build_graph_options.h", "common_runtime/collective_executor_mgr.h", "common_runtime/collective_param_resolver_local.h", "common_runtime/collective_rma_local.h", + "common_runtime/collective_util.h", "common_runtime/constant_folding.h", "common_runtime/copy_tensor.h", "common_runtime/costmodel_manager.h", @@ -2758,12 +2759,12 @@ tf_cuda_library( "common_runtime/allocator_retry.cc", "common_runtime/base_collective_executor.cc", "common_runtime/bfc_allocator.cc", - "common_runtime/broadcaster.cc", "common_runtime/buf_rendezvous.cc", "common_runtime/build_graph_options.cc", "common_runtime/collective_executor_mgr.cc", "common_runtime/collective_param_resolver_local.cc", "common_runtime/collective_rma_local.cc", + "common_runtime/collective_util.cc", "common_runtime/constant_folding.cc", "common_runtime/copy_tensor.cc", "common_runtime/costmodel_manager.cc", @@ -2778,6 +2779,7 @@ tf_cuda_library( "common_runtime/function.cc", "common_runtime/graph_optimizer.cc", "common_runtime/graph_runner.cc", + "common_runtime/hierarchical_tree_broadcaster.cc", "common_runtime/local_device.cc", "common_runtime/lower_if_op.cc", "common_runtime/memory_types.cc", @@ -3664,10 +3666,10 @@ tf_cc_tests_gpu( ) tf_cc_tests_gpu( - name = "broadcaster_test", + name = "hierarchical_tree_broadcaster_test", size = "small", srcs = [ - "common_runtime/broadcaster_test.cc", + "common_runtime/hierarchical_tree_broadcaster_test.cc", ], linkstatic = tf_kernel_tests_linkstatic(), tags = tf_cuda_tests_tags(), diff --git a/tensorflow/core/common_runtime/base_collective_executor.cc b/tensorflow/core/common_runtime/base_collective_executor.cc index 425a628a49..5b01f7fa03 100644 --- a/tensorflow/core/common_runtime/base_collective_executor.cc +++ b/tensorflow/core/common_runtime/base_collective_executor.cc @@ -14,13 +14,28 @@ limitations under the License. ==============================================================================*/ #include "tensorflow/core/common_runtime/base_collective_executor.h" -#include "tensorflow/core/common_runtime/broadcaster.h" +#include +#include +#include + #include "tensorflow/core/common_runtime/copy_tensor.h" #include "tensorflow/core/common_runtime/device_mgr.h" #include "tensorflow/core/common_runtime/dma_helper.h" +#include "tensorflow/core/common_runtime/hierarchical_tree_broadcaster.h" #include "tensorflow/core/common_runtime/process_util.h" #include "tensorflow/core/common_runtime/ring_reducer.h" +#include "tensorflow/core/framework/allocator.h" +#include "tensorflow/core/framework/op_kernel.h" +#include "tensorflow/core/framework/tensor.h" +#include "tensorflow/core/framework/tensor_shape.h" +#include "tensorflow/core/framework/types.h" +#include "tensorflow/core/framework/types.pb.h" +#include "tensorflow/core/lib/core/errors.h" #include "tensorflow/core/lib/core/notification.h" +#include "tensorflow/core/lib/core/status.h" +#include "tensorflow/core/lib/strings/strcat.h" +#include "tensorflow/core/platform/macros.h" +#include "tensorflow/core/platform/types.h" #define VALUE_IN_DEBUG_STRING false @@ -211,104 +226,67 @@ void BaseCollectiveExecutor::ExecuteAsync(OpKernelContext* ctx, }; Tensor* output = ctx->mutable_output(0); - string error; - switch (col_params.instance.type) { - case REDUCTION_COLLECTIVE: { - // TODO(tucker): support other reduction algorithms, - // e.g. tree-reduce, hybrid tree/ring, delegate-to-NCCL, etc. - const Tensor* input = &ctx->input(0); - RingReducer* reducer = - CreateReducer(ctx, CtxParams(ctx), col_params, exec_key, step_id_, - input, output, &error); - if (!reducer) { - done_safe(errors::Internal(error)); - return; - } - // Run in an I/O thread, so as not to starve the executor threads. - // TODO(tucker): Instead of forking every per-device Collective - // Op off into its own thread, consider queuing them on a - // fixed-size thread-pool dedicated to running CollectiveOps. - SchedClosure([reducer, done_safe]() { - reducer->Run([reducer, done_safe](const Status& s) { - done_safe(s); - delete reducer; - }); - }); - } break; - - case BROADCAST_COLLECTIVE: { - Broadcaster* broadcaster = CreateBroadcaster( - ctx, CtxParams(ctx), col_params, exec_key, step_id_, output, &error); - if (!broadcaster) { - done_safe(errors::Internal(error)); - return; - } - // Run in an I/O thread, so as not to starve the executor threads. - SchedClosure([broadcaster, done_safe]() { - broadcaster->Run([broadcaster, done_safe](const Status& s) { - done_safe(s); - delete broadcaster; - }); - }); - } break; - - default: - done_safe(errors::Internal("Unimplemented CollectiveType ", - col_params.instance.type)); + const Tensor* input = (col_params.instance.type == REDUCTION_COLLECTIVE || + (col_params.instance.type == BROADCAST_COLLECTIVE && + col_params.is_source)) + ? &ctx->input(0) + : nullptr; + CollectiveImplementationInterface* col_impl = nullptr; + Status status = CreateCollective(col_params, &col_impl); + if (!status.ok()) { + done_safe(status); + DCHECK_EQ(nullptr, col_impl); + return; } -} - -RingReducer* BaseCollectiveExecutor::CreateReducer( - OpKernelContext* ctx, OpKernelContext::Params* params, - const CollectiveParams& col_params, const string& exec_key, int64 step_id, - const Tensor* input, Tensor* output, string* error) { - switch (col_params.instance.data_type) { - case DT_INT32: - if (col_params.group.device_type == DEVICE_GPU) { - *error = - "Collective Reduce does not support datatype DT_INT32 on " - "DEVICE_GPU"; - return nullptr; - } - TF_FALLTHROUGH_INTENDED; - case DT_FLOAT: - case DT_DOUBLE: - case DT_INT64: - return new RingReducer(this, dev_mgr_, ctx, params, col_params, exec_key, - step_id, input, output); - break; - default: - *error = strings::StrCat("Collective Reduce does not support datatype ", - col_params.instance.data_type); - return nullptr; + CollectiveContext* col_ctx = + new CollectiveContext(this, dev_mgr_, ctx, CtxParams(ctx), col_params, + exec_key, step_id_, input, output); + status = col_impl->InitializeCollectiveContext(col_ctx); + if (!status.ok()) { + done_safe(status); + delete col_ctx; + delete col_impl; + return; } + // Run in an I/O thread, so as not to starve the executor threads. + // TODO(b/80529858): Instead of forking every per-device Collective + // Op off into its own thread, consider queuing them on a + // fixed-size thread-pool dedicated to running CollectiveOps. + SchedClosure([col_impl, col_ctx, done_safe]() { + col_impl->Run([col_impl, col_ctx, done_safe](const Status& s) { + done_safe(s); + delete col_ctx; + delete col_impl; + }); + }); } -Broadcaster* BaseCollectiveExecutor::CreateBroadcaster( - OpKernelContext* ctx, OpKernelContext::Params* params, - const CollectiveParams& col_params, const string& exec_key, int64 step_id, - Tensor* output, string* error) { +Status BaseCollectiveExecutor::CreateCollective( + const CollectiveParams& col_params, + CollectiveImplementationInterface** col_impl) { + *col_impl = nullptr; + Status status; switch (col_params.instance.data_type) { case DT_INT32: if (col_params.group.device_type == DEVICE_GPU) { - *error = - "Collective Broadcast does not support datatype DT_INT32 on " - "DEVICE_GPU"; - return nullptr; + status = errors::Internal( + "CollectiveImplementation does not support datatype DT_INT32 on " + "DEVICE_GPU"); } TF_FALLTHROUGH_INTENDED; case DT_FLOAT: case DT_DOUBLE: case DT_INT64: { - return new Broadcaster(this, dev_mgr_, ctx, params, col_params, exec_key, - step_id, output); - } break; + status = CollectiveRegistry::Lookup( + col_params.instance.impl_details.collective_name, col_impl); + break; + } default: - *error = - strings::StrCat("Collective Broadcast does not support datatype ", - DataTypeString(col_params.instance.data_type)); - return nullptr; + status = errors::Internal( + "CollectiveImplementation does not support datatype ", + col_params.instance.data_type); } + return status; } } // namespace tensorflow diff --git a/tensorflow/core/common_runtime/base_collective_executor.h b/tensorflow/core/common_runtime/base_collective_executor.h index 3af9286264..360ce4db7b 100644 --- a/tensorflow/core/common_runtime/base_collective_executor.h +++ b/tensorflow/core/common_runtime/base_collective_executor.h @@ -15,15 +15,17 @@ limitations under the License. #ifndef TENSORFLOW_CORE_COMMON_RUNTIME_BASE_COLLECTIVE_EXECUTOR_H_ #define TENSORFLOW_CORE_COMMON_RUNTIME_BASE_COLLECTIVE_EXECUTOR_H_ +#include #include + #include "tensorflow/core/common_runtime/buf_rendezvous.h" #include "tensorflow/core/framework/collective.h" #include "tensorflow/core/framework/device_attributes.pb.h" namespace tensorflow { -class Broadcaster; +class CollectiveImplementation; class DeviceMgr; -class RingReducer; +class Device; // Helper interface that aliases regular subfields of a Tensor as separate // Tensors for in-place update. @@ -133,18 +135,8 @@ class BaseCollectiveExecutor : public CollectiveExecutor { std::unique_ptr remote_access_; private: - RingReducer* CreateReducer(OpKernelContext* ctx, - OpKernelContext::Params* params, - const CollectiveParams& col_params, - const string& exec_key, int64 step_id, - const Tensor* input, Tensor* output, - string* error); - - Broadcaster* CreateBroadcaster(OpKernelContext* ctx, - OpKernelContext::Params* params, - const CollectiveParams& col_params, - const string& exec_key, int64 step_id, - Tensor* output, string* error); + Status CreateCollective(const CollectiveParams& col_params, + CollectiveImplementationInterface** col_impl); }; } // namespace tensorflow diff --git a/tensorflow/core/common_runtime/broadcaster.cc b/tensorflow/core/common_runtime/broadcaster.cc deleted file mode 100644 index e1c6b21939..0000000000 --- a/tensorflow/core/common_runtime/broadcaster.cc +++ /dev/null @@ -1,300 +0,0 @@ -/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -==============================================================================*/ -#include "tensorflow/core/common_runtime/broadcaster.h" - -#include "tensorflow/core/common_runtime/collective_rma_local.h" -#include "tensorflow/core/common_runtime/device_mgr.h" -#include "tensorflow/core/common_runtime/dma_helper.h" -#include "tensorflow/core/lib/core/notification.h" -#include "tensorflow/core/platform/env.h" - -// Set true for greater intelligibility of debug mode log messages. -#define READABLE_KEYS false - -namespace tensorflow { - -namespace { -// Key to be used for BufRendezvous by Broadcaster. -string BroadcastBufKey(const string& exec_key, int subdiv, int src_rank, - int dst_rank) { - if (READABLE_KEYS) { - return strings::StrCat("broadcast(", exec_key, "):subdiv(", subdiv, - "):src(", src_rank, "):dst(", dst_rank, ")"); - } else { - // TODO(tucker): Try a denser format, e.g. a 64 or 128 bit hash. - return strings::StrCat(exec_key, ":", subdiv, ":", src_rank, ":", dst_rank); - } -} -} // namespace - -Broadcaster::Broadcaster(CollectiveExecutor* col_exec, const DeviceMgr* dev_mgr, - OpKernelContext* ctx, OpKernelContext::Params* params, - const CollectiveParams& col_params, - const string& exec_key, int64 step_id, Tensor* output) - : col_exec_(col_exec), - dev_mgr_(dev_mgr), - ctx_(ctx), - col_params_(col_params), - exec_key_(exec_key), - rank_(col_params.subdiv_rank[0]), - is_source_(col_params.is_source), - output_(output), - done_(nullptr), - device_(nullptr) {} - -void Broadcaster::Run(StatusCallback done) { - // The optimal data transfer choreography is going to very platform dependent. - // That will be addressed by later improvements here or by platform-specific - // overrides of collective broadcast. The initial version is simply - // a binary tree that completely ignores DeviceLocality. - done_ = std::move(done); - - // Get the device for which we're executing and look up its locality. - status_ = dev_mgr_->LookupDevice( - col_params_.instance.device_names[col_params_.default_rank], &device_); - if (!status_.ok()) { - done_(status_); - return; - } - CHECK(device_); - device_locality_ = device_->attributes().locality(); - - RunTree(); -} - -// Binary tree parent/child relations are trivial to calculate, i.e. -// device at rank r is the parent of 2r+1 and 2r+2. The one exception -// is if the source is not rank 0. We treat that case as though the -// source is appended to the front of the rank ordering as well as -// continuing to occupy its current position. Hence we calculate as -// though each device's rank is actually r+1, then subtract 1 again to -// get the descendent ranks. If the source is not rank 0 then its -// descendants include both {0,1} and the descendents of its current -// position. Where a non-0-rank source is a descendent of another -// device, no send to it is necessary. - -/* static*/ -int Broadcaster::TreeRecvFrom(const CollectiveParams& cp, int subdiv) { - DCHECK_LT(subdiv, static_cast(cp.subdiv_rank.size())); - int my_rank = cp.subdiv_rank[subdiv]; - if (-1 == my_rank) return -1; - - const auto& impl = cp.instance.impl_details; - DCHECK_LT(subdiv, static_cast(impl.subdiv_source_rank.size())); - int source_rank = impl.subdiv_source_rank[subdiv]; - if (my_rank == source_rank) return -1; - if (source_rank == 0) { - return (my_rank - 1) / 2; - } else { - int predecessor_rank = (my_rank / 2) - 1; - return (predecessor_rank < 0) ? source_rank : predecessor_rank; - } -} - -/* static */ -void Broadcaster::TreeSendTo(const CollectiveParams& cp, int subdiv, - std::vector* targets) { - DCHECK_LT(subdiv, static_cast(cp.subdiv_rank.size())); - int my_rank = cp.subdiv_rank[subdiv]; - if (-1 == my_rank) return; - - const auto& impl = cp.instance.impl_details; - DCHECK_LT(subdiv, static_cast(impl.subdiv_source_rank.size())); - int source_rank = impl.subdiv_source_rank[subdiv]; - - int group_size = 0; - for (int i = 0; i < impl.subdiv_permutations[subdiv].size(); i++) { - if (impl.subdiv_permutations[subdiv][i] >= 0) { - group_size++; - } - } - - targets->clear(); - int successor_rank = 0; - if (source_rank == 0) { - successor_rank = (2 * my_rank) + 1; - } else { - successor_rank = (2 * (my_rank + 1)); - } - DCHECK_NE(successor_rank, my_rank); - if (cp.is_source && source_rank != 0) { - // The source sends to rank 0,1 in addition to its positional - // descendants. - if (group_size > 1) { - targets->push_back(0); - } - if (group_size > 2 && source_rank != 1) { - targets->push_back(1); - } - } - for (int i = 0; i < 2; ++i) { - if (successor_rank < group_size && successor_rank != source_rank) { - targets->push_back(successor_rank); - } - ++successor_rank; - } -} - -// Executes a hierarchical tree broadcast. -// Each subdiv is a broadcast between a subset of the devices. -// If there is only one task, there is one subdiv comprising a broadcast between -// all devices belonging to the task. -// If there are n tasks, n>1, then there are n+1 subdivs. In the first (global) -// subdiv, one device from each task participates in a binary tree broadcast. -// Each task receives a copy of the tensor on one device via this broadcast. -// Subsequent subdivs correspond to intra-task broadcasts. Subdiv i+1 -// corresponds to broadcast between all devices on task i. Thus, each task -// participates in at most 2 subdivs. -void Broadcaster::RunTree() { - int num_subdivs = static_cast(col_params_.subdiv_rank.size()); - // TODO(ayushd): this is easily improved when a node participates in both - // first and second subdivision. It would first send to its descendents in - // the first subdiv, then wait until all pending ops are finished before - // sending to descendents in second subdiv. A better implementation would - // collapse the two send blocks. - for (int si = 0; si < num_subdivs; si++) { - int my_rank = col_params_.subdiv_rank[si]; - // If rank is -1, this device does not participate in this subdiv. - if (-1 == my_rank) continue; - int source_rank = col_params_.instance.impl_details.subdiv_source_rank[si]; - if (VLOG_IS_ON(1)) { - string subdiv_buf; - for (int r : col_params_.instance.impl_details.subdiv_permutations[si]) { - strings::StrAppend(&subdiv_buf, r, ","); - } - VLOG(1) << "Running Broadcast tree device=" << device_->name() - << " subdiv=" << si << " perm=" << subdiv_buf - << " my_rank=" << my_rank << " source_rank=" << source_rank; - } - - mutex mu; // also guards status_ while callbacks are pending - int pending_count = 0; // GUARDED_BY(mu) - condition_variable all_done; - - if (my_rank >= 0 && my_rank != source_rank) { - // Begin by receiving the value. - int recv_from_rank = TreeRecvFrom(col_params_, si); - Notification note; - DispatchRecv(si, recv_from_rank, my_rank, output_, - [this, &mu, ¬e](const Status& s) { - mutex_lock l(mu); - status_.Update(s); - note.Notify(); - }); - note.WaitForNotification(); - } - - // Then forward value to all descendent devices. - if (my_rank >= 0 && status_.ok()) { - std::vector send_to_ranks; - TreeSendTo(col_params_, si, &send_to_ranks); - for (int i = 0; i < send_to_ranks.size(); ++i) { - int target_rank = send_to_ranks[i]; - { - mutex_lock l(mu); - ++pending_count; - } - DispatchSend(si, target_rank, my_rank, - (is_source_ ? &ctx_->input(0) : output_), - [this, &mu, &pending_count, &all_done](const Status& s) { - mutex_lock l(mu); - status_.Update(s); - --pending_count; - if (pending_count == 0) { - all_done.notify_all(); - } - }); - } - } - - // For the original source device, we copy input to output if they are - // different. - // If there is only 1 subdiv, we do this in that subdiv. If there is more - // than 1 subdiv, then the original source device will participate in 2 - // subdivs - the global inter-task broadcast and one local intra-task - // broadcast. In this case, we perform the copy in the second subdiv for - // this device. - if (status_.ok() && is_source_ && (1 == num_subdivs || 0 != si)) { - VLOG(2) << "copying input to output for device=" << device_->name() - << " subdiv=" << si; - const Tensor* input = &ctx_->input(0); - if (input != output_ && - (DMAHelper::base(input) != DMAHelper::base(output_))) { - { - mutex_lock l(mu); - ++pending_count; - } - DeviceContext* op_dev_ctx = ctx_->op_device_context(); - CollectiveRemoteAccessLocal::MemCpyAsync( - op_dev_ctx, op_dev_ctx, device_, device_, ctx_->input_alloc_attr(0), - ctx_->output_alloc_attr(0), input, output_, 0, /*stream_index*/ - [this, &mu, &pending_count, &all_done](const Status& s) { - mutex_lock l(mu); - status_.Update(s); - --pending_count; - if (0 == pending_count) { - all_done.notify_all(); - } - }); - } - } - - // Then wait for all pending actions to complete. - { - mutex_lock l(mu); - if (pending_count > 0) { - all_done.wait(l); - } - } - } - VLOG(2) << "device=" << device_->name() << " return status " << status_; - done_(status_); -} - -void Broadcaster::DispatchSend(int subdiv, int dst_rank, int src_rank, - const Tensor* src_tensor, - const StatusCallback& done) { - string send_buf_key = BroadcastBufKey(exec_key_, subdiv, src_rank, dst_rank); - int dst_idx = - col_params_.instance.impl_details.subdiv_permutations[subdiv][dst_rank]; - VLOG(1) << "DispatchSend " << send_buf_key << " from_device " - << device_->name() << " to_device " - << col_params_.instance.device_names[dst_idx] << " subdiv=" << subdiv - << " dst_rank=" << dst_rank << " dst_idx=" << dst_idx; - col_exec_->PostToPeer(col_params_.instance.device_names[dst_idx], - col_params_.instance.task_names[dst_idx], send_buf_key, - device_, ctx_->op_device_context(), - ctx_->output_alloc_attr(0), src_tensor, - device_locality_, done); -} - -void Broadcaster::DispatchRecv(int subdiv, int src_rank, int dst_rank, - Tensor* dst_tensor, const StatusCallback& done) { - string recv_buf_key = BroadcastBufKey(exec_key_, subdiv, src_rank, dst_rank); - int src_idx = - col_params_.instance.impl_details.subdiv_permutations[subdiv][src_rank]; - VLOG(1) << "DispatchRecv " << recv_buf_key << " from_device " - << col_params_.instance.device_names[src_idx] << " to_device " - << device_->name() << " subdiv=" << subdiv << " src_rank=" << src_rank - << " src_idx=" << src_idx; - col_exec_->RecvFromPeer(col_params_.instance.device_names[src_idx], - col_params_.instance.task_names[src_idx], - col_params_.task.is_local[src_idx], recv_buf_key, - device_, ctx_->op_device_context(), - ctx_->output_alloc_attr(0), dst_tensor, - device_locality_, 0 /*stream_index*/, done); -} - -} // namespace tensorflow diff --git a/tensorflow/core/common_runtime/collective_param_resolver_local.cc b/tensorflow/core/common_runtime/collective_param_resolver_local.cc index 2a14493a67..52eedae9b7 100644 --- a/tensorflow/core/common_runtime/collective_param_resolver_local.cc +++ b/tensorflow/core/common_runtime/collective_param_resolver_local.cc @@ -14,7 +14,20 @@ limitations under the License. ==============================================================================*/ #include "tensorflow/core/common_runtime/collective_param_resolver_local.h" +#include +#include +#include +#include + #include "tensorflow/core/common_runtime/device_mgr.h" +#include "tensorflow/core/framework/cancellation.h" +#include "tensorflow/core/framework/device_attributes.pb.h" +#include "tensorflow/core/framework/types.h" +#include "tensorflow/core/lib/core/errors.h" +#include "tensorflow/core/lib/core/status.h" +#include "tensorflow/core/lib/strings/strcat.h" +#include "tensorflow/core/platform/types.h" +#include "tensorflow/core/util/device_name_utils.h" namespace tensorflow { @@ -319,206 +332,6 @@ void SortDevicesAndTasks(CollectiveParams* cp) { } } // namespace -int GetDeviceTask(int device_rank, const std::vector& dev_per_task) { - int num_tasks = static_cast(dev_per_task.size()); - int task_lo = 0; - int task_hi; - for (int ti = 0; ti < num_tasks; ti++) { - task_hi = task_lo + dev_per_task[ti]; - if (task_lo <= device_rank && device_rank < task_hi) return ti; - task_lo += dev_per_task[ti]; - } - LOG(FATAL) << "Unexpected device rank " << device_rank << " for " << task_hi - << " devices"; - return -1; -} - -void CollectiveParamResolverLocal::GenerateBcastSubdivPerms( - const string& device, int source_rank, const std::vector& dev_per_task, - CollectiveParams* cp) { - if (VLOG_IS_ON(1)) { - string dpt_buf; - for (int dpt : dev_per_task) strings::StrAppend(&dpt_buf, dpt, ";"); - VLOG(1) << "GenerateBcastSubdivPerms device=" << device - << " source_rank=" << source_rank << " dev_per_task=" << dpt_buf; - } - int num_tasks = cp->group.num_tasks; - // If there is just 1 task, then execute binary tree broadcast over all - // devices. Otherwise, the first subdiv is inter-task broadcast, and then - // there are N more subdivs, where N is #task. - int num_subdivs = num_tasks + (num_tasks > 1 ? 1 : 0); - int total_num_devices = 0; - for (int num_dev : dev_per_task) total_num_devices += num_dev; - - cp->instance.impl_details.subdiv_permutations.resize(num_subdivs); - cp->subdiv_rank.reserve(num_subdivs); - cp->instance.impl_details.subdiv_source_rank.reserve(num_subdivs); - - // Inter-task subdiv. Pick one device from each task - this is the source - // device if it belongs to that task, or device 0 for that task. If a device - // does not participate in the subdiv, set subdiv_rank to -1. - if (num_tasks > 1) { - const int sdi = 0; - std::vector& perm = cp->instance.impl_details.subdiv_permutations[sdi]; - CHECK_EQ(perm.size(), 0); - int device_count = 0; - int source_task = GetDeviceTask(source_rank, dev_per_task); - for (int ti = 0; ti < cp->group.num_tasks; ti++) { - bool participate = false; - if (source_task == ti) { - // Source device belongs to this task. - perm.push_back(source_rank); - participate = cp->instance.device_names[source_rank] == device; - } else { - // Source does not belong to this task, choose dev 0. - perm.push_back(device_count); - participate = cp->instance.device_names[device_count] == device; - } - if (participate) cp->subdiv_rank.push_back(ti); - device_count += dev_per_task[ti]; - } - if (cp->subdiv_rank.empty()) cp->subdiv_rank.push_back(-1); - cp->instance.impl_details.subdiv_source_rank.push_back(source_task); - } - - // Intra-task subdivs. Pick all devices in task ti for subdiv sdi. Set - // source to dev 0 for that task if it does not contain original source, else - // set to rank of original source. If a device does not participate in the - // subdiv, set subdiv_rank to -1; - int abs_di = 0; - for (int ti = 0; ti < cp->group.num_tasks; ti++) { - const int sdi = ti + (num_tasks > 1 ? 1 : 0); - std::vector& perm = cp->instance.impl_details.subdiv_permutations[sdi]; - CHECK_EQ(perm.size(), 0); - bool participate = false; - int subdiv_source = 0; - for (int di = 0; di < dev_per_task[ti]; di++) { - perm.push_back(abs_di); - if (cp->instance.device_names[abs_di] == device) { - participate = true; - cp->subdiv_rank.push_back(di); - } - if (abs_di == source_rank) subdiv_source = di; - abs_di++; - } - if (!participate) cp->subdiv_rank.push_back(-1); - cp->instance.impl_details.subdiv_source_rank.push_back(subdiv_source); - } - - for (int sri = 0; sri < num_subdivs; sri++) { - CHECK_GE(cp->instance.impl_details.subdiv_source_rank[sri], 0); - } -} - -// Establish the requested number of subdivision permutations based on the -// ring order implicit in the device order. -/*static*/ -void CollectiveParamResolverLocal::GenerateSubdivPerms(const string& device, - int source_rank, - CollectiveParams* cp) { - // Each subdiv permutation is a ring formed by rotating each - // single-task subsequence of devices by an offset. This makes most - // sense when each task has the same number of devices but we can't - // depend on that being the case so we'll compute something that - // works in any case. - - // Start by counting the devices in each task. - // Precondition: device_names must be sorted so that all devices in - // the same task are adjacent. - VLOG(2) << "Sorted task names: " - << str_util::Join(cp->instance.task_names, ", "); - std::vector dev_per_task; - const string* prior_task_name = &cp->instance.task_names[0]; - int dev_count = 1; - for (int di = 1; di < cp->group.group_size; ++di) { - if (cp->instance.task_names[di] != *prior_task_name) { - dev_per_task.push_back(dev_count); - dev_count = 1; - prior_task_name = &cp->instance.task_names[di]; - } else { - ++dev_count; - } - } - dev_per_task.push_back(dev_count); - CHECK_EQ(cp->group.num_tasks, dev_per_task.size()); - - CHECK(cp->instance.type == REDUCTION_COLLECTIVE || - cp->instance.type == BROADCAST_COLLECTIVE); - if (cp->instance.type == REDUCTION_COLLECTIVE) { - // Generate a ring permutation for each requested offset. - CHECK_GT(cp->instance.impl_details.subdiv_offsets.size(), 0); - VLOG(2) << "Setting up perms for cp " << cp << " subdiv_permutations " - << &cp->instance.impl_details.subdiv_permutations; - cp->instance.impl_details.subdiv_permutations.resize( - cp->instance.impl_details.subdiv_offsets.size()); - cp->subdiv_rank.resize(cp->instance.impl_details.subdiv_offsets.size(), -1); - for (int sdi = 0; sdi < cp->instance.impl_details.subdiv_offsets.size(); - ++sdi) { - std::vector& perm = - cp->instance.impl_details.subdiv_permutations[sdi]; - CHECK_EQ(perm.size(), 0); - int offset = cp->instance.impl_details.subdiv_offsets[sdi]; - // A negative subdivision offset is interpreted as follows: - // 1. Reverse the local device ordering. - // 2. Begin the subdivision at abs(offset) in the reversed ordering. - bool reverse = false; - if (offset < 0) { - offset = abs(offset); - reverse = true; - } - int prior_dev_count = 0; // sum over prior worker device counts - for (int ti = 0; ti < cp->group.num_tasks; ++ti) { - for (int di = 0; di < dev_per_task[ti]; ++di) { - int di_offset = (di + offset) % dev_per_task[ti]; - int offset_di = - reverse ? (dev_per_task[ti] - (di_offset + 1)) : di_offset; - // Device index in global subdivision permutation. - int permuted_di = prior_dev_count + offset_di; - int rank = static_cast(perm.size()); - perm.push_back(permuted_di); - if (cp->instance.device_names[permuted_di] == device) { - CHECK_EQ(permuted_di, cp->default_rank); - cp->subdiv_rank[sdi] = rank; - } - } - prior_dev_count += dev_per_task[ti]; - } - CHECK_EQ(cp->group.group_size, perm.size()); - } - } else if (cp->instance.type == BROADCAST_COLLECTIVE) { - GenerateBcastSubdivPerms(device, source_rank, dev_per_task, cp); - } - - if (VLOG_IS_ON(1)) { - // Log the computed ring order for each subdiv. - string buf; - for (int sdi = 0; - sdi < cp->instance.impl_details.subdiv_permutations.size(); ++sdi) { - buf = strings::StrCat("Subdiv ", sdi, " device order:\n"); - for (int di = 0; - di < cp->instance.impl_details.subdiv_permutations[sdi].size(); - ++di) { - int idx = cp->instance.impl_details.subdiv_permutations[sdi][di]; - if (idx >= 0) { - CHECK_GT(cp->instance.device_names.size(), idx); - strings::StrAppend(&buf, cp->instance.device_names[idx], "\n"); - } - } - strings::StrAppend(&buf, " subdiv_offsets: "); - for (auto o : cp->instance.impl_details.subdiv_offsets) - strings::StrAppend(&buf, o, " "); - strings::StrAppend(&buf, " SubdivRank: "); - for (auto d : cp->subdiv_rank) strings::StrAppend(&buf, d, " "); - if (cp->instance.type == BROADCAST_COLLECTIVE) { - strings::StrAppend(&buf, " subdiv_source_rank: "); - for (auto src : cp->instance.impl_details.subdiv_source_rank) - strings::StrAppend(&buf, src, " "); - } - VLOG(1) << buf; - } - } -} - void CollectiveParamResolverLocal::CompleteTaskIsLocal(const string& task_name, CollectiveParams* cp) { cp->task.is_local.resize(cp->group.group_size, false); @@ -785,29 +598,39 @@ void CollectiveParamResolverLocal::CompleteInstanceFromInitializedIRec( // Populate the fields common across task, also default_rank. SetDefaultRank(device, cp); CompleteTaskIsLocal(task_name_, cp); + // TODO(b/113171733): we need a better way to pick the collective + // implementation. The ideal way would depend upon the topology and link + // strength before picking a particular implementation. + cp->instance.impl_details.collective_name = + (cp->instance.type == BROADCAST_COLLECTIVE) ? "HierarchicalTreeBroadcast" + : "RingReduce"; + CollectiveImplementationInterface* col_impl; + Status lookup_status = CollectiveRegistry::LookupParamResolverInstance( + cp->instance.impl_details.collective_name, &col_impl); + if (!lookup_status.ok()) { + done(lookup_status); + return; + } // If broadcast, may need to wait for source discovery. if (cp->instance.type == BROADCAST_COLLECTIVE) { CompleteInstanceSource(ir, cp, is_source, - [this, ir, device, cp, done](InstanceRec* irec) { + [col_impl, ir, device, cp, done](InstanceRec* irec) { CHECK_EQ(ir, irec); Status s; - int source_rank; { mutex_lock l(irec->out_mu); irec->WaitForOutMu(l); s = irec->status; - source_rank = irec->source_rank; + cp->source_rank = irec->source_rank; } if (s.ok()) { - GenerateSubdivPerms(device, source_rank, cp); + s = col_impl->InitializeCollectiveParams(cp); } done(s); }); - return; } else { - GenerateSubdivPerms(device, 0, cp); + done(col_impl->InitializeCollectiveParams(cp)); } - done(Status::OK()); } void CollectiveParamResolverLocal::CompleteInstanceSource(InstanceRec* ir, diff --git a/tensorflow/core/common_runtime/collective_param_resolver_local.h b/tensorflow/core/common_runtime/collective_param_resolver_local.h index 9372fd6272..c5c3497e28 100644 --- a/tensorflow/core/common_runtime/collective_param_resolver_local.h +++ b/tensorflow/core/common_runtime/collective_param_resolver_local.h @@ -15,7 +15,11 @@ limitations under the License. #ifndef TENSORFLOW_CORE_COMMON_RUNTIME_COLLECTIVE_PARAM_RESOLVER_LOCAL_H_ #define TENSORFLOW_CORE_COMMON_RUNTIME_COLLECTIVE_PARAM_RESOLVER_LOCAL_H_ +#include +#include +#include #include +#include #include "tensorflow/core/framework/collective.h" #include "tensorflow/core/lib/gtl/flatmap.h" @@ -79,6 +83,7 @@ class CollectiveParamResolverLocal : public ParamResolverInterface { // Used to complete/verify CollInstance. struct InstanceRec; + typedef std::function IRConsumer; struct InstanceRec { // This structure has two mutexes so that a possibly long @@ -212,18 +217,6 @@ class CollectiveParamResolverLocal : public ParamResolverInterface { void CallbackWithStatus(const InstanceRecCallback& done, InstanceRec* irec) LOCKS_EXCLUDED(irec->out_mu); - friend class CollectiveParamResolverLocalTest; - // Establishes the requested number of subdivision permutations based on the - // ring order implicit in the device order. - static void GenerateSubdivPerms(const string& device, int source_rank, - CollectiveParams* cp); - // Establishes the subdivisions for broadcast op. The first subdiv executes - // binary tree bcast with one device per task. Each subsequent subdiv - // executes intra-task binary tree broadcast. - static void GenerateBcastSubdivPerms(const string& device, int source_rank, - const std::vector& dev_per_task, - CollectiveParams* cp); - const DeviceMgr* dev_mgr_; DeviceResolverInterface* dev_resolver_; // Not owned. string task_name_; diff --git a/tensorflow/core/common_runtime/collective_param_resolver_local_test.cc b/tensorflow/core/common_runtime/collective_param_resolver_local_test.cc index 9ea23b72d2..9e1e2e8d5b 100644 --- a/tensorflow/core/common_runtime/collective_param_resolver_local_test.cc +++ b/tensorflow/core/common_runtime/collective_param_resolver_local_test.cc @@ -44,31 +44,6 @@ class CollectiveParamResolverLocalTest : public ::testing::Test { task_name)); } - void GenSubdivPerms(const string& device, int source_rank, - CollectiveParams* cp) { - CollectiveParamResolverLocal::GenerateSubdivPerms(device, source_rank, cp); - } - - // Calls GenerateBcastSubdivPerms for device at `device_rank`. Checks if the - // generated subdiv perms, ranks, and source ranks match the expected values. - void BcastSubdivPerms( - CollectiveParams* cp, const std::vector& dev_per_task, - int device_rank, int source_rank, - const std::vector>& expected_subdiv_perms, - const std::vector& expected_subdiv_rank, - const std::vector& expected_subdiv_source_rank) { - cp->subdiv_rank.clear(); - cp->instance.impl_details.subdiv_permutations.clear(); - cp->instance.impl_details.subdiv_source_rank.clear(); - CollectiveParamResolverLocal::GenerateBcastSubdivPerms( - cp->instance.device_names[device_rank], source_rank, dev_per_task, cp); - EXPECT_EQ(expected_subdiv_perms, - cp->instance.impl_details.subdiv_permutations); - EXPECT_EQ(expected_subdiv_rank, cp->subdiv_rank); - EXPECT_EQ(expected_subdiv_source_rank, - cp->instance.impl_details.subdiv_source_rank); - } - std::vector devices_; std::unique_ptr device_mgr_; std::unique_ptr drl_; @@ -114,7 +89,6 @@ TEST_F(CollectiveParamResolverLocalTest, CompleteParamsReduction1Task) { cps[i].instance.device_names[j]); EXPECT_TRUE(cps[i].task.is_local[j]); } - EXPECT_EQ(cps[i].subdiv_rank[0], i); EXPECT_EQ(cps[i].instance.impl_details.subdiv_source_rank.size(), 0); EXPECT_FALSE(cps[i].is_source); EXPECT_EQ(cps[i].default_rank, i); @@ -161,188 +135,10 @@ TEST_F(CollectiveParamResolverLocalTest, CompleteParamsBroadcast1Task) { cps[i].instance.device_names[j]); EXPECT_TRUE(cps[i].task.is_local[j]); } - ASSERT_GT(cps[i].subdiv_rank.size(), 0); - EXPECT_EQ(cps[i].subdiv_rank[0], i); - ASSERT_GT(cps[i].instance.impl_details.subdiv_source_rank.size(), 0); - EXPECT_EQ(cps[i].instance.impl_details.subdiv_source_rank[0], 1); EXPECT_EQ(cps[i].is_source, (i == 1)); EXPECT_EQ(cps[i].default_rank, i); EXPECT_TRUE(cps[i].instance.same_num_devices_per_task); } } -TEST_F(CollectiveParamResolverLocalTest, GenerateSubdivPerms) { - static const int kNumDevsPerTask = 8; - static const int kNumTasks = 3; - static const int kNumDevs = kNumDevsPerTask * kNumTasks; - CollectiveParams cp; - std::vector device_names; - std::vector task_names; - cp.group.group_key = 1; - cp.group.group_size = kNumDevs; - cp.group.device_type = DeviceType("GPU"); - cp.group.num_tasks = kNumTasks; - cp.instance.instance_key = 3; - cp.instance.type = REDUCTION_COLLECTIVE; - cp.instance.data_type = DataType(DT_FLOAT); - cp.instance.shape = TensorShape({5}); - cp.instance.impl_details.subdiv_offsets.push_back(0); - cp.is_source = false; - for (int i = 0; i < kNumDevs; ++i) { - int task_id = i / kNumDevsPerTask; - int dev_id = i % kNumDevsPerTask; - string task_name = strings::StrCat("/job:worker/replica:0/task:", task_id); - task_names.push_back(task_name); - string device_name = strings::StrCat(task_name, "/device:GPU:", dev_id); - device_names.push_back(device_name); - cp.instance.task_names.push_back(task_name); - cp.instance.device_names.push_back(device_name); - } - - int test_rank = 0; - cp.default_rank = test_rank; - cp.instance.impl_details.subdiv_offsets = {0, 4}; - GenSubdivPerms(cp.instance.device_names[test_rank], 0, &cp); - std::vector expected_0 = {0, 1, 2, 3, 4, 5, 6, 7, - 8, 9, 10, 11, 12, 13, 14, 15, - 16, 17, 18, 19, 20, 21, 22, 23}; - std::vector expected_1 = {4, 5, 6, 7, 0, 1, 2, 3, 12, 13, 14, 15, - 8, 9, 10, 11, 20, 21, 22, 23, 16, 17, 18, 19}; - for (int i = 0; i < kNumDevs; ++i) { - EXPECT_EQ(expected_0[i], - cp.instance.impl_details.subdiv_permutations[0][i]); - EXPECT_EQ(expected_1[i], - cp.instance.impl_details.subdiv_permutations[1][i]); - } - EXPECT_EQ(0, cp.subdiv_rank[0]); - EXPECT_EQ(4, cp.subdiv_rank[1]); - - test_rank = 3; - cp.default_rank = test_rank; - cp.instance.impl_details.subdiv_offsets = {3, -3}; - cp.instance.impl_details.subdiv_permutations.clear(); - GenSubdivPerms(cp.instance.device_names[test_rank], 0, &cp); - expected_0 = {3, 4, 5, 6, 7, 0, 1, 2, 11, 12, 13, 14, - 15, 8, 9, 10, 19, 20, 21, 22, 23, 16, 17, 18}; - expected_1 = {4, 3, 2, 1, 0, 7, 6, 5, 12, 11, 10, 9, - 8, 15, 14, 13, 20, 19, 18, 17, 16, 23, 22, 21}; - for (int i = 0; i < kNumDevs; ++i) { - EXPECT_EQ(expected_0[i], - cp.instance.impl_details.subdiv_permutations[0][i]); - EXPECT_EQ(expected_1[i], - cp.instance.impl_details.subdiv_permutations[1][i]); - } - EXPECT_EQ(0, cp.subdiv_rank[0]); - EXPECT_EQ(1, cp.subdiv_rank[1]); -} - -TEST_F(CollectiveParamResolverLocalTest, GenerateBcastSubdivPerms1Task8GPU) { - CollectiveParams cp; - cp.group.device_type = DeviceType("GPU"); - cp.group.num_tasks = 1; - cp.instance.type = BROADCAST_COLLECTIVE; - for (int i = 0; i < 8; i++) { - string dev_name = - strings::StrCat("/job:worker/replica:0/task:0/device:GPU:", i); - cp.instance.device_names.push_back(dev_name); - } - std::vector dev_per_task = {8}; - - // source 0 device 0 - BcastSubdivPerms(&cp, dev_per_task, 0, 0, {{0, 1, 2, 3, 4, 5, 6, 7}}, {0}, - {0}); - - // source 2 device 2 - BcastSubdivPerms(&cp, dev_per_task, 2, 2, {{0, 1, 2, 3, 4, 5, 6, 7}}, {2}, - {2}); - - // source 2 device 0 - BcastSubdivPerms(&cp, dev_per_task, 0, 2, {{0, 1, 2, 3, 4, 5, 6, 7}}, {0}, - {2}); -} - -TEST_F(CollectiveParamResolverLocalTest, GenerateBcastSubdivPerms4Tasks8GPU) { - CollectiveParams cp; - cp.group.device_type = DeviceType("GPU"); - cp.group.num_tasks = 4; - cp.instance.type = BROADCAST_COLLECTIVE; - for (int ti = 0; ti < cp.group.num_tasks; ti++) { - for (int di = 0; di < 8; di++) { - string dev_name = strings::StrCat("/job:worker/replica:0/task:", ti, - "/device:GPU:", di); - cp.instance.device_names.push_back(dev_name); - } - } - std::vector dev_per_task = {8, 8, 8, 8}; - - // source 0 device 0 - BcastSubdivPerms(&cp, dev_per_task, 0, 0, - {{0, 8, 16, 24}, - {0, 1, 2, 3, 4, 5, 6, 7}, - {8, 9, 10, 11, 12, 13, 14, 15}, - {16, 17, 18, 19, 20, 21, 22, 23}, - {24, 25, 26, 27, 28, 29, 30, 31}}, - {0, 0, -1, -1, -1}, {0, 0, 0, 0, 0}); - - // source 2 device 0 - BcastSubdivPerms(&cp, dev_per_task, 0, 2, - {{2, 8, 16, 24}, - {0, 1, 2, 3, 4, 5, 6, 7}, - {8, 9, 10, 11, 12, 13, 14, 15}, - {16, 17, 18, 19, 20, 21, 22, 23}, - {24, 25, 26, 27, 28, 29, 30, 31}}, - {-1, 0, -1, -1, -1}, {0, 2, 0, 0, 0}); - - // source 9 device 9 - BcastSubdivPerms(&cp, dev_per_task, 9, 9, - {{0, 9, 16, 24}, - {0, 1, 2, 3, 4, 5, 6, 7}, - {8, 9, 10, 11, 12, 13, 14, 15}, - {16, 17, 18, 19, 20, 21, 22, 23}, - {24, 25, 26, 27, 28, 29, 30, 31}}, - {1, -1, 1, -1, -1}, {1, 0, 1, 0, 0}); -} - -TEST_F(CollectiveParamResolverLocalTest, - GenerateBcastSubdivPerms4TasksVariableGPU) { - CollectiveParams cp; - cp.group.device_type = DeviceType("GPU"); - cp.group.num_tasks = 4; - std::vector dev_per_task = {4, 4, 6, 8}; - for (int ti = 0; ti < cp.group.num_tasks; ti++) { - for (int di = 0; di < dev_per_task[ti]; di++) { - string dev_name = strings::StrCat("/job:worker/replica:0/task:", ti, - "/device:GPU:", di); - cp.instance.device_names.push_back(dev_name); - } - } - - // source 0 device 0 - BcastSubdivPerms(&cp, dev_per_task, 0, 0, - {{0, 4, 8, 14}, - {0, 1, 2, 3}, - {4, 5, 6, 7}, - {8, 9, 10, 11, 12, 13}, - {14, 15, 16, 17, 18, 19, 20, 21}}, - {0, 0, -1, -1, -1}, {0, 0, 0, 0, 0}); - - // source 2 device 0 - BcastSubdivPerms(&cp, dev_per_task, 0, 2, - {{2, 4, 8, 14}, - {0, 1, 2, 3}, - {4, 5, 6, 7}, - {8, 9, 10, 11, 12, 13}, - {14, 15, 16, 17, 18, 19, 20, 21}}, - {-1, 0, -1, -1, -1}, {0, 2, 0, 0, 0}); - - // source 9 device 5 - BcastSubdivPerms(&cp, dev_per_task, 5, 9, - {{0, 4, 9, 14}, - {0, 1, 2, 3}, - {4, 5, 6, 7}, - {8, 9, 10, 11, 12, 13}, - {14, 15, 16, 17, 18, 19, 20, 21}}, - {-1, -1, 1, -1, -1}, {2, 0, 0, 1, 0}); -} - } // namespace tensorflow diff --git a/tensorflow/core/common_runtime/collective_util.cc b/tensorflow/core/common_runtime/collective_util.cc new file mode 100644 index 0000000000..195521a078 --- /dev/null +++ b/tensorflow/core/common_runtime/collective_util.cc @@ -0,0 +1,83 @@ +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ +#include "tensorflow/core/common_runtime/collective_util.h" + +#include +#include + +#include "tensorflow/core/common_runtime/device.h" +#include "tensorflow/core/common_runtime/device_mgr.h" +#include "tensorflow/core/framework/collective.h" +#include "tensorflow/core/framework/device_attributes.pb.h" +#include "tensorflow/core/lib/core/errors.h" +#include "tensorflow/core/lib/strings/strcat.h" +#include "tensorflow/core/platform/types.h" + +namespace tensorflow { +namespace collective_util { + +/*static*/ +Status InitializeDeviceAndLocality(const DeviceMgr* dev_mgr, + const string& device_name, Device** device, + DeviceLocality* device_locality) { + if (!dev_mgr) { + return errors::Internal("Required non-null dev_mgr ", dev_mgr, + " for InitializeDeviceAndLocality"); + } + + Status status = dev_mgr->LookupDevice(device_name, device); + if (status.ok()) { + CHECK(*device); + *device_locality = (*device)->attributes().locality(); + } else { + LOG(ERROR) << "Failed to find device " << device_name; + for (auto d : dev_mgr->ListDevices()) { + LOG(ERROR) << "Available devices " << d->name(); + } + } + return status; +} + +/*static*/ +string SubdivPermDebugString(const CollectiveParams& col_params) { + const auto& subdiv_perms = + col_params.instance.impl_details.subdiv_permutations; + string buf; + for (int sdi = 0; sdi < subdiv_perms.size(); ++sdi) { + strings::StrAppend(&buf, "Subdiv ", sdi, " device order:\n"); + for (int di = 0; di < subdiv_perms[sdi].size(); ++di) { + int idx = subdiv_perms[sdi][di]; + if (idx >= 0) { + CHECK_GT(col_params.instance.device_names.size(), idx); + strings::StrAppend(&buf, col_params.instance.device_names[idx], "\n"); + } + } + strings::StrAppend(&buf, " subdiv_offsets: "); + for (auto o : col_params.instance.impl_details.subdiv_offsets) + strings::StrAppend(&buf, o, " "); + strings::StrAppend(&buf, " SubdivRank: "); + for (auto d : col_params.subdiv_rank) strings::StrAppend(&buf, d, " "); + if (col_params.instance.type == BROADCAST_COLLECTIVE) { + strings::StrAppend(&buf, " subdiv_source_rank: "); + for (auto src : col_params.instance.impl_details.subdiv_source_rank) + strings::StrAppend(&buf, src, " "); + } + strings::StrAppend(&buf, "\n"); + } + return buf; +} + +} // namespace collective_util +} // namespace tensorflow diff --git a/tensorflow/core/common_runtime/collective_util.h b/tensorflow/core/common_runtime/collective_util.h new file mode 100644 index 0000000000..ebb5731bec --- /dev/null +++ b/tensorflow/core/common_runtime/collective_util.h @@ -0,0 +1,38 @@ +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ +#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_COLLECTIVE_UTIL_H_ +#define TENSORFLOW_CORE_COMMON_RUNTIME_COLLECTIVE_UTIL_H_ + +#include + +#include "tensorflow/core/common_runtime/device.h" +#include "tensorflow/core/common_runtime/device_mgr.h" +#include "tensorflow/core/framework/collective.h" +#include "tensorflow/core/framework/device_attributes.pb.h" +#include "tensorflow/core/framework/tensor.h" +#include "tensorflow/core/lib/core/status.h" + +namespace tensorflow { +namespace collective_util { + +Status InitializeDeviceAndLocality(const DeviceMgr* dev_mgr, + const string& device_name, Device** device, + DeviceLocality* device_locality); +string SubdivPermDebugString(const CollectiveParams& col_params); + +} // namespace collective_util +} // namespace tensorflow + +#endif // TENSORFLOW_CORE_COMMON_RUNTIME_COLLECTIVE_UTIL_H_ diff --git a/tensorflow/core/common_runtime/hierarchical_tree_broadcaster.cc b/tensorflow/core/common_runtime/hierarchical_tree_broadcaster.cc new file mode 100644 index 0000000000..eae34997d9 --- /dev/null +++ b/tensorflow/core/common_runtime/hierarchical_tree_broadcaster.cc @@ -0,0 +1,440 @@ +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ +#include "tensorflow/core/common_runtime/hierarchical_tree_broadcaster.h" + +#include +#include +#include +#include + +#include "tensorflow/core/common_runtime/collective_rma_local.h" +#include "tensorflow/core/common_runtime/collective_util.h" +#include "tensorflow/core/common_runtime/device_mgr.h" +#include "tensorflow/core/common_runtime/dma_helper.h" +#include "tensorflow/core/framework/device_base.h" +#include "tensorflow/core/framework/op_kernel.h" +#include "tensorflow/core/framework/tensor.h" +#include "tensorflow/core/lib/core/notification.h" +#include "tensorflow/core/lib/core/status.h" +#include "tensorflow/core/lib/strings/str_util.h" +#include "tensorflow/core/lib/strings/strcat.h" +#include "tensorflow/core/platform/env.h" +#include "tensorflow/core/platform/types.h" + +// Set true for greater intelligibility of debug mode log messages. +#define READABLE_KEYS false + +namespace tensorflow { + +namespace { +// Key to be used for BufRendezvous by Broadcaster. +string BroadcastBufKey(const string& exec_key, int subdiv, int src_rank, + int dst_rank) { + if (READABLE_KEYS) { + return strings::StrCat("broadcast(", exec_key, "):subdiv(", subdiv, + "):src(", src_rank, "):dst(", dst_rank, ")"); + } else { + // TODO(b/78352018): Try a denser format, e.g. a 64 or 128 bit hash. + return strings::StrCat(exec_key, ":", subdiv, ":", src_rank, ":", dst_rank); + } +} +} // namespace + +HierarchicalTreeBroadcaster::HierarchicalTreeBroadcaster() + : col_ctx_(nullptr), + col_params_(nullptr), + done_(nullptr), + is_source_(false) {} + +int HierarchicalTreeBroadcaster::GetDeviceTask( + int device_rank, const std::vector& dev_per_task) { + int num_tasks = static_cast(dev_per_task.size()); + int task_lo = 0; + int task_hi; + for (int ti = 0; ti < num_tasks; ti++) { + task_hi = task_lo + dev_per_task[ti]; + if (task_lo <= device_rank && device_rank < task_hi) return ti; + task_lo = task_hi; + } + LOG(FATAL) << "Unexpected device rank " << device_rank << " for " << task_hi + << " devices"; + return -1; +} + +Status HierarchicalTreeBroadcaster::InitializeCollectiveParams( + CollectiveParams* col_params) { + CHECK_EQ(col_params->instance.type, BROADCAST_COLLECTIVE); + CHECK_EQ(col_params->instance.impl_details.collective_name, + "HierarchicalTreeBroadcast"); + const string& device_name = + col_params->instance.device_names[col_params->default_rank]; + // Start by counting the devices in each task. + // Precondition: device_names must be sorted so that all devices in + // the same task are adjacent. + VLOG(2) << "Sorted task names: " + << str_util::Join(col_params->instance.task_names, ", "); + std::vector dev_per_task; + const string* prior_task_name = &col_params->instance.task_names[0]; + int dev_count = 1; + for (int di = 1; di < col_params->group.group_size; ++di) { + if (col_params->instance.task_names[di] != *prior_task_name) { + dev_per_task.push_back(dev_count); + dev_count = 1; + prior_task_name = &col_params->instance.task_names[di]; + } else { + ++dev_count; + } + } + dev_per_task.push_back(dev_count); + CHECK_EQ(col_params->group.num_tasks, dev_per_task.size()); + + if (VLOG_IS_ON(2)) { + string dpt_buf; + for (int dpt : dev_per_task) strings::StrAppend(&dpt_buf, dpt, ";"); + VLOG(2) << "HierarchicalTreeBroadcaster::InitializeCollectiveParams device=" + << device_name << " source_rank=" << col_params->source_rank + << " dev_per_task=" << dpt_buf; + } + int num_tasks = col_params->group.num_tasks; + // If there is just 1 task, then execute binary tree broadcast over all + // devices. Otherwise, the first subdiv is inter-task broadcast, and then + // there are N more subdivs, where N is #task. + int num_subdivs = num_tasks + (num_tasks > 1 ? 1 : 0); + int total_num_devices = 0; + for (int num_dev : dev_per_task) total_num_devices += num_dev; + + col_params->instance.impl_details.subdiv_permutations.resize(num_subdivs); + col_params->subdiv_rank.reserve(num_subdivs); + col_params->instance.impl_details.subdiv_source_rank.reserve(num_subdivs); + + // Inter-task subdiv. Pick one device from each task - this is the source + // device if it belongs to that task, or device 0 for that task. If a device + // does not participate in the subdiv, set subdiv_rank to -1. + if (num_tasks > 1) { + const int sdi = 0; + std::vector& perm = + col_params->instance.impl_details.subdiv_permutations[sdi]; + CHECK_EQ(perm.size(), 0); + int device_count = 0; + int source_task = GetDeviceTask(col_params->source_rank, dev_per_task); + for (int ti = 0; ti < col_params->group.num_tasks; ti++) { + bool participate = false; + if (source_task == ti) { + // Source device belongs to this task. + perm.push_back(col_params->source_rank); + participate = + col_params->instance.device_names[col_params->source_rank] == + device_name; + } else { + // Source does not belong to this task, choose dev 0. + perm.push_back(device_count); + participate = + col_params->instance.device_names[device_count] == device_name; + } + if (participate) col_params->subdiv_rank.push_back(ti); + device_count += dev_per_task[ti]; + } + if (col_params->subdiv_rank.empty()) col_params->subdiv_rank.push_back(-1); + col_params->instance.impl_details.subdiv_source_rank.push_back(source_task); + } + + // Intra-task subdivs. Pick all devices in task ti for subdiv sdi. Set + // source to dev 0 for that task if it does not contain original source, else + // set to rank of original source. If a device does not participate in + // the subdiv, set subdiv_rank to -1; + int abs_di = 0; + for (int ti = 0; ti < col_params->group.num_tasks; ti++) { + const int sdi = ti + (num_tasks > 1 ? 1 : 0); + std::vector& perm = + col_params->instance.impl_details.subdiv_permutations[sdi]; + CHECK_EQ(perm.size(), 0); + bool participate = false; + int subdiv_source = 0; + for (int di = 0; di < dev_per_task[ti]; di++) { + perm.push_back(abs_di); + if (col_params->instance.device_names[abs_di] == device_name) { + participate = true; + col_params->subdiv_rank.push_back(di); + } + if (abs_di == col_params->source_rank) subdiv_source = di; + abs_di++; + } + if (!participate) col_params->subdiv_rank.push_back(-1); + col_params->instance.impl_details.subdiv_source_rank.push_back( + subdiv_source); + } + + for (int sri = 0; sri < num_subdivs; sri++) { + CHECK_GE(col_params->instance.impl_details.subdiv_source_rank[sri], 0); + } + + VLOG(2) << collective_util::SubdivPermDebugString(*col_params); + return Status::OK(); +} + +Status HierarchicalTreeBroadcaster::InitializeCollectiveContext( + CollectiveContext* col_ctx) { + CHECK(col_ctx->dev_mgr); + col_ctx_ = col_ctx; + col_params_ = &col_ctx->col_params; + return collective_util::InitializeDeviceAndLocality( + col_ctx->dev_mgr, col_ctx->device_name, &col_ctx->device, + &col_ctx->device_locality); +} + +void HierarchicalTreeBroadcaster::Run(StatusCallback done) { + CHECK(col_ctx_); + CHECK(col_params_); + done_ = std::move(done); + is_source_ = col_params_->is_source; + RunTree(); +} + +// Binary tree parent/child relations are trivial to calculate, i.e. +// device at rank r is the parent of 2r+1 and 2r+2. The one exception +// is if the source is not rank 0. We treat that case as though the +// source is appended to the front of the rank ordering as well as +// continuing to occupy its current position. Hence we calculate as +// though each device's rank is actually r+1, then subtract 1 again to +// get the descendent ranks. If the source is not rank 0 then its +// descendants include both {0,1} and the descendents of its current +// position. Where a non-0-rank source is a descendent of another +// device, no send to it is necessary. + +/* static*/ +int HierarchicalTreeBroadcaster::TreeRecvFrom(const CollectiveParams& cp, + int subdiv) { + DCHECK_LT(subdiv, static_cast(cp.subdiv_rank.size())); + int my_rank = cp.subdiv_rank[subdiv]; + if (-1 == my_rank) return -1; + + const auto& impl = cp.instance.impl_details; + DCHECK_LT(subdiv, static_cast(impl.subdiv_source_rank.size())); + int source_rank = impl.subdiv_source_rank[subdiv]; + if (my_rank == source_rank) return -1; + if (source_rank == 0) { + return (my_rank - 1) / 2; + } else { + int predecessor_rank = (my_rank / 2) - 1; + return (predecessor_rank < 0) ? source_rank : predecessor_rank; + } +} + +/* static */ +void HierarchicalTreeBroadcaster::TreeSendTo(const CollectiveParams& cp, + int subdiv, + std::vector* targets) { + DCHECK_LT(subdiv, static_cast(cp.subdiv_rank.size())); + int my_rank = cp.subdiv_rank[subdiv]; + if (-1 == my_rank) return; + + const auto& impl = cp.instance.impl_details; + DCHECK_LT(subdiv, static_cast(impl.subdiv_source_rank.size())); + int source_rank = impl.subdiv_source_rank[subdiv]; + + int group_size = 0; + for (int i = 0; i < impl.subdiv_permutations[subdiv].size(); i++) { + if (impl.subdiv_permutations[subdiv][i] >= 0) { + group_size++; + } + } + + targets->clear(); + int successor_rank = 0; + if (source_rank == 0) { + successor_rank = (2 * my_rank) + 1; + } else { + successor_rank = (2 * (my_rank + 1)); + } + DCHECK_NE(successor_rank, my_rank); + if (cp.is_source && source_rank != 0) { + // The source sends to rank 0,1 in addition to its positional + // descendants. + if (group_size > 1) { + targets->push_back(0); + } + if (group_size > 2 && source_rank != 1) { + targets->push_back(1); + } + } + for (int i = 0; i < 2; ++i) { + if (successor_rank < group_size && successor_rank != source_rank) { + targets->push_back(successor_rank); + } + ++successor_rank; + } +} + +// Executes a hierarchical tree broadcast. +// Each subdiv is a broadcast between a subset of the devices. +// If there is only one task, there is one subdiv comprising a broadcast between +// all devices belonging to the task. +// If there are n tasks, n>1, then there are n+1 subdivs. In the first (global) +// subdiv, one device from each task participates in a binary tree broadcast. +// Each task receives a copy of the tensor on one device via this broadcast. +// Subsequent subdivs correspond to intra-task broadcasts. Subdiv i+1 +// corresponds to broadcast between all devices on task i. Thus, each task +// participates in at most 2 subdivs. +void HierarchicalTreeBroadcaster::RunTree() { + int num_subdivs = static_cast(col_params_->subdiv_rank.size()); + // TODO(b/78352018): this is easily improved when a node participates in both + // first and second subdivision. It would first send to its descendents in + // the first subdiv, then wait until all pending ops are finished before + // sending to descendents in second subdiv. A better implementation would + // collapse the two send blocks. + for (int si = 0; si < num_subdivs; si++) { + int my_rank = col_params_->subdiv_rank[si]; + // If rank is -1, this device does not participate in this subdiv. + if (-1 == my_rank) continue; + int source_rank = col_params_->instance.impl_details.subdiv_source_rank[si]; + if (VLOG_IS_ON(1)) { + string subdiv_buf; + for (int r : col_params_->instance.impl_details.subdiv_permutations[si]) { + strings::StrAppend(&subdiv_buf, r, ","); + } + VLOG(1) << "Running Broadcast tree device=" << col_ctx_->device_name + << " subdiv=" << si << " perm=" << subdiv_buf + << " my_rank=" << my_rank << " source_rank=" << source_rank; + } + + mutex mu; // also guards status_ while callbacks are pending + int pending_count = 0; // GUARDED_BY(mu) + condition_variable all_done; + + if (my_rank >= 0 && my_rank != source_rank) { + // Begin by receiving the value. + int recv_from_rank = TreeRecvFrom(*col_params_, si); + Notification note; + DispatchRecv(si, recv_from_rank, my_rank, col_ctx_->output, + [this, &mu, ¬e](const Status& s) { + mutex_lock l(mu); + status_.Update(s); + note.Notify(); + }); + note.WaitForNotification(); + } + + // Then forward value to all descendent devices. + if (my_rank >= 0 && status_.ok()) { + std::vector send_to_ranks; + TreeSendTo(*col_params_, si, &send_to_ranks); + for (int i = 0; i < send_to_ranks.size(); ++i) { + int target_rank = send_to_ranks[i]; + { + mutex_lock l(mu); + ++pending_count; + } + DispatchSend(si, target_rank, my_rank, + (is_source_ ? col_ctx_->input : col_ctx_->output), + [this, &mu, &pending_count, &all_done](const Status& s) { + mutex_lock l(mu); + status_.Update(s); + --pending_count; + if (pending_count == 0) { + all_done.notify_all(); + } + }); + } + } + + // For the original source device, we copy input to output if they are + // different. + // If there is only 1 subdiv, we do this in that subdiv. If there is more + // than 1 subdiv, then the original source device will participate in 2 + // subdivs - the global inter-task broadcast and one local intra-task + // broadcast. In this case, we perform the copy in the second subdiv for + // this device. + if (status_.ok() && is_source_ && (1 == num_subdivs || 0 != si)) { + VLOG(2) << "copying input to output for device=" << col_ctx_->device_name + << " subdiv=" << si; + if (col_ctx_->input != col_ctx_->output && + (DMAHelper::base(col_ctx_->input) != + DMAHelper::base(col_ctx_->output))) { + { + mutex_lock l(mu); + ++pending_count; + } + DeviceContext* op_dev_ctx = col_ctx_->op_ctx->op_device_context(); + CollectiveRemoteAccessLocal::MemCpyAsync( + op_dev_ctx, op_dev_ctx, col_ctx_->device, col_ctx_->device, + col_ctx_->op_ctx->input_alloc_attr(0), + col_ctx_->op_ctx->output_alloc_attr(0), col_ctx_->input, + col_ctx_->output, 0, /*stream_index*/ + [this, &mu, &pending_count, &all_done](const Status& s) { + mutex_lock l(mu); + status_.Update(s); + --pending_count; + if (0 == pending_count) { + all_done.notify_all(); + } + }); + } + } + + // Then wait for all pending actions to complete. + { + mutex_lock l(mu); + if (pending_count > 0) { + all_done.wait(l); + } + } + } + VLOG(2) << "device=" << col_ctx_->device_name << " return status " << status_; + done_(status_); +} + +void HierarchicalTreeBroadcaster::DispatchSend(int subdiv, int dst_rank, + int src_rank, + const Tensor* src_tensor, + const StatusCallback& done) { + string send_buf_key = + BroadcastBufKey(col_ctx_->exec_key, subdiv, src_rank, dst_rank); + int dst_idx = + col_params_->instance.impl_details.subdiv_permutations[subdiv][dst_rank]; + VLOG(3) << "DispatchSend " << send_buf_key << " from_device " + << col_ctx_->device_name << " to_device " + << col_params_->instance.device_names[dst_idx] << " subdiv=" << subdiv + << " dst_rank=" << dst_rank << " dst_idx=" << dst_idx; + col_ctx_->col_exec->PostToPeer(col_params_->instance.device_names[dst_idx], + col_params_->instance.task_names[dst_idx], + send_buf_key, col_ctx_->device, + col_ctx_->op_ctx->op_device_context(), + col_ctx_->op_ctx->output_alloc_attr(0), + src_tensor, col_ctx_->device_locality, done); +} + +void HierarchicalTreeBroadcaster::DispatchRecv(int subdiv, int src_rank, + int dst_rank, Tensor* dst_tensor, + const StatusCallback& done) { + string recv_buf_key = + BroadcastBufKey(col_ctx_->exec_key, subdiv, src_rank, dst_rank); + int src_idx = + col_params_->instance.impl_details.subdiv_permutations[subdiv][src_rank]; + VLOG(3) << "DispatchRecv " << recv_buf_key << " from_device " + << col_params_->instance.device_names[src_idx] << " to_device " + << col_ctx_->device_name << " subdiv=" << subdiv + << " src_rank=" << src_rank << " src_idx=" << src_idx; + col_ctx_->col_exec->RecvFromPeer( + col_params_->instance.device_names[src_idx], + col_params_->instance.task_names[src_idx], + col_params_->task.is_local[src_idx], recv_buf_key, col_ctx_->device, + col_ctx_->op_ctx->op_device_context(), + col_ctx_->op_ctx->output_alloc_attr(0), dst_tensor, + col_ctx_->device_locality, 0 /*stream_index*/, done); +} + +REGISTER_COLLECTIVE(HierarchicalTreeBroadcast, HierarchicalTreeBroadcaster); + +} // namespace tensorflow diff --git a/tensorflow/core/common_runtime/broadcaster.h b/tensorflow/core/common_runtime/hierarchical_tree_broadcaster.h similarity index 53% rename from tensorflow/core/common_runtime/broadcaster.h rename to tensorflow/core/common_runtime/hierarchical_tree_broadcaster.h index 799228b161..ceb9baad30 100644 --- a/tensorflow/core/common_runtime/broadcaster.h +++ b/tensorflow/core/common_runtime/hierarchical_tree_broadcaster.h @@ -12,25 +12,40 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. ==============================================================================*/ -#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_BROADCASTER_H_ -#define TENSORFLOW_CORE_COMMON_RUNTIME_BROADCASTER_H_ +#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_HIERARCHICAL_TREE_BROADCASTER_H_ +#define TENSORFLOW_CORE_COMMON_RUNTIME_HIERARCHICAL_TREE_BROADCASTER_H_ #include + #include "tensorflow/core/common_runtime/base_collective_executor.h" #include "tensorflow/core/framework/collective.h" -#include "tensorflow/core/framework/device_attributes.pb.h" namespace tensorflow { -// Tree-algorithm implementation of collective broadcast. -class Broadcaster { +// Hierarchical tree-algorithm implementation of collective broadcast. +class HierarchicalTreeBroadcaster : public CollectiveImplementationInterface { public: - Broadcaster(CollectiveExecutor* col_exec, const DeviceMgr* dev_mgr, - OpKernelContext* ctx, OpKernelContext::Params* params, - const CollectiveParams& col_params, const string& exec_key, - int64 step_id, Tensor* output); + HierarchicalTreeBroadcaster(); + ~HierarchicalTreeBroadcaster() override = default; + + // Establishes the subdiv permutations needed for a hierarchical broadcast. + // If all devices are local, establishes a single subdiv comprising all + // devices. If any devices are on a different task, establishes n+1 subdivs + // for n tasks. + // The first subdiv comprises one device per task which gets the tensor on + // each task. Subdiv i+1 corresponds to a task-local tree-broadcast for task + // i. + Status InitializeCollectiveParams(CollectiveParams* col_params) override; - void Run(StatusCallback done); + // Initializes members of CollectiveContext not yet initialized, i.e. device + // and device_locality. Also saves the CollectiveContext in this object. + Status InitializeCollectiveContext(CollectiveContext* col_ctx) override; + + // Begins async execution of the hierarchical tree broadcast. + // Must be called in a blockable thread. + // TODO(b/80529858): remove the previous warning when we have a dedicated + // collective threadpool. + void Run(StatusCallback done) override; // Returns the rank of the device from which this device should receive // its value, -1 if no value should be received. @@ -42,32 +57,29 @@ class Broadcaster { std::vector* targets); private: + // Get the task to which the device at `device_rank` belongs. + int GetDeviceTask(int device_rank, const std::vector& dev_per_task); + // Sends `src_tensor` asynchronously from this device to device at `dst_rank` // in `subdiv`. Calls `done` upon completion. void DispatchSend(int subdiv, int dst_rank, int src_rank, const Tensor* src_tensor, const StatusCallback& done); + // Receives a tensor into the memory buffer owned by `dst_tensor` at this // device from device at `src_rank` in `subdiv`. Calls `done` upon // completion. void DispatchRecv(int subdiv, int src_rank, int dst_rank, Tensor* dst_tensor, const StatusCallback& done); + // Executes the hierarchical broadcast defined by this op. void RunTree(); - Status status_; - CollectiveExecutor* col_exec_; // Not owned - const DeviceMgr* dev_mgr_; // Not owned - OpKernelContext* ctx_; // Not owned - const CollectiveParams& col_params_; - const string exec_key_; - const int rank_; - const bool is_source_; - Tensor* output_; // Not owned - std::unique_ptr ca_; + CollectiveContext* col_ctx_; // Not owned + const CollectiveParams* col_params_; // Not owned StatusCallback done_; - Device* device_; // The device for which this instance labors - DeviceLocality device_locality_; + Status status_; + bool is_source_; }; } // namespace tensorflow -#endif // TENSORFLOW_CORE_COMMON_RUNTIME_BROADCASTER_H_ +#endif // TENSORFLOW_CORE_COMMON_RUNTIME_HIERARCHICAL_TREE_BROADCASTER_H_ diff --git a/tensorflow/core/common_runtime/broadcaster_test.cc b/tensorflow/core/common_runtime/hierarchical_tree_broadcaster_test.cc similarity index 80% rename from tensorflow/core/common_runtime/broadcaster_test.cc rename to tensorflow/core/common_runtime/hierarchical_tree_broadcaster_test.cc index 3960fc6c97..da0e359cf8 100644 --- a/tensorflow/core/common_runtime/broadcaster_test.cc +++ b/tensorflow/core/common_runtime/hierarchical_tree_broadcaster_test.cc @@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. ==============================================================================*/ -#include "tensorflow/core/common_runtime/broadcaster.h" +#include "tensorflow/core/common_runtime/hierarchical_tree_broadcaster.h" #include #include "tensorflow/core/common_runtime/base_collective_executor.h" @@ -41,7 +41,7 @@ static int64 kStepId = 123; // The test harness won't allow a mixture of fixture and non-fixture // tests in one file, so this is a trival fixture for tests that don't -// need the heavy-weight BroadcasterTest fixture. +// need the heavy-weight HierarchicalTreeBroadcasterTest fixture. class TrivialTest : public ::testing::Test { protected: TrivialTest() {} @@ -53,23 +53,23 @@ class TrivialTest : public ::testing::Test { // R = tested rank // RF = receive-from rank // ST = send_to rank vector -#define DEF_TL_TEST(D, S, R, RF, ST) \ - TEST_F(TrivialTest, TreeLinks_##D##Devs_##S##Source_##R##Rank) { \ - CollectiveParams cp; \ - cp.group.group_size = D; \ - cp.instance.impl_details.subdiv_source_rank = {S}; \ - cp.instance.impl_details.subdiv_permutations.push_back( \ - std::vector(D, 0)); \ - cp.subdiv_rank = {R}; \ - cp.is_source = (S == R); \ - EXPECT_EQ(RF, Broadcaster::TreeRecvFrom(cp, 0)); \ - std::vector expected = ST; \ - std::vector send_to; \ - Broadcaster::TreeSendTo(cp, 0, &send_to); \ - ASSERT_EQ(expected.size(), send_to.size()); \ - for (int i = 0; i < expected.size(); ++i) { \ - EXPECT_EQ(expected[i], send_to[i]); \ - } \ +#define DEF_TL_TEST(D, S, R, RF, ST) \ + TEST_F(TrivialTest, TreeLinks_##D##Devs_##S##Source_##R##Rank) { \ + CollectiveParams cp; \ + cp.group.group_size = D; \ + cp.instance.impl_details.subdiv_source_rank = {S}; \ + cp.instance.impl_details.subdiv_permutations.push_back( \ + std::vector(D, 0)); \ + cp.subdiv_rank = {R}; \ + cp.is_source = (S == R); \ + EXPECT_EQ(RF, HierarchicalTreeBroadcaster::TreeRecvFrom(cp, 0)); \ + std::vector expected = ST; \ + std::vector send_to; \ + HierarchicalTreeBroadcaster::TreeSendTo(cp, 0, &send_to); \ + ASSERT_EQ(expected.size(), send_to.size()); \ + for (int i = 0; i < expected.size(); ++i) { \ + EXPECT_EQ(expected[i], send_to[i]); \ + } \ } #define V(...) std::vector({__VA_ARGS__}) @@ -130,7 +130,7 @@ DEF_TL_TEST(8, 7, 7, -1, V(0, 1)) // Wraps CollectiveRemoteAccessLocal with the ability to return an // error status to the N'th action. -// TODO(tucker): factor out of this file and ring_reducer_test.cc +// TODO(b/113171733): factor out of this file and ring_reducer_test.cc // into a single common source. class FailTestRMA : public CollectiveRemoteAccessLocal { public: @@ -187,31 +187,32 @@ class FailTestRMA : public CollectiveRemoteAccessLocal { int fail_after_ GUARDED_BY(mu_); }; -class BroadcasterTest : public ::testing::Test { +class HierarchicalTreeBroadcasterTest : public ::testing::Test { protected: - BroadcasterTest() : device_type_(DEVICE_CPU) {} + HierarchicalTreeBroadcasterTest() : device_type_(DEVICE_CPU) {} - ~BroadcasterTest() override { + ~HierarchicalTreeBroadcasterTest() override { stop_ = true; - for (auto i : instances_) { - delete i; - } + for (auto i : instances_) delete i; if (col_exec_) col_exec_->Unref(); } - void SetUp() override { -#if GOOGLE_CUDA +#ifdef GOOGLE_CUDA + void InitGPUDevices() { auto device_factory = DeviceFactory::GetFactory("GPU"); CHECK(device_factory); SessionOptions options; Status s = device_factory->CreateDevices( options, "/job:worker/replica:0/task:0", &gpu_devices_); CHECK(s.ok()); -#endif } +#endif void Init(int num_workers, int num_devices_per_worker, DataType dtype, const DeviceType& device_type, int fail_after) { +#ifdef GOOGLE_CUDA + InitGPUDevices(); +#endif VLOG(2) << "num_workers=" << num_workers << " num_devices_per_worker=" << num_devices_per_worker; int total_num_devices = num_workers * num_devices_per_worker; @@ -400,8 +401,6 @@ class BroadcasterTest : public ::testing::Test { return GetKernel(node_def, device_type, device); } - void BuildColParams() {} - template void RunTest(DataType dtype, const DeviceType& device_type, int num_workers, int num_devices, int tensor_len, int fail_after, @@ -511,10 +510,47 @@ class BroadcasterTest : public ::testing::Test { } } + void RunSubdivPermsTest( + CollectiveParams* cp, + const std::vector>& expected_subdiv_perms, + const std::vector& expected_subdiv_rank, + const std::vector& expected_subdiv_source_rank) { + col_exec_ = nullptr; + cp->instance.impl_details.subdiv_permutations.clear(); + cp->subdiv_rank.clear(); + cp->instance.impl_details.subdiv_source_rank.clear(); + // Create a stub broadcaster only for testing param initialization. + HierarchicalTreeBroadcaster broadcaster; + TF_CHECK_OK(broadcaster.InitializeCollectiveParams(cp)); + EXPECT_EQ(expected_subdiv_perms, + cp->instance.impl_details.subdiv_permutations); + EXPECT_EQ(expected_subdiv_rank, cp->subdiv_rank); + EXPECT_EQ(expected_subdiv_source_rank, + cp->instance.impl_details.subdiv_source_rank); + } + + void PrepColParamsForSubdivPermsTest(CollectiveParams* cp, int num_tasks, + int num_gpus) { + cp->group.device_type = DeviceType("GPU"); + cp->group.num_tasks = num_tasks; + cp->group.group_size = num_tasks * num_gpus; + cp->instance.type = BROADCAST_COLLECTIVE; + cp->instance.impl_details.collective_name = "HierarchicalTreeBroadcast"; + for (int ti = 0; ti < num_tasks; ti++) { + string task_name = strings::StrCat("/job:worker/replica:0/task:", ti); + for (int di = 0; di < num_gpus; di++) { + string dev_name = strings::StrCat(task_name, "/device:GPU:", di); + cp->instance.task_names.push_back(task_name); + cp->instance.device_names.push_back(dev_name); + } + } + } + class DeviceInstance { public: DeviceInstance(int rank, const string& dev_name, - const DeviceType& device_type, BroadcasterTest* parent) + const DeviceType& device_type, + HierarchicalTreeBroadcasterTest* parent) : parent_(parent), dev_name_(dev_name), device_type_(device_type), @@ -636,21 +672,20 @@ class BroadcasterTest : public ::testing::Test { ctx.allocate_output(0, tensor_.shape(), &output_tensor_ptr)); } CHECK_EQ(output_tensor_ptr, ctx.mutable_output(0)); + const Tensor* input_tensor_ptr = + col_params_.is_source ? &tensor_ : nullptr; // Prepare a Broadcaster instance. string exec_key = strings::StrCat(col_params_.instance.instance_key, ":0:0"); - Broadcaster broadcaster(parent_->col_exec_, parent_->dev_mgr_.get(), &ctx, - &op_params, col_params_, exec_key, kStepId, - output_tensor_ptr); - - // Start execution in a threadpool then wait for completion. - Notification notification; - broadcaster.Run([this, ¬ification](Status s) { - status_ = s; - notification.Notify(); - }); - notification.WaitForNotification(); + HierarchicalTreeBroadcaster broadcaster; + CollectiveContext col_ctx(parent_->col_exec_, parent_->dev_mgr_.get(), + &ctx, &op_params, col_params_, exec_key, + kStepId, input_tensor_ptr, output_tensor_ptr); + TF_CHECK_OK(broadcaster.InitializeCollectiveContext(&col_ctx)); + + // Run the broadcast. + broadcaster.Run([this](Status s) { status_ = s; }); if (status_.ok()) { CHECK(tensor_.CopyFrom(*ctx.mutable_output(0), tensor_.shape())); } @@ -658,15 +693,13 @@ class BroadcasterTest : public ::testing::Test { dev_ctx->Unref(); } - BroadcasterTest* parent_; + HierarchicalTreeBroadcasterTest* parent_; string dev_name_; DeviceType device_type_ = DEVICE_CPU; int rank_; Tensor tensor_; Device* device_; CollectiveParams col_params_; - std::unique_ptr ca_; - std::unique_ptr ctx_; Status status_; }; // class DeviceInstance @@ -688,6 +721,118 @@ class BroadcasterTest : public ::testing::Test { int failure_count_ GUARDED_BY(mu_) = 0; }; +TEST_F(HierarchicalTreeBroadcasterTest, InitializeParams1Task8GPU) { + CollectiveParams cp; + PrepColParamsForSubdivPermsTest(&cp, 1, 8); + + // source 0 device 0 + cp.source_rank = 0; + cp.default_rank = 0; + RunSubdivPermsTest(&cp, {{0, 1, 2, 3, 4, 5, 6, 7}}, {0}, {0}); + + // source 2 device 2 + cp.source_rank = 2; + cp.default_rank = 2; + RunSubdivPermsTest(&cp, {{0, 1, 2, 3, 4, 5, 6, 7}}, {2}, {2}); + + // source 2 device 0 + cp.source_rank = 2; + cp.default_rank = 0; + RunSubdivPermsTest(&cp, {{0, 1, 2, 3, 4, 5, 6, 7}}, {0}, {2}); +} + +TEST_F(HierarchicalTreeBroadcasterTest, InitializeParams4Tasks8GPU) { + CollectiveParams cp; + PrepColParamsForSubdivPermsTest(&cp, 4, 8); + + // source 0 device 0 + cp.source_rank = 0; + cp.default_rank = 0; + RunSubdivPermsTest(&cp, + {{0, 8, 16, 24}, + {0, 1, 2, 3, 4, 5, 6, 7}, + {8, 9, 10, 11, 12, 13, 14, 15}, + {16, 17, 18, 19, 20, 21, 22, 23}, + {24, 25, 26, 27, 28, 29, 30, 31}}, + {0, 0, -1, -1, -1}, {0, 0, 0, 0, 0}); + + // source 2 device 0 + cp.source_rank = 2; + cp.default_rank = 0; + RunSubdivPermsTest(&cp, + {{2, 8, 16, 24}, + {0, 1, 2, 3, 4, 5, 6, 7}, + {8, 9, 10, 11, 12, 13, 14, 15}, + {16, 17, 18, 19, 20, 21, 22, 23}, + {24, 25, 26, 27, 28, 29, 30, 31}}, + {-1, 0, -1, -1, -1}, {0, 2, 0, 0, 0}); + + // source 9 device 9 + cp.source_rank = 9; + cp.default_rank = 9; + RunSubdivPermsTest(&cp, + {{0, 9, 16, 24}, + {0, 1, 2, 3, 4, 5, 6, 7}, + {8, 9, 10, 11, 12, 13, 14, 15}, + {16, 17, 18, 19, 20, 21, 22, 23}, + {24, 25, 26, 27, 28, 29, 30, 31}}, + {1, -1, 1, -1, -1}, {1, 0, 1, 0, 0}); +} + +TEST_F(HierarchicalTreeBroadcasterTest, InitializeParams4TasksVariableGPU) { + CollectiveParams cp; + int num_tasks = 4; + cp.group.device_type = DeviceType("GPU"); + cp.group.num_tasks = num_tasks; + cp.group.group_size = 0; + cp.instance.type = BROADCAST_COLLECTIVE; + cp.instance.impl_details.collective_name = "HierarchicalTreeBroadcast"; + std::vector dev_per_task = {4, 4, 6, 8}; + for (int ti = 0; ti < cp.group.num_tasks; ti++) { + string task_name = strings::StrCat("/job:worker/replica:0/task:", ti); + for (int di = 0; di < dev_per_task[ti]; di++) { + string dev_name = strings::StrCat(task_name, "/device:GPU:", di); + cp.instance.task_names.push_back(task_name); + cp.instance.device_names.push_back(dev_name); + cp.group.group_size++; + } + } + + // source 0 device 0 + cp.source_rank = 0; + cp.default_rank = 0; + RunSubdivPermsTest(&cp, + {{0, 4, 8, 14}, + {0, 1, 2, 3}, + {4, 5, 6, 7}, + {8, 9, 10, 11, 12, 13}, + {14, 15, 16, 17, 18, 19, 20, 21}}, + {0, 0, -1, -1, -1}, {0, 0, 0, 0, 0}); + + // source 2 device 0 + cp.source_rank = 2; + cp.default_rank = 0; + RunSubdivPermsTest(&cp, + {{2, 4, 8, 14}, + {0, 1, 2, 3}, + {4, 5, 6, 7}, + {8, 9, 10, 11, 12, 13}, + {14, 15, 16, 17, 18, 19, 20, 21}}, + {-1, 0, -1, -1, -1}, {0, 2, 0, 0, 0}); + + // source 9 device 5 + cp.source_rank = 9; + cp.default_rank = 5; + RunSubdivPermsTest(&cp, + {{0, 4, 9, 14}, + {0, 1, 2, 3}, + {4, 5, 6, 7}, + {8, 9, 10, 11, 12, 13}, + {14, 15, 16, 17, 18, 19, 20, 21}}, + {-1, -1, 1, -1, -1}, {2, 0, 0, 1, 0}); +} + +// TODO(b/113171733): change to use TEST_P. // Tests of full broadcast algorithm, with different device and // data types. // B = data element type @@ -697,7 +842,7 @@ class BroadcasterTest : public ::testing::Test { // L = tensor length // A = abort after count #define DEF_TEST(B, T, W, D, L, A, F) \ - TEST_F(BroadcasterTest, \ + TEST_F(HierarchicalTreeBroadcasterTest, \ DaTy##B##_DevTy##T##_Wkr##W##_Dev##D##_Len##L##_Abt##A##_Fw##F) { \ DataType dtype = DT_##B; \ switch (dtype) { \ diff --git a/tensorflow/core/common_runtime/ring_reducer.cc b/tensorflow/core/common_runtime/ring_reducer.cc index e26761703b..bb8eeb141a 100644 --- a/tensorflow/core/common_runtime/ring_reducer.cc +++ b/tensorflow/core/common_runtime/ring_reducer.cc @@ -14,13 +14,29 @@ limitations under the License. ==============================================================================*/ #include "tensorflow/core/common_runtime/ring_reducer.h" +#include +#include +#include +#include + #include "tensorflow/core/common_runtime/collective_rma_local.h" +#include "tensorflow/core/common_runtime/collective_util.h" #include "tensorflow/core/common_runtime/copy_tensor.h" +#include "tensorflow/core/common_runtime/device.h" #include "tensorflow/core/common_runtime/device_mgr.h" #include "tensorflow/core/common_runtime/dma_helper.h" +#include "tensorflow/core/framework/allocator.h" +#include "tensorflow/core/framework/device_base.h" +#include "tensorflow/core/framework/op_kernel.h" +#include "tensorflow/core/framework/tensor.h" +#include "tensorflow/core/framework/types.h" +#include "tensorflow/core/lib/core/errors.h" #include "tensorflow/core/lib/core/notification.h" +#include "tensorflow/core/lib/core/status.h" #include "tensorflow/core/lib/strings/str_util.h" +#include "tensorflow/core/lib/strings/strcat.h" #include "tensorflow/core/platform/env.h" +#include "tensorflow/core/platform/types.h" // Set true for greater intelligibility of debug mode log messages. #define READABLE_KEYS false @@ -36,7 +52,8 @@ string RingReduceBufKey(const string& exec_key, int pass, int section, return strings::StrCat("rred(", exec_key, "):pass(", pass, "):section(", section, "):srcrank(", source_rank, ")"); } else { - // TODO(tucker): Try out some kind of denser encoding, e.g. 128 bit hash. + // TODO(b/78352018): Try out some kind of denser encoding, e.g. 128 bit + // hash. return strings::StrCat(exec_key, ":", pass, ":", section, ":", source_rank); } } @@ -65,105 +82,149 @@ RingReducer::RingField* RingReducer::PCQueue::Dequeue() { return rf; } -RingReducer::RingReducer(CollectiveExecutor* col_exec, const DeviceMgr* dev_mgr, - OpKernelContext* ctx, - OpKernelContext::Params* op_params, - const CollectiveParams& col_params, - const string& exec_key, int64 step_id, - const Tensor* input, Tensor* output) - : col_exec_(col_exec), - dev_mgr_(dev_mgr), - ctx_(ctx), - op_params_(op_params), - col_params_(col_params), - exec_key_(exec_key), - input_(input), - output_(output), - rank_(col_params.subdiv_rank[0]), - step_id_(step_id), - group_size_(col_params.group.group_size), - num_subdivs_(static_cast( - col_params.instance.impl_details.subdiv_permutations.size())), +RingReducer::RingReducer() + : col_ctx_(nullptr), + col_params_(nullptr), done_(nullptr), - device_(nullptr), - device_name_( - col_params_.instance.device_names[col_params_.default_rank]) { - CHECK_GT(group_size_, 0); - CHECK_GT(num_subdivs_, 0); -} + group_size_(-1), + num_subdivs_(-1) {} RingReducer::~RingReducer() { group_size_tensor_ready_.WaitForNotification(); } -string RingReducer::TensorDebugString(Tensor tensor) { - const DeviceBase::GpuDeviceInfo* gpu_device_info = - ctx_->device()->tensorflow_gpu_device_info(); - if (gpu_device_info) { - Tensor cpu_tensor(tensor.dtype(), tensor.shape()); - Notification note; - gpu_device_info->default_context->CopyDeviceTensorToCPU( - &tensor, "" /*tensor_name*/, device_, &cpu_tensor, - [¬e](const Status& s) { - CHECK(s.ok()); - note.Notify(); - }); - note.WaitForNotification(); - return cpu_tensor.SummarizeValue(64); - } else { - return tensor.SummarizeValue(64); +Status RingReducer::InitializeCollectiveParams(CollectiveParams* col_params) { + CHECK_EQ(col_params->instance.type, REDUCTION_COLLECTIVE); + CHECK_EQ(col_params->instance.impl_details.collective_name, "RingReduce"); + const string& device_name = + col_params->instance.device_names[col_params->default_rank]; + // Each subdiv permutation is a ring formed by rotating each + // single-task subsequence of devices by an offset. This makes most + // sense when each task has the same number of devices but we can't + // depend on that being the case so we'll compute something that + // works in any case. + + // Start by counting the devices in each task. + // Precondition: device_names must be sorted so that all devices in + // the same task are adjacent. + VLOG(2) << "Sorted task names: " + << str_util::Join(col_params->instance.task_names, ", "); + std::vector dev_per_task; + const string* prior_task_name = &col_params->instance.task_names[0]; + int dev_count = 1; + for (int di = 1; di < col_params->group.group_size; ++di) { + if (col_params->instance.task_names[di] != *prior_task_name) { + dev_per_task.push_back(dev_count); + dev_count = 1; + prior_task_name = &col_params->instance.task_names[di]; + } else { + ++dev_count; + } + } + dev_per_task.push_back(dev_count); + CHECK_EQ(col_params->group.num_tasks, dev_per_task.size()); + + // Generate a ring permutation for each requested offset. + if (col_params->instance.impl_details.subdiv_offsets.empty()) { + return errors::Internal( + "Subdiv offsets should be non-empty for ring reducer, size=", + col_params->instance.impl_details.subdiv_offsets.size()); + } + VLOG(2) << "Setting up perms for col_params " << col_params + << " subdiv_permutations " + << &col_params->instance.impl_details.subdiv_permutations; + col_params->instance.impl_details.subdiv_permutations.resize( + col_params->instance.impl_details.subdiv_offsets.size()); + col_params->subdiv_rank.resize( + col_params->instance.impl_details.subdiv_offsets.size(), -1); + for (int sdi = 0; + sdi < col_params->instance.impl_details.subdiv_offsets.size(); ++sdi) { + std::vector& perm = + col_params->instance.impl_details.subdiv_permutations[sdi]; + CHECK_EQ(perm.size(), 0); + int offset = col_params->instance.impl_details.subdiv_offsets[sdi]; + // A negative subdivision offset is interpreted as follows: + // 1. Reverse the local device ordering. + // 2. Begin the subdivision at abs(offset) in the reversed ordering. + bool reverse = false; + if (offset < 0) { + offset = abs(offset); + reverse = true; + } + int prior_dev_count = 0; // sum over prior worker device counts + for (int ti = 0; ti < col_params->group.num_tasks; ++ti) { + for (int di = 0; di < dev_per_task[ti]; ++di) { + int di_offset = (di + offset) % dev_per_task[ti]; + int offset_di = + reverse ? (dev_per_task[ti] - (di_offset + 1)) : di_offset; + // Device index in global subdivision permutation. + int permuted_di = prior_dev_count + offset_di; + int rank = static_cast(perm.size()); + perm.push_back(permuted_di); + if (col_params->instance.device_names[permuted_di] == device_name) { + CHECK_EQ(permuted_di, col_params->default_rank); + col_params->subdiv_rank[sdi] = rank; + } + } + prior_dev_count += dev_per_task[ti]; + } + CHECK_EQ(col_params->group.group_size, perm.size()); } + + VLOG(2) << collective_util::SubdivPermDebugString(*col_params); + return Status::OK(); +} + +Status RingReducer::InitializeCollectiveContext(CollectiveContext* col_ctx) { + CHECK(col_ctx->dev_mgr); + col_ctx_ = col_ctx; + col_params_ = &col_ctx->col_params; + return collective_util::InitializeDeviceAndLocality( + col_ctx->dev_mgr, col_ctx->device_name, &col_ctx->device, + &col_ctx->device_locality); } void RingReducer::Run(StatusCallback done) { + CHECK(col_ctx_); + CHECK(col_params_); done_ = std::move(done); + group_size_ = col_params_->group.group_size; + num_subdivs_ = static_cast( + col_params_->instance.impl_details.subdiv_permutations.size()); + CHECK_GT(num_subdivs_, 0); - // Get local execution device. if (VLOG_IS_ON(1)) { string buf; - for (int r = 0; r < col_params_.instance.device_names.size(); ++r) { + for (int r = 0; r < col_params_->instance.device_names.size(); ++r) { strings::StrAppend(&buf, "dev ", r, " : ", - col_params_.instance.device_names[r], "\n"); + col_params_->instance.device_names[r], "\n"); } for (int sd = 0; - sd < col_params_.instance.impl_details.subdiv_permutations.size(); + sd < col_params_->instance.impl_details.subdiv_permutations.size(); ++sd) { strings::StrAppend(&buf, "\nsubdiv ", sd, " perm: "); - for (auto x : col_params_.instance.impl_details.subdiv_permutations[sd]) { + for (auto x : + col_params_->instance.impl_details.subdiv_permutations[sd]) { strings::StrAppend(&buf, x, ", "); } } - VLOG(1) << "RingReducer::Run for device " << device_name_ - << " default_rank " << col_params_.default_rank << "\n" + VLOG(1) << "RingReducer::Run for device " << col_ctx_->device_name + << " default_rank " << col_params_->default_rank << "\n" << buf; } - CHECK(dev_mgr_); - Status status = dev_mgr_->LookupDevice( - col_params_.instance.device_names[col_params_.default_rank], &device_); - if (!status.ok()) { - LOG(ERROR) << "Failed to find device " - << col_params_.instance.device_names[col_params_.default_rank]; - for (auto d : dev_mgr_->ListDevices()) { - LOG(ERROR) << "Available device " << d->name(); - } - done_(status); - return; - } - CHECK(device_); - device_locality_ = device_->attributes().locality(); - - VLOG(1) << this << " default_rank " << col_params_.default_rank << " cp " - << &col_params_ << ": " << col_params_.ToString(); // Start by copying input to output if they're not already the same, i.e. if // we're not computing in-place on the input tensor. - if ((input_ != output_) && - (DMAHelper::base(input_) != DMAHelper::base(output_))) { + if ((col_ctx_->input != col_ctx_->output) && + (DMAHelper::base(col_ctx_->input) != DMAHelper::base(col_ctx_->output))) { // We are running in a blockable thread and the callback can't block so // just wait here on the copy. Notification note; + Status status; CollectiveRemoteAccessLocal::MemCpyAsync( - ctx_->input_device_context(0), ctx_->op_device_context(), device_, - device_, ctx_->input_alloc_attr(0), ctx_->output_alloc_attr(0), input_, - output_, 0 /*dev_to_dev_stream_index*/, + col_ctx_->op_ctx->input_device_context(0), + col_ctx_->op_ctx->op_device_context(), col_ctx_->device, + col_ctx_->device, col_ctx_->op_ctx->input_alloc_attr(0), + col_ctx_->op_ctx->output_alloc_attr(0), col_ctx_->input, + col_ctx_->output, 0 /*dev_to_dev_stream_index*/, [this, ¬e, &status](const Status& s) { status.Update(s); note.Notify(); @@ -177,24 +238,43 @@ void RingReducer::Run(StatusCallback done) { ContinueAfterInputCopy(); } +string RingReducer::TensorDebugString(const Tensor& tensor) { + const DeviceBase::GpuDeviceInfo* gpu_device_info = + col_ctx_->op_ctx->device()->tensorflow_gpu_device_info(); + if (gpu_device_info) { + Tensor cpu_tensor(tensor.dtype(), tensor.shape()); + Notification note; + gpu_device_info->default_context->CopyDeviceTensorToCPU( + &tensor, "" /*tensor_name*/, col_ctx_->device, &cpu_tensor, + [¬e](const Status& s) { + CHECK(s.ok()); + note.Notify(); + }); + note.WaitForNotification(); + return cpu_tensor.SummarizeValue(64); + } else { + return tensor.SummarizeValue(64); + } +} + // Note that this function is blocking and must not run in any thread // which cannot be blocked. void RingReducer::ContinueAfterInputCopy() { - AllocatorAttributes attr = ctx_->output_alloc_attr(0); - ca_.reset(MakeCollectiveAdapter(output_, group_size_ * num_subdivs_, - device_->GetAllocator(attr))); + AllocatorAttributes attr = col_ctx_->op_ctx->output_alloc_attr(0); + ca_.reset(MakeCollectiveAdapter(col_ctx_->output, group_size_ * num_subdivs_, + col_ctx_->device->GetAllocator(attr))); - if (col_params_.final_op) { + if (col_params_->final_op) { // Create an on-device scalar value from group_size_ that may be needed // later. // TODO(tucker): Cache and reuse across invocations? Or maybe the scalar // can be provided to the kernel in host memory? Tensor group_size_val = ca_->Scalar(group_size_); - if (col_params_.group.device_type != "CPU") { - group_size_tensor_ = - ca_->Scalar(device_->GetAllocator(ctx_->input_alloc_attr(0))); - DeviceContext* op_dev_ctx = ctx_->op_device_context(); - op_dev_ctx->CopyCPUTensorToDevice(&group_size_val, device_, + if (col_params_->group.device_type != "CPU") { + group_size_tensor_ = ca_->Scalar(col_ctx_->device->GetAllocator( + col_ctx_->op_ctx->input_alloc_attr(0))); + DeviceContext* op_dev_ctx = col_ctx_->op_ctx->op_device_context(); + op_dev_ctx->CopyCPUTensorToDevice(&group_size_val, col_ctx_->device, &group_size_tensor_, [this](const Status& s) { if (!s.ok()) { @@ -231,14 +311,14 @@ void RingReducer::StartAbort(const Status& s) { // cancellation on all of the outstanding CollectiveRemoteAccess // actions. if (abort_started) { - col_exec_->StartAbort(s); + col_ctx_->col_exec->StartAbort(s); } } void RingReducer::Finish(bool ok) { if (ok) { // Recover the output from the adaptor. - ca_->ConsumeFinalValue(output_); + ca_->ConsumeFinalValue(col_ctx_->output); } Status s; { @@ -275,7 +355,7 @@ Status RingReducer::ComputeBinOp(Device* device, OpKernel* op, Tensor* output, // TODO(tucker): Is it possible to cache and reuse these objects? They're // mostly identical inside one device execution. std::unique_ptr sub_ctx( - new SubContext(ctx_, op_params_, op, output, input)); + new SubContext(col_ctx_->op_ctx, col_ctx_->op_params, op, output, input)); device->Compute(op, sub_ctx->sub_ctx_); return sub_ctx->sub_ctx_->status(); } @@ -295,18 +375,18 @@ void RingReducer::InitRingField(RingField* rf, int chunk_idx, int subdiv_idx, rf->chunk_idx = chunk_idx; rf->subdiv_idx = subdiv_idx; rf->sc_idx = field_idx; - rf->rank = col_params_.subdiv_rank[subdiv_idx]; + rf->rank = col_params_->subdiv_rank[subdiv_idx]; rf->second_pass = false; rf->action = RF_INIT; // Recv from the device with preceding rank within the subdivision. int recv_from_rank = (rf->rank + (group_size_ - 1)) % group_size_; int send_to_rank = (rf->rank + 1) % group_size_; - rf->recv_dev_idx = col_params_.instance.impl_details + rf->recv_dev_idx = col_params_->instance.impl_details .subdiv_permutations[subdiv_idx][recv_from_rank]; - int send_dev_idx = col_params_.instance.impl_details + int send_dev_idx = col_params_->instance.impl_details .subdiv_permutations[subdiv_idx][send_to_rank]; - rf->recv_is_remote = !col_params_.task.is_local[rf->recv_dev_idx]; - rf->send_is_remote = !col_params_.task.is_local[send_dev_idx]; + rf->recv_is_remote = !col_params_->task.is_local[rf->recv_dev_idx]; + rf->send_is_remote = !col_params_->task.is_local[send_dev_idx]; if (ca_->ChunkBytes(rf->sc_idx) > 0) { // In pass 0 we skip Recv when rank = chunk_idx rf->do_recv = (rf->chunk_idx != rf->rank); @@ -360,45 +440,47 @@ string RingReducer::RingField::DebugString() const { void RingReducer::DispatchSend(RingField* rf, const StatusCallback& done) { CHECK(rf->do_send); - string send_buf_key = - RingReduceBufKey(exec_key_, rf->second_pass, rf->sc_idx, rf->rank); - VLOG(3) << "DispatchSend rank=" << col_params_.default_rank << " send key " + string send_buf_key = RingReduceBufKey(col_ctx_->exec_key, rf->second_pass, + rf->sc_idx, rf->rank); + VLOG(3) << "DispatchSend rank=" << col_params_->default_rank << " send key " << send_buf_key << " chunk " << ca_->TBounds(rf->chunk) << " sc_idx " << rf->sc_idx; int send_to_rank = (rf->rank + 1) % group_size_; - int send_to_dev_idx = col_params_.instance.impl_details + int send_to_dev_idx = col_params_->instance.impl_details .subdiv_permutations[rf->subdiv_idx][send_to_rank]; - col_exec_->PostToPeer(col_params_.instance.device_names[send_to_dev_idx], - col_params_.instance.task_names[send_to_dev_idx], - send_buf_key, device_, ctx_->op_device_context(), - ctx_->output_alloc_attr(0), &rf->chunk, - device_locality_, done); + col_ctx_->col_exec->PostToPeer( + col_params_->instance.device_names[send_to_dev_idx], + col_params_->instance.task_names[send_to_dev_idx], send_buf_key, + col_ctx_->device, col_ctx_->op_ctx->op_device_context(), + col_ctx_->op_ctx->output_alloc_attr(0), &rf->chunk, + col_ctx_->device_locality, done); } void RingReducer::DispatchRecv(RingField* rf, const StatusCallback& done) { CHECK(rf->do_recv); string recv_buf_key = - RingReduceBufKey(exec_key_, rf->second_pass, rf->sc_idx, + RingReduceBufKey(col_ctx_->exec_key, rf->second_pass, rf->sc_idx, (rf->rank + (group_size_ - 1)) % group_size_); - VLOG(3) << "DispatchRecv rank=" << col_params_.default_rank << " recv key " + VLOG(3) << "DispatchRecv rank=" << col_params_->default_rank << " recv key " << recv_buf_key << " chunk " << ca_->TBounds(rf->chunk) << " into " - << ((col_params_.merge_op != nullptr) ? "tmp_chunk" : "chunk"); - Tensor* dst_tensor = (!rf->second_pass && (col_params_.merge_op != nullptr)) + << ((col_params_->merge_op != nullptr) ? "tmp_chunk" : "chunk"); + Tensor* dst_tensor = (!rf->second_pass && (col_params_->merge_op != nullptr)) ? &rf->tmp_chunk : &rf->chunk; - col_exec_->RecvFromPeer(col_params_.instance.device_names[rf->recv_dev_idx], - col_params_.instance.task_names[rf->recv_dev_idx], - col_params_.task.is_local[rf->recv_dev_idx], - recv_buf_key, device_, ctx_->op_device_context(), - ctx_->output_alloc_attr(0), dst_tensor, - device_locality_, rf->subdiv_idx, done); + col_ctx_->col_exec->RecvFromPeer( + col_params_->instance.device_names[rf->recv_dev_idx], + col_params_->instance.task_names[rf->recv_dev_idx], + col_params_->task.is_local[rf->recv_dev_idx], recv_buf_key, + col_ctx_->device, col_ctx_->op_ctx->op_device_context(), + col_ctx_->op_ctx->output_alloc_attr(0), dst_tensor, + col_ctx_->device_locality, rf->subdiv_idx, done); } string RingReducer::FieldState() { - string s = strings::StrCat("RingReducer ", - strings::Hex(reinterpret_cast(this)), - " exec ", exec_key_, " step_id=", step_id_, - " state of all ", rfv_.size(), " fields:"); + string s = strings::StrCat( + "RingReducer ", strings::Hex(reinterpret_cast(this)), " exec ", + col_ctx_->exec_key, " step_id=", col_ctx_->step_id, " state of all ", + rfv_.size(), " fields:"); for (int i = 0; i < rfv_.size(); ++i) { s.append("\n"); s.append(rfv_[i].DebugString()); @@ -468,8 +550,9 @@ bool RingReducer::RunAsyncParts() { --recv_pending_count; if (!rf->second_pass) { rf->action = RF_REDUCE; - Status s = ComputeBinOp(device_, col_params_.merge_op.get(), - &rf->chunk, &rf->tmp_chunk); + Status s = + ComputeBinOp(col_ctx_->device, col_params_->merge_op.get(), + &rf->chunk, &rf->tmp_chunk); if (!s.ok()) { aborted = true; StartAbort(s); @@ -479,11 +562,12 @@ bool RingReducer::RunAsyncParts() { } break; case RF_REDUCE: - if (!rf->second_pass && col_params_.final_op.get() && rf->is_final) { + if (!rf->second_pass && col_params_->final_op.get() && rf->is_final) { rf->action = RF_FINALIZE; group_size_tensor_ready_.WaitForNotification(); - Status s = ComputeBinOp(device_, col_params_.final_op.get(), - &rf->chunk, &group_size_tensor_); + Status s = + ComputeBinOp(col_ctx_->device, col_params_->final_op.get(), + &rf->chunk, &group_size_tensor_); if (!s.ok()) { aborted = true; StartAbort(s); @@ -552,9 +636,11 @@ bool RingReducer::RunAsyncParts() { CHECK_EQ(send_pending_count, 0); CHECK_EQ(recv_pending_count, 0); - VLOG(2) << this << " rank=" << rank_ << " finish;" + VLOG(2) << this << " device=" << col_ctx_->device_name << " finish;" << " final value " << TensorDebugString(ca_->Value()); return !aborted; } +REGISTER_COLLECTIVE(RingReduce, RingReducer); + } // namespace tensorflow diff --git a/tensorflow/core/common_runtime/ring_reducer.h b/tensorflow/core/common_runtime/ring_reducer.h index 3e1988e787..0848e37b52 100644 --- a/tensorflow/core/common_runtime/ring_reducer.h +++ b/tensorflow/core/common_runtime/ring_reducer.h @@ -16,25 +16,35 @@ limitations under the License. #define TENSORFLOW_CORE_COMMON_RUNTIME_RING_REDUCER_H_ #include +#include +#include +#include #include "tensorflow/core/common_runtime/base_collective_executor.h" #include "tensorflow/core/framework/collective.h" -#include "tensorflow/core/framework/device_attributes.pb.h" namespace tensorflow { -class DeviceMgr; +class Device; // Ring-algorithm implementation of collective all-reduce. -class RingReducer { +class RingReducer : public CollectiveImplementationInterface { public: - RingReducer(CollectiveExecutor* col_exec, const DeviceMgr* dev_mgr, - OpKernelContext* ctx, OpKernelContext::Params* op_params, - const CollectiveParams& col_params, const string& exec_key, - int64 step_id, const Tensor* input, Tensor* output); + RingReducer(); + ~RingReducer() override; - virtual ~RingReducer(); + // Establishes the requested number of subdivision permutations based on the + // ring order implicit in the device order. + Status InitializeCollectiveParams(CollectiveParams* col_params) override; - void Run(StatusCallback done); + // Initializes members of CollectiveContext not yet initialized, i.e. device + // and device_locality. Also saves the CollectiveContext in this object. + Status InitializeCollectiveContext(CollectiveContext* col_ctx) override; + + // Begins async execution of the ring reduce algorithm. + // Must be called in a blockable thread. + // TODO(b/80529858): remove the previous warning when we have a dedicated + // collective threadpool. + void Run(StatusCallback done) override; private: // Called when a bad status is received that implies we should terminate @@ -101,7 +111,7 @@ class RingReducer { // For constructing log messages for debugging. string FieldState(); - string TensorDebugString(Tensor tensor); + string TensorDebugString(const Tensor& tensor); // Producer/Consumer Queue of RingField structs. class PCQueue { @@ -116,30 +126,19 @@ class RingReducer { std::deque deque_ GUARDED_BY(pcq_mu_); }; - CollectiveExecutor* col_exec_; // Not owned - const DeviceMgr* dev_mgr_; // Not owned - OpKernelContext* ctx_; // Not owned - OpKernelContext::Params* op_params_; // Not owned - const CollectiveParams& col_params_; - const string exec_key_; - const Tensor* input_; // Not owned - Tensor* output_; // Not owned - const int rank_; - const int64 step_id_; - const int group_size_; - const int num_subdivs_; + CollectiveContext* col_ctx_; // Not owned + const CollectiveParams* col_params_; // Not owned + StatusCallback done_; + int group_size_; + int num_subdivs_; Tensor group_size_tensor_; Notification group_size_tensor_ready_; std::unique_ptr ca_; - StatusCallback done_; - Device* device_; // The device for which this instance labors - const string device_name_; - DeviceLocality device_locality_; - mutex status_mu_; Status status_ GUARDED_BY(status_mu_); - std::vector rfv_; + + friend class RingReducerTest; }; } // namespace tensorflow diff --git a/tensorflow/core/common_runtime/ring_reducer_test.cc b/tensorflow/core/common_runtime/ring_reducer_test.cc index fcdf9deff8..5e079dbce6 100644 --- a/tensorflow/core/common_runtime/ring_reducer_test.cc +++ b/tensorflow/core/common_runtime/ring_reducer_test.cc @@ -37,7 +37,6 @@ limitations under the License. #include "tensorflow/core/public/version.h" namespace tensorflow { -namespace { // Wraps CollectiveRemoteAccessLocal with the ability to return an // error status to the N'th action. @@ -135,27 +134,28 @@ class RingReducerTest : public ::testing::Test { protected: RingReducerTest() : device_type_(DEVICE_CPU) {} - void SetUp() override { -#if GOOGLE_CUDA +#ifdef GOOGLE_CUDA + void InitGPUDevices() { auto device_factory = DeviceFactory::GetFactory("GPU"); CHECK(device_factory); SessionOptions options; Status s = device_factory->CreateDevices( options, "/job:worker/replica:0/task:0", &gpu_devices_); CHECK(s.ok()); -#endif } +#endif ~RingReducerTest() override { stop_ = true; - for (auto i : instances_) { - delete i; - } + for (auto i : instances_) delete i; if (col_exec_) col_exec_->Unref(); } void Init(int num_workers, int num_devices, DataType dtype, const DeviceType& device_type, int num_subdivs, int fail_after) { +#ifdef GOOGLE_CUDA + InitGPUDevices(); +#endif device_type_ = device_type; std::vector local_devices; SessionOptions sess_opts; @@ -201,6 +201,7 @@ class RingReducerTest : public ::testing::Test { col_params_.instance.instance_key = kInstanceKey; col_params_.instance.impl_details.subdiv_offsets.clear(); col_params_.instance.type = REDUCTION_COLLECTIVE; + col_params_.instance.impl_details.collective_name = "RingReduce"; col_params_.instance.data_type = dtype; col_params_.instance.impl_details.subdiv_permutations.resize(num_subdivs); col_params_.subdiv_rank.resize(num_subdivs); @@ -373,6 +374,22 @@ class RingReducerTest : public ::testing::Test { return GetKernel(node_def, device_type, device); } + void RunSubdivPermsTest( + CollectiveParams* cp, + const std::vector>& expected_subdiv_perms, + const std::vector& expected_subdiv_rank) { + col_exec_ = nullptr; + cp->instance.impl_details.subdiv_permutations.clear(); + cp->subdiv_rank.clear(); + // Create a stub ring reducer only for testing param initialization. + RingReducer reducer; + TF_CHECK_OK(reducer.InitializeCollectiveParams(cp)); + EXPECT_EQ(expected_subdiv_perms, + cp->instance.impl_details.subdiv_permutations); + EXPECT_EQ(expected_subdiv_rank, cp->subdiv_rank); + reducer.group_size_tensor_ready_.Notify(); // To unblock destructor. + } + class DeviceInstance { public: DeviceInstance(int rank, const string& dev_name, @@ -475,8 +492,8 @@ class RingReducerTest : public ::testing::Test { op_params.op_kernel = op.get(); OpKernelContext ctx(&op_params, 1); - // We never actually execute the kernel, so we need to do the - // output allocation that it would do, ourselves. + // We never actually execute the kernel, so we need to do the output + // allocation it would do, ourselves. Tensor* output_tensor_ptr = nullptr; TF_CHECK_OK(ctx.forward_input_or_allocate_output({0}, 0, tensor_.shape(), &output_tensor_ptr)); @@ -485,20 +502,17 @@ class RingReducerTest : public ::testing::Test { // Prepare a RingReducer instance. string exec_key = strings::StrCat(col_params_.instance.instance_key, ":0:0"); - RingReducer rr(parent_->col_exec_, parent_->dev_mgr_.get(), &ctx, - &op_params, col_params_, exec_key, kStepId, &tensor_, - &tensor_); - - // Start execution in a threadpool then wait for completion. - Notification notification; - SchedClosure([this, ¬ification, &rr]() { - rr.Run([this, ¬ification](Status s) { - status_ = s; - notification.Notify(); - }); - }); - notification.WaitForNotification(); - CHECK(tensor_.CopyFrom(*ctx.mutable_output(0), tensor_.shape())); + RingReducer reducer; + CollectiveContext col_ctx(parent_->col_exec_, parent_->dev_mgr_.get(), + &ctx, &op_params, col_params_, exec_key, + kStepId, &tensor_, &tensor_); + TF_CHECK_OK(reducer.InitializeCollectiveContext(&col_ctx)); + + // Run the all-reduce. + reducer.Run([this](Status s) { status_ = s; }); + if (status_.ok()) { + CHECK(tensor_.CopyFrom(*ctx.mutable_output(0), tensor_.shape())); + } dev_ctx->Unref(); } @@ -531,6 +545,57 @@ class RingReducerTest : public ::testing::Test { int32 reduce_counter_ GUARDED_BY(mu_) = 0; }; +TEST_F(RingReducerTest, InitializeParams) { + static const int kNumDevsPerTask = 8; + static const int kNumTasks = 3; + static const int kNumDevs = kNumDevsPerTask * kNumTasks; + CollectiveParams cp; + std::vector device_names; + std::vector task_names; + cp.group.group_key = 1; + cp.group.group_size = kNumDevs; + cp.group.device_type = DeviceType("GPU"); + cp.group.num_tasks = kNumTasks; + cp.instance.instance_key = 3; + cp.instance.type = REDUCTION_COLLECTIVE; + cp.instance.data_type = DataType(DT_FLOAT); + cp.instance.shape = TensorShape({5}); + cp.instance.impl_details.collective_name = "RingReduce"; + cp.instance.impl_details.subdiv_offsets.push_back(0); + cp.is_source = false; + for (int i = 0; i < kNumDevs; ++i) { + int task_id = i / kNumDevsPerTask; + int dev_id = i % kNumDevsPerTask; + string task_name = strings::StrCat("/job:worker/replica:0/task:", task_id); + task_names.push_back(task_name); + string device_name = strings::StrCat(task_name, "/device:GPU:", dev_id); + device_names.push_back(device_name); + cp.instance.task_names.push_back(task_name); + cp.instance.device_names.push_back(device_name); + } + + int test_rank = 0; + cp.default_rank = test_rank; + cp.instance.impl_details.subdiv_offsets = {0, 4}; + RunSubdivPermsTest(&cp, + {{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, + 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23}, + {4, 5, 6, 7, 0, 1, 2, 3, 12, 13, 14, 15, + 8, 9, 10, 11, 20, 21, 22, 23, 16, 17, 18, 19}}, + {0, 4}); + + test_rank = 3; + cp.default_rank = test_rank; + cp.instance.impl_details.subdiv_offsets = {3, -3}; + RunSubdivPermsTest(&cp, + {{3, 4, 5, 6, 7, 0, 1, 2, 11, 12, 13, 14, + 15, 8, 9, 10, 19, 20, 21, 22, 23, 16, 17, 18}, + {4, 3, 2, 1, 0, 7, 6, 5, 12, 11, 10, 9, + 8, 15, 14, 13, 20, 19, 18, 17, 16, 23, 22, 21}}, + {0, 1}); +} + +// TODO(b/113171733): change to use TEST_P. #define DEF_TEST(B, T, W, D, S, L, A) \ TEST_F(RingReducerTest, \ DaTy##B##_DevTy##T##_Wkr##W##_Dev##D##_Sdiv##S##_Len##L##_Abrt##A) { \ @@ -604,5 +669,4 @@ DEF_TEST(FLOAT, GPU, 1, 8, 1, 9408, 2) DEF_TEST(FLOAT, GPU, 1, 8, 2, 9408, 5) #endif -} // namespace } // namespace tensorflow diff --git a/tensorflow/core/framework/collective.cc b/tensorflow/core/framework/collective.cc index d4ac50cbbe..4cb277d5a8 100644 --- a/tensorflow/core/framework/collective.cc +++ b/tensorflow/core/framework/collective.cc @@ -21,6 +21,31 @@ limitations under the License. namespace tensorflow { +namespace { +// A RegistrationInfo object stores a collective implementation registration +// details. `factory` is used to create instances of the collective +// implementation. +struct RegistrationInfo { + // This constructor also creates, and stores in `param_resolver_instance`, + // what is effectively a static instance of the collective implementation. + // During param resolution of collective ops we return this static instance. + // The actual op execution gets a fresh instance using `factory`. + RegistrationInfo(const string& n, CollectiveRegistry::Factory f) + : name(n), + factory(std::move(f)), + param_resolver_instance(this->factory()) {} + string name; + CollectiveRegistry::Factory factory; + CollectiveImplementationInterface* param_resolver_instance; +}; + +std::vector* MutableCollectiveRegistry() { + static std::vector* registry = + new std::vector; + return registry; +} +} // namespace + string CollGroupParams::ToString() const { return strings::StrCat("CollGroupParams {group_key=", group_key, " group_size=", group_size, @@ -102,7 +127,8 @@ string CollectiveParams::ToString() const { strings::StrAppend(&v, " ", instance.ToString()); strings::StrAppend(&v, " ", task.ToString()); strings::StrAppend(&v, " default_rank=", default_rank, - " is_source=", is_source, " subdiv_rank={"); + " is_source=", is_source, " source_rank=", source_rank, + " subdiv_rank={"); for (const auto& r : subdiv_rank) { strings::StrAppend(&v, r, ","); } @@ -115,7 +141,81 @@ string CollectiveParams::ToString() const { return ctx->params_; } +CollectiveContext::CollectiveContext(CollectiveExecutor* col_exec, + const DeviceMgr* dev_mgr, + OpKernelContext* ctx, + OpKernelContext::Params* op_params, + const CollectiveParams& col_params, + const string& exec_key, int64 step_id, + const Tensor* input, Tensor* output) + : col_exec(col_exec), + dev_mgr(dev_mgr), + op_ctx(ctx), + op_params(op_params), + col_params(col_params), + exec_key(exec_key), + step_id(step_id), + input(input), + output(output), + device(nullptr), + device_name(col_params.instance.device_names[col_params.default_rank]) {} + /*static*/ int64 CollectiveExecutor::kInvalidId = -1; +/*static*/ +Status CollectiveRegistry::Lookup( + const string& collective_name, + CollectiveImplementationInterface** implementation) { + return LookupHelper(collective_name, implementation, false); +} + +/*static*/ +Status CollectiveRegistry::LookupParamResolverInstance( + const string& collective_name, + CollectiveImplementationInterface** implementation) { + return LookupHelper(collective_name, implementation, true); +} + +/*static*/ +void CollectiveRegistry::GetAll( + std::vector* implementations) { + std::vector* registry = MutableCollectiveRegistry(); + for (const RegistrationInfo& reg_info : *registry) + implementations->emplace_back(reg_info.factory()); +} + +/*static*/ +Status CollectiveRegistry::Register(const string& collective_name, + Factory factory) { + std::vector* registry = MutableCollectiveRegistry(); + for (const RegistrationInfo& reg_info : *registry) { + if (reg_info.name == collective_name) + return errors::Internal("Already registered collective ", + collective_name); + } + registry->emplace_back(collective_name, std::move(factory)); + return Status::OK(); +} + +/*static*/ +Status CollectiveRegistry::LookupHelper( + const string& collective_name, + CollectiveImplementationInterface** implementation, bool param_resolver) { + std::vector* registry = MutableCollectiveRegistry(); + for (const RegistrationInfo& reg_info : *registry) { + if (reg_info.name == collective_name) { + if (param_resolver) { + *implementation = reg_info.param_resolver_instance; + } else { + *implementation = reg_info.factory(); + } + return Status::OK(); + } + } + return errors::Internal( + "CollectiveRegistry::Lookup did not find collective implementation ", + collective_name); +} + } // namespace tensorflow diff --git a/tensorflow/core/framework/collective.h b/tensorflow/core/framework/collective.h index 0b37b3a88c..e35edb09d0 100644 --- a/tensorflow/core/framework/collective.h +++ b/tensorflow/core/framework/collective.h @@ -18,6 +18,7 @@ limitations under the License. #include #include +#include "tensorflow/core/framework/device_attributes.pb.h" #include "tensorflow/core/framework/device_base.h" #include "tensorflow/core/framework/op_kernel.h" #include "tensorflow/core/lib/core/refcount.h" @@ -30,7 +31,8 @@ class CompleteGroupRequest; class CompleteGroupResponse; class CompleteInstanceRequest; class CompleteInstanceResponse; -class DeviceLocality; +class Device; +class DeviceMgr; class GetStepSequenceRequest; class GetStepSequenceResponse; class Op; @@ -64,10 +66,10 @@ struct CollGroupParams { // interpretation. On first execution the runtime will update this // structure with decisions that will guide all subsequent executions. struct CollImplDetails { + string collective_name; std::vector> subdiv_permutations; std::vector subdiv_offsets; - // broadcast only: rank of source in each subdiv - std::vector subdiv_source_rank; + std::vector subdiv_source_rank; // rank of source in each subdiv }; // Data common to all members of a collective instance. @@ -104,6 +106,7 @@ struct CollectiveParams { string name = ""; // node name used only for log or error messages int default_rank = -1; // index of this op within device_names bool is_source = false; // broadcast only + int source_rank = -1; // broadcast only // Rank of this device in each subdivision permutation. std::vector subdiv_rank; std::unique_ptr merge_op; // reduction only @@ -306,6 +309,110 @@ class PerStepCollectiveRemoteAccess : public CollectiveRemoteAccess { virtual void StartAbort(const Status& s) = 0; }; +class CollectiveContext { + public: + CollectiveContext(CollectiveExecutor* col_exec, const DeviceMgr* dev_mgr, + OpKernelContext* ctx, OpKernelContext::Params* op_params, + const CollectiveParams& col_params, const string& exec_key, + int64 step_id, const Tensor* input, Tensor* output); + + virtual ~CollectiveContext() = default; + + CollectiveExecutor* col_exec; // Not owned + const DeviceMgr* dev_mgr; // Not owned + OpKernelContext* op_ctx; // Not owned + OpKernelContext::Params* op_params; // Not owned + const CollectiveParams& col_params; + const string exec_key; + const int64 step_id; + const Tensor* input; // Not owned + Tensor* output; // Not owned + Device* device; // The device for which this instance labors + const string device_name; + DeviceLocality device_locality; +}; + +// Interface of a Collective Op implementation. Each specific CollectiveOp will +// implement this interface and register the implementation via the +// CollectiveRegistry detailed below. See common_runtime/ring_reducer and +// common_runtime/hierarchical_tree_broadcaster for examples. +class CollectiveImplementationInterface { + public: + virtual ~CollectiveImplementationInterface() = default; + + // Initializes the portions of `col_params` specific to this + // implementation. Called exactly once for every Collective instance during + // the CollectiveParams resolution process when the graph is first executed. + // NOTE(ayushd): This is effectively a static function because it modifies the + // `col_params` passed in and should not manipulate any data members. However + // because it is virtual and needs to be implemented by every derived class we + // do not mark it as static. + virtual Status InitializeCollectiveParams(CollectiveParams* col_params) = 0; + + // Prepares the CollectiveContext for executing this CollectiveImplementation. + // Called from CollectiveExecutor right before calling Run(). The + // CollectiveContext passed in must outlive the CollectiveImplementation + // object. + virtual Status InitializeCollectiveContext(CollectiveContext* col_ctx) = 0; + + // Processes and moves data according to the logic of this Collective + // implementation. Relies on appropriate initialization of op-specific + // CollectiveParams in InitializeCollectiveParams(), as well as appropriate + // context initialization in InitializeCollectiveContext(). + virtual void Run(StatusCallback done) = 0; +}; + +// Static-methods only class for registering and looking up collective +// implementations. +class CollectiveRegistry { + public: + using Factory = std::function; + // Looks up a previously registered CollectiveImplementation under + // `collective_name`. If found, creates an instance of the implementation and + // assign to `implementation`. + static Status Lookup(const string& collective_name, + CollectiveImplementationInterface** implementation); + + // Looks up a previously registered CollectiveImplementation under + // `collective_name`. If found, returns the static instance of this + // implementation via `implementation`. This instance should only be used to + // call InitializateCollectiveParams. + static Status LookupParamResolverInstance( + const string& collective_name, + CollectiveImplementationInterface** implementation); + + // Returns all registered collective implementations. + static void GetAll( + std::vector* implementations); + + private: + friend class CollectiveRegistration; + // Registers a CollectiveImplementation with name `collective_name` and + // factory `factory`. The latter is a function used to create instances of + // the CollectiveImplementation. Also creates a static instance of the + // implementation - this instance is used during param resolution and should + // only be used to call InitializeCollectiveParams. + static Status Register(const string& collective_name, Factory factory); + + static Status LookupHelper(const string& collective_name, + CollectiveImplementationInterface** implementation, + bool param_resolver); +}; + +// Class used to call CollectiveRegistry::Register. This should only be used to +// create a global static object. +class CollectiveRegistration { + public: + CollectiveRegistration(const string& collective_name, + CollectiveRegistry::Factory factory) { + TF_CHECK_OK(CollectiveRegistry::Register(collective_name, factory)); + } +}; + +#define REGISTER_COLLECTIVE(name, implementation) \ + static CollectiveRegistration register_##name##_collective( \ + #name, []() { return new implementation; }); + } // namespace tensorflow #endif // TENSORFLOW_CORE_FRAMEWORK_COLLECTIVE_H_ -- GitLab From fb9a2fbfe461020b7ae167f97832c8a2f060319d Mon Sep 17 00:00:00 2001 From: Gunhan Gulsoy Date: Mon, 27 Aug 2018 14:27:48 -0700 Subject: [PATCH 177/598] Disable flaky estimator_training_test PiperOrigin-RevId: 210431699 --- tensorflow/contrib/distribute/python/BUILD | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tensorflow/contrib/distribute/python/BUILD b/tensorflow/contrib/distribute/python/BUILD index f5b236e35f..5a982f1e04 100644 --- a/tensorflow/contrib/distribute/python/BUILD +++ b/tensorflow/contrib/distribute/python/BUILD @@ -473,8 +473,11 @@ cuda_py_test( "//tensorflow/python:summary", ], tags = [ + "manual", "multi_and_single_gpu", "no_pip", + "nogpu", + "notap", ], ) -- GitLab From 3b4df1b62c5b2c1302ebf23a9040cc749f0dd23d Mon Sep 17 00:00:00 2001 From: Olivia Nordquist Date: Mon, 27 Aug 2018 14:29:45 -0700 Subject: [PATCH 178/598] adding args and returns docstrings to freeze_graph public functions to make them more user friendly PiperOrigin-RevId: 210432021 --- tensorflow/python/tools/freeze_graph.py | 64 ++++++++++++++++++++++++- 1 file changed, 62 insertions(+), 2 deletions(-) diff --git a/tensorflow/python/tools/freeze_graph.py b/tensorflow/python/tools/freeze_graph.py index c7f414c5dc..893309f35a 100644 --- a/tensorflow/python/tools/freeze_graph.py +++ b/tensorflow/python/tools/freeze_graph.py @@ -89,7 +89,37 @@ def freeze_graph_with_def_protos(input_graph_def, input_saved_model_dir=None, saved_model_tags=None, checkpoint_version=saver_pb2.SaverDef.V2): - """Converts all variables in a graph and checkpoint into constants.""" + """Converts all variables in a graph and checkpoint into constants. + + Args: + input_graph_def: A `GraphDef`. + input_saver_def: A `SaverDef` (optional). + input_checkpoint: The prefix of a V1 or V2 checkpoint, with V2 taking + priority. Typically the result of `Saver.save()` or that of + `tf.train.latest_checkpoint()`, regardless of sharded/non-sharded or + V1/V2. + output_node_names: The name(s) of the output nodes, comma separated. + restore_op_name: Unused. + filename_tensor_name: Unused. + output_graph: String where to write the frozen `GraphDef`. + clear_devices: A Bool whether to remove device specifications. + initializer_nodes: Comma separated string of initializer nodes to run before + freezing. + variable_names_whitelist: The set of variable names to convert (optional, by + default, all variables are converted). + variable_names_blacklist: The set of variable names to omit converting + to constants (optional). + input_meta_graph_def: A `MetaGraphDef` (optional), + input_saved_model_dir: Path to the dir with TensorFlow 'SavedModel' file + and variables (optional). + saved_model_tags: Group of comma separated tag(s) of the MetaGraphDef to + load, in string format (optional). + checkpoint_version: Tensorflow variable file format (saver_pb2.SaverDef.V1 + or saver_pb2.SaverDef.V2) + + Returns: + Location of the output_graph_def. + """ del restore_op_name, filename_tensor_name # Unused by updated loading code. # 'input_checkpoint' may be a prefix if we're using Saver V2 format @@ -271,7 +301,37 @@ def freeze_graph(input_graph, input_saved_model_dir=None, saved_model_tags=tag_constants.SERVING, checkpoint_version=saver_pb2.SaverDef.V2): - """Converts all variables in a graph and checkpoint into constants.""" + """Converts all variables in a graph and checkpoint into constants. + + Args: + input_graph: A `GraphDef` file to load. + input_saver: A TensorFlow Saver file. + input_binary: A Bool. True means input_graph is .pb, False indicates .pbtxt. + input_checkpoint: The prefix of a V1 or V2 checkpoint, with V2 taking + priority. Typically the result of `Saver.save()` or that of + `tf.train.latest_checkpoint()`, regardless of sharded/non-sharded or + V1/V2. + output_node_names: The name(s) of the output nodes, comma separated. + restore_op_name: Unused. + filename_tensor_name: Unused. + output_graph: String where to write the frozen `GraphDef`. + clear_devices: A Bool whether to remove device specifications. + initializer_nodes: Comma separated list of initializer nodes to run before + freezing. + variable_names_whitelist: The set of variable names to convert (optional, by + default, all variables are converted), + variable_names_blacklist: The set of variable names to omit converting + to constants (optional). + input_meta_graph: A `MetaGraphDef` file to load (optional). + input_saved_model_dir: Path to the dir with TensorFlow 'SavedModel' file and + variables (optional). + saved_model_tags: Group of comma separated tag(s) of the MetaGraphDef to + load, in string format. + checkpoint_version: Tensorflow variable file format (saver_pb2.SaverDef.V1 + or saver_pb2.SaverDef.V2). + Returns: + String that is the location of frozen GraphDef. + """ input_graph_def = None if input_saved_model_dir: input_graph_def = saved_model_utils.get_meta_graph_def( -- GitLab From d57f5a82025702d573d478091dc9c385adf53c09 Mon Sep 17 00:00:00 2001 From: Justin Lebar Date: Mon, 27 Aug 2018 14:50:49 -0700 Subject: [PATCH 179/598] [XLA] Switch to absl::StrFormat. Unlike Printf, StrFormat does not require type-length qualifiers, e.g %z, %ll. Nor does it require that you call c_str() to print strings. So these are fixed up here as well. PiperOrigin-RevId: 210435915 --- tensorflow/compiler/xla/BUILD | 4 + tensorflow/compiler/xla/array4d.h | 1 - tensorflow/compiler/xla/client/BUILD | 1 + .../xla/client/executable_build_options.cc | 6 +- .../compiler/xla/client/lib/constants.cc | 2 +- .../compiler/xla/client/lib/constants.h | 6 +- tensorflow/compiler/xla/client/lib/numeric.cc | 5 +- .../compiler/xla/client/local_client.cc | 23 +- tensorflow/compiler/xla/client/padding.cc | 4 +- tensorflow/compiler/xla/client/xla_builder.cc | 65 +- tensorflow/compiler/xla/layout_util.cc | 24 +- tensorflow/compiler/xla/legacy_flags/BUILD | 3 + .../xla/legacy_flags/debug_options_parsers.h | 1 - .../legacy_flags/parse_flags_from_env_test.cc | 6 +- tensorflow/compiler/xla/literal.cc | 60 +- tensorflow/compiler/xla/literal_comparison.cc | 124 ++-- tensorflow/compiler/xla/literal_util.cc | 1 - tensorflow/compiler/xla/map_util.h | 2 +- .../compiler/xla/metric_table_report.cc | 5 +- .../compiler/xla/packed_literal_reader.cc | 2 +- tensorflow/compiler/xla/python/BUILD | 1 + .../xla/python/local_computation_builder.cc | 5 +- .../xla/python/local_computation_builder.i | 5 +- .../compiler/xla/python/numpy_bridge.cc | 5 +- tensorflow/compiler/xla/rpc/BUILD | 2 + .../compiler/xla/rpc/grpc_client_test.cc | 9 +- .../compiler/xla/rpc/grpc_service_main.cc | 4 +- tensorflow/compiler/xla/service/BUILD | 21 +- .../xla/service/allocation_tracker.cc | 13 +- tensorflow/compiler/xla/service/backend.cc | 2 +- .../compiler/xla/service/buffer_assignment.cc | 60 +- .../compiler/xla/service/buffer_liveness.cc | 16 +- tensorflow/compiler/xla/service/call_graph.cc | 20 +- .../compiler/xla/service/call_inliner.cc | 2 +- .../compiler/xla/service/channel_tracker.cc | 12 +- tensorflow/compiler/xla/service/compiler.cc | 2 +- .../xla/service/computation_placer.cc | 2 +- tensorflow/compiler/xla/service/cpu/BUILD | 7 +- .../compiler/xla/service/cpu/cpu_compiler.cc | 3 +- .../xla/service/cpu/cpu_executable.cc | 20 +- .../service/cpu/cpu_hlo_support_checker.cc | 5 +- .../xla/service/cpu/cpu_runtime_test.cc | 18 +- .../xla/service/cpu/cpu_transfer_manager.cc | 10 +- .../compiler/xla/service/cpu/disassembler.cc | 6 +- .../xla/service/cpu/dot_op_emitter.cc | 2 +- .../compiler/xla/service/cpu/ir_emitter.cc | 28 +- .../xla/service/cpu/parallel_loop_emitter.cc | 8 +- .../xla/service/cpu/sample_harness.cc | 6 +- .../service/cpu/tests/cpu_intrinsic_test.cc | 4 +- .../xla/service/device_memory_allocator.cc | 9 +- .../compiler/xla/service/dfs_hlo_visitor.cc | 4 +- .../xla/service/elemental_ir_emitter.cc | 32 +- tensorflow/compiler/xla/service/executable.cc | 8 +- .../compiler/xla/service/execution_tracker.cc | 4 +- .../compiler/xla/service/gather_expander.cc | 2 +- tensorflow/compiler/xla/service/gpu/BUILD | 5 + .../xla/service/gpu/buffer_allocations.cc | 4 +- .../xla/service/gpu/buffer_comparator.cc | 2 +- .../xla/service/gpu/conditional_thunk.cc | 2 +- .../xla/service/gpu/convolution_thunk.cc | 1 - .../xla/service/gpu/cudnn_batchnorm_thunk.cc | 1 - .../gpu/cudnn_convolution_algorithm_picker.cc | 7 +- .../service/gpu/cudnn_convolution_runner.cc | 4 +- .../xla/service/gpu/elemental_ir_emitter.cc | 8 +- .../compiler/xla/service/gpu/fft_thunk.cc | 8 +- .../compiler/xla/service/gpu/gemm_thunk.cc | 2 +- .../xla/service/gpu/gpu_executable.cc | 9 +- .../service/gpu/gpu_hlo_support_checker.cc | 5 +- .../xla/service/gpu/gpu_transfer_manager.cc | 4 +- .../xla/service/gpu/hlo_schedule_test.cc | 3 +- .../compiler/xla/service/gpu/infeed_thunk.cc | 2 +- .../service/gpu/instruction_fusion_test.cc | 2 +- .../compiler/xla/service/gpu/ir_emitter.cc | 4 +- .../xla/service/gpu/ir_emitter_unnested.cc | 3 +- .../compiler/xla/service/gpu/kernel_thunk.cc | 4 +- .../xla/service/gpu/llvm_gpu_backend/BUILD | 1 + .../gpu/llvm_gpu_backend/dump_ir_pass.cc | 7 +- .../gpu/llvm_gpu_backend/nvptx_backend_lib.cc | 1 - .../compiler/xla/service/gpu/outfeed_thunk.cc | 2 +- .../xla/service/gpu/partition_assignment.cc | 11 +- .../xla/service/gpu/stream_assignment_test.cc | 4 +- .../compiler/xla/service/gpu/while_thunk.cc | 2 +- .../compiler/xla/service/hlo_computation.cc | 9 +- .../xla/service/hlo_dataflow_analysis.cc | 2 +- .../compiler/xla/service/hlo_evaluator.cc | 14 +- .../compiler/xla/service/hlo_evaluator.h | 4 +- .../xla/service/hlo_evaluator_typed_visitor.h | 14 +- .../compiler/xla/service/hlo_graph_dumper.cc | 152 ++-- .../compiler/xla/service/hlo_instruction.cc | 8 +- .../xla/service/hlo_instruction_test.cc | 2 +- tensorflow/compiler/xla/service/hlo_lexer.cc | 3 +- .../xla/service/hlo_module_group_metadata.cc | 28 +- .../xla/service/hlo_module_group_util.cc | 2 +- tensorflow/compiler/xla/service/hlo_opcode.cc | 2 +- .../compiler/xla/service/hlo_ordering.cc | 19 +- tensorflow/compiler/xla/service/hlo_parser.cc | 107 ++- .../compiler/xla/service/hlo_pass_pipeline.cc | 7 +- .../xla/service/hlo_rematerialization.cc | 13 +- .../compiler/xla/service/hlo_scheduling.cc | 1 - .../compiler/xla/service/hlo_verifier.cc | 131 ++-- .../service/human_readable_profile_builder.cc | 35 +- .../xla/service/interpreter/platform.cc | 6 +- .../compiler/xla/service/layout_assignment.cc | 73 +- tensorflow/compiler/xla/service/llvm_ir/BUILD | 1 + .../xla/service/llvm_ir/fused_ir_emitter.cc | 2 +- .../compiler/xla/service/llvm_ir/llvm_loop.cc | 1 - .../xla/service/llvm_ir/loop_emitter.cc | 4 +- .../compiler/xla/service/local_service.cc | 14 +- .../compiler/xla/service/platform_util.cc | 14 +- .../compiler/xla/service/scatter_expander.cc | 2 +- tensorflow/compiler/xla/service/service.cc | 66 +- .../compiler/xla/service/shape_inference.cc | 699 +++++++++--------- .../compiler/xla/service/shaped_buffer.cc | 10 +- .../compiler/xla/service/source_map_util.cc | 8 +- .../compiler/xla/service/source_map_util.h | 34 +- .../compiler/xla/service/transfer_manager.cc | 17 +- .../xla/service/tuple_points_to_analysis.cc | 17 +- tensorflow/compiler/xla/shape_layout.cc | 8 +- tensorflow/compiler/xla/shape_util.cc | 41 +- tensorflow/compiler/xla/tests/BUILD | 7 +- .../compiler/xla/tests/literal_test_util.cc | 5 +- .../xla/tests/matrix_ops_simple_test.cc | 5 +- tensorflow/compiler/xla/tests/reduce_test.cc | 12 +- tensorflow/compiler/xla/tests/reverse_test.cc | 7 +- tensorflow/compiler/xla/tests/slice_test.cc | 7 +- tensorflow/compiler/xla/tests/test_utils.cc | 4 +- .../compiler/xla/text_literal_reader.cc | 16 +- tensorflow/compiler/xla/tools/BUILD | 1 + .../dumped_computation_to_operation_list.cc | 9 +- .../compiler/xla/tools/replay_computation.cc | 2 +- tensorflow/compiler/xla/util.cc | 81 -- tensorflow/compiler/xla/util.h | 76 +- tensorflow/compiler/xla/window_util.cc | 1 - 133 files changed, 1241 insertions(+), 1372 deletions(-) diff --git a/tensorflow/compiler/xla/BUILD b/tensorflow/compiler/xla/BUILD index 26bd1ac4f7..ddeba1d91d 100644 --- a/tensorflow/compiler/xla/BUILD +++ b/tensorflow/compiler/xla/BUILD @@ -175,6 +175,7 @@ cc_library( "@com_google_absl//absl/algorithm:container", "@com_google_absl//absl/container:inlined_vector", "@com_google_absl//absl/strings", + "@com_google_absl//absl/strings:str_format", ], ) @@ -305,6 +306,7 @@ cc_library( "//tensorflow/core:lib", "@com_google_absl//absl/memory", "@com_google_absl//absl/strings", + "@com_google_absl//absl/strings:str_format", ], ) @@ -366,6 +368,7 @@ cc_library( ":util", "//tensorflow/core:lib", "@com_google_absl//absl/strings", + "@com_google_absl//absl/strings:str_format", ], ) @@ -378,6 +381,7 @@ cc_library( ":util", "//tensorflow/core:lib", "@com_google_absl//absl/strings", + "@com_google_absl//absl/strings:str_format", ], ) diff --git a/tensorflow/compiler/xla/array4d.h b/tensorflow/compiler/xla/array4d.h index 14e7bf1814..8557bb8fe4 100644 --- a/tensorflow/compiler/xla/array4d.h +++ b/tensorflow/compiler/xla/array4d.h @@ -31,7 +31,6 @@ limitations under the License. #include "tensorflow/compiler/xla/array2d.h" #include "tensorflow/compiler/xla/types.h" #include "tensorflow/core/lib/gtl/array_slice.h" -#include "tensorflow/core/lib/strings/stringprintf.h" #include "tensorflow/core/platform/logging.h" #include "tensorflow/core/platform/macros.h" #include "tensorflow/core/platform/types.h" diff --git a/tensorflow/compiler/xla/client/BUILD b/tensorflow/compiler/xla/client/BUILD index 9ad8ee2014..2638dea1bd 100644 --- a/tensorflow/compiler/xla/client/BUILD +++ b/tensorflow/compiler/xla/client/BUILD @@ -92,6 +92,7 @@ cc_library( "//tensorflow/compiler/xla/service:device_memory_allocator", "//tensorflow/core:lib", "@com_google_absl//absl/strings", + "@com_google_absl//absl/strings:str_format", "@com_google_absl//absl/types:optional", ], ) diff --git a/tensorflow/compiler/xla/client/executable_build_options.cc b/tensorflow/compiler/xla/client/executable_build_options.cc index 5a73408db5..0f1745366b 100644 --- a/tensorflow/compiler/xla/client/executable_build_options.cc +++ b/tensorflow/compiler/xla/client/executable_build_options.cc @@ -15,8 +15,8 @@ limitations under the License. #include "tensorflow/compiler/xla/client/executable_build_options.h" +#include "absl/strings/str_format.h" #include "tensorflow/compiler/xla/shape_util.h" -#include "tensorflow/core/lib/strings/stringprintf.h" namespace xla { @@ -59,10 +59,10 @@ string ExecutableBuildOptions::ToString() const { if (generate_hlo_graph_.has_value()) { generate_hlo_graph = generate_hlo_graph_.value(); } - return tensorflow::strings::Printf( + return absl::StrFormat( "ExecutableBuildOptions{device_ordinal=%d, result_layout=%s, " "generate_hlo_graph=%s}", - device_ordinal_, result_layout.c_str(), generate_hlo_graph.c_str()); + device_ordinal_, result_layout, generate_hlo_graph); } ExecutableBuildOptions& ExecutableBuildOptions::set_generate_hlo_graph( diff --git a/tensorflow/compiler/xla/client/lib/constants.cc b/tensorflow/compiler/xla/client/lib/constants.cc index 031d62e4ff..1ada7b4a96 100644 --- a/tensorflow/compiler/xla/client/lib/constants.cc +++ b/tensorflow/compiler/xla/client/lib/constants.cc @@ -56,7 +56,7 @@ XlaOp Epsilon(XlaBuilder* builder, PrimitiveType type) { std::numeric_limits::epsilon()); default: return builder->ReportError(InvalidArgument( - "Invalid type for Epsilon (%s).", PrimitiveType_Name(type).c_str())); + "Invalid type for Epsilon (%s).", PrimitiveType_Name(type))); } } diff --git a/tensorflow/compiler/xla/client/lib/constants.h b/tensorflow/compiler/xla/client/lib/constants.h index 0c8a9b8cc0..81624614c1 100644 --- a/tensorflow/compiler/xla/client/lib/constants.h +++ b/tensorflow/compiler/xla/client/lib/constants.h @@ -37,13 +37,13 @@ XlaOp ConstantR0WithType(XlaBuilder* builder, PrimitiveType type, T value) { primitive_util::IsComplexType(type))) { return builder->ReportError(InvalidArgument( "Invalid cast from floating point type to %s in ConstantR0WithType.", - PrimitiveType_Name(type).c_str())); + PrimitiveType_Name(type))); } if (std::is_same::value && !primitive_util::IsComplexType(type)) { return builder->ReportError(InvalidArgument( "Invalid cast from complex type to %s in ConstantR0WithType.", - PrimitiveType_Name(type).c_str())); + PrimitiveType_Name(type))); } switch (type) { case F16: @@ -71,7 +71,7 @@ XlaOp ConstantR0WithType(XlaBuilder* builder, PrimitiveType type, T value) { default: return builder->ReportError( InvalidArgument("Invalid type for ConstantR0WithType (%s).", - PrimitiveType_Name(type).c_str())); + PrimitiveType_Name(type))); } } diff --git a/tensorflow/compiler/xla/client/lib/numeric.cc b/tensorflow/compiler/xla/client/lib/numeric.cc index 1c91237ae1..02bed80162 100644 --- a/tensorflow/compiler/xla/client/lib/numeric.cc +++ b/tensorflow/compiler/xla/client/lib/numeric.cc @@ -65,9 +65,8 @@ XlaOp Iota(XlaBuilder* builder, PrimitiveType type, int64 size) { case C64: return MakeIota(builder, size); default: - return builder->ReportError( - InvalidArgument("Unimplemented type for Iota: %s.", - PrimitiveType_Name(type).c_str())); + return builder->ReportError(InvalidArgument( + "Unimplemented type for Iota: %s.", PrimitiveType_Name(type))); } } diff --git a/tensorflow/compiler/xla/client/local_client.cc b/tensorflow/compiler/xla/client/local_client.cc index 1cd3e9b22f..db7a8fc047 100644 --- a/tensorflow/compiler/xla/client/local_client.cc +++ b/tensorflow/compiler/xla/client/local_client.cc @@ -59,7 +59,7 @@ Status LocalExecutable::ValidateExecutionOptions( // Check argument number, shapes, and layouts. if (arguments.size() != computation_layout.parameter_count()) { return InvalidArgument( - "invalid number of arguments for computation: expected %d, got %zu", + "invalid number of arguments for computation: expected %d, got %u", computation_layout.parameter_count(), arguments.size()); } for (int i = 0; i < arguments.size(); ++i) { @@ -71,9 +71,9 @@ Status LocalExecutable::ValidateExecutionOptions( "parameter " "%d: want %s, got %s", i, - ShapeUtil::HumanString(computation_layout.parameter_layout(i).shape()) - .c_str(), - ShapeUtil::HumanString(arguments[i]->on_host_shape()).c_str()); + ShapeUtil::HumanString( + computation_layout.parameter_layout(i).shape()), + ShapeUtil::HumanString(arguments[i]->on_host_shape())); } } @@ -88,8 +88,7 @@ Status LocalExecutable::ValidateExecutionOptions( if (stream_platform != backend_->platform()) { return InvalidArgument( "stream is for platform %s, but service targets platform %s", - stream_platform->Name().c_str(), - backend_->platform()->Name().c_str()); + stream_platform->Name(), backend_->platform()->Name()); } // Cannot specify device_ordinal with a stream. The stream determines these @@ -120,10 +119,10 @@ Status LocalExecutable::ValidateExecutionOptions( return InvalidArgument( "executable is built for device %s of type \"%s\"; cannot run it on " "device %s of type \"%s\"", - backend_->device_name(build_device_ordinal()).c_str(), - build_executor->GetDeviceDescription().name().c_str(), - backend_->device_name(run_device_ordinal).c_str(), - run_executor->GetDeviceDescription().name().c_str()); + backend_->device_name(build_device_ordinal()), + build_executor->GetDeviceDescription().name(), + backend_->device_name(run_device_ordinal), + run_executor->GetDeviceDescription().name()); } if (!run_options.allocator()) { @@ -133,8 +132,8 @@ Status LocalExecutable::ValidateExecutionOptions( if (run_options.allocator()->platform() != backend.platform()) { return InvalidArgument( "allocator platform (%s) does not match service platform (%s)", - run_options.allocator()->platform()->Name().c_str(), - backend.platform()->Name().c_str()); + run_options.allocator()->platform()->Name(), + backend.platform()->Name()); } return Status::OK(); diff --git a/tensorflow/compiler/xla/client/padding.cc b/tensorflow/compiler/xla/client/padding.cc index 6a9cf466ac..ed4dc8e9f6 100644 --- a/tensorflow/compiler/xla/client/padding.cc +++ b/tensorflow/compiler/xla/client/padding.cc @@ -31,8 +31,8 @@ Status ValidatePaddingValues( input_dimensions.size() == window_strides.size(); if (!ok) { return InvalidArgument( - "Want input dimensions size %zu = window dimensions size %zu = window " - "strides size %zu", + "Want input dimensions size %u = window dimensions size %u = window " + "strides size %u", input_dimensions.size(), window_dimensions.size(), window_strides.size()); } diff --git a/tensorflow/compiler/xla/client/xla_builder.cc b/tensorflow/compiler/xla/client/xla_builder.cc index 9f902d7298..65b110e285 100644 --- a/tensorflow/compiler/xla/client/xla_builder.cc +++ b/tensorflow/compiler/xla/client/xla_builder.cc @@ -72,7 +72,7 @@ XlaOp operator>>(const XlaOp& x, const XlaOp& y) { if (!ShapeUtil::ElementIsIntegral(shape)) { return InvalidArgument( "Argument to >> operator does not have an integral type (%s).", - ShapeUtil::HumanString(shape).c_str()); + ShapeUtil::HumanString(shape)); } if (ShapeUtil::ElementIsSigned(shape)) { return ShiftRightArithmetic(x, y); @@ -492,7 +492,7 @@ XlaOp XlaBuilder::Parameter(int64 parameter_number, const Shape& shape, return ReportErrorOrReturn([&]() -> StatusOr { HloInstructionProto instr; if (!parameter_numbers_.insert(parameter_number).second) { - return InvalidArgument("parameter %lld already registered", + return InvalidArgument("parameter %d already registered", parameter_number); } instr.set_parameter_number(parameter_number); @@ -766,7 +766,7 @@ XlaOp XlaBuilder::GetTupleElement(const XlaOp& tuple_data, int64 index) { if (!ShapeUtil::IsTuple(tuple_shape)) { return InvalidArgument( "Operand to GetTupleElement() is not a tuple; got %s", - ShapeUtil::HumanString(tuple_shape).c_str()); + ShapeUtil::HumanString(tuple_shape)); } *instr.mutable_shape() = ShapeUtil::GetTupleElementShape(tuple_shape, index); @@ -847,16 +847,14 @@ Status XlaBuilder::VerifyConvolution( return InvalidArgument( "Convolution arguments must have same number of " "dimensions. Got: %s and %s", - ShapeUtil::HumanString(lhs_shape).c_str(), - ShapeUtil::HumanString(rhs_shape).c_str()); + ShapeUtil::HumanString(lhs_shape), ShapeUtil::HumanString(rhs_shape)); } int num_dims = ShapeUtil::Rank(lhs_shape); if (num_dims < 2) { return InvalidArgument( "Convolution expects argument arrays with >= 3 dimensions. " "Got: %s and %s", - ShapeUtil::HumanString(lhs_shape).c_str(), - ShapeUtil::HumanString(rhs_shape).c_str()); + ShapeUtil::HumanString(lhs_shape), ShapeUtil::HumanString(rhs_shape)); } int num_spatial_dims = num_dims - 2; @@ -870,7 +868,7 @@ Status XlaBuilder::VerifyConvolution( } for (int i = 0; i < numbers.size(); ++i) { if (numbers.Get(i) < 0 || numbers.Get(i) >= num_dims) { - return InvalidArgument("Convolution %s[%d] is out of bounds: %lld", + return InvalidArgument("Convolution %s[%d] is out of bounds: %d", field_name, i, numbers.Get(i)); } } @@ -1016,8 +1014,7 @@ StatusOr XlaBuilder::MakeWindow( "Window has different number of window dimensions than of ", x_name, "\nNumber of window dimensions: ", window_dimensions.size(), - "\nNumber of ", x_name, ": ", x, "\n") - .c_str()); + "\nNumber of ", x_name, ": ", x, "\n")); } }; TF_RETURN_IF_ERROR(verify_size(window_strides.size(), "window strides")); @@ -1193,8 +1190,8 @@ void XlaBuilder::Outfeed(const XlaOp& operand, const Shape& shape_with_layout, if (!ShapeUtil::Compatible(operand_shape, shape_with_layout)) { return InvalidArgument( "Outfeed shape %s must be compatible with operand shape %s", - ShapeUtil::HumanStringWithLayout(shape_with_layout).c_str(), - ShapeUtil::HumanStringWithLayout(operand_shape).c_str()); + ShapeUtil::HumanStringWithLayout(shape_with_layout), + ShapeUtil::HumanStringWithLayout(operand_shape)); } *instr.mutable_outfeed_shape() = shape_with_layout; @@ -1246,8 +1243,8 @@ XlaOp XlaBuilder::OutfeedWithToken(const XlaOp& operand, const XlaOp& token, if (!ShapeUtil::Compatible(operand_shape, shape_with_layout)) { return InvalidArgument( "Outfeed shape %s must be compatible with operand shape %s", - ShapeUtil::HumanStringWithLayout(shape_with_layout).c_str(), - ShapeUtil::HumanStringWithLayout(operand_shape).c_str()); + ShapeUtil::HumanStringWithLayout(shape_with_layout), + ShapeUtil::HumanStringWithLayout(operand_shape)); } *instr.mutable_outfeed_shape() = shape_with_layout; @@ -1286,7 +1283,7 @@ XlaOp XlaBuilder::CustomCall(const string& call_target_name, return InvalidArgument( "Invalid custom_call_target \"%s\": Call targets that start with '$' " "are reserved for internal use.", - call_target_name.c_str()); + call_target_name); } *instr.mutable_shape() = shape; instr.set_custom_call_target(call_target_name); @@ -1590,7 +1587,7 @@ XlaOp XlaBuilder::RngOp(RandomDistribution distribution, if (parameters.size() != 2) { return InvalidArgument( "RNG distribution (%s) expects 2 parameters, but got %ld", - RandomDistribution_Name(distribution).c_str(), parameters.size()); + RandomDistribution_Name(distribution), parameters.size()); } break; default: @@ -2140,13 +2137,13 @@ XlaOp XlaBuilder::SendToHost(const XlaOp& operand, const XlaOp& token, if (!ShapeUtil::Compatible(operand_shape, shape_with_layout)) { return InvalidArgument( "SendToHost shape %s must be compatible with operand shape %s", - ShapeUtil::HumanStringWithLayout(shape_with_layout).c_str(), - ShapeUtil::HumanStringWithLayout(operand_shape).c_str()); + ShapeUtil::HumanStringWithLayout(shape_with_layout), + ShapeUtil::HumanStringWithLayout(operand_shape)); } // TODO(b/111544877): Support tuple shapes. if (!ShapeUtil::IsArray(operand_shape)) { return InvalidArgument("SendToHost only supports array shapes, shape: %s", - ShapeUtil::HumanString(operand_shape).c_str()); + ShapeUtil::HumanString(operand_shape)); } if (handle.type() != ChannelHandle::DEVICE_TO_HOST) { @@ -2185,7 +2182,7 @@ XlaOp XlaBuilder::RecvFromHost(const XlaOp& token, const Shape& shape, if (!ShapeUtil::IsArray(shape)) { return InvalidArgument( "RecvFromHost only supports array shapes, shape: %s", - ShapeUtil::HumanString(shape).c_str()); + ShapeUtil::HumanString(shape)); } if (handle.type() != ChannelHandle::HOST_TO_DEVICE) { @@ -2240,7 +2237,7 @@ StatusOr XlaBuilder::BuildConstantSubGraph( "of being evaluated at XLA compile time.\n\n" "Please file a usability bug with the framework being used (e.g. " "TensorFlow).", - op_string.c_str()); + op_string); } TF_ASSIGN_OR_RETURN(const HloInstructionProto* root, @@ -2348,8 +2345,8 @@ XlaBuilder::CreateDefaultConvDimensionNumbers(int num_spatial_dims) { dnum.input_spatial_dimensions(0), dnum.input_spatial_dimensions(1)}) .size() != 4) { return FailedPrecondition( - "dimension numbers for the input are not unique: (%lld, %lld, %lld, " - "%lld)", + "dimension numbers for the input are not unique: (%d, %d, %d, " + "%d)", dnum.input_batch_dimension(), dnum.input_feature_dimension(), dnum.input_spatial_dimensions(0), dnum.input_spatial_dimensions(1)); } @@ -2359,8 +2356,8 @@ XlaBuilder::CreateDefaultConvDimensionNumbers(int num_spatial_dims) { dnum.kernel_spatial_dimensions(1)}) .size() != 4) { return FailedPrecondition( - "dimension numbers for the weight are not unique: (%lld, %lld, %lld, " - "%lld)", + "dimension numbers for the weight are not unique: (%d, %d, %d, " + "%d)", dnum.kernel_output_feature_dimension(), dnum.kernel_input_feature_dimension(), dnum.kernel_spatial_dimensions(0), dnum.kernel_spatial_dimensions(1)); @@ -2371,8 +2368,8 @@ XlaBuilder::CreateDefaultConvDimensionNumbers(int num_spatial_dims) { dnum.output_spatial_dimensions(1)}) .size() != 4) { return FailedPrecondition( - "dimension numbers for the output are not unique: (%lld, %lld, %lld, " - "%lld)", + "dimension numbers for the output are not unique: (%d, %d, %d, " + "%d)", dnum.output_batch_dimension(), dnum.output_feature_dimension(), dnum.output_spatial_dimensions(0), dnum.output_spatial_dimensions(1)); } @@ -2392,13 +2389,11 @@ StatusOr XlaBuilder::AddInstruction( } for (const auto& operand : operands) { if (operand.builder_ == nullptr) { - return InvalidArgument("invalid XlaOp with handle %lld", - operand.handle()); + return InvalidArgument("invalid XlaOp with handle %d", operand.handle()); } if (operand.builder_ != this) { return InvalidArgument("Do not add XlaOp from builder %s to builder %s", - operand.builder_->name().c_str(), - this->name().c_str()); + operand.builder_->name(), this->name()); } instr.add_operand_ids(operand.handle()); } @@ -2428,18 +2423,18 @@ StatusOr XlaBuilder::LookUpInstruction( if (op.builder_ == nullptr) { return InvalidArgument( - "invalid XlaOp with handle %lld; the builder of this op is freed", + "invalid XlaOp with handle %d; the builder of this op is freed", op.handle()); } if (op.builder_ != this) { return InvalidArgument( - "XlaOp with handle %lld is built by builder '%s', but is trying to use " + "XlaOp with handle %d is built by builder '%s', but is trying to use " "it in builder '%s'", - op.handle(), op.builder_->name().c_str(), this->name().c_str()); + op.handle(), op.builder_->name(), this->name()); } if (op.handle() >= instructions_.size() || op.handle() < 0) { - return InvalidArgument("no XlaOp value %lld", op.handle()); + return InvalidArgument("no XlaOp value %d", op.handle()); } return &instructions_[op.handle()]; } diff --git a/tensorflow/compiler/xla/layout_util.cc b/tensorflow/compiler/xla/layout_util.cc index 61c26434b1..cce1838ef3 100644 --- a/tensorflow/compiler/xla/layout_util.cc +++ b/tensorflow/compiler/xla/layout_util.cc @@ -169,7 +169,7 @@ Layout CreateDefaultLayoutForRank(int64 rank) { } else if (ShapeUtil::IsArray(shape)) { if (!shape.has_layout()) { return InvalidArgument("shape %s does not have a layout", - ShapeUtil::HumanString(shape).c_str()); + ShapeUtil::HumanString(shape)); } return ValidateLayoutForShape(shape.layout(), shape); } else { @@ -177,7 +177,7 @@ Layout CreateDefaultLayoutForRank(int64 rank) { if (shape.has_layout()) { return InvalidArgument( "shape of primitive type %s should not have a layout", - PrimitiveType_Name(shape.element_type()).c_str()); + PrimitiveType_Name(shape.element_type())); } return Status::OK(); } @@ -194,7 +194,7 @@ Layout CreateDefaultLayoutForRank(int64 rank) { layout.padded_dimensions_size() != 0) { return InvalidArgument( "shape of primitive type %s should not have a non-trivial layout", - PrimitiveType_Name(shape.element_type()).c_str()); + PrimitiveType_Name(shape.element_type())); } return Status::OK(); } @@ -202,17 +202,17 @@ Layout CreateDefaultLayoutForRank(int64 rank) { if (layout.format() == INVALID_FORMAT) { return InvalidArgument( "Layout does not have a valid format: layout {%s}, shape {%s}", - layout.ShortDebugString().c_str(), shape.ShortDebugString().c_str()); + layout.ShortDebugString(), shape.ShortDebugString()); } if (layout.format() == DENSE) { if (layout.minor_to_major_size() != ShapeUtil::Rank(shape)) { return InvalidArgument( "layout minor_to_major field contains %d elements, " - "but shape is rank %lld: {%s}; shape: %s", + "but shape is rank %d: {%s}; shape: %s", layout.minor_to_major_size(), ShapeUtil::Rank(shape), - absl::StrJoin(layout.minor_to_major(), ", ").c_str(), - shape.ShortDebugString().c_str()); + absl::StrJoin(layout.minor_to_major(), ", "), + shape.ShortDebugString()); } std::vector dimensions_in_layout(ShapeUtil::Rank(shape), false); @@ -221,12 +221,12 @@ Layout CreateDefaultLayoutForRank(int64 rank) { if (dim < 0 || dim >= ShapeUtil::Rank(shape)) { return InvalidArgument( "layout minor_to_major field has out-of-bounds value: %s", - HumanString(layout).c_str()); + HumanString(layout)); } if (dimensions_in_layout[dim]) { return InvalidArgument( "layout minor_to_major field has duplicate values: {%s}", - HumanString(layout).c_str()); + HumanString(layout)); } dimensions_in_layout[dim] = true; } @@ -234,14 +234,14 @@ Layout CreateDefaultLayoutForRank(int64 rank) { if (layout.padded_dimensions_size() > 0) { if (layout.padded_dimensions_size() != ShapeUtil::Rank(shape)) { return InvalidArgument( - "layout has %d padded dimensions, but shape is rank %lld", + "layout has %d padded dimensions, but shape is rank %d", layout.padded_dimensions_size(), ShapeUtil::Rank(shape)); } for (int i = 0; i < layout.padded_dimensions_size(); ++i) { if (layout.padded_dimensions(i) < shape.dimensions(i)) { return InvalidArgument( - "for dimension %d, dimension padding (%lld) is smaller than " - "the dimension size (%lld) of the shape", + "for dimension %d, dimension padding (%d) is smaller than " + "the dimension size (%d) of the shape", i, layout.padded_dimensions(i), shape.dimensions(i)); } } diff --git a/tensorflow/compiler/xla/legacy_flags/BUILD b/tensorflow/compiler/xla/legacy_flags/BUILD index 989035896b..3e79129aaf 100644 --- a/tensorflow/compiler/xla/legacy_flags/BUILD +++ b/tensorflow/compiler/xla/legacy_flags/BUILD @@ -26,6 +26,7 @@ cc_library( "//tensorflow/compiler/xla:types", "//tensorflow/core:framework_internal", "//tensorflow/core:lib", + "@com_google_absl//absl/strings", ], ) @@ -39,6 +40,7 @@ tf_cc_test( "//tensorflow/core:framework_internal", "//tensorflow/core:lib", "//tensorflow/core:test", + "@com_google_absl//absl/strings:str_format", ], ) @@ -75,5 +77,6 @@ tf_cc_test( "//tensorflow/core:lib", "//tensorflow/core:test", "@com_google_absl//absl/strings", + "@com_google_absl//absl/strings:str_format", ], ) diff --git a/tensorflow/compiler/xla/legacy_flags/debug_options_parsers.h b/tensorflow/compiler/xla/legacy_flags/debug_options_parsers.h index acda438395..ee7eb019c0 100644 --- a/tensorflow/compiler/xla/legacy_flags/debug_options_parsers.h +++ b/tensorflow/compiler/xla/legacy_flags/debug_options_parsers.h @@ -21,7 +21,6 @@ limitations under the License. #include "absl/strings/str_split.h" #include "tensorflow/compiler/xla/service/hlo_opcode.h" #include "tensorflow/compiler/xla/xla.pb.h" -#include "tensorflow/core/lib/strings/stringprintf.h" namespace xla { namespace legacy_flags { diff --git a/tensorflow/compiler/xla/legacy_flags/parse_flags_from_env_test.cc b/tensorflow/compiler/xla/legacy_flags/parse_flags_from_env_test.cc index 7b6ae311c1..138c0c852e 100644 --- a/tensorflow/compiler/xla/legacy_flags/parse_flags_from_env_test.cc +++ b/tensorflow/compiler/xla/legacy_flags/parse_flags_from_env_test.cc @@ -21,8 +21,8 @@ limitations under the License. #include #include +#include "absl/strings/str_format.h" #include "tensorflow/compiler/xla/types.h" -#include "tensorflow/core/lib/strings/stringprintf.h" #include "tensorflow/core/platform/logging.h" #include "tensorflow/core/platform/subprocess.h" #include "tensorflow/core/platform/test.h" @@ -106,8 +106,8 @@ TEST(ParseFlagsFromEnv, File) { if (tmp_dir == nullptr) { tmp_dir = kTempDir; } - string tmp_file = tensorflow::strings::Printf("%s/parse_flags_from_env.%d", - tmp_dir, getpid()); + string tmp_file = + absl::StrFormat("%s/parse_flags_from_env.%d", tmp_dir, getpid()); FILE* fp = fopen(tmp_file.c_str(), "w"); CHECK_NE(fp, nullptr) << "can't write to " << tmp_file; for (int i = 0; kTestFlagString[i] != '\0'; i++) { diff --git a/tensorflow/compiler/xla/literal.cc b/tensorflow/compiler/xla/literal.cc index 0c0b619d50..93e808469a 100644 --- a/tensorflow/compiler/xla/literal.cc +++ b/tensorflow/compiler/xla/literal.cc @@ -24,6 +24,7 @@ limitations under the License. #include "absl/memory/memory.h" #include "absl/strings/str_cat.h" +#include "absl/strings/str_format.h" #include "absl/strings/str_join.h" #include "tensorflow/compiler/xla/index_util.h" #include "tensorflow/compiler/xla/shape_util.h" @@ -33,7 +34,6 @@ limitations under the License. #include "tensorflow/core/lib/core/casts.h" #include "tensorflow/core/lib/core/errors.h" #include "tensorflow/core/lib/hash/hash.h" -#include "tensorflow/core/lib/strings/stringprintf.h" #include "tensorflow/core/platform/logging.h" #include "tensorflow/core/platform/types.h" @@ -41,7 +41,7 @@ namespace xla { namespace { using absl::StrCat; -using tensorflow::strings::Printf; +using absl::StrFormat; constexpr bool kLittleEndian = __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__; @@ -303,7 +303,7 @@ MutableLiteralBase::CreateFromProto(const LiteralProto& proto) { if (proto_element->tuple_literals_size() != ShapeUtil::TupleElementCount(piece->subshape())) { return InvalidArgument( - "Expected %lld tuple elements in LiteralProto, has %d", + "Expected %d tuple elements in LiteralProto, has %d", ShapeUtil::TupleElementCount(piece->subshape()), proto_element->tuple_literals_size()); } @@ -404,7 +404,7 @@ Status LiteralBase::Piece::CopyFrom(const LiteralBase::Piece& src) { default: return Unimplemented( "Copying a Literal object with element type %s is not implemented.", - PrimitiveType_Name(subshape().element_type()).c_str()); + PrimitiveType_Name(subshape().element_type())); } } return Status::OK(); @@ -420,8 +420,8 @@ Status MutableLiteralBase::CopyFrom(const LiteralSlice& src_literal, if (!ShapeUtil::Compatible(dest_subshape, src_subshape)) { return InvalidArgument( "Destination subshape incompatible with source subshape: %s vs %s", - ShapeUtil::HumanString(dest_subshape).c_str(), - ShapeUtil::HumanString(src_subshape).c_str()); + ShapeUtil::HumanString(dest_subshape), + ShapeUtil::HumanString(src_subshape)); } return root_piece_->ForEachMutableSubpieceWithStatus( [&](const ShapeIndex& index, Piece* piece) { @@ -458,8 +458,8 @@ Status Literal::MoveFrom(Literal&& src_literal, if (!ShapeUtil::Equal(dest_subshape, src_literal.shape())) { return InvalidArgument( "Destination subshape not equal to source shape: %s vs %s", - ShapeUtil::HumanString(dest_subshape).c_str(), - ShapeUtil::HumanString(src_literal.shape()).c_str()); + ShapeUtil::HumanString(dest_subshape), + ShapeUtil::HumanString(src_literal.shape())); } src_literal.root_piece_->ForEachSubpiece( @@ -654,8 +654,8 @@ StatusOr> LiteralBase::Reshape( return InvalidArgument( "Shapes before and after Literal::Reshape have different numbers " "of elements: %s vs %s.", - ShapeUtil::HumanString(shape()).c_str(), - ShapeUtil::HumanString(output->shape()).c_str()); + ShapeUtil::HumanString(shape()), + ShapeUtil::HumanString(output->shape())); } return std::move(output); } @@ -874,9 +874,8 @@ StatusOr LiteralBase::GetIntegralAsS64( case U64: return Get(multi_index); default: - return FailedPrecondition( - "Array element type is not integral: %s", - PrimitiveType_Name(shape().element_type()).c_str()); + return FailedPrecondition("Array element type is not integral: %s", + PrimitiveType_Name(shape().element_type())); } } @@ -924,9 +923,8 @@ Status MutableLiteralBase::SetIntegralAsS64( Set(multi_index, value); break; default: - return FailedPrecondition( - "Array element type is not integral: %s", - PrimitiveType_Name(shape().element_type()).c_str()); + return FailedPrecondition("Array element type is not integral: %s", + PrimitiveType_Name(shape().element_type())); } return Status::OK(); } @@ -1116,9 +1114,9 @@ void ToStringHelper(const LiteralBase& literal, const ShapeIndex& shape_index, pieces->push_back(shape_to_string(subshape)); pieces->push_back(" {\n"); for (int64 i0 = 0; i0 < subshape.dimensions(0); ++i0) { - pieces->push_back(Printf(" { /*i0=%lld*/\n", i0)); + pieces->push_back(StrFormat(" { /*i0=%d*/\n", i0)); for (int64 i1 = 0; i1 < subshape.dimensions(1); ++i1) { - pieces->push_back(Printf(" { /*i1=%lld*/\n", i1)); + pieces->push_back(StrFormat(" { /*i1=%d*/\n", i1)); for (int64 i2 = 0; i2 < subshape.dimensions(2); ++i2) { pieces->push_back(" {"); for (int64 i3 = 0; i3 < subshape.dimensions(3); ++i3) { @@ -1136,11 +1134,11 @@ void ToStringHelper(const LiteralBase& literal, const ShapeIndex& shape_index, pieces->push_back(shape_to_string(subshape)); pieces->push_back(" {\n"); for (int64 i0 = 0; i0 < subshape.dimensions(0); ++i0) { - pieces->push_back(Printf(" { /*i0=%lld*/\n", i0)); + pieces->push_back(StrFormat(" { /*i0=%d*/\n", i0)); for (int64 i1 = 0; i1 < subshape.dimensions(1); ++i1) { - pieces->push_back(Printf(" { /*i1=%lld*/\n", i1)); + pieces->push_back(StrFormat(" { /*i1=%d*/\n", i1)); for (int64 i2 = 0; i2 < subshape.dimensions(2); ++i2) { - pieces->push_back(Printf(" { /*i2=%lld*/\n", i2)); + pieces->push_back(StrFormat(" { /*i2=%d*/\n", i2)); for (int64 i3 = 0; i3 < subshape.dimensions(3); ++i3) { pieces->push_back(" {"); for (int64 i4 = 0; i4 < subshape.dimensions(4); ++i4) { @@ -1312,10 +1310,9 @@ StatusOr> ConvertIfDestTypeMatches( default: break; } - return Unimplemented( - "Converting from type %s to type %s is not implemented.", - PrimitiveType_Name(src_literal.shape().element_type()).c_str(), - PrimitiveType_Name(primitive_dest_type).c_str()); + return Unimplemented("Converting from type %s to type %s is not implemented.", + PrimitiveType_Name(src_literal.shape().element_type()), + PrimitiveType_Name(primitive_dest_type)); } StatusOr> ConvertSwitch( @@ -1344,11 +1341,10 @@ StatusOr> ConvertSwitch( #undef CONVERT_IF_DEST_TYPE_MATCHES // Other types are not yet supported. default: - return Unimplemented( - "%s from type %s to type %s is not implemented.", - (bitcast ? "Bitcast converting" : "Converting"), - PrimitiveType_Name(literal.shape().element_type()).c_str(), - PrimitiveType_Name(primitive_dest_type).c_str()); + return Unimplemented("%s from type %s to type %s is not implemented.", + (bitcast ? "Bitcast converting" : "Converting"), + PrimitiveType_Name(literal.shape().element_type()), + PrimitiveType_Name(primitive_dest_type)); } } @@ -1366,8 +1362,8 @@ StatusOr> LiteralBase::BitcastConvert( return InvalidArgument( "Cannot bitcast convert from %s to %s, bit widths are different: %d != " "%d", - PrimitiveType_Name(shape().element_type()).c_str(), - PrimitiveType_Name(primitive_dest_type).c_str(), + PrimitiveType_Name(shape().element_type()), + PrimitiveType_Name(primitive_dest_type), primitive_util::BitWidth(shape().element_type()), primitive_util::BitWidth(primitive_dest_type)); } diff --git a/tensorflow/compiler/xla/literal_comparison.cc b/tensorflow/compiler/xla/literal_comparison.cc index 67a69c2403..43388ac9d1 100644 --- a/tensorflow/compiler/xla/literal_comparison.cc +++ b/tensorflow/compiler/xla/literal_comparison.cc @@ -20,15 +20,15 @@ limitations under the License. #include #include "absl/strings/str_cat.h" +#include "absl/strings/str_format.h" #include "tensorflow/compiler/xla/literal_util.h" #include "tensorflow/compiler/xla/util.h" #include "tensorflow/core/lib/core/casts.h" #include "tensorflow/core/platform/env.h" using absl::StrAppend; +using absl::StrAppendFormat; using absl::StrCat; -using tensorflow::strings::Appendf; -using tensorflow::strings::Printf; namespace xla { namespace literal_comparison { @@ -48,9 +48,9 @@ Status CompareFloatsBitwiseEqual( return InvalidArgument( "floating values are not bitwise-equal; and equality testing " "was requested: %s=%g=%a vs %s=%g=%a at array index %s", - StrCat(absl::Hex(ulhs)).c_str(), lhs_double, lhs_double, - StrCat(absl::Hex(urhs)).c_str(), rhs_double, rhs_double, - LiteralUtil::MultiIndexAsString(multi_index).c_str()); + StrCat(absl::Hex(ulhs)), lhs_double, lhs_double, + StrCat(absl::Hex(urhs)), rhs_double, rhs_double, + LiteralUtil::MultiIndexAsString(multi_index)); } return Status::OK(); } @@ -67,8 +67,7 @@ Status CompareEqual(NativeT lhs, NativeT rhs, return InvalidArgument( "first mismatch at array index %s:\n expected value: %s\n actual " "value: %s", - LiteralUtil::MultiIndexAsString(multi_index).c_str(), StrCat(lhs).c_str(), - StrCat(rhs).c_str()); + LiteralUtil::MultiIndexAsString(multi_index), StrCat(lhs), StrCat(rhs)); } // Specializations for floating types that do bitwise comparisons when equality @@ -168,12 +167,12 @@ bool NanMismatch(half expected, half actual, bool relaxed_nans) { // Converts the given floating-point value to a string. template string FpValueToString(NativeT value) { - return Printf("%8.4g", static_cast(value)); + return absl::StrFormat("%8.4g", static_cast(value)); } template <> string FpValueToString(complex64 value) { - return Printf("%8.4g + %8.4fi", value.real(), value.imag()); + return absl::StrFormat("%8.4g + %8.4fi", value.real(), value.imag()); } // Returns the absolute value of the given floating point value. This function @@ -228,13 +227,12 @@ class NearComparator { } string ToString(const Shape& shape) const { - return Printf( + return absl::StrFormat( "actual %s, expected %s, index %s, rel error %8.3g, abs error %8.3g", - FpValueToString(actual).c_str(), FpValueToString(expected).c_str(), + FpValueToString(actual), FpValueToString(expected), LiteralUtil::MultiIndexAsString( IndexUtil::LinearIndexToMultidimensionalIndex(shape, - linear_index)) - .c_str(), + linear_index)), rel_error, abs_error); } }; @@ -258,7 +256,7 @@ class NearComparator { TF_RETURN_IF_ERROR(EqualShapes(expected_.shape(), actual_.shape())); if (!ShapeUtil::IsArray(expected_.shape())) { return InvalidArgument("Expected array shape; got %s.", - ShapeUtil::HumanString(expected_.shape()).c_str()); + ShapeUtil::HumanString(expected_.shape())); } mismatches_ = Literal(ShapeUtil::ChangeElementType(actual_.shape(), PRED)); @@ -271,7 +269,7 @@ class NearComparator { } else if (!VLOG_IS_ON(1) && miscompare_callback_ != nullptr) { miscompare_callback_(expected_, actual_, mismatches_); } - return InvalidArgument("%s", ErrorMessage().c_str()); + return InvalidArgument("%s", ErrorMessage()); } // Insert the given absolute value into the absolute value bucket vector. The @@ -410,23 +408,23 @@ class NearComparator { auto percent_string = [](float a, float b) { float pct = b == 0.0 ? 0.0 : 100.0 * a / b; - return Printf("%0.4f%%", pct); + return absl::StrFormat("%0.4f%%", pct); }; - Appendf(&out, - "\nMismatch count %lld (%s) in shape %s (%lld elements), abs bound " - "%g, rel bound %g\n", - num_mismatches_, - percent_string(num_mismatches_, element_count).c_str(), - ShapeUtil::HumanString(actual_.shape()).c_str(), - ShapeUtil::ElementsIn(actual_.shape()), error_.abs, error_.rel); + StrAppendFormat( + &out, + "\nMismatch count %d (%s) in shape %s (%d elements), abs bound " + "%g, rel bound %g\n", + num_mismatches_, percent_string(num_mismatches_, element_count), + ShapeUtil::HumanString(actual_.shape()), + ShapeUtil::ElementsIn(actual_.shape()), error_.abs, error_.rel); if (num_nan_mismatches_ > 0) { StrAppend(&out, "nan mismatches ", num_nan_mismatches_, "\n"); } - Appendf(&out, "Top relative error mismatches:\n"); + StrAppendFormat(&out, "Top relative error mismatches:\n"); for (auto it = top_rel_mismatches_.rbegin(); it != top_rel_mismatches_.rend(); ++it) { - StrAppend(&out, " ", it->ToString(actual_.shape()).c_str(), "\n"); + StrAppend(&out, " ", it->ToString(actual_.shape()), "\n"); } if (!detailed_message_) { @@ -438,36 +436,37 @@ class NearComparator { for (int i = 0; i < abs_value_buckets_.size(); ++i) { const int64 bucket_size = abs_value_buckets_[i].first; const int64 bucket_mismatches = abs_value_buckets_[i].second; - string mismatch_str = bucket_mismatches > 0 - ? Printf(", mismatches %lld", bucket_mismatches) - : ""; - Appendf(&out, " %-6g <= x < %-6g : %7lld (%9s)%s\n", - kAbsValueBucketBounds[i], kAbsValueBucketBounds[i + 1], - bucket_size, percent_string(bucket_size, element_count).c_str(), - mismatch_str.c_str()); + string mismatch_str = + bucket_mismatches > 0 + ? absl::StrFormat(", mismatches %d", bucket_mismatches) + : ""; + StrAppendFormat(&out, " %-6g <= x < %-6g : %7d (%9s)%s\n", + kAbsValueBucketBounds[i], kAbsValueBucketBounds[i + 1], + bucket_size, percent_string(bucket_size, element_count), + mismatch_str); } auto print_accum_buckets = [&](const string& header, int64 total, tensorflow::gtl::ArraySlice buckets) { StrAppend(&out, header, ":\n"); - Appendf(&out, " < %-6g : %7lld (%s)\n", kErrorBucketBounds[0], - total - buckets[0], - percent_string(total - buckets[0], total).c_str()); + StrAppendFormat(&out, " < %-6g : %7d (%s)\n", kErrorBucketBounds[0], + total - buckets[0], + percent_string(total - buckets[0], total)); CHECK_EQ(buckets.size(), kErrorBucketBounds.size()); for (int i = 0; i < kErrorBucketBounds.size(); ++i) { - Appendf(&out, " >= %-6g : %7lld (%s)\n", kErrorBucketBounds[i], - buckets[i], percent_string(buckets[i], total).c_str()); + StrAppendFormat(&out, " >= %-6g : %7d (%s)\n", kErrorBucketBounds[i], + buckets[i], percent_string(buckets[i], total)); } }; - Appendf(&out, "Elements exceeding abs error bound %g: %lld (%s)\n", - error_.abs, num_abs_mismatches_, - percent_string(num_abs_mismatches_, element_count).c_str()); + StrAppendFormat(&out, "Elements exceeding abs error bound %g: %d (%s)\n", + error_.abs, num_abs_mismatches_, + percent_string(num_abs_mismatches_, element_count)); print_accum_buckets( "Relative error breakdown of elements exceeding abs error bound", num_abs_mismatches_, rel_error_buckets_); - Appendf(&out, "Elements exceeding rel error bound %g: %lld (%s)\n", - error_.rel, num_rel_mismatches_, - percent_string(num_rel_mismatches_, element_count).c_str()); + StrAppendFormat(&out, "Elements exceeding rel error bound %g: %d (%s)\n", + error_.rel, num_rel_mismatches_, + percent_string(num_rel_mismatches_, element_count)); print_accum_buckets( "Absolute error breakdown of elements exceeding rel error bound", num_rel_mismatches_, abs_error_buckets_); @@ -612,9 +611,9 @@ Status NearHelper(const LiteralSlice& expected, const LiteralSlice& actual, NearHelper(expected_element, actual_element, error, detailed_message, miscompare_callback, element_index); if (!element_result.ok()) { - element_result = InvalidArgument( - "Array at shape index %s, %s", element_index.ToString().c_str(), - element_result.error_message().c_str()); + element_result = InvalidArgument("Array at shape index %s, %s", + element_index.ToString(), + element_result.error_message()); if (return_status.ok()) { return_status = element_result; } else { @@ -627,10 +626,10 @@ Status NearHelper(const LiteralSlice& expected, const LiteralSlice& actual, // Emit a top-level error message containing the top-level shape in case // of mismatch. int64 total_elements = RecursiveElementCount(actual.shape()); - return_status = InvalidArgument( - "\nMismatches in shape %s (%lld elements):\n%s", - ShapeUtil::HumanString(actual.shape()).c_str(), total_elements, - return_status.error_message().c_str()); + return_status = + InvalidArgument("\nMismatches in shape %s (%d elements):\n%s", + ShapeUtil::HumanString(actual.shape()), + total_elements, return_status.error_message()); } return return_status; } @@ -674,14 +673,14 @@ Status NearHelper(const LiteralSlice& expected, const LiteralSlice& actual, Status EqualShapes(const Shape& expected, const Shape& actual) { if (expected.element_type() != actual.element_type()) { return InvalidArgument("element type mismatch, want: %s got %s", - ShapeUtil::HumanString(expected).c_str(), - ShapeUtil::HumanString(actual).c_str()); + ShapeUtil::HumanString(expected), + ShapeUtil::HumanString(actual)); } if (ShapeUtil::IsTuple(expected)) { if (ShapeUtil::TupleElementCount(expected) != ShapeUtil::TupleElementCount(actual)) { return InvalidArgument( - "want tuple element count: %lld got tuple element count: %lld", + "want tuple element count: %d got tuple element count: %d", ShapeUtil::TupleElementCount(expected), ShapeUtil::TupleElementCount(actual)); } @@ -695,14 +694,13 @@ Status EqualShapes(const Shape& expected, const Shape& actual) { } else if (ShapeUtil::IsArray(expected)) { if (ShapeUtil::Rank(expected) != ShapeUtil::Rank(actual)) { return InvalidArgument("want rank of %s got rank of %s", - ShapeUtil::HumanString(expected).c_str(), - ShapeUtil::HumanString(actual).c_str()); + ShapeUtil::HumanString(expected), + ShapeUtil::HumanString(actual)); } if (expected.element_type() != actual.element_type()) { - return InvalidArgument( - "mismatch in primitive type %s vs %s", - PrimitiveType_Name(expected.element_type()).c_str(), - PrimitiveType_Name(actual.element_type()).c_str()); + return InvalidArgument("mismatch in primitive type %s vs %s", + PrimitiveType_Name(expected.element_type()), + PrimitiveType_Name(actual.element_type())); } if (expected.dimensions_size() != actual.dimensions_size()) { return InvalidArgument("want dimensions_size %d got dimensions_size %d", @@ -713,8 +711,7 @@ Status EqualShapes(const Shape& expected, const Shape& actual) { if (expected.dimensions(i) != actual.dimensions(i)) { return InvalidArgument( "mismatch in dimension #%d expected: %s actual: %s", i, - ShapeUtil::HumanString(expected).c_str(), - ShapeUtil::HumanString(actual).c_str()); + ShapeUtil::HumanString(expected), ShapeUtil::HumanString(actual)); } } } @@ -733,9 +730,8 @@ Status EmitLiteralsInErrorMessage(const Status& result, return result; } return InvalidArgument("%s\n\nExpected literal:\n%s\n\nActual literal:\n%s", - result.error_message().c_str(), - ToStringTruncated(expected).c_str(), - ToStringTruncated(actual).c_str()); + result.error_message(), ToStringTruncated(expected), + ToStringTruncated(actual)); } } // namespace diff --git a/tensorflow/compiler/xla/literal_util.cc b/tensorflow/compiler/xla/literal_util.cc index 95d93acfe8..931d2c631b 100644 --- a/tensorflow/compiler/xla/literal_util.cc +++ b/tensorflow/compiler/xla/literal_util.cc @@ -33,7 +33,6 @@ limitations under the License. #include "tensorflow/core/lib/core/casts.h" #include "tensorflow/core/lib/core/errors.h" #include "tensorflow/core/lib/hash/hash.h" -#include "tensorflow/core/lib/strings/stringprintf.h" #include "tensorflow/core/platform/logging.h" #include "tensorflow/core/platform/mem.h" #include "tensorflow/core/platform/types.h" diff --git a/tensorflow/compiler/xla/map_util.h b/tensorflow/compiler/xla/map_util.h index 3c74e070da..fcff48b6b1 100644 --- a/tensorflow/compiler/xla/map_util.h +++ b/tensorflow/compiler/xla/map_util.h @@ -60,7 +60,7 @@ MaybeFind(const Collection& collection, if (it == collection.end()) { std::ostringstream os; os << key; - return NotFound("key not found: %s", os.str().c_str()); + return NotFound("key not found: %s", os.str()); } return {it->second}; } diff --git a/tensorflow/compiler/xla/metric_table_report.cc b/tensorflow/compiler/xla/metric_table_report.cc index 2f22e02c3e..4eab4fa429 100644 --- a/tensorflow/compiler/xla/metric_table_report.cc +++ b/tensorflow/compiler/xla/metric_table_report.cc @@ -19,7 +19,7 @@ limitations under the License. #include #include "absl/strings/str_cat.h" -#include "tensorflow/core/lib/strings/stringprintf.h" +#include "absl/strings/str_format.h" #include "tensorflow/core/platform/logging.h" #include "tensorflow/core/platform/types.h" @@ -264,8 +264,7 @@ string MetricTableReport::MetricString(double metric) { } string MetricTableReport::MetricPercent(double metric) { - return tensorflow::strings::Printf("%5.2f%%", - metric / expected_metric_sum_ * 100.0); + return absl::StrFormat("%5.2f%%", metric / expected_metric_sum_ * 100.0); } } // namespace xla diff --git a/tensorflow/compiler/xla/packed_literal_reader.cc b/tensorflow/compiler/xla/packed_literal_reader.cc index 012df87551..6e42775f6f 100644 --- a/tensorflow/compiler/xla/packed_literal_reader.cc +++ b/tensorflow/compiler/xla/packed_literal_reader.cc @@ -54,7 +54,7 @@ StatusOr> PackedLiteralReader::Read( if (shape.element_type() != F32) { return Unimplemented( "not yet implemented element type for packed literal reading: %s", - PrimitiveType_Name(shape.element_type()).c_str()); + PrimitiveType_Name(shape.element_type())); } auto result = absl::make_unique(literal_shape); diff --git a/tensorflow/compiler/xla/python/BUILD b/tensorflow/compiler/xla/python/BUILD index 2d8fe434b0..fe91dc0618 100644 --- a/tensorflow/compiler/xla/python/BUILD +++ b/tensorflow/compiler/xla/python/BUILD @@ -40,6 +40,7 @@ cc_library( "//tensorflow/core:lib", "//tensorflow/python:numpy_lib", "@com_google_absl//absl/strings", + "@com_google_absl//absl/strings:str_format", ], ) diff --git a/tensorflow/compiler/xla/python/local_computation_builder.cc b/tensorflow/compiler/xla/python/local_computation_builder.cc index 00e36c3c86..b5fd747cfa 100644 --- a/tensorflow/compiler/xla/python/local_computation_builder.cc +++ b/tensorflow/compiler/xla/python/local_computation_builder.cc @@ -251,7 +251,7 @@ StatusOr> CompiledLocalComputation::Execute( return InternalError( "Failed running replica %d (other replicas may have failed as well): " "%s.", - replica, statusor.status().ToString().c_str()); + replica, statusor.status().ToString()); } } @@ -696,8 +696,7 @@ StatusOr DestructureLocalShapedBufferTuple( "Attemped to destructure a LocalShapedBuffer that did not have a tuple " "shape; shape: %s", ShapeUtil::HumanString( - local_shaped_buffer->shaped_buffer()->on_device_shape()) - .c_str()); + local_shaped_buffer->shaped_buffer()->on_device_shape())); } DeviceMemoryAllocator* allocator = diff --git a/tensorflow/compiler/xla/python/local_computation_builder.i b/tensorflow/compiler/xla/python/local_computation_builder.i index 08dccb3ee1..f6169ebf19 100644 --- a/tensorflow/compiler/xla/python/local_computation_builder.i +++ b/tensorflow/compiler/xla/python/local_computation_builder.i @@ -110,6 +110,7 @@ limitations under the License. #include "tensorflow/python/lib/core/numpy.h" #include "third_party/absl/strings/str_cat.h" +#include "third_party/absl/strings/str_format.h" #include "tensorflow/compiler/xla/literal.h" #include "tensorflow/compiler/xla/shape_util.h" #include "tensorflow/compiler/xla/xla_data.pb.h" @@ -155,8 +156,8 @@ bool HandleStringAttribute(PyObject* o, return true; // The attribute is None, which we consider ok. } if (!PyString_Check(attr)) { - string message = tensorflow::strings::Printf("%s must be a string or none; got %s", - attr_name, numpy::PyObjectCppRepr(attr).c_str()); + string message = absl::StrFormat("%s must be a string or none; got %s", + attr_name, numpy::PyObjectCppRepr(attr)); PyErr_SetString(PyExc_TypeError, message.c_str()); Py_DECREF(attr); return false; // Type error, not ok. diff --git a/tensorflow/compiler/xla/python/numpy_bridge.cc b/tensorflow/compiler/xla/python/numpy_bridge.cc index f2f99c1745..fc6511bef5 100644 --- a/tensorflow/compiler/xla/python/numpy_bridge.cc +++ b/tensorflow/compiler/xla/python/numpy_bridge.cc @@ -15,6 +15,7 @@ limitations under the License. #include "tensorflow/compiler/xla/python/numpy_bridge.h" #include "absl/strings/str_cat.h" +#include "absl/strings/str_format.h" #include "tensorflow/compiler/xla/literal_util.h" #include "tensorflow/compiler/xla/shape_util.h" #include "tensorflow/core/platform/logging.h" @@ -150,9 +151,7 @@ static int NumpyTypenum(PyObject* o) { // // NOTE: this is an internal helper for conversion to a C++, and so decrefs r. static string ExtractStringAndDecref(PyObject* r) { - auto error = [r] { - return tensorflow::strings::Printf("", r); - }; + auto error = [r] { return absl::StrFormat("", r); }; if (r == nullptr) { return error(); } diff --git a/tensorflow/compiler/xla/rpc/BUILD b/tensorflow/compiler/xla/rpc/BUILD index 44b22a5586..97fcd37f6b 100644 --- a/tensorflow/compiler/xla/rpc/BUILD +++ b/tensorflow/compiler/xla/rpc/BUILD @@ -43,6 +43,7 @@ tf_cc_binary( "//tensorflow/compiler/xla/service:cpu_plugin", "//tensorflow/core:framework_internal", "//tensorflow/core:lib", + "@com_google_absl//absl/strings:str_format", ], ) @@ -62,6 +63,7 @@ tf_cc_test( "//tensorflow/core:lib", "//tensorflow/core:test", "//tensorflow/core:test_main", + "@com_google_absl//absl/strings:str_format", ], ) diff --git a/tensorflow/compiler/xla/rpc/grpc_client_test.cc b/tensorflow/compiler/xla/rpc/grpc_client_test.cc index 6788676181..43fd8fe1bd 100644 --- a/tensorflow/compiler/xla/rpc/grpc_client_test.cc +++ b/tensorflow/compiler/xla/rpc/grpc_client_test.cc @@ -23,12 +23,12 @@ limitations under the License. #include "grpcpp/create_channel.h" #include "grpcpp/security/credentials.h" +#include "absl/strings/str_format.h" #include "tensorflow/compiler/xla/client/client.h" #include "tensorflow/compiler/xla/client/xla_builder.h" #include "tensorflow/compiler/xla/rpc/grpc_stub.h" #include "tensorflow/compiler/xla/tests/literal_test_util.h" #include "tensorflow/core/lib/io/path.h" -#include "tensorflow/core/lib/strings/stringprintf.h" #include "tensorflow/core/platform/logging.h" #include "tensorflow/core/platform/net.h" #include "tensorflow/core/platform/subprocess.h" @@ -46,7 +46,7 @@ class GRPCClientTestBase : public ::testing::Test { int port = tensorflow::internal::PickUnusedPortOrDie(); subprocess_.SetProgram( service_main_path, - {service_main_path, tensorflow::strings::Printf("--port=%d", port)}); + {service_main_path, absl::StrFormat("--port=%d", port)}); subprocess_.SetChannelAction(tensorflow::CHAN_STDOUT, tensorflow::ACTION_DUPPARENT); subprocess_.SetChannelAction(tensorflow::CHAN_STDERR, @@ -54,9 +54,8 @@ class GRPCClientTestBase : public ::testing::Test { CHECK(subprocess_.Start()); LOG(INFO) << "Launched subprocess"; - auto channel = - ::grpc::CreateChannel(tensorflow::strings::Printf("localhost:%d", port), - ::grpc::InsecureChannelCredentials()); + auto channel = ::grpc::CreateChannel(absl::StrFormat("localhost:%d", port), + ::grpc::InsecureChannelCredentials()); channel->WaitForConnected(gpr_time_add( gpr_now(GPR_CLOCK_REALTIME), gpr_time_from_seconds(10, GPR_TIMESPAN))); LOG(INFO) << "Channel to server is connected on port " << port; diff --git a/tensorflow/compiler/xla/rpc/grpc_service_main.cc b/tensorflow/compiler/xla/rpc/grpc_service_main.cc index c68c857c30..d6b5149a24 100644 --- a/tensorflow/compiler/xla/rpc/grpc_service_main.cc +++ b/tensorflow/compiler/xla/rpc/grpc_service_main.cc @@ -18,8 +18,8 @@ limitations under the License. #include "grpcpp/security/server_credentials.h" #include "grpcpp/server.h" #include "grpcpp/server_builder.h" +#include "absl/strings/str_format.h" #include "tensorflow/compiler/xla/rpc/grpc_service.h" -#include "tensorflow/core/lib/strings/stringprintf.h" #include "tensorflow/core/platform/init_main.h" #include "tensorflow/core/platform/logging.h" #include "tensorflow/core/util/command_line_flags.h" @@ -44,7 +44,7 @@ int RealMain(int argc, char** argv) { xla::GRPCService::NewService().ConsumeValueOrDie(); ::grpc::ServerBuilder builder; - string server_address(tensorflow::strings::Printf("localhost:%d", port)); + string server_address(absl::StrFormat("localhost:%d", port)); builder.AddListeningPort(server_address, ::grpc::InsecureServerCredentials()); builder.RegisterService(service.get()); diff --git a/tensorflow/compiler/xla/service/BUILD b/tensorflow/compiler/xla/service/BUILD index 47d376c8ac..f164a614f1 100644 --- a/tensorflow/compiler/xla/service/BUILD +++ b/tensorflow/compiler/xla/service/BUILD @@ -178,6 +178,7 @@ cc_library( "//tensorflow/core:lib", "@com_google_absl//absl/algorithm:container", "@com_google_absl//absl/strings", + "@com_google_absl//absl/strings:str_format", ], ) @@ -465,6 +466,7 @@ cc_library( "//tensorflow/core:lib", "@com_google_absl//absl/memory", "@com_google_absl//absl/strings", + "@com_google_absl//absl/strings:str_format", ], ) @@ -637,6 +639,7 @@ cc_library( "//tensorflow/core:stream_executor_no_cuda", "@com_google_absl//absl/memory", "@com_google_absl//absl/strings", + "@com_google_absl//absl/strings:str_format", ], alwayslink = 1, ) @@ -671,6 +674,7 @@ cc_library( "//tensorflow/core:stream_executor_no_cuda", "@com_google_absl//absl/memory", "@com_google_absl//absl/strings", + "@com_google_absl//absl/strings:str_format", ], ) @@ -746,6 +750,7 @@ cc_library( "//tensorflow/core:stream_executor_no_cuda", "@com_google_absl//absl/memory", "@com_google_absl//absl/strings", + "@com_google_absl//absl/strings:str_format", ], ) @@ -795,6 +800,7 @@ cc_library( "//tensorflow/core:stream_executor_no_cuda", "//tensorflow/stream_executor", "@com_google_absl//absl/memory", + "@com_google_absl//absl/strings:str_format", ], ) @@ -946,6 +952,7 @@ cc_library( "//tensorflow/compiler/xla:util", "//tensorflow/core:lib", "@com_google_absl//absl/strings", + "@com_google_absl//absl/strings:str_format", ], ) @@ -992,6 +999,7 @@ cc_library( "//tensorflow/core:lib_internal", "@com_google_absl//absl/memory", "@com_google_absl//absl/strings", + "@com_google_absl//absl/strings:str_format", ], ) @@ -1040,6 +1048,7 @@ cc_library( "//tensorflow/compiler/xla:util", "//tensorflow/core:lib", "@com_google_absl//absl/strings", + "@com_google_absl//absl/strings:str_format", ], ) @@ -1746,6 +1755,7 @@ cc_library( "//tensorflow/compiler/xla:util", "//tensorflow/core:lib", "@com_google_absl//absl/strings", + "@com_google_absl//absl/strings:str_format", ], ) @@ -2135,6 +2145,7 @@ cc_library( "@com_google_absl//absl/container:inlined_vector", "@com_google_absl//absl/memory", "@com_google_absl//absl/strings", + "@com_google_absl//absl/strings:str_format", ], ) @@ -2187,6 +2198,7 @@ cc_library( "//tensorflow/core:lib", "@com_google_absl//absl/memory", "@com_google_absl//absl/strings", + "@com_google_absl//absl/strings:str_format", ], ) @@ -2325,6 +2337,7 @@ cc_library( "//tensorflow/core:lib_internal", "@com_google_absl//absl/container:inlined_vector", "@com_google_absl//absl/strings", + "@com_google_absl//absl/strings:str_format", ], ) @@ -2448,6 +2461,7 @@ cc_library( "//tensorflow/core:lib", "@com_google_absl//absl/memory", "@com_google_absl//absl/strings", + "@com_google_absl//absl/strings:str_format", ], ) @@ -2803,6 +2817,7 @@ cc_library( "//tensorflow/core:lib_internal", "//tensorflow/core:regexp_internal", "@com_google_absl//absl/strings", + "@com_google_absl//absl/strings:str_format", "@com_google_absl//absl/types:optional", ], alwayslink = 1, @@ -3143,13 +3158,13 @@ cc_library( cc_library( name = "source_map_util", - srcs = ["source_map_util.cc"], + srcs = [], hdrs = ["source_map_util.h"], deps = [ ":executable", "//tensorflow/compiler/xla:status", - "//tensorflow/compiler/xla:util", "//tensorflow/core:lib", + "@com_google_absl//absl/strings:str_format", ], ) @@ -3199,11 +3214,11 @@ cc_library( "//tensorflow/compiler/xla:statusor", "//tensorflow/compiler/xla:util", "//tensorflow/compiler/xla:xla_data_proto", - "//tensorflow/core:lib", "//tensorflow/core:lib_internal", "@com_google_absl//absl/algorithm:container", "@com_google_absl//absl/memory", "@com_google_absl//absl/strings", + "@com_google_absl//absl/strings:str_format", ], ) diff --git a/tensorflow/compiler/xla/service/allocation_tracker.cc b/tensorflow/compiler/xla/service/allocation_tracker.cc index 5115a14df0..1ed6142dce 100644 --- a/tensorflow/compiler/xla/service/allocation_tracker.cc +++ b/tensorflow/compiler/xla/service/allocation_tracker.cc @@ -69,8 +69,7 @@ StatusOr AllocationTracker::RegisterInternal( return InvalidArgument( "AllocationTracker for platform %s cannot register buffer from " "platform %s", - backend_->platform()->Name().c_str(), - shaped_buffer.platform()->Name().c_str()); + backend_->platform()->Name(), shaped_buffer.platform()->Name()); } } @@ -125,7 +124,7 @@ Status AllocationTracker::Unregister(const GlobalDataHandle& data) { // "handle does not exist". auto it = handle_to_shaped_buffers_.find(data.handle()); if (it == handle_to_shaped_buffers_.end()) { - return NotFound("no allocation record for global data handle: %lld", + return NotFound("no allocation record for global data handle: %d", data.handle()); } for (auto& shaped_buffer : it->second) { @@ -144,7 +143,7 @@ StatusOr> AllocationTracker::DeconstructTuple( // the same for all buffers across replicas. const ShapedBuffer* shaped_buffer = replicated_buffers[0]; if (!ShapeUtil::IsTuple(shaped_buffer->on_host_shape())) { - return InvalidArgument("global data handle %lld is not a tuple", + return InvalidArgument("global data handle %d is not a tuple", data.handle()); } // If the on-host representation is a tuple, then the on-device one should be @@ -201,14 +200,14 @@ StatusOr> AllocationTracker::ResolveInternal( VLOG(2) << "resolve:" << data.handle(); auto it = handle_to_shaped_buffers_.find(data.handle()); if (it == handle_to_shaped_buffers_.end()) { - return NotFound("no allocation record for global data handle: %lld", + return NotFound("no allocation record for global data handle: %d", data.handle()); } std::vector replicated_buffers; for (const auto& shaped_buffer : it->second) { if (shaped_buffer == nullptr) { - return InvalidArgument( - "global data handle %lld was previously deallocated", data.handle()); + return InvalidArgument("global data handle %d was previously deallocated", + data.handle()); } replicated_buffers.push_back(shaped_buffer.get()); } diff --git a/tensorflow/compiler/xla/service/backend.cc b/tensorflow/compiler/xla/service/backend.cc index 841d0fa85b..a6889cb171 100644 --- a/tensorflow/compiler/xla/service/backend.cc +++ b/tensorflow/compiler/xla/service/backend.cc @@ -177,7 +177,7 @@ StatusOr Backend::stream_executor( } } return InvalidArgument("device %s not supported by XLA service", - device_name(device_ordinal).c_str()); + device_name(device_ordinal)); } StatusOr Backend::devices_equivalent(int device_ordinal_a, diff --git a/tensorflow/compiler/xla/service/buffer_assignment.cc b/tensorflow/compiler/xla/service/buffer_assignment.cc index c8c36ae60e..b11f15ec7b 100644 --- a/tensorflow/compiler/xla/service/buffer_assignment.cc +++ b/tensorflow/compiler/xla/service/buffer_assignment.cc @@ -24,6 +24,7 @@ limitations under the License. #include "absl/memory/memory.h" #include "absl/strings/str_cat.h" +#include "absl/strings/str_format.h" #include "tensorflow/compiler/xla/map_util.h" #include "tensorflow/compiler/xla/service/buffer_value_containers.h" #include "tensorflow/compiler/xla/service/heap_simulator.h" @@ -37,17 +38,15 @@ limitations under the License. #include "tensorflow/core/lib/core/errors.h" #include "tensorflow/core/lib/hash/hash.h" #include "tensorflow/core/lib/strings/numbers.h" -#include "tensorflow/core/lib/strings/stringprintf.h" namespace xla { namespace { using absl::StrAppend; +using absl::StrAppendFormat; using ::tensorflow::gtl::FlatMap; using ::tensorflow::gtl::FlatSet; -using ::tensorflow::strings::Appendf; using ::tensorflow::strings::HumanReadableNumBytes; -using ::tensorflow::strings::Printf; template string ColocatedBufferSetsToString(const T& container, const char* title) { @@ -105,7 +104,7 @@ Status GatherComputationsByAllocationType( return InvalidArgument( "computation %s has conflicting allocation requirements (global " "and thread-local)", - computation->name().c_str()); + computation->name()); } if (is_thread_local) { @@ -128,7 +127,7 @@ Status GatherComputationsByAllocationType( return InvalidArgument( "computation %s cannot contain call/while op because it " "requires thread-local buffer allocations", - computation->name().c_str()); + computation->name()); } worklist.push_back(std::make_pair(subcomputation, false)); // Not thread local. @@ -145,9 +144,8 @@ Status GatherComputationsByAllocationType( true)); // Thread local. break; default: - return InternalError( - "Unexpected calling opcode: %s", - HloOpcodeString(instruction->opcode()).c_str()); + return InternalError("Unexpected calling opcode: %s", + HloOpcodeString(instruction->opcode())); } } } @@ -296,7 +294,7 @@ BufferAllocationProto BufferAllocation::ToProto() const { string BufferAllocation::ToString() const { string output; - Appendf(&output, "allocation %lld: %p, size %lld", index_, this, size()); + StrAppendFormat(&output, "allocation %d: %p, size %d", index_, this, size()); if (color().value() != 0) { StrAppend(&output, ", color ", color().value()); } @@ -328,11 +326,10 @@ string BufferAllocation::ToString() const { }); for (const LogicalBuffer* buffer : sorted_buffers) { const OffsetSize& offset_size = FindOrDie(assigned_buffers_, buffer); - StrAppend(&output, - tensorflow::strings::Printf( - " %s [%lld,%lld]: %s\n", buffer->ToString().c_str(), - offset_size.offset, offset_size.size, - ShapeUtil::HumanStringWithLayout(buffer->shape()).c_str())); + StrAppend(&output, absl::StrFormat( + " %s [%d,%d]: %s\n", buffer->ToString(), + offset_size.offset, offset_size.size, + ShapeUtil::HumanStringWithLayout(buffer->shape()))); } return output; } @@ -425,7 +422,7 @@ StatusOr BufferAssignment::GetUniqueSlice( return FailedPrecondition( "BufferAllocation::Slice for instruction %s at index %s cannot " "be determined at compile-time.", - instruction->name().c_str(), index.ToString().c_str()); + instruction->name(), index.ToString()); } } else { VLOG(3) << "No allocation"; @@ -434,7 +431,7 @@ StatusOr BufferAssignment::GetUniqueSlice( if (result.allocation() == nullptr) { return FailedPrecondition( "BufferAllocation::Slice not assigned for instruction %s at index %s", - instruction->name().c_str(), index.ToString().c_str()); + instruction->name(), index.ToString()); } return result; } @@ -646,30 +643,29 @@ Status BufferAssignment::ComputeSummaryStats() { string BufferAssignment::Stats::ToString() const { string s; - Appendf(&s, "BufferAssignment stats:\n"); - Appendf(&s, " parameter allocation: %10s\n", - HumanReadableNumBytes(parameter_allocation_bytes).c_str()); - Appendf(&s, " constant allocation: %10s\n", - HumanReadableNumBytes(constant_allocation_bytes).c_str()); - Appendf(&s, " maybe_live_out allocation: %10s\n", - HumanReadableNumBytes(maybe_live_out_allocation_bytes).c_str()); - Appendf(&s, " preallocated temp allocation: %10s\n", - HumanReadableNumBytes(preallocated_temp_allocation_bytes).c_str()); + StrAppendFormat(&s, "BufferAssignment stats:\n"); + StrAppendFormat(&s, " parameter allocation: %10s\n", + HumanReadableNumBytes(parameter_allocation_bytes)); + StrAppendFormat(&s, " constant allocation: %10s\n", + HumanReadableNumBytes(constant_allocation_bytes)); + StrAppendFormat(&s, " maybe_live_out allocation: %10s\n", + HumanReadableNumBytes(maybe_live_out_allocation_bytes)); + StrAppendFormat(&s, " preallocated temp allocation: %10s\n", + HumanReadableNumBytes(preallocated_temp_allocation_bytes)); if (preallocated_temp_fragmentation_bytes >= 0) { const double percent = 100. * preallocated_temp_fragmentation_bytes / preallocated_temp_allocation_bytes; - Appendf( + StrAppendFormat( &s, " preallocated temp fragmentation: %10s (%.2f%%)\n", - HumanReadableNumBytes(preallocated_temp_fragmentation_bytes).c_str(), - percent); + HumanReadableNumBytes(preallocated_temp_fragmentation_bytes), percent); } - Appendf(&s, " total allocation: %10s\n", - HumanReadableNumBytes(total_allocation_bytes).c_str()); + StrAppendFormat(&s, " total allocation: %10s\n", + HumanReadableNumBytes(total_allocation_bytes)); if (total_fragmentation_bytes >= 0) { const double percent = 100. * total_fragmentation_bytes / total_allocation_bytes; - Appendf(&s, " total fragmentation: %10s (%.2f%%)\n", - HumanReadableNumBytes(total_fragmentation_bytes).c_str(), percent); + StrAppendFormat(&s, " total fragmentation: %10s (%.2f%%)\n", + HumanReadableNumBytes(total_fragmentation_bytes), percent); } return s; } diff --git a/tensorflow/compiler/xla/service/buffer_liveness.cc b/tensorflow/compiler/xla/service/buffer_liveness.cc index 8d0ac3b84a..9b2783a214 100644 --- a/tensorflow/compiler/xla/service/buffer_liveness.cc +++ b/tensorflow/compiler/xla/service/buffer_liveness.cc @@ -20,6 +20,7 @@ limitations under the License. #include #include +#include "absl/strings/str_format.h" #include "absl/strings/str_join.h" #include "tensorflow/compiler/xla/service/hlo_computation.h" #include "tensorflow/compiler/xla/service/logical_buffer.h" @@ -29,7 +30,6 @@ limitations under the License. #include "tensorflow/compiler/xla/types.h" #include "tensorflow/compiler/xla/util.h" #include "tensorflow/core/lib/core/errors.h" -#include "tensorflow/core/lib/strings/stringprintf.h" #include "tensorflow/core/platform/logging.h" namespace xla { @@ -75,19 +75,17 @@ Status BufferLiveness::Analyze() { string BufferLiveness::ToString() const { std::vector pieces; - pieces.push_back(tensorflow::strings::Printf("BufferLiveness(module=%s):", - module_->name().c_str())); + pieces.push_back( + absl::StrFormat("BufferLiveness(module=%s):", module_->name())); pieces.push_back("HloOrdering:"); pieces.push_back(hlo_ordering_->ToString()); - pieces.push_back(tensorflow::strings::Printf("Aliased buffers:")); + pieces.push_back("Aliased buffers:"); for (const LogicalBuffer* buffer : aliased_buffers_) { - pieces.push_back( - tensorflow::strings::Printf(" %s", buffer->ToString().c_str())); + pieces.push_back(absl::StrFormat(" %s", buffer->ToString())); } - pieces.push_back(tensorflow::strings::Printf("Live out buffers:")); + pieces.push_back("Live out buffers:"); for (const LogicalBuffer* buffer : maybe_live_out_buffers_) { - pieces.push_back( - tensorflow::strings::Printf(" %s", buffer->ToString().c_str())); + pieces.push_back(absl::StrFormat(" %s", buffer->ToString())); } return absl::StrJoin(pieces, "\n"); } diff --git a/tensorflow/compiler/xla/service/call_graph.cc b/tensorflow/compiler/xla/service/call_graph.cc index 37523a73ff..23b2a32709 100644 --- a/tensorflow/compiler/xla/service/call_graph.cc +++ b/tensorflow/compiler/xla/service/call_graph.cc @@ -19,19 +19,19 @@ limitations under the License. #include "absl/memory/memory.h" #include "absl/strings/str_cat.h" +#include "absl/strings/str_format.h" #include "absl/strings/str_join.h" #include "tensorflow/compiler/xla/map_util.h" #include "tensorflow/compiler/xla/status_macros.h" #include "tensorflow/compiler/xla/util.h" #include "tensorflow/core/lib/core/errors.h" #include "tensorflow/core/lib/core/status.h" -#include "tensorflow/core/lib/strings/stringprintf.h" #include "tensorflow/core/platform/types.h" namespace xla { +using absl::StrAppendFormat; using absl::StrCat; -using ::tensorflow::strings::Appendf; string CallContextToString(CallContext context) { switch (context) { @@ -356,20 +356,20 @@ CallGraph::NearestAncestorsInSameComputation(HloInstruction* a, string CallGraph::ToString() const { string out; - Appendf(&out, "Call graph for module %s:\n", module_->name().c_str()); + StrAppendFormat(&out, "Call graph for module %s:\n", module_->name()); for (const CallGraphNode& node : nodes()) { - Appendf(&out, "Computation %s:\n", node.computation()->name().c_str()); - Appendf(&out, " calls:\n"); + StrAppendFormat(&out, "Computation %s:\n", node.computation()->name()); + StrAppendFormat(&out, " calls:\n"); for (const HloComputation* callee : node.callees()) { - Appendf(&out, " %s\n", callee->name().c_str()); + StrAppendFormat(&out, " %s\n", callee->name()); } - Appendf(&out, " called by:\n"); + StrAppendFormat(&out, " called by:\n"); for (const HloComputation* caller : node.callers()) { - Appendf(&out, " %s\n", caller->name().c_str()); + StrAppendFormat(&out, " %s\n", caller->name()); } - Appendf(&out, " callsites:\n"); + StrAppendFormat(&out, " callsites:\n"); for (const CallSite& callsite : node.callsites()) { - Appendf(&out, " %s\n", callsite.ToString().c_str()); + StrAppendFormat(&out, " %s\n", callsite.ToString()); } } return out; diff --git a/tensorflow/compiler/xla/service/call_inliner.cc b/tensorflow/compiler/xla/service/call_inliner.cc index 256d05a73e..1d42140444 100644 --- a/tensorflow/compiler/xla/service/call_inliner.cc +++ b/tensorflow/compiler/xla/service/call_inliner.cc @@ -96,7 +96,7 @@ class SubcomputationInsertionVisitor : public DfsHloVisitorWithDefault { if (it == subcomputation_hlo_to_new_hlo_.end()) { return NotFound( "Could not find mapping from subcomputation HLO %s to a cloned HLO.", - subcomputation_hlo->ToString().c_str()); + subcomputation_hlo->ToString()); } return it->second; } diff --git a/tensorflow/compiler/xla/service/channel_tracker.cc b/tensorflow/compiler/xla/service/channel_tracker.cc index 601a3e9a01..3c2d1ae6d8 100644 --- a/tensorflow/compiler/xla/service/channel_tracker.cc +++ b/tensorflow/compiler/xla/service/channel_tracker.cc @@ -73,20 +73,20 @@ ChannelHandle ChannelTracker::AllocateHandle(ChannelHandle::ChannelType type) { Status ChannelTracker::RegisterSendInternal(const ChannelHandle& handle) { if (opaque_to_channel_.count(handle.handle()) == 0) { - return NotFound("channel handle not found: %lld", handle.handle()); + return NotFound("channel handle not found: %d", handle.handle()); } Channel& channel = opaque_to_channel_[handle.handle()]; if (channel.type == ChannelHandle::HOST_TO_DEVICE) { return FailedPrecondition( "host-to-device channels cannot be used with a Send operation; " - "channel handle: %lld", + "channel handle: %d", handle.handle()); } if (channel.has_sender) { return FailedPrecondition( "when registering send, passed a channel handle that is already used " - "by a sender: %lld", + "by a sender: %d", handle.handle()); } channel.has_sender = true; @@ -95,13 +95,13 @@ Status ChannelTracker::RegisterSendInternal(const ChannelHandle& handle) { Status ChannelTracker::RegisterRecvInternal(const ChannelHandle& handle) { if (opaque_to_channel_.count(handle.handle()) == 0) { - return NotFound("channel handle not found: %lld", handle.handle()); + return NotFound("channel handle not found: %d", handle.handle()); } Channel& channel = opaque_to_channel_[handle.handle()]; if (channel.type == ChannelHandle::DEVICE_TO_HOST) { return FailedPrecondition( "device-to-host channels cannot be used with a Recv operation; " - "channel handle: %lld", + "channel handle: %d", handle.handle()); } @@ -109,7 +109,7 @@ Status ChannelTracker::RegisterRecvInternal(const ChannelHandle& handle) { if (channel.receiver_count >= 1) { return FailedPrecondition( "when registering recv, passed a channel handle that is already used " - "by a receiver: %lld", + "by a receiver: %d", handle.handle()); } channel.receiver_count += 1; diff --git a/tensorflow/compiler/xla/service/compiler.cc b/tensorflow/compiler/xla/service/compiler.cc index 6b3b9820f0..687ecafe0c 100644 --- a/tensorflow/compiler/xla/service/compiler.cc +++ b/tensorflow/compiler/xla/service/compiler.cc @@ -101,7 +101,7 @@ Compiler::GetPlatformCompilers() { return NotFound( "could not find registered compiler for platform %s -- check " "target linkage", - platform->Name().c_str()); + platform->Name()); } // And then we invoke the factory, placing the result into the mapping. diff --git a/tensorflow/compiler/xla/service/computation_placer.cc b/tensorflow/compiler/xla/service/computation_placer.cc index 61b1dba6c9..2210a8578a 100644 --- a/tensorflow/compiler/xla/service/computation_placer.cc +++ b/tensorflow/compiler/xla/service/computation_placer.cc @@ -132,7 +132,7 @@ StatusOr ComputationPlacer::AssignDevices( return NotFound( "could not find registered computation placer for platform %s -- check " "target linkage", - platform->Name().c_str()); + platform->Name()); } if (it->second.placer == nullptr) { diff --git a/tensorflow/compiler/xla/service/cpu/BUILD b/tensorflow/compiler/xla/service/cpu/BUILD index e01fecffd0..f0adfc5d45 100644 --- a/tensorflow/compiler/xla/service/cpu/BUILD +++ b/tensorflow/compiler/xla/service/cpu/BUILD @@ -235,6 +235,7 @@ cc_library( "//tensorflow/core:lib", "//tensorflow/core:stream_executor_no_cuda", "@com_google_absl//absl/strings", + "@com_google_absl//absl/strings:str_format", "@llvm//:orc_jit", ], ) @@ -283,6 +284,7 @@ cc_library( "//tensorflow/compiler/xla/service/llvm_ir:tuple_ops", "//tensorflow/core:lib", "@com_google_absl//absl/strings", + "@com_google_absl//absl/strings:str_format", "@llvm//:code_gen", "@llvm//:core", "@llvm//:support", @@ -338,12 +340,12 @@ cc_library( hdrs = ["parallel_loop_emitter.h"], deps = [ ":ir_emission_utils", - "//tensorflow/compiler/xla:xla_data_proto", "//tensorflow/compiler/xla/service/llvm_ir:ir_array", "//tensorflow/compiler/xla/service/llvm_ir:llvm_loop", "//tensorflow/compiler/xla/service/llvm_ir:llvm_util", "//tensorflow/compiler/xla/service/llvm_ir:loop_emitter", "//tensorflow/core:lib", + "@com_google_absl//absl/strings:str_format", "@llvm//:core", ], ) @@ -391,6 +393,7 @@ tf_cc_binary( "//tensorflow/compiler/xla/client:xla_builder", "//tensorflow/compiler/xla/client:xla_computation", "//tensorflow/core:lib", + "@com_google_absl//absl/strings:str_format", ], ) @@ -404,6 +407,7 @@ cc_library( "//tensorflow/compiler/xla:types", "//tensorflow/compiler/xla:util", "//tensorflow/core:lib", + "@com_google_absl//absl/strings:str_format", "@llvm//:mc", "@llvm//:mc_disassembler", "@llvm//:object", @@ -645,6 +649,7 @@ tf_cc_test( "//tensorflow/core:test", "//third_party/eigen3", "@com_google_absl//absl/memory", + "@com_google_absl//absl/strings:str_format", ], ) diff --git a/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc b/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc index 279aa42fe2..6420180b13 100644 --- a/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc +++ b/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc @@ -705,8 +705,7 @@ CpuCompiler::CompileAheadOfTime(std::vector> modules, const llvm::Target* target = llvm::TargetRegistry::lookupTarget(triple.getTriple(), error); if (target == nullptr) { - return InternalError("TargetRegistry::lookupTarget failed: %s", - error.c_str()); + return InternalError("TargetRegistry::lookupTarget failed: %s", error); } llvm::Reloc::Model reloc_model = llvm::Reloc::Static; diff --git a/tensorflow/compiler/xla/service/cpu/cpu_executable.cc b/tensorflow/compiler/xla/service/cpu/cpu_executable.cc index fbcbbbd200..08773693fb 100644 --- a/tensorflow/compiler/xla/service/cpu/cpu_executable.cc +++ b/tensorflow/compiler/xla/service/cpu/cpu_executable.cc @@ -23,6 +23,7 @@ limitations under the License. #include #include "absl/strings/str_cat.h" +#include "absl/strings/str_format.h" #include "absl/strings/str_join.h" #include "llvm/ExecutionEngine/Orc/IRCompileLayer.h" #include "tensorflow/compiler/xla/service/buffer_assignment.h" @@ -37,7 +38,6 @@ limitations under the License. #include "tensorflow/compiler/xla/types.h" #include "tensorflow/compiler/xla/util.h" #include "tensorflow/compiler/xla/xla_data.pb.h" -#include "tensorflow/core/lib/strings/stringprintf.h" #include "tensorflow/core/platform/env.h" #include "tensorflow/core/platform/logging.h" #include "tensorflow/core/platform/mem.h" @@ -171,20 +171,18 @@ Status CpuExecutable::ExecuteComputeFunction( void* result_buffer = buffer_pointers[result_slice.index()]; if (VLOG_IS_ON(3)) { VLOG(3) << "Executing compute function:"; - VLOG(3) << tensorflow::strings::Printf( - " func(void* result, void* params[null], void* temps[%zu], " - "uint64 profile_counters[%zu])", + VLOG(3) << absl::StrFormat( + " func(void* result, void* params[null], void* temps[%u], " + "uint64 profile_counters[%u])", buffer_pointers.size(), profile_counters_size); - VLOG(3) << tensorflow::strings::Printf(" result = %p", result_buffer); + VLOG(3) << absl::StrFormat(" result = %p", result_buffer); auto ptr_printer = [](string* out, const void* p) { - absl::StrAppend(out, tensorflow::strings::Printf("%p", p)); + absl::StrAppend(out, absl::StrFormat("%p", p)); }; VLOG(3) << " params = nullptr"; - VLOG(3) << tensorflow::strings::Printf( - " temps = [%s]", - absl::StrJoin(buffer_pointers, ", ", ptr_printer).c_str()); - VLOG(3) << tensorflow::strings::Printf(" profile_counters = %p", - profile_counters); + VLOG(3) << absl::StrFormat( + " temps = [%s]", absl::StrJoin(buffer_pointers, ", ", ptr_printer)); + VLOG(3) << absl::StrFormat(" profile_counters = %p", profile_counters); } compute_function_(result_buffer, run_options, nullptr, buffer_pointers.data(), diff --git a/tensorflow/compiler/xla/service/cpu/cpu_hlo_support_checker.cc b/tensorflow/compiler/xla/service/cpu/cpu_hlo_support_checker.cc index 7bd4741a04..7fbe0fa157 100644 --- a/tensorflow/compiler/xla/service/cpu/cpu_hlo_support_checker.cc +++ b/tensorflow/compiler/xla/service/cpu/cpu_hlo_support_checker.cc @@ -34,9 +34,8 @@ StatusOr CpuHloSupportChecker::Run(HloModule* module) { return xla::Unimplemented( "CPU backend does not support HLO instruction %s with shape " "containing a sparse layout: %s", - instruction->ToString().c_str(), - ShapeUtil::HumanStringWithLayout(instruction->shape()) - .c_str()); + instruction->ToString(), + ShapeUtil::HumanStringWithLayout(instruction->shape())); } return Status::OK(); })); diff --git a/tensorflow/compiler/xla/service/cpu/cpu_runtime_test.cc b/tensorflow/compiler/xla/service/cpu/cpu_runtime_test.cc index bc4cfc0999..1ae3aa5711 100644 --- a/tensorflow/compiler/xla/service/cpu/cpu_runtime_test.cc +++ b/tensorflow/compiler/xla/service/cpu/cpu_runtime_test.cc @@ -20,6 +20,7 @@ limitations under the License. #include #include "absl/memory/memory.h" +#include "absl/strings/str_format.h" #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor" #include "tensorflow/compiler/xla/array2d.h" #include "tensorflow/compiler/xla/client/local_client.h" @@ -28,7 +29,6 @@ limitations under the License. #include "tensorflow/compiler/xla/service/cpu/runtime_single_threaded_matmul.h" #include "tensorflow/compiler/xla/types.h" #include "tensorflow/core/common_runtime/eigen_thread_pool.h" -#include "tensorflow/core/lib/strings/stringprintf.h" #include "tensorflow/core/platform/env.h" #include "tensorflow/core/platform/logging.h" #include "tensorflow/core/platform/test.h" @@ -142,10 +142,10 @@ class EigenMatMulTest : public CpuRuntimeTest, bool transpose_rhs = std::get<2>(info.param); bool single_threaded = std::get<3>(info.param); - return tensorflow::strings::Printf( - "EigenMatMul_%lld_%lld_%lld_%s%s%s_threaded", shape.m, shape.k, shape.n, - transpose_lhs ? "Tlhs_" : "", transpose_rhs ? "Trhs_" : "", - single_threaded ? "single" : "multi"); + return absl::StrFormat("EigenMatMul_%d_%d_%d_%s%s%s_threaded", shape.m, + shape.k, shape.n, transpose_lhs ? "Tlhs_" : "", + transpose_rhs ? "Trhs_" : "", + single_threaded ? "single" : "multi"); } }; @@ -178,10 +178,10 @@ class MKLMatMulTest : public CpuRuntimeTest, bool transpose_rhs = std::get<2>(info.param); bool single_threaded = std::get<3>(info.param); - return tensorflow::strings::Printf( - "MKLMatMul_%lld_%lld_%lld_%s%s%s_threaded", shape.m, shape.k, shape.n, - transpose_lhs ? "Tlhs_" : "", transpose_rhs ? "Trhs_" : "", - single_threaded ? "single" : "multi"); + return absl::StrFormat("MKLMatMul_%d_%d_%d_%s%s%s_threaded", shape.m, + shape.k, shape.n, transpose_lhs ? "Tlhs_" : "", + transpose_rhs ? "Trhs_" : "", + single_threaded ? "single" : "multi"); } }; diff --git a/tensorflow/compiler/xla/service/cpu/cpu_transfer_manager.cc b/tensorflow/compiler/xla/service/cpu/cpu_transfer_manager.cc index b07cd675ff..0df2abf001 100644 --- a/tensorflow/compiler/xla/service/cpu/cpu_transfer_manager.cc +++ b/tensorflow/compiler/xla/service/cpu/cpu_transfer_manager.cc @@ -104,7 +104,7 @@ Status CpuTransferManager::TransferLiteralToInfeed( if (ShapeUtil::IsNestedTuple(shape)) { return Unimplemented( "Infeed with a nested tuple shape is not supported: %s", - ShapeUtil::HumanString(literal.shape()).c_str()); + ShapeUtil::HumanString(literal.shape())); } // For a tuple, we transfer each of its elements to the device and @@ -152,11 +152,11 @@ CpuTransferManager::TransferBufferToInfeedInternal(se::StreamExecutor* executor, int64 size, const void* source) { if (size > std::numeric_limits::max()) { - return InvalidArgument("Infeed shape is too large: needs %lld bytes", size); + return InvalidArgument("Infeed shape is too large: needs %d bytes", size); } if (size <= 0) { - return InvalidArgument("Infeed shape must have positive size; got %lld", + return InvalidArgument("Infeed shape must have positive size; got %d", size); } @@ -244,12 +244,12 @@ StatusOr CpuTransferManager::TransferBuffersFromOutfeedInternal( for (auto b : buffer_data) { int64 size = b.second; if (size > std::numeric_limits::max()) { - return InvalidArgument("Outfeed shape is too large: needs %lld bytes", + return InvalidArgument("Outfeed shape is too large: needs %d bytes", size); } if (size <= 0) { - return InvalidArgument("Outfeed shape must have positive size; got %lld", + return InvalidArgument("Outfeed shape must have positive size; got %d", size); } diff --git a/tensorflow/compiler/xla/service/cpu/disassembler.cc b/tensorflow/compiler/xla/service/cpu/disassembler.cc index e4c674e227..3ae64142cd 100644 --- a/tensorflow/compiler/xla/service/cpu/disassembler.cc +++ b/tensorflow/compiler/xla/service/cpu/disassembler.cc @@ -21,13 +21,13 @@ limitations under the License. #include #include +#include "absl/strings/str_format.h" #include "llvm/MC/MCInst.h" #include "llvm/Support/TargetRegistry.h" #include "llvm/Support/raw_ostream.h" #include "tensorflow/compiler/xla/status_macros.h" #include "tensorflow/compiler/xla/types.h" #include "tensorflow/compiler/xla/util.h" -#include "tensorflow/core/lib/strings/stringprintf.h" #include "tensorflow/core/platform/logging.h" #include "tensorflow/core/platform/types.h" @@ -151,7 +151,7 @@ StatusOr Disassembler::DisassembleObjectFile( size = 1; } - ostream << tensorflow::strings::Printf("0x%08lx", index) << " "; + ostream << absl::StrFormat("0x%08lx", index) << " "; if (decode_status == llvm::MCDisassembler::Success) { // For branches, try to determine the actual address and emit it as an @@ -163,7 +163,7 @@ StatusOr Disassembler::DisassembleObjectFile( uint64_t target; if (inst_analysis_->evaluateBranch( instruction, section_address + index, size, target)) { - annotation = tensorflow::strings::Printf("[0x%08lx]", target); + annotation = absl::StrFormat("[0x%08lx]", target); } } inst_printer_->printInst(&instruction, ostream, annotation.c_str(), diff --git a/tensorflow/compiler/xla/service/cpu/dot_op_emitter.cc b/tensorflow/compiler/xla/service/cpu/dot_op_emitter.cc index 4af16f4fa0..dd060f54a2 100644 --- a/tensorflow/compiler/xla/service/cpu/dot_op_emitter.cc +++ b/tensorflow/compiler/xla/service/cpu/dot_op_emitter.cc @@ -1467,7 +1467,7 @@ Status DotOpEmitter::EmitCallToRuntime() { break; default: return Unimplemented("Invalid type %s for dot operation", - PrimitiveType_Name(type).c_str()); + PrimitiveType_Name(type)); } llvm::Type* float_ptr_type = float_type->getPointerTo(); diff --git a/tensorflow/compiler/xla/service/cpu/ir_emitter.cc b/tensorflow/compiler/xla/service/cpu/ir_emitter.cc index 417a1dba1f..321c2e9896 100644 --- a/tensorflow/compiler/xla/service/cpu/ir_emitter.cc +++ b/tensorflow/compiler/xla/service/cpu/ir_emitter.cc @@ -28,6 +28,7 @@ limitations under the License. #include "tensorflow/core/platform/logging.h" // IWYU pragma: no_include "llvm/IR/Intrinsics.gen.inc" #include "absl/strings/str_cat.h" +#include "absl/strings/str_format.h" #include "llvm/CodeGen/TargetRegisterInfo.h" #include "llvm/CodeGen/TargetSubtargetInfo.h" #include "llvm/IR/BasicBlock.h" @@ -68,7 +69,6 @@ limitations under the License. #include "tensorflow/core/lib/gtl/array_slice.h" #include "tensorflow/core/lib/gtl/flatmap.h" #include "tensorflow/core/lib/gtl/flatset.h" -#include "tensorflow/core/lib/strings/stringprintf.h" namespace xla { @@ -230,9 +230,8 @@ Status IrEmitter::HandleCopy(HloInstruction* copy) { // Use the elemental emitter for array shapes. return DefaultAction(copy); } - return Unimplemented( - "unsupported operand type %s for copy instruction", - PrimitiveType_Name(copy->shape().element_type()).c_str()); + return Unimplemented("unsupported operand type %s for copy instruction", + PrimitiveType_Name(copy->shape().element_type())); } // Calculate the alignment of a buffer allocated for a given primitive type. @@ -389,7 +388,7 @@ Status IrEmitter::EmitXfeedTransfer(XfeedKind kind, const Shape& shape, int64 length = ByteSizeOf(shape); if (length <= 0 || length > std::numeric_limits::max()) { return InvalidArgument( - "xfeed (infeed or outfeed) buffer length %lld is outside the valid " + "xfeed (infeed or outfeed) buffer length %d is outside the valid " "size range", length); } @@ -1620,9 +1619,8 @@ StatusOr IrEmitter::EmitVectorizedReduce( int64 dimension = LayoutUtil::Minor(reduce->shape().layout(), i); int64 start_index = 0; int64 end_index = reduce->shape().dimensions(dimension); - std::unique_ptr loop = - loop_nest.AddLoop(start_index, end_index, - tensorflow::strings::Printf("dim.%lld", dimension)); + std::unique_ptr loop = loop_nest.AddLoop( + start_index, end_index, absl::StrFormat("dim.%d", dimension)); array_index[dimension] = loop->GetIndVarValue(); } @@ -1641,9 +1639,9 @@ StatusOr IrEmitter::EmitVectorizedReduce( int64 start_index = 0; int64 end_index = (innermost_dimension_size / vectorization_factor) * vectorization_factor; - std::unique_ptr loop = loop_nest.AddLoop( - start_index, end_index, vectorization_factor, - tensorflow::strings::Printf("dim.%lld", innermost_dimension)); + std::unique_ptr loop = + loop_nest.AddLoop(start_index, end_index, vectorization_factor, + absl::StrFormat("dim.%d", innermost_dimension)); array_index[innermost_dimension] = loop->GetIndVarValue(); SetToFirstInsertPoint(loop->GetBodyBasicBlock(), &b_); @@ -2170,8 +2168,8 @@ Status IrEmitter::HandleWhile(HloInstruction* xla_while) { return InternalError( "instruction %s %s does not share slice with " "instruction %s %s", - a->ToString().c_str(), slice_a.ToString().c_str(), - b->ToString().c_str(), slice_b.ToString().c_str()); + a->ToString(), slice_a.ToString(), b->ToString(), + slice_b.ToString()); } return Status::OK(); }; @@ -2826,8 +2824,8 @@ Status IrEmitter::ElementTypesSameAndSupported( if (std::find(supported_types.begin(), supported_types.end(), primitive_type) == supported_types.end()) { return Unimplemented("unsupported operand type %s in op %s", - PrimitiveType_Name(primitive_type).c_str(), - HloOpcodeString(instruction.opcode()).c_str()); + PrimitiveType_Name(primitive_type), + HloOpcodeString(instruction.opcode())); } return Status::OK(); } diff --git a/tensorflow/compiler/xla/service/cpu/parallel_loop_emitter.cc b/tensorflow/compiler/xla/service/cpu/parallel_loop_emitter.cc index aedb069dce..f8441c3e34 100644 --- a/tensorflow/compiler/xla/service/cpu/parallel_loop_emitter.cc +++ b/tensorflow/compiler/xla/service/cpu/parallel_loop_emitter.cc @@ -15,9 +15,9 @@ limitations under the License. #include "tensorflow/compiler/xla/service/cpu/parallel_loop_emitter.h" +#include "absl/strings/str_format.h" #include "tensorflow/compiler/xla/service/llvm_ir/llvm_loop.h" #include "tensorflow/compiler/xla/service/llvm_ir/llvm_util.h" -#include "tensorflow/core/lib/strings/stringprintf.h" namespace xla { namespace cpu { @@ -52,15 +52,15 @@ ParallelLoopEmitter::EmitIndexAndSetExitBasicBlock(absl::string_view loop_name, llvm::Value* end_index = (*dynamic_loop_bounds_)[bounds_index].second; std::unique_ptr loop = loop_nest.AddLoop( - /*suffix=*/tensorflow::strings::Printf("dim.%lld", dimension), - start_index, end_index); + /*suffix=*/absl::StrFormat("dim.%d", dimension), start_index, + end_index); array_index[dimension] = loop->GetIndVarValue(); } else { // Emit static loop bounds for this dimension. std::unique_ptr loop = loop_nest.AddLoop( /*start_index=*/0, /*end_index=*/shape_.dimensions(dimension), - /*suffix=*/tensorflow::strings::Printf("dim.%lld", dimension)); + /*suffix=*/absl::StrFormat("dim.%d", dimension)); array_index[dimension] = loop->GetIndVarValue(); } } diff --git a/tensorflow/compiler/xla/service/cpu/sample_harness.cc b/tensorflow/compiler/xla/service/cpu/sample_harness.cc index f227e4ae13..942e2ddd39 100644 --- a/tensorflow/compiler/xla/service/cpu/sample_harness.cc +++ b/tensorflow/compiler/xla/service/cpu/sample_harness.cc @@ -16,6 +16,7 @@ limitations under the License. #include #include +#include "absl/strings/str_format.h" #include "tensorflow/compiler/xla/array4d.h" #include "tensorflow/compiler/xla/client/client.h" #include "tensorflow/compiler/xla/client/client_library.h" @@ -27,7 +28,6 @@ limitations under the License. #include "tensorflow/compiler/xla/statusor.h" #include "tensorflow/compiler/xla/types.h" #include "tensorflow/compiler/xla/xla_data.pb.h" -#include "tensorflow/core/lib/strings/stringprintf.h" #include "tensorflow/core/platform/init_main.h" #include "tensorflow/core/platform/logging.h" @@ -67,8 +67,8 @@ int main(int argc, char** argv) { /*execution_profile=*/&profile); std::unique_ptr actual = result.ConsumeValueOrDie(); - LOG(INFO) << tensorflow::strings::Printf("computation took %lldns", - profile.compute_time_ns()); + LOG(INFO) << absl::StrFormat("computation took %dns", + profile.compute_time_ns()); LOG(INFO) << actual->ToString(); return 0; diff --git a/tensorflow/compiler/xla/service/cpu/tests/cpu_intrinsic_test.cc b/tensorflow/compiler/xla/service/cpu/tests/cpu_intrinsic_test.cc index 9457e57d7b..a434c04a98 100644 --- a/tensorflow/compiler/xla/service/cpu/tests/cpu_intrinsic_test.cc +++ b/tensorflow/compiler/xla/service/cpu/tests/cpu_intrinsic_test.cc @@ -65,8 +65,8 @@ class CpuUnaryIntrinsicTest features = ""; } - return absl::StrCat(opcode.c_str(), "_On_", triple.c_str(), - features.empty() ? "" : "_With", features.c_str()); + return absl::StrCat(opcode, "_On_", triple, + (features.empty() ? "" : "_With"), features); } }; diff --git a/tensorflow/compiler/xla/service/device_memory_allocator.cc b/tensorflow/compiler/xla/service/device_memory_allocator.cc index e228bb56bc..1d0297cfbf 100644 --- a/tensorflow/compiler/xla/service/device_memory_allocator.cc +++ b/tensorflow/compiler/xla/service/device_memory_allocator.cc @@ -36,9 +36,8 @@ StatusOr StreamExecutorMemoryAllocator::Allocate( se::DeviceMemoryBase result = stream_executor->AllocateArray(size); if (size > 0 && result == nullptr) { return ResourceExhausted( - "Failed to allocate request for %s (%lluB) on device ordinal %d", - tensorflow::strings::HumanReadableNumBytes(size).c_str(), size, - device_ordinal); + "Failed to allocate request for %s (%uB) on device ordinal %d", + tensorflow::strings::HumanReadableNumBytes(size), size, device_ordinal); } return OwningDeviceMemory(result, device_ordinal, this); } @@ -61,12 +60,12 @@ StatusOr StreamExecutorMemoryAllocator::GetStreamExecutor( } if (device_ordinal >= stream_executors_.size()) { return InvalidArgument( - "device ordinal value (%d) >= number of devices (%zu)", device_ordinal, + "device ordinal value (%d) >= number of devices (%u)", device_ordinal, stream_executors_.size()); } if (stream_executors_[device_ordinal] == nullptr) { return NotFound("Device %s:%d present but not supported", - platform()->Name().c_str(), device_ordinal); + platform()->Name(), device_ordinal); } return stream_executors_[device_ordinal]; } diff --git a/tensorflow/compiler/xla/service/dfs_hlo_visitor.cc b/tensorflow/compiler/xla/service/dfs_hlo_visitor.cc index 2172ae0a29..3e7373adc5 100644 --- a/tensorflow/compiler/xla/service/dfs_hlo_visitor.cc +++ b/tensorflow/compiler/xla/service/dfs_hlo_visitor.cc @@ -28,14 +28,14 @@ template Status DfsHloVisitorBase::HandleElementwiseUnary( HloInstructionPtr hlo) { return Unimplemented("DfsHloVisitor::HandleElementwiseUnary: %s", - HloOpcodeString(hlo->opcode()).c_str()); + HloOpcodeString(hlo->opcode())); } template Status DfsHloVisitorBase::HandleElementwiseBinary( HloInstructionPtr hlo) { return Unimplemented("DfsHloVisitor::HandleElementwiseBinary: %s", - HloOpcodeString(hlo->opcode()).c_str()); + HloOpcodeString(hlo->opcode())); } template diff --git a/tensorflow/compiler/xla/service/elemental_ir_emitter.cc b/tensorflow/compiler/xla/service/elemental_ir_emitter.cc index 26af67cc1c..2e5930fb70 100644 --- a/tensorflow/compiler/xla/service/elemental_ir_emitter.cc +++ b/tensorflow/compiler/xla/service/elemental_ir_emitter.cc @@ -264,8 +264,8 @@ StatusOr ElementalIrEmitter::EmitIntegerUnaryOp( } } return Unimplemented("conversion from primitive type %s to %s", - PrimitiveType_Name(from_type).c_str(), - PrimitiveType_Name(to_type).c_str()); + PrimitiveType_Name(from_type), + PrimitiveType_Name(to_type)); } case HloOpcode::kBitcastConvert: { PrimitiveType from_type = op->operand(0)->shape().element_type(); @@ -282,8 +282,7 @@ StatusOr ElementalIrEmitter::EmitIntegerUnaryOp( return InvalidArgument( "bitcast conversion from primitive type %s to %s with unequal " "bit-widths (%u versus %u) ", - PrimitiveType_Name(from_type).c_str(), - PrimitiveType_Name(to_type).c_str(), + PrimitiveType_Name(from_type), PrimitiveType_Name(to_type), primitive_util::BitWidth(from_type), primitive_util::BitWidth(to_type)); } @@ -332,7 +331,7 @@ StatusOr ElementalIrEmitter::EmitIntegerUnaryOp( } default: return Unimplemented("unary integer op '%s'", - HloOpcodeString(op->opcode()).c_str()); + HloOpcodeString(op->opcode())); } } @@ -389,8 +388,8 @@ StatusOr ElementalIrEmitter::EmitFloatUnaryOp( operand_value, llvm_ir::PrimitiveTypeToIrType(to_type, module_)); } return Unimplemented("unhandled conversion operation: %s => %s", - PrimitiveType_Name(from_type).c_str(), - PrimitiveType_Name(to_type).c_str()); + PrimitiveType_Name(from_type), + PrimitiveType_Name(to_type)); } case HloOpcode::kBitcastConvert: { PrimitiveType from_type = op->operand(0)->shape().element_type(); @@ -407,8 +406,7 @@ StatusOr ElementalIrEmitter::EmitFloatUnaryOp( return InvalidArgument( "bitcast conversion from primitive type %s to %s with unequal " "bit-widths (%u versus %u) ", - PrimitiveType_Name(from_type).c_str(), - PrimitiveType_Name(to_type).c_str(), + PrimitiveType_Name(from_type), PrimitiveType_Name(to_type), primitive_util::BitWidth(from_type), primitive_util::BitWidth(to_type)); } @@ -471,7 +469,7 @@ StatusOr ElementalIrEmitter::EmitFloatUnaryOp( return llvm::ConstantFP::get(operand_value->getType(), 0.0); default: return Unimplemented("unary floating-point op '%s'", - HloOpcodeString(op->opcode()).c_str()); + HloOpcodeString(op->opcode())); } } @@ -683,7 +681,7 @@ StatusOr ElementalIrEmitter::EmitComplexUnaryOp( return EmitExtractImag(operand_value); default: return Unimplemented("unary complex op '%s'", - HloOpcodeString(op->opcode()).c_str()); + HloOpcodeString(op->opcode())); } } @@ -755,7 +753,7 @@ StatusOr ElementalIrEmitter::EmitFloatBinaryOp( return EmitAtan2(op->shape().element_type(), lhs_value, rhs_value); default: return Unimplemented("binary floating point op '%s'", - HloOpcodeString(op->opcode()).c_str()); + HloOpcodeString(op->opcode())); } } @@ -873,7 +871,7 @@ StatusOr ElementalIrEmitter::EmitComplexBinaryOp( } default: return Unimplemented("binary complex op '%s'", - HloOpcodeString(op->opcode()).c_str()); + HloOpcodeString(op->opcode())); } } @@ -1247,7 +1245,7 @@ StatusOr ElementalIrEmitter::EmitIntegerBinaryOp( /*saturate_to_sign_bit=*/false); default: return Unimplemented("binary integer op '%s'", - HloOpcodeString(op->opcode()).c_str()); + HloOpcodeString(op->opcode())); } } @@ -1378,7 +1376,7 @@ StatusOr ElementalIrEmitter::ConvertValueForDistribution( default: return InvalidArgument( "unhandled distribution %s", - RandomDistribution_Name(hlo->random_distribution()).c_str()); + RandomDistribution_Name(hlo->random_distribution())); } } @@ -1610,7 +1608,7 @@ StatusOr ElementalIrEmitter::EmitElementalClamp( max_value, EmitIntegralMax(min_value, arg_value, is_signed), is_signed); } else { return Unimplemented("Clamp unimplemented for %s", - PrimitiveType_Name(prim_type).c_str()); + PrimitiveType_Name(prim_type)); } } @@ -2232,7 +2230,7 @@ llvm_ir::ElementGenerator ElementalIrEmitter::MakeElementGenerator( default: return [hlo](const IrArray::Index& index) { return Unimplemented("Unhandled opcode for elemental IR emission: %s", - HloOpcodeString(hlo->opcode()).c_str()); + HloOpcodeString(hlo->opcode())); }; } } diff --git a/tensorflow/compiler/xla/service/executable.cc b/tensorflow/compiler/xla/service/executable.cc index 1c9f396b68..78edf918a4 100644 --- a/tensorflow/compiler/xla/service/executable.cc +++ b/tensorflow/compiler/xla/service/executable.cc @@ -16,6 +16,7 @@ limitations under the License. #include "tensorflow/compiler/xla/service/executable.h" #include "absl/memory/memory.h" +#include "absl/strings/str_format.h" #include "tensorflow/compiler/xla/legacy_flags/debug_options_flags.h" #include "tensorflow/compiler/xla/service/hlo_graph_dumper.h" #include "tensorflow/compiler/xla/status.h" @@ -23,7 +24,6 @@ limitations under the License. #include "tensorflow/core/lib/hash/hash.h" #include "tensorflow/core/lib/io/path.h" #include "tensorflow/core/lib/strings/proto_serialization.h" -#include "tensorflow/core/lib/strings/stringprintf.h" #include "tensorflow/core/platform/env.h" using tensorflow::gtl::ArraySlice; @@ -155,9 +155,9 @@ Status Executable::DumpHloSnapshot() { const string& directory_path = module_config().debug_options().xla_dump_executions_to(); const auto& module = hlo_snapshot_->hlo().hlo_module(); - string filename = tensorflow::strings::Printf( - "computation_%lld__%s__execution_%lld", module.id(), - module.entry_computation_name().c_str(), ++execution_count_); + string filename = + absl::StrFormat("computation_%d__%s__execution_%d", module.id(), + module.entry_computation_name(), ++execution_count_); return Executable::DumpToDirectory(directory_path, filename, *hlo_snapshot_); } diff --git a/tensorflow/compiler/xla/service/execution_tracker.cc b/tensorflow/compiler/xla/service/execution_tracker.cc index 70a78c8a2b..997db7c058 100644 --- a/tensorflow/compiler/xla/service/execution_tracker.cc +++ b/tensorflow/compiler/xla/service/execution_tracker.cc @@ -66,7 +66,7 @@ Status ExecutionTracker::Unregister(const ExecutionHandle& handle) { tensorflow::mutex_lock lock(execution_mutex_); auto it = handle_to_execution_.find(handle.handle()); if (it == handle_to_execution_.end()) { - return NotFound("no execution record for execution handle: %lld", + return NotFound("no execution record for execution handle: %d", handle.handle()); } handle_to_execution_.erase(handle.handle()); @@ -78,7 +78,7 @@ StatusOr ExecutionTracker::Resolve( tensorflow::mutex_lock lock(execution_mutex_); auto it = handle_to_execution_.find(handle.handle()); if (it == handle_to_execution_.end()) { - return NotFound("no execution record for execution handle: %lld", + return NotFound("no execution record for execution handle: %d", handle.handle()); } return it->second.get(); diff --git a/tensorflow/compiler/xla/service/gather_expander.cc b/tensorflow/compiler/xla/service/gather_expander.cc index d889fd8e88..3f1a881372 100644 --- a/tensorflow/compiler/xla/service/gather_expander.cc +++ b/tensorflow/compiler/xla/service/gather_expander.cc @@ -323,7 +323,7 @@ StatusOr GatherExpander::ExpandGather( return Unimplemented( "Gather operations with more than 2147483647 gather indices are not " "supported. This error occurred for %s.", - gather_instr->ToString().c_str()); + gather_instr->ToString()); } TF_ASSIGN_OR_RETURN( diff --git a/tensorflow/compiler/xla/service/gpu/BUILD b/tensorflow/compiler/xla/service/gpu/BUILD index e53f525517..87b799e78e 100644 --- a/tensorflow/compiler/xla/service/gpu/BUILD +++ b/tensorflow/compiler/xla/service/gpu/BUILD @@ -57,6 +57,7 @@ cc_library( "//tensorflow/core:lib", "//tensorflow/core:stream_executor_no_cuda", "@com_google_absl//absl/memory", + "@com_google_absl//absl/strings:str_format", ], ) @@ -110,6 +111,7 @@ tf_cc_test( "//tensorflow/compiler/xla/tests:xla_internal_test_main", "//tensorflow/core:lib", "@com_google_absl//absl/memory", + "@com_google_absl//absl/strings:str_format", ], ) @@ -351,6 +353,7 @@ cc_library( "//tensorflow/stream_executor", "@com_google_absl//absl/memory", "@com_google_absl//absl/strings", + "@com_google_absl//absl/strings:str_format", "@com_google_absl//absl/types:optional", ], ) @@ -389,6 +392,7 @@ cc_library( "//tensorflow/core:lib", "//tensorflow/core:stream_executor_no_cuda", "@com_google_absl//absl/strings", + "@com_google_absl//absl/strings:str_format", "@com_google_absl//absl/types:optional", ], ) @@ -819,6 +823,7 @@ tf_cc_test( "//tensorflow/compiler/xla/tests:hlo_test_base", "//tensorflow/compiler/xla/tests:xla_internal_test_main", "@com_google_absl//absl/memory", + "@com_google_absl//absl/strings:str_format", ], ) diff --git a/tensorflow/compiler/xla/service/gpu/buffer_allocations.cc b/tensorflow/compiler/xla/service/gpu/buffer_allocations.cc index e208ad61e3..86af83b6b9 100644 --- a/tensorflow/compiler/xla/service/gpu/buffer_allocations.cc +++ b/tensorflow/compiler/xla/service/gpu/buffer_allocations.cc @@ -62,7 +62,7 @@ StatusOr> BufferAllocations::Builder::Build( if (reinterpret_cast(address.opaque()) % expected_alignment != 0) { return InternalError( - "Address of registered buffer %lld must be a multiple of %llx, but " + "Address of registered buffer %d must be a multiple of %x, but " "was %p", i, kEntryParameterAlignBytes, address.opaque()); } @@ -83,7 +83,7 @@ StatusOr> BufferAllocations::Builder::Build( 0) { return InternalError( "Address returned by memory_allocator->Allocate must be a " - "multiple of %llx, but was %p", + "multiple of %x, but was %p", kXlaAllocatedBufferAlignBytes, buffer.opaque()); } // We do manual memory management within BufferAllocations. Be sure not diff --git a/tensorflow/compiler/xla/service/gpu/buffer_comparator.cc b/tensorflow/compiler/xla/service/gpu/buffer_comparator.cc index f22c2a8add..13c83c9199 100644 --- a/tensorflow/compiler/xla/service/gpu/buffer_comparator.cc +++ b/tensorflow/compiler/xla/service/gpu/buffer_comparator.cc @@ -124,7 +124,7 @@ StatusOr F16BufferComparator::Create( StatusOr F16BufferComparator::CompareEqualImpl( se::DeviceMemory test_buffer) { if (ref_buffer_.root_buffer().size() != test_buffer.size()) { - return InternalError("Mismatched buffer size: %lld vs %lld", + return InternalError("Mismatched buffer size: %d vs %d", ref_buffer_.root_buffer().size(), test_buffer.size()); } diff --git a/tensorflow/compiler/xla/service/gpu/conditional_thunk.cc b/tensorflow/compiler/xla/service/gpu/conditional_thunk.cc index 8b0426aa27..9ed523998b 100644 --- a/tensorflow/compiler/xla/service/gpu/conditional_thunk.cc +++ b/tensorflow/compiler/xla/service/gpu/conditional_thunk.cc @@ -59,7 +59,7 @@ Status ConditionalThunk::ExecuteOnStream( Status block_status = stream->BlockHostUntilDone(); if (!block_status.ok()) { return InternalError("Failed to retrieve predicate value on stream %p: %s.", - stream, block_status.error_message().c_str()); + stream, block_status.error_message()); } // Execute the true or the false computation depending on the value of the diff --git a/tensorflow/compiler/xla/service/gpu/convolution_thunk.cc b/tensorflow/compiler/xla/service/gpu/convolution_thunk.cc index 854a2f50b2..eea31f3de1 100644 --- a/tensorflow/compiler/xla/service/gpu/convolution_thunk.cc +++ b/tensorflow/compiler/xla/service/gpu/convolution_thunk.cc @@ -22,7 +22,6 @@ limitations under the License. #include "tensorflow/compiler/xla/service/gpu/hlo_execution_profiler.h" #include "tensorflow/compiler/xla/types.h" #include "tensorflow/compiler/xla/util.h" -#include "tensorflow/core/lib/strings/stringprintf.h" #include "tensorflow/core/platform/logging.h" #include "tensorflow/core/platform/stream_executor_no_cuda.h" diff --git a/tensorflow/compiler/xla/service/gpu/cudnn_batchnorm_thunk.cc b/tensorflow/compiler/xla/service/gpu/cudnn_batchnorm_thunk.cc index 18a76e8c26..bc3c6f72f6 100644 --- a/tensorflow/compiler/xla/service/gpu/cudnn_batchnorm_thunk.cc +++ b/tensorflow/compiler/xla/service/gpu/cudnn_batchnorm_thunk.cc @@ -22,7 +22,6 @@ limitations under the License. #include "tensorflow/compiler/xla/service/gpu/ir_emission_utils.h" #include "tensorflow/compiler/xla/types.h" #include "tensorflow/compiler/xla/util.h" -#include "tensorflow/core/lib/strings/stringprintf.h" #include "tensorflow/core/platform/logging.h" #include "tensorflow/core/platform/stream_executor_no_cuda.h" diff --git a/tensorflow/compiler/xla/service/gpu/cudnn_convolution_algorithm_picker.cc b/tensorflow/compiler/xla/service/gpu/cudnn_convolution_algorithm_picker.cc index 3d421ebb69..dbdf8e7a0e 100644 --- a/tensorflow/compiler/xla/service/gpu/cudnn_convolution_algorithm_picker.cc +++ b/tensorflow/compiler/xla/service/gpu/cudnn_convolution_algorithm_picker.cc @@ -15,6 +15,7 @@ limitations under the License. #include "tensorflow/compiler/xla/service/gpu/cudnn_convolution_algorithm_picker.h" #include "absl/strings/str_cat.h" +#include "absl/strings/str_format.h" #include "absl/types/optional.h" #include "tensorflow/compiler/xla/literal_util.h" #include "tensorflow/compiler/xla/service/gpu/backend_configs.pb.h" @@ -59,8 +60,8 @@ StatusOr> ScratchAllocator::AllocateBytes( if (byte_size > GetMemoryLimitInBytes(stream)) { return se::port::Status( se::port::error::RESOURCE_EXHAUSTED, - tensorflow::strings::Printf( - "Allocating %lld bytes exceeds the memory limit of %lld bytes.", + absl::StrFormat( + "Allocating %d bytes exceeds the memory limit of %d bytes.", byte_size, GetMemoryLimitInBytes(stream))); } @@ -361,7 +362,7 @@ CudnnConvolutionAlgorithmPicker::PickBestAlgorithm( return InternalError( "All algorithms tried for convolution %s failed. Falling back to " "default algorithm.", - instr->ToString().c_str()); + instr->ToString()); } StatusOr CudnnConvolutionAlgorithmPicker::RunOnInstruction( diff --git a/tensorflow/compiler/xla/service/gpu/cudnn_convolution_runner.cc b/tensorflow/compiler/xla/service/gpu/cudnn_convolution_runner.cc index 68086c86e9..07b96fbd3f 100644 --- a/tensorflow/compiler/xla/service/gpu/cudnn_convolution_runner.cc +++ b/tensorflow/compiler/xla/service/gpu/cudnn_convolution_runner.cc @@ -197,8 +197,8 @@ Status RunCudnnConvolution( if (!stream->ok()) { return InternalError( - "Unable to launch convolution with type %s and algorithm (%lld, %lld)", - CudnnConvKindToString(kind).c_str(), algorithm.algorithm().algo_id(), + "Unable to launch convolution with type %s and algorithm (%d, %d)", + CudnnConvKindToString(kind), algorithm.algorithm().algo_id(), algorithm.algorithm_no_scratch().algo_id()); } return Status::OK(); diff --git a/tensorflow/compiler/xla/service/gpu/elemental_ir_emitter.cc b/tensorflow/compiler/xla/service/gpu/elemental_ir_emitter.cc index 2460d951bd..afcf9fa2ea 100644 --- a/tensorflow/compiler/xla/service/gpu/elemental_ir_emitter.cc +++ b/tensorflow/compiler/xla/service/gpu/elemental_ir_emitter.cc @@ -107,7 +107,7 @@ StatusOr GpuElementalIrEmitter::EmitLibdeviceMathCall( break; default: return Unimplemented("Bad type for libdevice math call: %s", - PrimitiveType_Name(output_type).c_str()); + PrimitiveType_Name(output_type)); } llvm::Value* result = EmitMathCall(munged_callee, converted_operands, converted_input_types, output_type) @@ -138,7 +138,7 @@ StatusOr GpuElementalIrEmitter::EmitLlvmIntrinsicMathCall( break; default: return Unimplemented("Bad type for llvm intrinsic math call: %s", - PrimitiveType_Name(output_type).c_str()); + PrimitiveType_Name(output_type)); } return EmitMathCall(munged_callee, operands, input_types, output_type); } @@ -152,8 +152,8 @@ StatusOr GpuElementalIrEmitter::EmitMathCall( for (PrimitiveType input_type : input_types) { if (output_type != input_type) { return Unimplemented("Input type ≠ output type: %s ≠ %s", - PrimitiveType_Name(input_type).c_str(), - PrimitiveType_Name(output_type).c_str()); + PrimitiveType_Name(input_type), + PrimitiveType_Name(output_type)); } } diff --git a/tensorflow/compiler/xla/service/gpu/fft_thunk.cc b/tensorflow/compiler/xla/service/gpu/fft_thunk.cc index def595d217..11549cdac5 100644 --- a/tensorflow/compiler/xla/service/gpu/fft_thunk.cc +++ b/tensorflow/compiler/xla/service/gpu/fft_thunk.cc @@ -18,10 +18,10 @@ limitations under the License. #include #include "absl/strings/str_cat.h" +#include "absl/strings/str_format.h" #include "tensorflow/compiler/xla/service/gpu/hlo_execution_profiler.h" #include "tensorflow/compiler/xla/types.h" #include "tensorflow/compiler/xla/util.h" -#include "tensorflow/core/lib/strings/stringprintf.h" #include "tensorflow/core/platform/logging.h" #include "tensorflow/core/platform/stream_executor_no_cuda.h" @@ -43,8 +43,8 @@ StatusOr> FftScratchAllocator::AllocateBytes( if (byte_size > GetMemoryLimitInBytes(stream)) { return se::port::Status( se::port::error::RESOURCE_EXHAUSTED, - tensorflow::strings::Printf( - "Allocating %lld bytes exceeds the memory limit of %lld bytes.", + absl::StrFormat( + "Allocating %d bytes exceeds the memory limit of %d bytes.", byte_size, GetMemoryLimitInBytes(stream))); } @@ -213,7 +213,7 @@ Status FftThunk::ExecuteOnStream(const BufferAllocations& buffer_allocations, return Status::OK(); } return InternalError("Unable to launch fft for thunk %p with type %s", this, - FftTypeToString(fft_type_).c_str()); + FftTypeToString(fft_type_)); } } // namespace gpu diff --git a/tensorflow/compiler/xla/service/gpu/gemm_thunk.cc b/tensorflow/compiler/xla/service/gpu/gemm_thunk.cc index 2c02ec2584..9c4a490366 100644 --- a/tensorflow/compiler/xla/service/gpu/gemm_thunk.cc +++ b/tensorflow/compiler/xla/service/gpu/gemm_thunk.cc @@ -186,7 +186,7 @@ StatusOr DoGemmAutotune( } return InternalError( - "Unable to autotune cuBLAS gemm on stream %p; none of the %zu algorithms " + "Unable to autotune cuBLAS gemm on stream %p; none of the %u algorithms " "ran successfully", stream, algorithms.size()); } diff --git a/tensorflow/compiler/xla/service/gpu/gpu_executable.cc b/tensorflow/compiler/xla/service/gpu/gpu_executable.cc index 88be63e267..71a02e70df 100644 --- a/tensorflow/compiler/xla/service/gpu/gpu_executable.cc +++ b/tensorflow/compiler/xla/service/gpu/gpu_executable.cc @@ -160,7 +160,7 @@ Status GpuExecutable::ExecuteThunks( if (!block_status.ok()) { return InternalError( "Failed to complete all kernels launched on stream %p: %s", - main_stream, block_status.error_message().c_str()); + main_stream, block_status.error_message()); } } @@ -260,10 +260,9 @@ StatusOr GpuExecutable::ExecuteOnStream( if (buffer.is_null() && buffer.size() > 0) { return FailedPrecondition( "Cannot run XLA computation because pointer to (sub-)buffer at " - "index %s of parameter %lld was null. All pointers to " - "(sub-)buffers must not be null, unless the (sub-)buffer has zero " - "elements.", - allocation.param_shape_index().ToString().c_str(), param_no); + "index %s of parameter %d was null. All pointers to (sub-)buffers " + "must not be null, unless the (sub-)buffer has zero elements.", + allocation.param_shape_index().ToString(), param_no); } buffer_allocations_builder.RegisterBuffer(i, buffer); diff --git a/tensorflow/compiler/xla/service/gpu/gpu_hlo_support_checker.cc b/tensorflow/compiler/xla/service/gpu/gpu_hlo_support_checker.cc index 4944c41f7d..4268fb2c7a 100644 --- a/tensorflow/compiler/xla/service/gpu/gpu_hlo_support_checker.cc +++ b/tensorflow/compiler/xla/service/gpu/gpu_hlo_support_checker.cc @@ -34,9 +34,8 @@ StatusOr GpuHloSupportChecker::Run(HloModule* module) { return xla::Unimplemented( "GPU backend does not support HLO instruction %s with shape " "containing a sparse layout: %s", - instruction->ToString().c_str(), - ShapeUtil::HumanStringWithLayout(instruction->shape()) - .c_str()); + instruction->ToString(), + ShapeUtil::HumanStringWithLayout(instruction->shape())); } return Status::OK(); })); diff --git a/tensorflow/compiler/xla/service/gpu/gpu_transfer_manager.cc b/tensorflow/compiler/xla/service/gpu/gpu_transfer_manager.cc index 44303724bb..f3c2744292 100644 --- a/tensorflow/compiler/xla/service/gpu/gpu_transfer_manager.cc +++ b/tensorflow/compiler/xla/service/gpu/gpu_transfer_manager.cc @@ -84,7 +84,7 @@ Status GpuTransferManager::EnqueueBuffersToInfeed( Status block_status = stream->BlockHostUntilDone(); if (!block_status.ok()) { return InternalError("Failed to complete data transfer on stream %p: %s", - stream, block_status.error_message().c_str()); + stream, block_status.error_message()); } infeed_manager->EnqueueDestination(std::move(buffers)); @@ -97,7 +97,7 @@ Status GpuTransferManager::EnqueueBuffersToInfeed( StatusOr GpuTransferManager::TransferBufferToInfeedInternal( se::StreamExecutor* executor, int64 size, const void* source) { if (size > std::numeric_limits::max()) { - return InvalidArgument("Infeed shape is too large: needs %lld bytes", size); + return InvalidArgument("Infeed shape is too large: needs %d bytes", size); } if (size == 0) { diff --git a/tensorflow/compiler/xla/service/gpu/hlo_schedule_test.cc b/tensorflow/compiler/xla/service/gpu/hlo_schedule_test.cc index d4a96cd5b3..bb147c8d98 100644 --- a/tensorflow/compiler/xla/service/gpu/hlo_schedule_test.cc +++ b/tensorflow/compiler/xla/service/gpu/hlo_schedule_test.cc @@ -19,6 +19,7 @@ limitations under the License. #include #include "absl/memory/memory.h" +#include "absl/strings/str_format.h" #include "tensorflow/compiler/xla/service/gpu/stream_assignment.h" #include "tensorflow/compiler/xla/service/hlo_computation.h" #include "tensorflow/compiler/xla/service/hlo_instruction.h" @@ -266,7 +267,7 @@ TEST_F(HloScheduleTest, LatticeMatMul) { params.reserve(6); for (int i = 0; i < 6; ++i) { params.push_back(builder.AddInstruction(HloInstruction::CreateParameter( - i, f32_2x2_, /*name=*/tensorflow::strings::Printf("param%d", i)))); + i, f32_2x2_, /*name=*/absl::StrFormat("param%d", i)))); } HloInstruction* d00 = builder.AddInstruction( HloInstruction::CreateCanonicalDot(f32_2x2_, params[2], params[3])); diff --git a/tensorflow/compiler/xla/service/gpu/infeed_thunk.cc b/tensorflow/compiler/xla/service/gpu/infeed_thunk.cc index fee6d2af3b..8c3a026740 100644 --- a/tensorflow/compiler/xla/service/gpu/infeed_thunk.cc +++ b/tensorflow/compiler/xla/service/gpu/infeed_thunk.cc @@ -96,7 +96,7 @@ Status InfeedThunk::ExecuteOnStream(const BufferAllocations& buffer_allocations, Status block_status = stream->BlockHostUntilDone(); if (!block_status.ok()) { return InternalError("Failed to complete data transfer on stream %p: %s", - stream, block_status.error_message().c_str()); + stream, block_status.error_message()); } VLOG(2) << "Infeeding to GPU complete"; diff --git a/tensorflow/compiler/xla/service/gpu/instruction_fusion_test.cc b/tensorflow/compiler/xla/service/gpu/instruction_fusion_test.cc index 8d0522bd8f..f53dfaee3d 100644 --- a/tensorflow/compiler/xla/service/gpu/instruction_fusion_test.cc +++ b/tensorflow/compiler/xla/service/gpu/instruction_fusion_test.cc @@ -365,7 +365,7 @@ static StatusOr FindHloInstruction( } return NotFound( "Computation '%s' does not contain an instruction with op code '%s'.", - computation.name().c_str(), HloOpcodeString(op).c_str()); + computation.name(), HloOpcodeString(op)); } TEST_F(InstructionFusionTest, MultiOutputFusion) { diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter.cc b/tensorflow/compiler/xla/service/gpu/ir_emitter.cc index 7111b53944..4cbb6d75a8 100644 --- a/tensorflow/compiler/xla/service/gpu/ir_emitter.cc +++ b/tensorflow/compiler/xla/service/gpu/ir_emitter.cc @@ -384,8 +384,8 @@ Status IrEmitter::EmitAtomicOperationForNestedComputation( // TODO(b/30258929): We only accept binary computations so far. return Unimplemented( "We only support atomic functions with exactly two parameters, but " - "computation %s has %lld.", - computation.name().c_str(), computation.num_parameters()); + "computation %s has %d.", + computation.name(), computation.num_parameters()); } if (MaybeEmitDirectAtomicOperation(computation, output_address, diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc index 9c7b508e10..4d98955c58 100644 --- a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc +++ b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc @@ -2674,8 +2674,7 @@ Status CheckHloBuffersShareAllocation( if (slice_a != slice_b) { return InternalError( "instruction %s %s does not share allocation with instruction %s %s", - a->ToString().c_str(), slice_a.ToString().c_str(), - b->ToString().c_str(), slice_b.ToString().c_str()); + a->ToString(), slice_a.ToString(), b->ToString(), slice_b.ToString()); } return Status::OK(); } diff --git a/tensorflow/compiler/xla/service/gpu/kernel_thunk.cc b/tensorflow/compiler/xla/service/gpu/kernel_thunk.cc index d856299889..3259eaa2a2 100644 --- a/tensorflow/compiler/xla/service/gpu/kernel_thunk.cc +++ b/tensorflow/compiler/xla/service/gpu/kernel_thunk.cc @@ -63,7 +63,7 @@ Status KernelThunk::Initialize(const GpuExecutable& executable, if (kernel_cache_.end() == it) { it = kernel_cache_.emplace(executor, se::KernelBase(executor)).first; if (!executor->GetKernel(*loader_spec_, &it->second)) { - return InternalError("Unable to load kernel %s", kernel_name_.c_str()); + return InternalError("Unable to load kernel %s", kernel_name_); } } @@ -107,7 +107,7 @@ Status KernelThunk::ExecuteOnStream(const BufferAllocations& buffer_allocations, stream, se::ThreadDim(launch_dimensions.threads_per_block()), se::BlockDim(launch_dimensions.block_count()), *kernel, *kernel_args)) { - return InternalError("Unable to launch kernel %s", kernel_name_.c_str()); + return InternalError("Unable to launch kernel %s", kernel_name_); } return Status::OK(); } diff --git a/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/BUILD b/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/BUILD index ccf082c4c6..698d2d51cc 100644 --- a/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/BUILD +++ b/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/BUILD @@ -36,6 +36,7 @@ cc_library( "//tensorflow/core:lib_internal", "@com_google_absl//absl/memory", "@com_google_absl//absl/strings", + "@com_google_absl//absl/strings:str_format", "@llvm//:amdgpu_code_gen", "@llvm//:analysis", "@llvm//:bit_reader", diff --git a/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/dump_ir_pass.cc b/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/dump_ir_pass.cc index a3c74507dd..85bc58cb44 100644 --- a/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/dump_ir_pass.cc +++ b/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/dump_ir_pass.cc @@ -15,6 +15,7 @@ limitations under the License. #include "tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/dump_ir_pass.h" +#include "absl/strings/str_format.h" #include "absl/strings/string_view.h" #include "llvm/IR/Module.h" #include "llvm/Support/FileSystem.h" @@ -22,7 +23,6 @@ limitations under the License. #include "tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/utils.h" #include "tensorflow/compiler/xla/types.h" #include "tensorflow/core/lib/io/path.h" -#include "tensorflow/core/lib/strings/stringprintf.h" #include "tensorflow/core/platform/logging.h" namespace xla { @@ -87,9 +87,10 @@ void IrDumpingPassManager::run(llvm::Module &module) { llvm::PassRegistry::getPassRegistry()->getPassInfo(P->getPassID()); const string basename = ReplaceFilenameExtension( absl::string_view(tensorflow::io::Basename(input_filename_)), - tensorflow::strings::Printf( + absl::StrFormat( "pass-%02d.before.%s.ll", i, - (PI == nullptr ? "unknown" : PI->getPassArgument().data()))); + absl::string_view(PI == nullptr ? "unknown" + : PI->getPassArgument().data()))); llvm::legacy::PassManager::add( new DumpIrPass(tensorflow::io::JoinPath(output_dir_, basename))); } diff --git a/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/nvptx_backend_lib.cc b/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/nvptx_backend_lib.cc index e18d7e764a..8751e3a9c2 100644 --- a/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/nvptx_backend_lib.cc +++ b/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/nvptx_backend_lib.cc @@ -57,7 +57,6 @@ limitations under the License. #include "llvm/Transforms/Scalar.h" #include "tensorflow/compiler/xla/types.h" #include "tensorflow/core/lib/io/path.h" -#include "tensorflow/core/lib/strings/stringprintf.h" #include "tensorflow/core/platform/env.h" #include "tensorflow/core/platform/logging.h" #include "tensorflow/core/platform/tracing.h" diff --git a/tensorflow/compiler/xla/service/gpu/outfeed_thunk.cc b/tensorflow/compiler/xla/service/gpu/outfeed_thunk.cc index b99d998c4d..e0f3e84a4c 100644 --- a/tensorflow/compiler/xla/service/gpu/outfeed_thunk.cc +++ b/tensorflow/compiler/xla/service/gpu/outfeed_thunk.cc @@ -96,7 +96,7 @@ Status OutfeedThunk::ExecuteOnStream( Status block_status = stream->BlockHostUntilDone(); if (!block_status.ok()) { return InternalError("Failed to complete data transfer on stream %p: %s", - stream, block_status.error_message().c_str()); + stream, block_status.error_message()); } VLOG(2) << "Outfeeding from GPU complete"; diff --git a/tensorflow/compiler/xla/service/gpu/partition_assignment.cc b/tensorflow/compiler/xla/service/gpu/partition_assignment.cc index c927c5ee16..cf9f102d31 100644 --- a/tensorflow/compiler/xla/service/gpu/partition_assignment.cc +++ b/tensorflow/compiler/xla/service/gpu/partition_assignment.cc @@ -19,6 +19,7 @@ limitations under the License. #include #include "absl/memory/memory.h" +#include "absl/strings/str_format.h" #include "tensorflow/compiler/xla/map_util.h" #include "tensorflow/compiler/xla/service/hlo_computation.h" #include "tensorflow/compiler/xla/service/hlo_opcode.h" @@ -26,7 +27,6 @@ limitations under the License. #include "tensorflow/compiler/xla/types.h" #include "tensorflow/compiler/xla/util.h" #include "tensorflow/core/lib/core/bits.h" -#include "tensorflow/core/lib/strings/stringprintf.h" #include "tensorflow/core/platform/logging.h" namespace xla { @@ -34,9 +34,8 @@ namespace gpu { std::ostream& operator<<(std::ostream& out, const LaunchDimensions& launch_dims) { - out << tensorflow::strings::Printf("[block: %lld, thread: %lld]", - launch_dims.block_count(), - launch_dims.threads_per_block()); + out << absl::StrFormat("[block: %d, thread: %d]", launch_dims.block_count(), + launch_dims.threads_per_block()); return out; } @@ -91,9 +90,9 @@ LaunchDimensions CalculateLaunchDimensions( } int64 block_count = CeilOfRatio(num_elements, threads_per_block); - VLOG(2) << tensorflow::strings::Printf( + VLOG(2) << absl::StrFormat( "Initialized the block count to ceil(# of elements / threads per " - "block) = ceil(%lld/%lld) = %lld", + "block) = ceil(%d/%d) = %d", num_elements, threads_per_block, block_count); return LaunchDimensions(block_count, threads_per_block); diff --git a/tensorflow/compiler/xla/service/gpu/stream_assignment_test.cc b/tensorflow/compiler/xla/service/gpu/stream_assignment_test.cc index 3f75d8b559..091aca23e5 100644 --- a/tensorflow/compiler/xla/service/gpu/stream_assignment_test.cc +++ b/tensorflow/compiler/xla/service/gpu/stream_assignment_test.cc @@ -16,13 +16,13 @@ limitations under the License. #include "tensorflow/compiler/xla/service/gpu/stream_assignment.h" #include "absl/memory/memory.h" +#include "absl/strings/str_format.h" #include "tensorflow/compiler/xla/service/hlo_computation.h" #include "tensorflow/compiler/xla/service/hlo_instruction.h" #include "tensorflow/compiler/xla/service/hlo_opcode.h" #include "tensorflow/compiler/xla/test_helpers.h" #include "tensorflow/compiler/xla/tests/hlo_test_base.h" #include "tensorflow/compiler/xla/types.h" -#include "tensorflow/core/lib/strings/stringprintf.h" namespace xla { namespace gpu { @@ -98,7 +98,7 @@ TEST_F(StreamAssignmentTest, LatticeMatMul) { params.reserve(6); for (int i = 0; i < 6; ++i) { params.push_back(builder.AddInstruction(HloInstruction::CreateParameter( - i, f32_2x2_, /*name=*/tensorflow::strings::Printf("param%d", i)))); + i, f32_2x2_, /*name=*/absl::StrFormat("param%d", i)))); } HloInstruction* d00 = builder.AddInstruction( HloInstruction::CreateCanonicalDot(f32_2x2_, params[2], params[3])); diff --git a/tensorflow/compiler/xla/service/gpu/while_thunk.cc b/tensorflow/compiler/xla/service/gpu/while_thunk.cc index 828fc2884b..c4754fe378 100644 --- a/tensorflow/compiler/xla/service/gpu/while_thunk.cc +++ b/tensorflow/compiler/xla/service/gpu/while_thunk.cc @@ -70,7 +70,7 @@ Status WhileThunk::ExecuteOnStream(const BufferAllocations& buffer_allocations, if (!block_status.ok()) { return InternalError( "Failed to complete all kernels launched on stream %p: %s", stream, - block_status.error_message().c_str()); + block_status.error_message()); } if (!condition_result) { diff --git a/tensorflow/compiler/xla/service/hlo_computation.cc b/tensorflow/compiler/xla/service/hlo_computation.cc index cf95b112d7..4a59380ed9 100644 --- a/tensorflow/compiler/xla/service/hlo_computation.cc +++ b/tensorflow/compiler/xla/service/hlo_computation.cc @@ -625,16 +625,15 @@ StatusOr HloComputation::DeepCopyInstruction( if (instruction->parent() != this) { return FailedPrecondition( "Can't deep copy instruction %s: instruction is not in computation %s", - instruction->name().c_str(), name().c_str()); + instruction->name(), name()); } if (indices_to_copy != nullptr && !ShapeUtil::Compatible(instruction->shape(), indices_to_copy->shape())) { return FailedPrecondition( "Can't deep copy instruction %s: given shape tree of indices to copy " "has incompatible shapes: %s vs. %s", - instruction->name().c_str(), - ShapeUtil::HumanString(instruction->shape()).c_str(), - ShapeUtil::HumanString(indices_to_copy->shape()).c_str()); + instruction->name(), ShapeUtil::HumanString(instruction->shape()), + ShapeUtil::HumanString(indices_to_copy->shape())); } ShapeIndex index; @@ -664,7 +663,7 @@ StatusOr HloComputation::DeepCopyInstructionWithCustomCopier( if (instruction->parent() != this) { return FailedPrecondition( "Can't deep copy instruction %s: instruction is not in computation %s", - instruction->name().c_str(), name().c_str()); + instruction->name(), name()); } ShapeIndex index; return DeepCopyHelper(instruction, &index, copy_leaf); diff --git a/tensorflow/compiler/xla/service/hlo_dataflow_analysis.cc b/tensorflow/compiler/xla/service/hlo_dataflow_analysis.cc index 1d35757b42..3376d170e6 100644 --- a/tensorflow/compiler/xla/service/hlo_dataflow_analysis.cc +++ b/tensorflow/compiler/xla/service/hlo_dataflow_analysis.cc @@ -837,7 +837,7 @@ Status HloDataflowAnalysis::InitializeInstructionValueSets() { return Unimplemented( "Computation %s is called in both a parallel (eg, kMap) and " "sequential (eg, kCall) context", - computation->name().c_str()); + computation->name()); } if (call_graph_node.caller_callsites().empty() || call_graph_node.context() == CallContext::kParallel) { diff --git a/tensorflow/compiler/xla/service/hlo_evaluator.cc b/tensorflow/compiler/xla/service/hlo_evaluator.cc index ca1c4dd0e9..71f91fde93 100644 --- a/tensorflow/compiler/xla/service/hlo_evaluator.cc +++ b/tensorflow/compiler/xla/service/hlo_evaluator.cc @@ -435,7 +435,7 @@ Status HloEvaluator::HandleIsFinite(HloInstruction* is_finite) { if (!ShapeUtil::ElementIsFloating(operand->shape())) { return InvalidArgument( "expected element type in shape to be float for IsFinite op, got: %s", - PrimitiveType_Name(operand->shape().element_type()).c_str()); + PrimitiveType_Name(operand->shape().element_type())); } switch (operand->shape().element_type()) { @@ -476,9 +476,9 @@ Status HloEvaluator::HandleCompare(HloInstruction* compare) { return Unimplemented( "Implicit broadcasting is currently unsupported in HLO evaluator " "Shape Mismatch: %s vs %s vs %s", - ShapeUtil::HumanString(compare->shape()).c_str(), - ShapeUtil::HumanString(lhs->shape()).c_str(), - ShapeUtil::HumanString(rhs->shape()).c_str()); + ShapeUtil::HumanString(compare->shape()), + ShapeUtil::HumanString(lhs->shape()), + ShapeUtil::HumanString(rhs->shape())); } TF_RET_CHECK(lhs->shape().element_type() == rhs->shape().element_type()); @@ -1105,8 +1105,8 @@ Status HloEvaluator::HandleWhile(HloInstruction* while_hlo) { HloEvaluator loop_body_evaluator(max_loop_iterations_); while (keep_going) { if (max_loop_iterations_ >= 0 && iteration_count++ > max_loop_iterations_) { - return InvalidArgument("Loop %s exceeded loop iteration limit (%lld).", - while_hlo->name().c_str(), max_loop_iterations_); + return InvalidArgument("Loop %s exceeded loop iteration limit (%d).", + while_hlo->name(), max_loop_iterations_); } TF_ASSIGN_OR_RETURN(auto cond_val, cond_evaluator.Evaluate( *cond_comp, {lcv.get()})); @@ -1262,7 +1262,7 @@ Status HloEvaluator::HandleSort(HloInstruction* sort) { const int64 rank = ShapeUtil::Rank(sort->operand(0)->shape()); if (sort_dim != rank - 1) { return Unimplemented( - "Trying to support along dimension %lld, which is not the last " + "Trying to support along dimension %d, which is not the last " "dimension", sort_dim); } diff --git a/tensorflow/compiler/xla/service/hlo_evaluator.h b/tensorflow/compiler/xla/service/hlo_evaluator.h index 7588916de5..0ea7089552 100644 --- a/tensorflow/compiler/xla/service/hlo_evaluator.h +++ b/tensorflow/compiler/xla/service/hlo_evaluator.h @@ -222,8 +222,8 @@ class HloEvaluator : public DfsHloVisitorWithDefault { return Unimplemented( "Implicit broadcasting is currently unsupported in HLO evaluator " "Shape Mismatch: %s vs %s", - ShapeUtil::HumanString(shape).c_str(), - ShapeUtil::HumanString(operand->shape()).c_str()); + ShapeUtil::HumanString(shape), + ShapeUtil::HumanString(operand->shape())); } auto result = absl::make_unique(shape); diff --git a/tensorflow/compiler/xla/service/hlo_evaluator_typed_visitor.h b/tensorflow/compiler/xla/service/hlo_evaluator_typed_visitor.h index 2da2cc2d71..b6566ebefe 100644 --- a/tensorflow/compiler/xla/service/hlo_evaluator_typed_visitor.h +++ b/tensorflow/compiler/xla/service/hlo_evaluator_typed_visitor.h @@ -143,7 +143,7 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault { Status DefaultAction(HloInstruction* hlo_instruction) override { return Unimplemented("unhandled HLO ops for HloEvaluator: %s.", - HloOpcodeString(hlo_instruction->opcode()).c_str()); + HloOpcodeString(hlo_instruction->opcode())); } // TODO(b/35950897): many of the stl functions used in the handlers are not @@ -2654,9 +2654,8 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault { return Unimplemented( "Implicit broadcasting is currently unsupported in HLO evaluator " "Shape Mismatch: %s vs %s vs %s: ", - ShapeUtil::HumanString(shape).c_str(), - ShapeUtil::HumanString(lhs->shape()).c_str(), - ShapeUtil::HumanString(rhs->shape()).c_str()); + ShapeUtil::HumanString(shape), ShapeUtil::HumanString(lhs->shape()), + ShapeUtil::HumanString(rhs->shape())); } const Literal& lhs_literal = parent_->GetEvaluatedLiteralFor(lhs); @@ -2690,10 +2689,9 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault { return Unimplemented( "Implicit broadcasting is currently unsupported in HLO evaluator " "Shape Mismatch: %s vs %s vs %s vs %s: ", - ShapeUtil::HumanString(shape).c_str(), - ShapeUtil::HumanString(lhs->shape()).c_str(), - ShapeUtil::HumanString(rhs->shape()).c_str(), - ShapeUtil::HumanString(ehs->shape()).c_str()); + ShapeUtil::HumanString(shape), ShapeUtil::HumanString(lhs->shape()), + ShapeUtil::HumanString(rhs->shape()), + ShapeUtil::HumanString(ehs->shape())); } const Literal& lhs_literal = parent_->GetEvaluatedLiteralFor(lhs); diff --git a/tensorflow/compiler/xla/service/hlo_graph_dumper.cc b/tensorflow/compiler/xla/service/hlo_graph_dumper.cc index 59c628e945..6cf7730fdc 100644 --- a/tensorflow/compiler/xla/service/hlo_graph_dumper.cc +++ b/tensorflow/compiler/xla/service/hlo_graph_dumper.cc @@ -28,6 +28,7 @@ limitations under the License. #include "absl/strings/match.h" #include "absl/strings/str_cat.h" +#include "absl/strings/str_format.h" #include "absl/strings/str_join.h" #include "absl/strings/str_replace.h" #include "absl/types/optional.h" @@ -44,7 +45,6 @@ limitations under the License. #include "tensorflow/core/lib/gtl/map_util.h" #include "tensorflow/core/lib/io/path.h" #include "tensorflow/core/lib/strings/numbers.h" -#include "tensorflow/core/lib/strings/stringprintf.h" #include "tensorflow/core/platform/env.h" #include "tensorflow/core/platform/protobuf.h" #include "tensorflow/core/platform/regexp.h" @@ -57,32 +57,12 @@ using absl::nullopt; using absl::optional; using absl::StrAppend; using absl::StrCat; +using absl::StrFormat; using absl::StrJoin; using tensorflow::Env; using tensorflow::WriteStringToFile; using tensorflow::io::JoinPath; -// Helpers for Printf and Appendf. -template -struct PrintfConvert { - const T& operator()(const T& t) const { return t; } -}; -template <> -struct PrintfConvert { - const char* operator()(const string& s) const { return s.c_str(); } -}; - -// Like tensorflow::strings::Printf/Appendf, but you don't need to call c_str() -// on strings. -template -string Printf(const char* fmt, const Ts&... ts) { - return tensorflow::strings::Printf(fmt, PrintfConvert()(ts)...); -} -template -void Appendf(string* s, const char* fmt, const Ts&... ts) { - tensorflow::strings::Appendf(s, fmt, PrintfConvert()(ts)...); -} - // Used to indicate how we should treat a given HLOInstruction in the graph. // should we treat it like normal, hide it, and so on? enum NodeFilterResult { @@ -210,10 +190,9 @@ NodeColors NodeColorsForScheme(ColorScheme color) { string NodeColorAttributes(ColorScheme color) { NodeColors node_colors = NodeColorsForScheme(color); - return Printf( - R"(style="%s", fontcolor="%s", color="%s", fillcolor="%s")", - node_colors.style, node_colors.font_color, node_colors.stroke_color, - node_colors.fill_color); + return StrFormat(R"(style="%s", fontcolor="%s", color="%s", fillcolor="%s")", + node_colors.style, node_colors.font_color, + node_colors.stroke_color, node_colors.fill_color); } // Replaces <> with <>, so that this string is safe(er) for use in a @@ -448,7 +427,7 @@ string HloDotDumper::Dump() { } string HloDotDumper::Header() { - const char* fmt = R"(digraph G { + constexpr char fmt[] = R"(digraph G { rankdir = TB; compound = true; label = <%s>; @@ -481,8 +460,8 @@ stylesheet=< } if (profile_ != nullptr) { auto cycles = profile_->total_cycles_executed(*computation_); - Appendf(&graph_label, "
total cycles = %lld (%s)", cycles, - tensorflow::strings::HumanReadableNum(cycles)); + absl::StrAppendFormat(&graph_label, "
total cycles = %d (%s)", cycles, + tensorflow::strings::HumanReadableNum(cycles)); } // Create CSS rules that say, when you hover over the given node or cluster, @@ -509,14 +488,14 @@ stylesheet=< // One could imagine other ways of writing this CSS rule that involve // less duplication, but this way seems to be relatively performant. edge_css_rules.push_back( - Printf(" #%s%d:hover ~ #edge%lld text { fill: %s; }\n" - " #%s%d:hover ~ #edge%lld path { " - "stroke: %s; stroke-width: .2em; }\n" - " #%s%d:hover ~ #edge%lld polygon { " - "fill: %s; stroke: %s; stroke-width: .2em; }\n", - elem_type, elem_id, edge_id, color, // - elem_type, elem_id, edge_id, color, // - elem_type, elem_id, edge_id, color, color)); + StrFormat(" #%s%d:hover ~ #edge%d text { fill: %s; }\n" + " #%s%d:hover ~ #edge%d path { " + "stroke: %s; stroke-width: .2em; }\n" + " #%s%d:hover ~ #edge%d polygon { " + "fill: %s; stroke: %s; stroke-width: .2em; }\n", + elem_type, elem_id, edge_id, color, // + elem_type, elem_id, edge_id, color, // + elem_type, elem_id, edge_id, color, color)); }; // The "to_node" value may be a NULL, indicating that this points to the @@ -559,7 +538,7 @@ stylesheet=< } } - return Printf(fmt, graph_label, StrJoin(edge_css_rules, "\n")); + return StrFormat(fmt, graph_label, StrJoin(edge_css_rules, "\n")); } string HloDotDumper::Footer() { return StrCat(StrJoin(edges_, "\n"), "\n}"); } @@ -600,9 +579,9 @@ string HloDotDumper::DumpSubcomputation(const HloComputation* subcomp, VLOG(2) << "Edge: from " << from->name() << " to " << parent_instr->name() << " as " << next_edge_id_; edge_ids_.insert({{from, parent_instr}, next_edge_id_++}); - const char* edge_fmt = + constexpr char edge_fmt[] = R"(%s -> %s [ltail="%s", style="dashed" tooltip="%s -> %s"];)"; - edges_.push_back(Printf( + edges_.push_back(StrFormat( edge_fmt, InstructionId(from), InstructionId(parent_instr), SubcomputationId(subcomp), subcomp->name(), parent_instr->name())); } @@ -619,9 +598,10 @@ string HloDotDumper::DumpSubcomputation(const HloComputation* subcomp, string subcomp_label, style; if (parent_instr->opcode() == HloOpcode::kFusion) { - subcomp_label = Printf("Fused expression for %s
%s", - HtmlLikeStringSanitize(parent_instr->name()), - HtmlLikeStringSanitize(parent_instr->ToCategory())); + subcomp_label = + StrFormat("Fused expression for %s
%s", + HtmlLikeStringSanitize(parent_instr->name()), + HtmlLikeStringSanitize(parent_instr->ToCategory())); string extra_info = GetInstructionNodeExtraInfo(parent_instr); if (!extra_info.empty()) { StrAppend(&subcomp_label, "
", extra_info); @@ -647,18 +627,18 @@ string HloDotDumper::DumpSubcomputation(const HloComputation* subcomp, strokecolor = highlight ? "#b71c1c" : "#c2c2c2"; } style = - Printf(R"(style="rounded,filled,bold"; fillcolor="%s"; color="%s;")", - fillcolor, strokecolor); + StrFormat(R"(style="rounded,filled,bold"; fillcolor="%s"; color="%s;")", + fillcolor, strokecolor); } else { - subcomp_label = Printf("Subcomputation for %s
%s", - HtmlLikeStringSanitize(parent_instr->name()), - HtmlLikeStringSanitize(subcomp->name())); + subcomp_label = StrFormat("Subcomputation for %s
%s", + HtmlLikeStringSanitize(parent_instr->name()), + HtmlLikeStringSanitize(subcomp->name())); style = "style=rounded; color=black;"; } string comp_body = DumpComputation(subcomp); - const char* computation_fmt = R"(subgraph %s { + constexpr char computation_fmt[] = R"(subgraph %s { %s label = <%s>; labelloc = t; @@ -667,7 +647,7 @@ tooltip = " "; } // %s )"; - return Printf(computation_fmt, id, style, subcomp_label, comp_body, id); + return StrFormat(computation_fmt, id, style, subcomp_label, comp_body, id); } string HloDotDumper::DumpComputation(const HloComputation* comp) { @@ -718,11 +698,11 @@ string HloDotDumper::DumpRootTag() { VLOG(2) << "Adding edge from " << from->name() << " to root tag as " << next_edge_id_; edge_ids_.insert({{from, to}, next_edge_id_++}); - edges_.push_back(Printf(R"(%s -> %s [tooltip=" "];)", from_id, to_id)); + edges_.push_back(StrFormat(R"(%s -> %s [tooltip=" "];)", from_id, to_id)); - return Printf(R"(%s [label=<%s>, shape=%s, tooltip=" ", %s];)" - "\n", - to_id, node_body, node_shape, NodeColorAttributes(color)); + return StrFormat(R"(%s [label=<%s>, shape=%s, tooltip=" ", %s];)" + "\n", + to_id, node_body, node_shape, NodeColorAttributes(color)); } static const HloConstantInstruction* TryGetFusionParameterConstant( @@ -817,10 +797,10 @@ string HloDotDumper::DumpInstruction(const HloInstruction* instr) { } } - return Printf(R"(%s [label=<%s>, shape=%s, tooltip="%s", %s];)" - "\n", - InstructionId(instr), node_body, node_shape, node_metadata, - NodeColorAttributes(color)); + return StrFormat(R"(%s [label=<%s>, shape=%s, tooltip="%s", %s];)" + "\n", + InstructionId(instr), node_body, node_shape, node_metadata, + NodeColorAttributes(color)); } string HloDotDumper::GetInstructionNodeInlinedOperands( @@ -833,7 +813,7 @@ string HloDotDumper::GetInstructionNodeInlinedOperands( // enumerates all of its empty dimensions (e.g. "{ { {}, {} }, ..."), which // is just noise. if (ShapeUtil::IsZeroElementArray(shape)) { - return Printf("{} (%s)", ShapeUtil::HumanString(constant->shape())); + return StrFormat("{} (%s)", ShapeUtil::HumanString(constant->shape())); } // Print the literal value of constants with <= K elements. @@ -848,8 +828,8 @@ string HloDotDumper::GetInstructionNodeInlinedOperands( // collected from profiling tools. Those constants may not have a valid // literal. if (elem_count.has_value() && *elem_count <= 8 && constant->HasLiteral()) { - return Printf("%s (%s)", constant->literal().ToString(), - ShapeUtil::HumanString(constant->shape())); + return StrFormat("%s (%s)", constant->literal().ToString(), + ShapeUtil::HumanString(constant->shape())); } // Otherwise, print e.g. "%constant.42 (s32[100])". @@ -859,8 +839,8 @@ string HloDotDumper::GetInstructionNodeInlinedOperands( } else { constant_name = StrCat("constant ", constant->name()); } - return Printf("%s %s", constant_name, - ShapeUtil::HumanString(constant->shape())); + return StrFormat("%s %s", constant_name, + ShapeUtil::HumanString(constant->shape())); }; std::vector lines; @@ -881,7 +861,7 @@ string HloDotDumper::GetInstructionNodeInlinedOperands( TryGetFusionParameterConstant(operand)) { operand_str = stringify_constant(constant); } else { - operand_str = Printf("Parameter %lld", operand->parameter_number()); + operand_str = StrFormat("Parameter %d", operand->parameter_number()); } } else { operand_str = operand->name(); @@ -890,9 +870,9 @@ string HloDotDumper::GetInstructionNodeInlinedOperands( if (operand_str) { if (instr->operand_count() > 1) { - lines.push_back(Printf("operand %lld = %s", i, *operand_str)); + lines.push_back(StrFormat("operand %d = %s", i, *operand_str)); } else { - lines.push_back(Printf("operand = %s", *operand_str)); + lines.push_back(StrFormat("operand = %s", *operand_str)); } } } @@ -1079,13 +1059,13 @@ string HloDotDumper::GetInstructionNodeShape(const HloInstruction* instr) { string HloDotDumper::GetInstructionNodeLabel(const HloInstruction* instr) { // If we have a parameter, put the param number in the name. if (instr->opcode() == HloOpcode::kParameter) { - return Printf("Parameter %lld", instr->parameter_number()); + return StrFormat("Parameter %d", instr->parameter_number()); } // The HLO instruction name contains usually the opcode, e.g. "%add.42" is // an add instruction. In this case we render just the name. if (absl::StartsWith(instr->name(), HloOpcodeString(instr->opcode()))) { - return Printf("%s", HtmlLikeStringSanitize(instr->name())); + return StrFormat("%s", HtmlLikeStringSanitize(instr->name())); } string extended_opcode = StrCat(HloOpcodeString(instr->opcode()), @@ -1093,8 +1073,8 @@ string HloDotDumper::GetInstructionNodeLabel(const HloInstruction* instr) { ? "" : StrCat(":", xla::ToString(instr->fusion_kind()))); // If the name does not contain the opcode, render both. - return Printf("%s
%s", HtmlLikeStringSanitize(extended_opcode), - HtmlLikeStringSanitize(instr->name())); + return StrFormat("%s
%s", HtmlLikeStringSanitize(extended_opcode), + HtmlLikeStringSanitize(instr->name())); } string HloDotDumper::GetInstructionNodeMetadata(const HloInstruction* instr) { @@ -1103,13 +1083,13 @@ string HloDotDumper::GetInstructionNodeMetadata(const HloInstruction* instr) { lines.push_back(HtmlLikeStringSanitize(instr->metadata().op_name())); } if (!instr->metadata().op_type().empty()) { - lines.push_back(Printf( + lines.push_back(StrFormat( "op_type: %s", HtmlLikeStringSanitize(instr->metadata().op_type()))); } if (!instr->metadata().source_file().empty() && instr->metadata().source_line() != 0) { - lines.push_back(Printf("op_type: %s", instr->metadata().source_file(), - instr->metadata().source_line())); + lines.push_back(StrFormat("op_type: %s:%d", instr->metadata().source_file(), + instr->metadata().source_line())); } return StrJoin(lines, "
"); @@ -1164,7 +1144,7 @@ string HloDotDumper::GetInstructionNodeExtraInfo(const HloInstruction* instr) { lines.push_back(instr_shape); } if (debug_options_.xla_hlo_graph_addresses()) { - lines.push_back(Printf("[%p]", instr)); + lines.push_back(StrFormat("[%p]", instr)); } if (profile_ != nullptr) { double hlo_cycles_executed = profile_->GetCyclesTakenBy(*instr); @@ -1172,8 +1152,8 @@ string HloDotDumper::GetInstructionNodeExtraInfo(const HloInstruction* instr) { profile_->total_cycles_executed(*instr->parent()); if (hlo_cycles_executed > 0 && total_cycles_executed > 0) { lines.push_back( - Printf("%% of cycles executed=%.2f", - 100 * hlo_cycles_executed / total_cycles_executed)); + StrFormat("%% of cycles executed=%.2f", + 100 * hlo_cycles_executed / total_cycles_executed)); } } return StrJoin(lines, "
"); @@ -1208,7 +1188,8 @@ void HloDotDumper::AddInstructionIncomingEdges(const HloInstruction* instr) { string edge_label; if (instr->operand_count() > 1 && !control_edge) { - edge_label = Printf(R"( headlabel="%lld", labeldistance=2)", operand_num); + edge_label = + StrFormat(R"( headlabel="%d", labeldistance=2)", operand_num); } else if (control_edge) { edge_label = "style=\"dotted\" color=\"gray\" label=\"ctrl\""; } @@ -1218,10 +1199,11 @@ void HloDotDumper::AddInstructionIncomingEdges(const HloInstruction* instr) { // means. bool is_big_array = TotalElementsInShape(from->shape()) >= 4096; - const char* kEdgeFmt = R"(%s -> %s [arrowhead=%s tooltip="%s -> %s" %s];)"; - edges_.push_back(Printf(kEdgeFmt, InstructionId(from), InstructionId(to), - (is_big_array ? "normal" : "empty"), from->name(), - to->name(), edge_label)); + constexpr char kEdgeFmt[] = + R"(%s -> %s [arrowhead=%s tooltip="%s -> %s" %s];)"; + edges_.push_back(StrFormat(kEdgeFmt, InstructionId(from), InstructionId(to), + (is_big_array ? "normal" : "empty"), + from->name(), to->name(), edge_label)); }; // Add edges from instr's operands to instr. Parameters within fusion @@ -1262,11 +1244,11 @@ string HloDotDumper::GetInstructionTrivialComputationStr( continue; } if (instr->called_computations().size() == 1) { - lines.push_back(Printf("Subcomputation: %s", - HtmlLikeStringSanitize(*computation_type))); + lines.push_back(StrFormat("Subcomputation: %s", + HtmlLikeStringSanitize(*computation_type))); } else { - lines.push_back(Printf("Subcomputation %lld: %s", i, - HtmlLikeStringSanitize(*computation_type))); + lines.push_back(StrFormat("Subcomputation %d: %s", i, + HtmlLikeStringSanitize(*computation_type))); } } return StrJoin(lines, "
"); diff --git a/tensorflow/compiler/xla/service/hlo_instruction.cc b/tensorflow/compiler/xla/service/hlo_instruction.cc index 3e077d8aec..6b4f3c4eb8 100644 --- a/tensorflow/compiler/xla/service/hlo_instruction.cc +++ b/tensorflow/compiler/xla/service/hlo_instruction.cc @@ -2381,7 +2381,7 @@ Status HloInstruction::Visit(DfsHloVisitorBase* visitor) { return InternalError( "Unhandled HloOpcode for DfsHloVisitor: %s. This should not happen - " "please file a bug for XLA.", - HloOpcodeString(opcode_).c_str()); + HloOpcodeString(opcode_)); } // Explicit instantiations. @@ -2464,7 +2464,7 @@ static Status PostOrderDFS(HloInstruction* root, Visitor* visitor, if (!TF_PREDICT_TRUE(PushDFSChild(visitor, &dfs_stack, child))) { return FailedPrecondition( "A cycle is detected while visiting instruction %s", - current_node->ToString().c_str()); + current_node->ToString()); } } @@ -2473,7 +2473,7 @@ static Status PostOrderDFS(HloInstruction* root, Visitor* visitor, if (!TF_PREDICT_TRUE(PushDFSChild(visitor, &dfs_stack, child))) { return FailedPrecondition( "A cycle is detected while visiting instruction %s", - current_node->ToString().c_str()); + current_node->ToString()); } } } @@ -2789,7 +2789,7 @@ StatusOr StringToFusionKind( if (kind_name == "kCustom") { return HloInstruction::FusionKind::kCustom; } - return InvalidArgument("Unknown fusion kind: %s", kind_name.c_str()); + return InvalidArgument("Unknown fusion kind: %s", kind_name); } string PaddingConfigToString(const PaddingConfig& padding) { diff --git a/tensorflow/compiler/xla/service/hlo_instruction_test.cc b/tensorflow/compiler/xla/service/hlo_instruction_test.cc index 504b13043f..8b0b90dfb3 100644 --- a/tensorflow/compiler/xla/service/hlo_instruction_test.cc +++ b/tensorflow/compiler/xla/service/hlo_instruction_test.cc @@ -53,7 +53,7 @@ class OpAndUserCollectingVisitor : public DfsHloVisitorWithDefault { public: Status DefaultAction(HloInstruction* hlo_instruction) override { return Unimplemented("not implemented %s", - HloOpcodeString(hlo_instruction->opcode()).c_str()); + HloOpcodeString(hlo_instruction->opcode())); } Status HandleParameter(HloInstruction* parameter) override { diff --git a/tensorflow/compiler/xla/service/hlo_lexer.cc b/tensorflow/compiler/xla/service/hlo_lexer.cc index 0e49d343d6..5b23ee7d00 100644 --- a/tensorflow/compiler/xla/service/hlo_lexer.cc +++ b/tensorflow/compiler/xla/service/hlo_lexer.cc @@ -306,8 +306,7 @@ TokKind HloLexer::LexNumberOrPattern() { R"([-]?((\d+|\d+[.]\d*|\d*[.]\d+)([eE][+-]?\d+))|[-]?(\d+[.]\d*|\d*[.]\d+))"}; if (RE2::Consume(&consumable, *float_pattern)) { current_ptr_ = consumable.begin(); - CHECK(absl::SimpleAtod(string(token_start_, current_ptr_).c_str(), - &decimal_val_)); + CHECK(absl::SimpleAtod(string(token_start_, current_ptr_), &decimal_val_)); return TokKind::kDecimal; } diff --git a/tensorflow/compiler/xla/service/hlo_module_group_metadata.cc b/tensorflow/compiler/xla/service/hlo_module_group_metadata.cc index f52a37bc74..a9c5d48983 100644 --- a/tensorflow/compiler/xla/service/hlo_module_group_metadata.cc +++ b/tensorflow/compiler/xla/service/hlo_module_group_metadata.cc @@ -163,7 +163,7 @@ Status HloModuleGroupMetadata::VerifyCompanionSets() const { ss << " " << hlo->name() << std::endl; } ss << "has multiple instructions on the same device"; - return FailedPrecondition("%s", ss.str().c_str()); + return FailedPrecondition("%s", ss.str()); } } } @@ -411,16 +411,16 @@ Status HloModuleGroupMetadata::AddCompanion(HloInstruction* instruction1, Status HloModuleGroupMetadata::VerifyChannelInstructions() { for (const Channel& channel : channels_) { if (channel.send == nullptr) { - return FailedPrecondition("missing send for id : %lld", channel.id); + return FailedPrecondition("missing send for id : %d", channel.id); } if (channel.recv == nullptr) { - return FailedPrecondition("missing recv for id : %lld", channel.id); + return FailedPrecondition("missing recv for id : %d", channel.id); } if (channel.send_done == nullptr) { - return FailedPrecondition("missing send-done for id : %lld", channel.id); + return FailedPrecondition("missing send-done for id : %d", channel.id); } if (channel.recv_done == nullptr) { - return FailedPrecondition("missing recv-done for id : %lld", channel.id); + return FailedPrecondition("missing recv-done for id : %d", channel.id); } } @@ -436,33 +436,33 @@ Status HloModuleGroupMetadata::VerifyChannelInstructions() { auto send_done_device = GetInstructionDevice(*channel.send_done); if (!send_device) { return FailedPrecondition("send instruction must have a device: %s", - channel.send->ToString().c_str()); + channel.send->ToString()); } if (!send_done_device) { return FailedPrecondition("send_done instruction must have a device: %s", - channel.send_done->ToString().c_str()); + channel.send_done->ToString()); } if (*send_device != *send_done_device) { return FailedPrecondition( - "send and send-done (channel=%lld) must be on the same device: %lld " - "vs. %lld", + "send and send-done (channel=%d) must be on the same device: %d " + "vs. %d", channel.id, *send_device, *send_done_device); } auto recv_device = GetInstructionDevice(*channel.recv); auto recv_done_device = GetInstructionDevice(*channel.recv_done); if (!recv_done_device) { return FailedPrecondition("recv_done instruction must have a device: %s", - channel.recv_done->ToString().c_str()); + channel.recv_done->ToString()); } if (*recv_device != *recv_done_device) { return FailedPrecondition( - "recv and recv-done (channel=%lld) must be on the same device: %lld " - "vs. %lld", + "recv and recv-done (channel=%d) must be on the same device: %d " + "vs. %d", channel.id, *recv_device, *recv_done_device); } if (*send_device == *recv_device) { return FailedPrecondition( - "send and recv (channel=%lld) must be on different devices: %lld", + "send and recv (channel=%d) must be on different devices: %d", channel.id, *send_device); } } @@ -483,7 +483,7 @@ Status HloModuleGroupMetadata::VerifyChannelInstructions() { !CheckCompanionPathsCompatibility( path, GetCompanionsPath(channel.recv_done))) { return FailedPrecondition( - "Nest companion paths do not match for channel %lld", channel.id); + "Nest companion paths do not match for channel %d", channel.id); } } return Status::OK(); diff --git a/tensorflow/compiler/xla/service/hlo_module_group_util.cc b/tensorflow/compiler/xla/service/hlo_module_group_util.cc index b5c7681edd..d70328c8a3 100644 --- a/tensorflow/compiler/xla/service/hlo_module_group_util.cc +++ b/tensorflow/compiler/xla/service/hlo_module_group_util.cc @@ -282,7 +282,7 @@ Status HloModuleGroupUtil::VisitTopologicalOrder( "following nodes. Note that the order of the nodes is arbitrary " "and that the list may include nodes that are not part of the " "cycle.\n%s", - predecessor->ToString().c_str(), cyclic_instructions.c_str()); + predecessor->ToString(), cyclic_instructions); } stack.push(predecessor); } diff --git a/tensorflow/compiler/xla/service/hlo_opcode.cc b/tensorflow/compiler/xla/service/hlo_opcode.cc index d1eaf35785..2d4e38589f 100644 --- a/tensorflow/compiler/xla/service/hlo_opcode.cc +++ b/tensorflow/compiler/xla/service/hlo_opcode.cc @@ -39,7 +39,7 @@ StatusOr StringToHloOpcode(const string& opcode_name) { }); auto it = opcode_map->find(opcode_name); if (it == opcode_map->end()) { - return InvalidArgument("Unknown opcode: %s", opcode_name.c_str()); + return InvalidArgument("Unknown opcode: %s", opcode_name); } return it->second; } diff --git a/tensorflow/compiler/xla/service/hlo_ordering.cc b/tensorflow/compiler/xla/service/hlo_ordering.cc index 8fe91c7278..0581d5c404 100644 --- a/tensorflow/compiler/xla/service/hlo_ordering.cc +++ b/tensorflow/compiler/xla/service/hlo_ordering.cc @@ -18,6 +18,7 @@ limitations under the License. #include #include +#include "absl/strings/str_format.h" #include "absl/strings/str_join.h" #include "tensorflow/compiler/xla/service/hlo_computation.h" #include "tensorflow/compiler/xla/shape_util.h" @@ -26,7 +27,6 @@ limitations under the License. #include "tensorflow/compiler/xla/types.h" #include "tensorflow/compiler/xla/util.h" #include "tensorflow/core/lib/core/errors.h" -#include "tensorflow/core/lib/strings/stringprintf.h" #include "tensorflow/core/platform/logging.h" namespace xla { @@ -306,17 +306,15 @@ string PredecessorHloOrdering::ToStringHelper(const string& name) const { std::vector pieces; pieces.push_back(name); for (auto* computation : module_->MakeNonfusionComputations()) { - pieces.push_back(tensorflow::strings::Printf("computation %s:", - computation->name().c_str())); + pieces.push_back(absl::StrFormat("computation %s:", computation->name())); const auto all = computation->MakeInstructionPostOrder(); for (auto instruction : all) { - pieces.push_back(tensorflow::strings::Printf( - " %s predecessors:", instruction->name().c_str())); + pieces.push_back( + absl::StrFormat(" %s predecessors:", instruction->name())); for (auto predecessor : all) { if (predecessors_.at(computation) ->IsReachable(predecessor, instruction)) { - pieces.push_back( - tensorflow::strings::Printf(" %s", predecessor->name().c_str())); + pieces.push_back(absl::StrFormat(" %s", predecessor->name())); } } } @@ -372,8 +370,8 @@ string SequentialHloOrdering::ToString() const { std::vector pieces; pieces.push_back("SequentialHloOrdering"); for (auto* computation : module_->computations()) { - pieces.push_back(tensorflow::strings::Printf("computation %s order:", - computation->name().c_str())); + pieces.push_back( + absl::StrFormat("computation %s order:", computation->name())); // Gather all instructions in the module sequence for this computation and // sort them by their position. std::vector instructions; @@ -388,8 +386,7 @@ string SequentialHloOrdering::ToString() const { return order_position_.at(a) < order_position_.at(b); }); for (auto instruction : instructions) { - pieces.push_back( - tensorflow::strings::Printf(" %s", instruction->name().c_str())); + pieces.push_back(absl::StrFormat(" %s", instruction->name())); } } return absl::StrJoin(pieces, "\n"); diff --git a/tensorflow/compiler/xla/service/hlo_parser.cc b/tensorflow/compiler/xla/service/hlo_parser.cc index df789e6222..ba0f07dd14 100644 --- a/tensorflow/compiler/xla/service/hlo_parser.cc +++ b/tensorflow/compiler/xla/service/hlo_parser.cc @@ -18,6 +18,7 @@ limitations under the License. #include "absl/algorithm/container.h" #include "absl/memory/memory.h" #include "absl/strings/str_cat.h" +#include "absl/strings/str_format.h" #include "absl/strings/str_join.h" #include "absl/strings/str_split.h" #include "tensorflow/compiler/xla/literal.h" @@ -29,7 +30,6 @@ limitations under the License. #include "tensorflow/compiler/xla/shape_util.h" #include "tensorflow/compiler/xla/util.h" #include "tensorflow/core/lib/gtl/map_util.h" -#include "tensorflow/core/lib/strings/stringprintf.h" namespace xla { @@ -39,8 +39,8 @@ using absl::nullopt; using absl::optional; using absl::StrAppend; using absl::StrCat; +using absl::StrFormat; using absl::StrJoin; -using ::tensorflow::strings::Printf; const double kF16max = 65504; @@ -1586,8 +1586,7 @@ bool HloParser::ParseInstructionNames( } std::pair* instr = FindInstruction(name); if (!instr) { - return TokenError( - Printf("instruction '%s' is not defined", name.c_str())); + return TokenError(StrFormat("instruction '%s' is not defined", name)); } instructions->push_back(instr->first); } while (EatIfPresent(TokKind::kComma)); @@ -1829,17 +1828,17 @@ bool HloParser::ParseDenseLiteral(std::unique_ptr* literal, case TokKind::kLbrace: { nest_level++; if (nest_level > rank) { - return TokenError(Printf( - "expects nested array in rank %lld, but sees larger", rank)); + return TokenError(absl::StrFormat( + "expects nested array in rank %d, but sees larger", rank)); } if (nest_level > 1) { elems_seen_per_dim[nest_level - 2]++; if (elems_seen_per_dim[nest_level - 2] > shape.dimensions(nest_level - 2)) { - return TokenError(Printf( - "expects %lld elements in the %sth element, but sees more", + return TokenError(absl::StrFormat( + "expects %d elements in the %sth element, but sees more", shape.dimensions(nest_level - 2), - get_index_str(nest_level - 2).c_str())); + get_index_str(nest_level - 2))); } } lexer_.Lex(); @@ -1848,9 +1847,9 @@ bool HloParser::ParseDenseLiteral(std::unique_ptr* literal, case TokKind::kRbrace: { nest_level--; if (elems_seen_per_dim[nest_level] != shape.dimensions(nest_level)) { - return TokenError(Printf( - "expects %lld elements in the %sth element, but sees %lld", - shape.dimensions(nest_level), get_index_str(nest_level).c_str(), + return TokenError(absl::StrFormat( + "expects %d elements in the %sth element, but sees %d", + shape.dimensions(nest_level), get_index_str(nest_level), elems_seen_per_dim[nest_level])); } elems_seen_per_dim[nest_level] = 0; @@ -1871,15 +1870,15 @@ bool HloParser::ParseDenseLiteral(std::unique_ptr* literal, if (rank > 0) { if (nest_level != rank) { return TokenError( - Printf("expects nested array in rank %lld, but sees %lld", rank, - nest_level)); + absl::StrFormat("expects nested array in rank %d, but sees %d", + rank, nest_level)); } elems_seen_per_dim[rank - 1]++; if (elems_seen_per_dim[rank - 1] > shape.dimensions(rank - 1)) { - return TokenError( - Printf("expects %lld elements on the minor-most dimension, but " - "sees more", - shape.dimensions(rank - 1))); + return TokenError(absl::StrFormat( + "expects %d elements on the minor-most dimension, but " + "sees more", + shape.dimensions(rank - 1))); } } if (lexer_.GetKind() == TokKind::kw_true || @@ -2135,8 +2134,8 @@ bool HloParser::ParseSubAttributes( for (const auto& attr_it : attrs) { if (attr_it.second.required && seen_attrs.find(attr_it.first) == seen_attrs.end()) { - return Error(loc, Printf("sub-attribute %s is expected but not seen", - attr_it.first.c_str())); + return Error(loc, StrFormat("sub-attribute %s is expected but not seen", + attr_it.first)); } } return ParseToken(TokKind::kRbrace, "expects '}' to end sub attributes"); @@ -2156,8 +2155,8 @@ bool HloParser::ParseAttributes( for (const auto& attr_it : attrs) { if (attr_it.second.required && seen_attrs.find(attr_it.first) == seen_attrs.end()) { - return Error(loc, Printf("attribute %s is expected but not seen", - attr_it.first.c_str())); + return Error(loc, StrFormat("attribute %s is expected but not seen", + attr_it.first)); } } return true; @@ -2173,7 +2172,7 @@ bool HloParser::ParseAttributeHelper( } VLOG(1) << "Parsing attribute " << name; if (!seen_attrs->insert(name).second) { - return Error(loc, Printf("attribute %s already exists", name.c_str())); + return Error(loc, StrFormat("attribute %s already exists", name)); } auto attr_it = attrs.find(name); if (attr_it == attrs.end()) { @@ -2188,8 +2187,8 @@ bool HloParser::ParseAttributeHelper( StrAppend(out, kv.first); })); } - return Error(loc, Printf("unexpected attribute \"%s\". %s", name.c_str(), - allowed_attrs.c_str())); + return Error(loc, StrFormat("unexpected attribute \"%s\". %s", name, + allowed_attrs)); } AttrTy attr_type = attr_it->second.attr_type; void* attr_out_ptr = attr_it->second.result; @@ -2384,7 +2383,7 @@ bool HloParser::ParseAttributeHelper( } }(); if (!success) { - return Error(loc, Printf("error parsing attribute %s", name.c_str())); + return Error(loc, StrFormat("error parsing attribute %s", name)); } return true; } @@ -2548,7 +2547,7 @@ bool HloParser::ParseConvolutionDimensionNumbers( dnums->set_input_spatial_dimensions(c - '0', i); } else { return TokenError( - Printf("expects [0-%lldbf] in lhs dimension numbers", rank - 1)); + StrFormat("expects [0-%dbf] in lhs dimension numbers", rank - 1)); } } } @@ -2571,7 +2570,7 @@ bool HloParser::ParseConvolutionDimensionNumbers( dnums->set_kernel_spatial_dimensions(c - '0', i); } else { return TokenError( - Printf("expects [0-%lldio] in rhs dimension numbers", rank - 1)); + StrFormat("expects [0-%dio] in rhs dimension numbers", rank - 1)); } } } @@ -2593,8 +2592,8 @@ bool HloParser::ParseConvolutionDimensionNumbers( } else if (c < '0' + rank && c >= '0') { dnums->set_output_spatial_dimensions(c - '0', i); } else { - return TokenError( - Printf("expects [0-%lldbf] in output dimension numbers", rank - 1)); + return TokenError(StrFormat( + "expects [0-%dbf] in output dimension numbers", rank - 1)); } } } @@ -2640,9 +2639,10 @@ bool HloParser::ParseSliceRanges(SliceRanges* result) { } const auto& range = ranges.back(); if (range.size() != 2 && range.size() != 3) { - return Error(loc, Printf("expects [start:limit:step] or [start:limit], " - "but sees %ld elements.", - range.size())); + return Error(loc, + StrFormat("expects [start:limit:step] or [start:limit], " + "but sees %d elements.", + range.size())); } } while (EatIfPresent(TokKind::kComma)); @@ -2828,14 +2828,13 @@ bool HloParser::ParseDxD(const string& name, std::vector* result) { LocTy loc = lexer_.GetLoc(); if (!result->empty()) { - return Error(loc, - Printf("sub-attribute '%s=' already exists", name.c_str())); + return Error(loc, StrFormat("sub-attribute '%s=' already exists", name)); } // 1D if (lexer_.GetKind() == TokKind::kInt) { tensorflow::int64 number; if (!ParseInt64(&number)) { - return Error(loc, Printf("expects sub-attribute '%s=i'", name.c_str())); + return Error(loc, StrFormat("expects sub-attribute '%s=i'", name)); } result->push_back(number); return true; @@ -2844,8 +2843,7 @@ bool HloParser::ParseDxD(const string& name, if (lexer_.GetKind() == TokKind::kDxD) { string str = lexer_.GetStrVal(); if (!SplitToInt64s(str, 'x', result)) { - return Error(loc, - Printf("expects sub-attribute '%s=ixj...'", name.c_str())); + return Error(loc, StrFormat("expects sub-attribute '%s=ixj...'", name)); } lexer_.Lex(); return true; @@ -2940,9 +2938,8 @@ bool HloParser::ParseOpcode(HloOpcode* result) { string val = lexer_.GetStrVal(); auto status_or_result = StringToHloOpcode(val); if (!status_or_result.ok()) { - return TokenError( - Printf("expects opcode but sees: %s, error: %s", val.c_str(), - status_or_result.status().error_message().c_str())); + return TokenError(StrFormat("expects opcode but sees: %s, error: %s", val, + status_or_result.status().error_message())); } *result = status_or_result.ValueOrDie(); lexer_.Lex(); @@ -2956,7 +2953,7 @@ bool HloParser::ParseFftType(FftType* result) { } string val = lexer_.GetStrVal(); if (!FftType_Parse(val, result) || !FftType_IsValid(*result)) { - return TokenError(Printf("expects fft type but sees: %s", val.c_str())); + return TokenError(StrFormat("expects fft type but sees: %s", val)); } lexer_.Lex(); return true; @@ -2970,9 +2967,9 @@ bool HloParser::ParseFusionKind(HloInstruction::FusionKind* result) { string val = lexer_.GetStrVal(); auto status_or_result = StringToFusionKind(val); if (!status_or_result.ok()) { - return TokenError( - Printf("expects fusion kind but sees: %s, error: %s", val.c_str(), - status_or_result.status().error_message().c_str())); + return TokenError(StrFormat("expects fusion kind but sees: %s, error: %s", + val, + status_or_result.status().error_message())); } *result = status_or_result.ValueOrDie(); lexer_.Lex(); @@ -2988,8 +2985,8 @@ bool HloParser::ParseRandomDistribution(RandomDistribution* result) { auto status_or_result = StringToRandomDistribution(val); if (!status_or_result.ok()) { return TokenError( - Printf("expects random distribution but sees: %s, error: %s", - val.c_str(), status_or_result.status().error_message().c_str())); + StrFormat("expects random distribution but sees: %s, error: %s", val, + status_or_result.status().error_message())); } *result = status_or_result.ValueOrDie(); lexer_.Lex(); @@ -3004,9 +3001,9 @@ bool HloParser::ParsePrecision(PrecisionConfigProto::Precision* result) { string val = lexer_.GetStrVal(); auto status_or_result = StringToPrecision(val); if (!status_or_result.ok()) { - return TokenError( - Printf("expects precision but sees: %s, error: %s", val.c_str(), - status_or_result.status().error_message().c_str())); + return TokenError(StrFormat("expects precision but sees: %s, error: %s", + val, + status_or_result.status().error_message())); } *result = status_or_result.ValueOrDie(); lexer_.Lex(); @@ -3100,7 +3097,7 @@ StatusOr HloParser::ParseShardingOnly() { lexer_.Lex(); OpSharding op_sharding; if (!ParseSharding(&op_sharding)) { - return InvalidArgument("Syntax error:\n%s", GetError().c_str()); + return InvalidArgument("Syntax error:\n%s", GetError()); } if (lexer_.GetKind() != TokKind::kEof) { return InvalidArgument("Syntax error:\nExtra content after sharding"); @@ -3112,7 +3109,7 @@ StatusOr HloParser::ParseWindowOnly() { lexer_.Lex(); Window window; if (!ParseWindow(&window, /*expect_outer_curlies=*/false)) { - return InvalidArgument("Syntax error:\n%s", GetError().c_str()); + return InvalidArgument("Syntax error:\n%s", GetError()); } if (lexer_.GetKind() != TokKind::kEof) { return InvalidArgument("Syntax error:\nExtra content after window"); @@ -3125,7 +3122,7 @@ HloParser::ParseConvolutionDimensionNumbersOnly() { lexer_.Lex(); ConvolutionDimensionNumbers dnums; if (!ParseConvolutionDimensionNumbers(&dnums)) { - return InvalidArgument("Syntax error:\n%s", GetError().c_str()); + return InvalidArgument("Syntax error:\n%s", GetError()); } if (lexer_.GetKind() != TokKind::kEof) { return InvalidArgument( @@ -3163,7 +3160,7 @@ Status HloParser::ParseSingleInstruction(HloComputation::Builder* builder, // Parse the instruction with the registered hook. if (!ParseInstruction(builder, root_name)) { - return InvalidArgument("Syntax error:\n%s", GetError().c_str()); + return InvalidArgument("Syntax error:\n%s", GetError()); } return Status::OK(); } @@ -3174,7 +3171,7 @@ StatusOr> ParseHloString( absl::string_view str, const HloModuleConfig& config) { HloParser parser(str, config); if (!parser.Run()) { - return InvalidArgument("Syntax error:\n%s", parser.GetError().c_str()); + return InvalidArgument("Syntax error:\n%s", parser.GetError()); } return parser.ConsumeHloModule(); } diff --git a/tensorflow/compiler/xla/service/hlo_pass_pipeline.cc b/tensorflow/compiler/xla/service/hlo_pass_pipeline.cc index df99e131d8..de7ad6d209 100644 --- a/tensorflow/compiler/xla/service/hlo_pass_pipeline.cc +++ b/tensorflow/compiler/xla/service/hlo_pass_pipeline.cc @@ -18,6 +18,7 @@ limitations under the License. #include #include "absl/strings/str_cat.h" +#include "absl/strings/str_format.h" #include "absl/strings/str_join.h" #include "tensorflow/compiler/xla/service/hlo_graph_dumper.h" #include "tensorflow/compiler/xla/service/hlo_proto_util.h" @@ -48,9 +49,9 @@ void DumpModuleProto(const HloModule& module, const string& dump_to, tensorflow::mutex_lock lock(mu); const int64 pass_number = (*module_id_to_pass_number)[module.unique_id()]++; - const string mod_name = SanitizeFileName(tensorflow::strings::Printf( - "module_%04d.%04lld.%s.after_%s", module.unique_id(), pass_number, - pipeline_name.c_str(), pass_name.c_str())); + const string mod_name = SanitizeFileName( + absl::StrFormat("module_%04d.%04d.%s.after_%s", module.unique_id(), + pass_number, pipeline_name, pass_name)); TF_QCHECK_OK(protobuf_util::DumpProtoToDirectory(MakeHloProto(module), dump_to, mod_name)); diff --git a/tensorflow/compiler/xla/service/hlo_rematerialization.cc b/tensorflow/compiler/xla/service/hlo_rematerialization.cc index 6c6e7c6fec..569d2e5d2d 100644 --- a/tensorflow/compiler/xla/service/hlo_rematerialization.cc +++ b/tensorflow/compiler/xla/service/hlo_rematerialization.cc @@ -22,6 +22,7 @@ limitations under the License. #include "absl/container/inlined_vector.h" #include "absl/strings/str_cat.h" +#include "absl/strings/str_format.h" #include "absl/strings/str_join.h" #include "tensorflow/compiler/xla/map_util.h" #include "tensorflow/compiler/xla/primitive_util.h" @@ -40,7 +41,6 @@ limitations under the License. #include "tensorflow/compiler/xla/statusor.h" #include "tensorflow/compiler/xla/types.h" #include "tensorflow/compiler/xla/util.h" -#include "tensorflow/core/lib/strings/stringprintf.h" #include "tensorflow/core/platform/logging.h" namespace xla { @@ -1353,12 +1353,11 @@ StatusOr HloRematerialization::Run( XLA_VLOG_LINES(3, "After HloRematerialization:\n" + module->ToString()); if (current_peak_memory > memory_limit_bytes) { - LOG(WARNING) << tensorflow::strings::Printf( - "Can't reduce memory use below %s (%lld bytes) by rematerialization; " - "only reduced to %s (%lld bytes)", - HumanReadableNumBytes(memory_limit_bytes).c_str(), memory_limit_bytes, - HumanReadableNumBytes(current_peak_memory).c_str(), - current_peak_memory); + LOG(WARNING) << absl::StrFormat( + "Can't reduce memory use below %s (%d bytes) by rematerialization; " + "only reduced to %s (%d bytes)", + HumanReadableNumBytes(memory_limit_bytes), memory_limit_bytes, + HumanReadableNumBytes(current_peak_memory), current_peak_memory); } return changed; diff --git a/tensorflow/compiler/xla/service/hlo_scheduling.cc b/tensorflow/compiler/xla/service/hlo_scheduling.cc index 56b14f9fef..0fc3b268c0 100644 --- a/tensorflow/compiler/xla/service/hlo_scheduling.cc +++ b/tensorflow/compiler/xla/service/hlo_scheduling.cc @@ -30,7 +30,6 @@ limitations under the License. #include "tensorflow/compiler/xla/util.h" #include "tensorflow/core/lib/core/errors.h" #include "tensorflow/core/lib/gtl/map_util.h" -#include "tensorflow/core/lib/strings/stringprintf.h" #include "tensorflow/core/platform/logging.h" namespace xla { diff --git a/tensorflow/compiler/xla/service/hlo_verifier.cc b/tensorflow/compiler/xla/service/hlo_verifier.cc index f60c4eab42..81ffb5ac43 100644 --- a/tensorflow/compiler/xla/service/hlo_verifier.cc +++ b/tensorflow/compiler/xla/service/hlo_verifier.cc @@ -128,10 +128,9 @@ Status ShapeVerifier::CheckIsTokenOperand(const HloInstruction* instruction, const HloInstruction* token = instruction->operand(operand_no); if (!ShapeUtil::Equal(token->shape(), ShapeUtil::MakeTokenShape())) { return InternalError( - "Expected operand %lld to be token-shaped, actual shape is " + "Expected operand %d to be token-shaped, actual shape is " "%s:\n%s", - operand_no, StringifyShape(token->shape()).c_str(), - instruction->ToString().c_str()); + operand_no, StringifyShape(token->shape()), instruction->ToString()); } return Status::OK(); } @@ -144,9 +143,8 @@ Status ShapeVerifier::CheckOperandAndParameter( computation->parameter_instruction(parameter_number); if (!ShapesSame(operand->shape(), parameter->shape())) { return InternalError("Operand %s shape does not match parameter's %s in %s", - operand->ToString().c_str(), - parameter->ToString().c_str(), - instruction->ToString().c_str()); + operand->ToString(), parameter->ToString(), + instruction->ToString()); } return Status::OK(); } @@ -171,9 +169,8 @@ Status ShapeVerifier::HandleOutfeed(HloInstruction* instruction) { return InternalError( "Expected outfeed shape to be equal to operand's shape %s, " "actual shape is %s:\n%s", - StringifyShape(outfeed->operand(0)->shape()).c_str(), - StringifyShape(outfeed->outfeed_shape()).c_str(), - outfeed->ToString().c_str()); + StringifyShape(outfeed->operand(0)->shape()), + StringifyShape(outfeed->outfeed_shape()), outfeed->ToString()); } return CheckShape(outfeed, ShapeUtil::MakeTokenShape()); } @@ -191,7 +188,7 @@ bool ShapeVerifier::HasCompatibleElementTypes(const Shape& shape_0, Status ShapeVerifier::HandleRng(HloInstruction* instruction) { if (instruction->operand_count() != 2) { return InternalError("Expected two operands for Rng instruction: %s", - instruction->ToString().c_str()); + instruction->ToString()); } const Shape& shape_0 = instruction->operand(0)->shape(); @@ -199,14 +196,14 @@ Status ShapeVerifier::HandleRng(HloInstruction* instruction) { if (!ShapeUtil::IsScalar(shape_0) || !ShapeUtil::IsScalar(shape_1)) { return InternalError( "Expected scalar types for the two operands of Rng instruction: %s", - instruction->ToString().c_str()); + instruction->ToString()); } if (!HasCompatibleElementTypes(shape_0, shape_1, instruction->shape())) { return InternalError( "Expected compatible element types for the result and the two operands" " of Rng instruction: %s", - instruction->ToString().c_str()); + instruction->ToString()); } PrimitiveType element_type = shape_0.element_type(); @@ -219,7 +216,7 @@ Status ShapeVerifier::HandleRng(HloInstruction* instruction) { "Element type not supported." " Expected element to be of floating point type, integral type or" " predicate type for RngUniform: %s", - instruction->ToString().c_str()); + instruction->ToString()); } break; @@ -228,13 +225,13 @@ Status ShapeVerifier::HandleRng(HloInstruction* instruction) { return InternalError( "Element type not supported." " Expected element to be FloatingPointType for RngNormal: %s", - instruction->ToString().c_str()); + instruction->ToString()); } break; default: return InternalError( "Invalid Rng distribution %s", - RandomDistribution_Name(instruction->random_distribution()).c_str()); + RandomDistribution_Name(instruction->random_distribution())); } return Status::OK(); @@ -253,8 +250,8 @@ Status ShapeVerifier::HandleSort(HloInstruction* sort) { return InternalError( "Expected sort to have to have the same dimensions for the keys and " "the values. Keys shape is: %s\n, Values shape is: %s", - StringifyShape(sort->operand(0)->shape()).c_str(), - StringifyShape(sort->operand(1)->shape()).c_str()); + StringifyShape(sort->operand(0)->shape()), + StringifyShape(sort->operand(1)->shape())); } return CheckVariadicShape(sort); } @@ -333,7 +330,7 @@ Status ShapeVerifier::HandleFusion(HloInstruction* fusion) { int64 param_no = fused_param->parameter_number(); if (!ShapesSame(fused_param->shape(), fusion->operand(param_no)->shape())) { return InternalError( - "Shape mismatch between parameter number %lld and its operand in " + "Shape mismatch between parameter number %d and its operand in " "%s.", param_no, fusion->ToString().c_str()); } @@ -425,7 +422,7 @@ Status ShapeVerifier::HandleWhile(HloInstruction* xla_while) { return InternalError( "Conditional computation shape does not lead to a scalar predicate " "shape: %s", - StringifyShape(conditional_shape).c_str()); + StringifyShape(conditional_shape)); } // The shape of kWhile should match the shape of the body computation it // calls. @@ -556,7 +553,7 @@ Status CheckMixedPrecisionOperands(const HloInstruction* instruction) { return InternalError( "Seen floating point types of different precisions in " "%s, but mixed precision is disallowed.", - instruction->ToString().c_str()); + instruction->ToString()); } return Status::OK(); })); @@ -646,9 +643,8 @@ Status ShapeVerifier::CheckShape(const HloInstruction* instruction, return InternalError( "Expected instruction to have shape equal to %s, actual " "shape is %s:\n%s", - StringifyShape(inferred_shape).c_str(), - StringifyShape(instruction->shape()).c_str(), - instruction->ToString().c_str()); + StringifyShape(inferred_shape), StringifyShape(instruction->shape()), + instruction->ToString()); } return Status::OK(); } @@ -713,23 +709,23 @@ Status VerifyHloStructure(HloModule* module) { for (const HloComputation* computation : module->computations()) { if (computation->parent() == nullptr) { return InternalError("Computation %s has a null parent pointer", - computation->name().c_str()); + computation->name()); } if (computation->parent() != module) { return InternalError( "Computation %s parent() does not point to parent module", - computation->name().c_str()); + computation->name()); } for (const HloInstruction* instruction : computation->instructions()) { if (instruction->parent() == nullptr) { return InternalError("Instruction %s has a null parent pointer", - instruction->name().c_str()); + instruction->name()); } if (instruction->parent() != computation) { return InternalError( "Instruction %s parent() does not point to parent computation", - instruction->name().c_str()); + instruction->name()); } } } @@ -746,9 +742,8 @@ Status VerifyHloStructure(HloModule* module) { return InternalError( "Operand %d (%s) of instruction %s is in a different " "computation: %s vs %s", - i, operand->name().c_str(), instruction->name().c_str(), - operand->parent()->name().c_str(), - instruction->parent()->name().c_str()); + i, operand->name(), instruction->name(), + operand->parent()->name(), instruction->parent()->name()); } } } @@ -764,7 +759,7 @@ Status HloVerifier::CheckFusionInstruction(HloInstruction* fusion) const { "Instruction of fused computation does not match expected " "instruction " "%s.", - fusion->ToString().c_str()); + fusion->ToString()); } // Fused root instruction and fused parameters must all be owned by the @@ -778,7 +773,7 @@ Status HloVerifier::CheckFusionInstruction(HloInstruction* fusion) const { if (fused_root == instruction) { if (root_owned) { return InternalError("Root appears more than once in %s.", - fusion->ToString().c_str()); + fusion->ToString()); } root_owned = true; } @@ -786,7 +781,7 @@ Status HloVerifier::CheckFusionInstruction(HloInstruction* fusion) const { if (fused_parameters[i] == instruction) { if (parameter_owned[i]) { return InternalError("Parameter appears more than once in %s.", - fusion->ToString().c_str()); + fusion->ToString()); } parameter_owned[i] = true; } @@ -794,20 +789,19 @@ Status HloVerifier::CheckFusionInstruction(HloInstruction* fusion) const { } if (!root_owned) { return InternalError("Root not found in computation of %s.", - fusion->ToString().c_str()); + fusion->ToString()); } // Make sure all the parameter_owned entries are set for (int i = 0; i < parameter_owned.size(); i++) { if (!parameter_owned[i]) { return InternalError("Parameter %d not found in computation of %s.", i, - fusion->ToString().c_str()); + fusion->ToString()); } } // Fused root must have no users. if (fused_root->user_count() != 0) { - return InternalError("Root of %s may not have users.", - fusion->ToString().c_str()); + return InternalError("Root of %s may not have users.", fusion->ToString()); } // All uses of fused instructions must be in the fusion computation, and @@ -817,14 +811,13 @@ Status HloVerifier::CheckFusionInstruction(HloInstruction* fusion) const { if (instruction != fused_root) { if (instruction->user_count() == 0) { return InternalError("Non-root instruction %s in %s must have users.", - instruction->ToString().c_str(), - fusion->ToString().c_str()); + instruction->ToString(), fusion->ToString()); } for (auto& user : instruction->users()) { if (fused_computation != user->parent()) { return InternalError( "Non-root instruction %s in %s may not have external users.", - instruction->ToString().c_str(), fusion->ToString().c_str()); + instruction->ToString(), fusion->ToString()); } } } @@ -837,19 +830,19 @@ Status HloVerifier::CheckFusionInstruction(HloInstruction* fusion) const { for (auto fused_param : fused_parameters) { int64 param_no = fused_param->parameter_number(); if (param_no < 0) { - return InternalError("Unexpected negative parameter number %lld in %s.", - param_no, fusion->ToString().c_str()); + return InternalError("Unexpected negative parameter number %d in %s.", + param_no, fusion->ToString()); } if (param_no >= fused_parameters.size()) { return InternalError( - "Unexpected parameter number %lld in %s: higher then number of " + "Unexpected parameter number %d in %s: higher then number of " "parameters %lu.", - param_no, fusion->ToString().c_str(), fused_parameters.size()); + param_no, fusion->ToString(), fused_parameters.size()); } if (parameter_numbers[param_no]) { return InternalError( - "Did not expect parameter number %lld more than once in %s.", - param_no, fusion->ToString().c_str()); + "Did not expect parameter number %d more than once in %s.", param_no, + fusion->ToString()); } parameter_numbers[param_no] = true; } @@ -857,7 +850,7 @@ Status HloVerifier::CheckFusionInstruction(HloInstruction* fusion) const { for (int i = 0; i < parameter_numbers.size(); i++) { if (!parameter_numbers[i]) { return InternalError("Did not see parameter number %d in %s.", i, - fusion->ToString().c_str()); + fusion->ToString()); } } @@ -872,18 +865,18 @@ Status HloVerifier::CheckWhileInstruction(HloInstruction* instruction) { auto* while_body = instruction->while_body(); if (while_cond->num_parameters() != 1) { return FailedPrecondition( - "While condition must have exactly 1 parameter; had %lld : %s", - while_cond->num_parameters(), while_cond->ToString().c_str()); + "While condition must have exactly 1 parameter; had %d : %s", + while_cond->num_parameters(), while_cond->ToString()); } if (while_body->num_parameters() != 1) { return FailedPrecondition( - "While body must have exactly 1 parameter; had %lld : %s", - while_body->num_parameters(), while_body->ToString().c_str()); + "While body must have exactly 1 parameter; had %d : %s", + while_body->num_parameters(), while_body->ToString()); } if (instruction->operand_count() != 1) { return FailedPrecondition( - "While loop must have exactly one operand; had %lld : %s", - instruction->operand_count(), instruction->ToString().c_str()); + "While loop must have exactly one operand; had %d : %s", + instruction->operand_count(), instruction->ToString()); } return Status::OK(); } @@ -891,16 +884,14 @@ Status HloVerifier::CheckWhileInstruction(HloInstruction* instruction) { Status HloVerifier::CheckConditionalInstruction(HloInstruction* instruction) { if (instruction->true_computation()->num_parameters() != 1) { return FailedPrecondition( - "True computation %s of %s must have 1 parameter insted of %lld", - instruction->true_computation()->name().c_str(), - instruction->ToString().c_str(), + "True computation %s of %s must have 1 parameter insted of %d", + instruction->true_computation()->name(), instruction->ToString(), instruction->true_computation()->num_parameters()); } if (instruction->false_computation()->num_parameters() != 1) { return FailedPrecondition( - "False computation %s of %s must have 1 parameter insted of %lld", - instruction->false_computation()->name().c_str(), - instruction->ToString().c_str(), + "False computation %s of %s must have 1 parameter insted of %d", + instruction->false_computation()->name(), instruction->ToString(), instruction->false_computation()->num_parameters()); } return Status::OK(); @@ -915,9 +906,9 @@ Status HloVerifier::CheckElementwiseInstruction(HloInstruction* instruction) { "Implicit broadcast is not allowed in HLO." "Found different shapes for instruction %s.\n" "output: %s\noperand: %s\n", - HloOpcodeString(instruction->opcode()).c_str(), - ShapeUtil::HumanString(out_shape).c_str(), - ShapeUtil::HumanString(operand_shape).c_str()); + HloOpcodeString(instruction->opcode()), + ShapeUtil::HumanString(out_shape), + ShapeUtil::HumanString(operand_shape)); } } return Status::OK(); @@ -948,7 +939,7 @@ Status VerifyEntryAndExitShapes(const HloModule& module) { if (ShapeContainsToken(param->shape())) { return InternalError( "Entry parameter %d is or contains a token shape: %s", i, - ShapeUtil::HumanString(param->shape()).c_str()); + ShapeUtil::HumanString(param->shape())); } } return Status::OK(); @@ -960,9 +951,9 @@ Status CheckSameChannel(const HloInstruction* instr1, if (instr1->channel_id() != instr2->channel_id()) { return InternalError( "Expected to have the same channel id, actual channel ids are: %s " - "(%lld), %s (%lld)", - instr1->ToString().c_str(), instr1->channel_id(), - instr2->ToString().c_str(), instr2->channel_id()); + "(%d), %s (%d)", + instr1->ToString(), instr1->channel_id(), instr2->ToString(), + instr2->channel_id()); } return Status::OK(); } @@ -983,7 +974,7 @@ Status CheckSameIsHostTransfer(const HloInstruction* instr1, "Expected instructions to have the same is-host-transfer property: " "%s, " "%s ", - instr1->ToString().c_str(), instr2->ToString().c_str()); + instr1->ToString(), instr2->ToString()); } return Status::OK(); } @@ -1000,12 +991,12 @@ Status VerifySendsAndRecvs(const HloModule& module) { host_channels.insert({sendrecv->channel_id(), sendrecv}); if (!it_inserted.second) { return FailedPrecondition( - "Channel %lld is used for multiple host send/recv instructions: " + "Channel %d is used for multiple host send/recv instructions: " "%s " "and " "%s", - sendrecv->channel_id(), sendrecv->ToString().c_str(), - it_inserted.first->second->ToString().c_str()); + sendrecv->channel_id(), sendrecv->ToString(), + it_inserted.first->second->ToString()); } } diff --git a/tensorflow/compiler/xla/service/human_readable_profile_builder.cc b/tensorflow/compiler/xla/service/human_readable_profile_builder.cc index 581b3ce1e0..e76b93107c 100644 --- a/tensorflow/compiler/xla/service/human_readable_profile_builder.cc +++ b/tensorflow/compiler/xla/service/human_readable_profile_builder.cc @@ -15,26 +15,26 @@ limitations under the License. #include "tensorflow/compiler/xla/service/human_readable_profile_builder.h" #include "absl/strings/str_cat.h" +#include "absl/strings/str_format.h" #include "tensorflow/compiler/xla/metric_table_report.h" #include "tensorflow/compiler/xla/util.h" #include "tensorflow/core/lib/strings/numbers.h" -#include "tensorflow/core/lib/strings/stringprintf.h" namespace xla { using absl::StrAppend; +using absl::StrAppendFormat; using absl::StrCat; -using tensorflow::strings::Appendf; +using absl::StrFormat; using tensorflow::strings::HumanReadableElapsedTime; using tensorflow::strings::HumanReadableNumBytes; -using tensorflow::strings::Printf; string HumanReadableProfileBuilder::ToString() const { string s; - Appendf(&s, "Execution profile for %s: (%s @ f_nom)\n", - computation_name_.c_str(), - HumanReadableElapsedTime(CyclesToSeconds(total_cycles_)).c_str()); + StrAppendFormat(&s, "Execution profile for %s: (%s @ f_nom)\n", + computation_name_, + HumanReadableElapsedTime(CyclesToSeconds(total_cycles_))); int64 cumulative_cycles = 0; auto print_op = [&](const OpInfo& op, bool is_total = false) { @@ -56,7 +56,7 @@ string HumanReadableProfileBuilder::ToString() const { if (op.bytes_accessed > op.cycles) { bytes_per_cycle = StrCat(HumanReadableNumBytes(bpc), "/cycle"); } else { - bytes_per_cycle = Printf("%.3fB/cycle", bpc); + bytes_per_cycle = StrFormat("%.3fB/cycle", bpc); } } @@ -77,27 +77,24 @@ string HumanReadableProfileBuilder::ToString() const { // columns in the output. cycles_percent_str = "100.% 100Σ"; } else { - cycles_percent_str = - Printf("%5.2f%% %2.0fΣ", cycles_percent, cumulative_cycles_percent); + cycles_percent_str = StrFormat("%5.2f%% %2.0fΣ", cycles_percent, + cumulative_cycles_percent); } double nsecs = op.cycles / clock_rate_ghz_; - Appendf( + StrAppendFormat( &s, - "%15lld cycles (%s) :: %12.1f usec %22s :: %18s :: %18s :: %14s :: " + "%15d cycles (%s) :: %12.1f usec %22s :: %18s :: %18s :: %14s :: " "%16s :: %s\n", - op.cycles, cycles_percent_str.c_str(), CyclesToMicroseconds(op.cycles), + op.cycles, cycles_percent_str, CyclesToMicroseconds(op.cycles), op.optimal_seconds < 0 ? "" - : Printf("(%12.1f optimal)", op.optimal_seconds * 1e6).c_str(), - op.flop_count <= 0 - ? "" - : HumanReadableNumFlops(op.flop_count, nsecs).c_str(), + : StrFormat("(%12.1f optimal)", op.optimal_seconds * 1e6), + op.flop_count <= 0 ? "" : HumanReadableNumFlops(op.flop_count, nsecs), op.transcendental_count <= 0 ? "" - : HumanReadableNumTranscendentalOps(op.transcendental_count, nsecs) - .c_str(), - bytes_per_sec.c_str(), bytes_per_cycle.c_str(), op.name.c_str()); + : HumanReadableNumTranscendentalOps(op.transcendental_count, nsecs), + bytes_per_sec, bytes_per_cycle, op.name); }; float optimal_seconds_sum = 0.0; diff --git a/tensorflow/compiler/xla/service/interpreter/platform.cc b/tensorflow/compiler/xla/service/interpreter/platform.cc index e57a9b3672..c9b40d3c61 100644 --- a/tensorflow/compiler/xla/service/interpreter/platform.cc +++ b/tensorflow/compiler/xla/service/interpreter/platform.cc @@ -18,13 +18,13 @@ limitations under the License. #include #include "absl/memory/memory.h" +#include "absl/strings/str_format.h" #include "tensorflow/compiler/xla/service/interpreter/executor.h" #include "tensorflow/stream_executor/device_options.h" #include "tensorflow/stream_executor/lib/initialize.h" #include "tensorflow/stream_executor/lib/ptr_util.h" #include "tensorflow/stream_executor/lib/status.h" #include "tensorflow/stream_executor/lib/status_macros.h" -#include "tensorflow/stream_executor/lib/stringprintf.h" #include "tensorflow/stream_executor/multi_platform_manager.h" #include "tensorflow/stream_executor/platform.h" @@ -77,9 +77,9 @@ XlaInterpreterPlatform::GetUncachedExecutor( if (!init_status.ok()) { return port::Status{ port::error::INTERNAL, - port::Printf( + absl::StrFormat( "failed initializing StreamExecutor for device ordinal %d: %s", - config.ordinal, init_status.ToString().c_str())}; + config.ordinal, init_status.ToString())}; } return std::move(executor); diff --git a/tensorflow/compiler/xla/service/layout_assignment.cc b/tensorflow/compiler/xla/service/layout_assignment.cc index 5741864282..75d6d22a48 100644 --- a/tensorflow/compiler/xla/service/layout_assignment.cc +++ b/tensorflow/compiler/xla/service/layout_assignment.cc @@ -28,6 +28,7 @@ limitations under the License. #include "absl/memory/memory.h" #include "absl/strings/str_cat.h" +#include "absl/strings/str_format.h" #include "absl/strings/str_join.h" #include "tensorflow/compiler/xla/layout_util.h" #include "tensorflow/compiler/xla/map_util.h" @@ -51,7 +52,6 @@ limitations under the License. #include "tensorflow/core/lib/core/errors.h" #include "tensorflow/core/lib/core/status.h" #include "tensorflow/core/lib/gtl/array_slice.h" -#include "tensorflow/core/lib/strings/stringprintf.h" #include "tensorflow/core/platform/logging.h" #include "tensorflow/core/platform/protobuf.h" @@ -71,9 +71,8 @@ BufferLayoutConstraint::BufferLayoutConstraint(const Layout& layout, } string BufferLayoutConstraint::ToString() const { - return tensorflow::strings::Printf("BufferLayoutConstraint %s: %s", - buffer_->ToString().c_str(), - LayoutUtil::HumanString(layout_).c_str()); + return absl::StrFormat("BufferLayoutConstraint %s: %s", buffer_->ToString(), + LayoutUtil::HumanString(layout_)); } OperandLayoutConstraint::OperandLayoutConstraint( @@ -92,15 +91,14 @@ OperandLayoutConstraint::OperandLayoutConstraint( } string OperandLayoutConstraint::ToString() const { - return tensorflow::strings::Printf( - "OperandLayoutConstraint %s, operand %lld: %s", - instruction_->name().c_str(), operand_no_, - shape_layout_.ToString().c_str()); + return absl::StrFormat("OperandLayoutConstraint %s, operand %d: %s", + instruction_->name(), operand_no_, + shape_layout_.ToString()); } string ResultLayoutConstraint::ToString() const { - return tensorflow::strings::Printf("ResultLayoutConstraint: %s", - shape_layout_.ToString().c_str()); + return absl::StrFormat("ResultLayoutConstraint: %s", + shape_layout_.ToString()); } LayoutConstraints::LayoutConstraints( @@ -168,8 +166,7 @@ Status LayoutConstraints::SetBufferLayout(const Layout& layout, return FailedPrecondition( "Layout of buffer %s cannot be constrained because buffer is not " "array-shaped, has shape: %s", - buffer.ToString().c_str(), - ShapeUtil::HumanString(buffer.shape()).c_str()); + buffer.ToString(), ShapeUtil::HumanString(buffer.shape())); } TF_RETURN_IF_ERROR( LayoutUtil::ValidateLayoutForShape(layout, buffer.shape())); @@ -185,9 +182,8 @@ Status LayoutConstraints::SetBufferLayout(const Layout& layout, return FailedPrecondition( "Buffer %s already has the layout constraint %s, cannot add " "incompatible constraint %s", - buffer.ToString().c_str(), - LayoutUtil::HumanString(curr_constraint.layout()).c_str(), - LayoutUtil::HumanString(layout).c_str()); + buffer.ToString(), LayoutUtil::HumanString(curr_constraint.layout()), + LayoutUtil::HumanString(layout)); } iter->second = BufferLayoutConstraint(layout, buffer, mandatory, dfs); } else { @@ -221,11 +217,11 @@ Status LayoutConstraints::SetOperandLayout(const Shape& shape_with_layout, } if (curr_shape_layout->mandatory()) { return FailedPrecondition( - "Operand %lld of instruction %s already has a layout constraint " + "Operand %d of instruction %s already has a layout constraint " "%s, cannot add incompatible constraint %s", - operand_no, instruction->name().c_str(), - curr_shape_layout->shape_layout().ToString().c_str(), - ShapeUtil::HumanStringWithLayout(shape_with_layout).c_str()); + operand_no, instruction->name(), + curr_shape_layout->shape_layout().ToString(), + ShapeUtil::HumanStringWithLayout(shape_with_layout)); } } @@ -234,9 +230,9 @@ Status LayoutConstraints::SetOperandLayout(const Shape& shape_with_layout, // layouts beyond this immediate use and is complicated to handle. if (OperandBufferForwarded(instruction, operand_no)) { return FailedPrecondition( - "Cannot constraint layout of operand %lld of instruction %s " + "Cannot constraint layout of operand %d of instruction %s " "because instruction forwards operand's LogicalBuffer(s)", - operand_no, instruction->name().c_str()); + operand_no, instruction->name()); } auto key = std::make_pair(instruction, operand_no); @@ -278,8 +274,8 @@ Status LayoutConstraints::SetResultLayout(const Shape& shape_with_layout, return FailedPrecondition( "Result of computation %s already has the layout constraint %s, " "cannot add incompatible constraint %s", - computation_->name().c_str(), curr_shape_layout->ToString().c_str(), - ShapeUtil::HumanStringWithLayout(shape_with_layout).c_str()); + computation_->name(), curr_shape_layout->ToString(), + ShapeUtil::HumanStringWithLayout(shape_with_layout)); } // New constraint matches existing constraint. Nothing to do. return Status::OK(); @@ -301,9 +297,8 @@ Status LayoutConstraints::SetInstructionLayout( if (!ShapeUtil::Compatible(shape_with_layout, instruction->shape())) { return FailedPrecondition( "Instruction %s of shape %s cannot be assigned incompatible layout %s", - instruction->name().c_str(), - ShapeUtil::HumanString(instruction->shape()).c_str(), - ShapeUtil::HumanStringWithLayout(shape_with_layout).c_str()); + instruction->name(), ShapeUtil::HumanString(instruction->shape()), + ShapeUtil::HumanStringWithLayout(shape_with_layout)); } // Create a BufferLayoutConstraint for each array shape in the output of the @@ -753,7 +748,7 @@ Status CheckParameterLayout(HloInstruction* parameter, return InternalError( "parameter instruction %s does not match layout of computation " "shape: %s", - parameter->ToString().c_str(), parameter_layout.ToString().c_str()); + parameter->ToString(), parameter_layout.ToString()); } return Status::OK(); } @@ -764,8 +759,8 @@ Status CheckConstantLayout(HloInstruction* constant) { constant->shape())) { return InternalError( "constant instruction %s does not match the layout of its literal %s", - constant->ToString().c_str(), - ShapeUtil::HumanStringWithLayout(constant->literal().shape()).c_str()); + constant->ToString(), + ShapeUtil::HumanStringWithLayout(constant->literal().shape())); } return Status::OK(); } @@ -898,13 +893,10 @@ Status LayoutAssignment::CheckLayouts(HloModule* module) { return InternalError( "Layout of instruction %s at index {%s} does not match " "source LogicalBuffer %s: %s vs %s", - instruction->name().c_str(), - absl::StrJoin(index, ",").c_str(), - buffer->ToString().c_str(), - ShapeUtil::HumanStringWithLayout(instruction_subshape) - .c_str(), - ShapeUtil::HumanStringWithLayout(buffer->shape()) - .c_str()); + instruction->name(), absl::StrJoin(index, ","), + buffer->ToString(), + ShapeUtil::HumanStringWithLayout(instruction_subshape), + ShapeUtil::HumanStringWithLayout(buffer->shape())); } } } @@ -1375,7 +1367,7 @@ StatusOr InferArrayLayout( // This should not happen because we've assigned layouts to all // instructions preceding this one. return InternalError("LogicalBuffer %s does not have a layout", - source_buffer->ToString().c_str()); + source_buffer->ToString()); } if (first_buffer_layout == nullptr) { @@ -1390,9 +1382,8 @@ StatusOr InferArrayLayout( return FailedPrecondition( "Array at index {%s} in instruction %s aliases buffers %s " "and %s which have different layouts", - absl::StrJoin(index, ",").c_str(), instruction->name().c_str(), - source_buffers[0]->ToString().c_str(), - source_buffer->ToString().c_str()); + absl::StrJoin(index, ","), instruction->name(), + source_buffers[0]->ToString(), source_buffer->ToString()); } } @@ -1560,7 +1551,7 @@ Status LayoutAssignment::ClearComputationLayouts(HloComputation* computation) { // present in the IR before layout assignment is a bug. return InternalError( "Unexpected bitcast operation seen during layout assignment: %s.", - instruction->ToString().c_str()); + instruction->ToString()); } if (instruction->opcode() != HloOpcode::kInfeed) { LayoutUtil::ClearLayout(instruction->mutable_shape()); diff --git a/tensorflow/compiler/xla/service/llvm_ir/BUILD b/tensorflow/compiler/xla/service/llvm_ir/BUILD index fc3289f30d..786448ea76 100644 --- a/tensorflow/compiler/xla/service/llvm_ir/BUILD +++ b/tensorflow/compiler/xla/service/llvm_ir/BUILD @@ -125,6 +125,7 @@ cc_library( "//tensorflow/compiler/xla:types", "//tensorflow/compiler/xla:xla_data_proto", "//tensorflow/core:lib", + "@com_google_absl//absl/strings:str_format", "@llvm//:core", ], ) diff --git a/tensorflow/compiler/xla/service/llvm_ir/fused_ir_emitter.cc b/tensorflow/compiler/xla/service/llvm_ir/fused_ir_emitter.cc index 72ede377e1..6d637cad6d 100644 --- a/tensorflow/compiler/xla/service/llvm_ir/fused_ir_emitter.cc +++ b/tensorflow/compiler/xla/service/llvm_ir/fused_ir_emitter.cc @@ -98,7 +98,7 @@ Status FusedIrEmitter::HandleGetTupleElement( return Unimplemented( "GetTupleElement fusion currently only supports" " parameter operands, but found operand: %s", - operand->name().c_str()); + operand->name()); } // Emit code to lookup tuple element pointer, and store it in 'gte_values_'. llvm::Value* tuple_element_ptr = llvm_ir::EmitGetTupleElement( diff --git a/tensorflow/compiler/xla/service/llvm_ir/llvm_loop.cc b/tensorflow/compiler/xla/service/llvm_ir/llvm_loop.cc index 978fa5b453..2f6720b042 100644 --- a/tensorflow/compiler/xla/service/llvm_ir/llvm_loop.cc +++ b/tensorflow/compiler/xla/service/llvm_ir/llvm_loop.cc @@ -26,7 +26,6 @@ limitations under the License. #include "tensorflow/compiler/xla/shape_util.h" #include "tensorflow/compiler/xla/types.h" #include "tensorflow/compiler/xla/xla_data.pb.h" -#include "tensorflow/core/lib/strings/stringprintf.h" #include "tensorflow/core/platform/logging.h" namespace xla { diff --git a/tensorflow/compiler/xla/service/llvm_ir/loop_emitter.cc b/tensorflow/compiler/xla/service/llvm_ir/loop_emitter.cc index cf7445804c..1553b4fc91 100644 --- a/tensorflow/compiler/xla/service/llvm_ir/loop_emitter.cc +++ b/tensorflow/compiler/xla/service/llvm_ir/loop_emitter.cc @@ -18,13 +18,13 @@ limitations under the License. #include #include +#include "absl/strings/str_format.h" #include "tensorflow/compiler/xla/service/llvm_ir/llvm_loop.h" #include "tensorflow/compiler/xla/shape_util.h" #include "tensorflow/compiler/xla/status_macros.h" #include "tensorflow/compiler/xla/types.h" #include "tensorflow/compiler/xla/xla_data.pb.h" #include "tensorflow/core/lib/core/errors.h" -#include "tensorflow/core/lib/strings/stringprintf.h" #include "tensorflow/core/platform/logging.h" #include "tensorflow/core/platform/protobuf.h" #include "tensorflow/core/platform/types.h" @@ -105,7 +105,7 @@ std::vector LoopEmitter::EmitIndexAndSetExitBasicBlock( std::unique_ptr loop = loop_nest.AddLoop( /*start_index=*/0, /*end_index=*/shape_.dimensions(dimension), - /*suffix=*/tensorflow::strings::Printf("dim.%lld", dimension)); + /*suffix=*/absl::StrFormat("dim.%d", dimension)); array_index[dimension] = loop->GetIndVarValue(); } diff --git a/tensorflow/compiler/xla/service/local_service.cc b/tensorflow/compiler/xla/service/local_service.cc index ea59adadea..768105d9e1 100644 --- a/tensorflow/compiler/xla/service/local_service.cc +++ b/tensorflow/compiler/xla/service/local_service.cc @@ -21,6 +21,7 @@ limitations under the License. #include "absl/memory/memory.h" #include "absl/strings/str_cat.h" +#include "absl/strings/str_format.h" #include "tensorflow/compiler/xla/client/executable_build_options.h" #include "tensorflow/compiler/xla/client/xla_computation.h" #include "tensorflow/compiler/xla/execution_options_util.h" @@ -149,7 +150,7 @@ StatusOr> LocalService::CompileExecutable( // Validate incoming layouts. if (argument_layouts.size() != program_shape.parameters_size()) { return InvalidArgument( - "Invalid number of arguments for computation: expected %d, got %zu.", + "Invalid number of arguments for computation: expected %d, got %u.", program_shape.parameters_size(), argument_layouts.size()); } @@ -167,16 +168,15 @@ StatusOr> LocalService::CompileExecutable( CHECK(metadata.value() != nullptr); const OpMetadata& m = *metadata.value(); if (!m.source_file().empty()) { - return tensorflow::strings::Printf( - " (%s:%d)", m.source_file().c_str(), m.source_line()); + return absl::StrFormat(" (%s:%d)", m.source_file(), m.source_line()); } return ""; }; return InvalidArgument( "Invalid argument shape for argument %d%s, expected %s, got %s.", i, - metadata_string().c_str(), - ShapeUtil::HumanString(program_shape.parameters(i)).c_str(), - ShapeUtil::HumanString(argument_shape).c_str()); + metadata_string(), + ShapeUtil::HumanString(program_shape.parameters(i)), + ShapeUtil::HumanString(argument_shape)); } } if (build_options.result_layout() != nullptr) { @@ -214,7 +214,7 @@ StatusOr LocalService::GlobalDataToShapedBuffer( TF_ASSIGN_OR_RETURN(auto buffers, allocation_tracker_.Resolve(data)); if (replica_number >= buffers.size()) { return InvalidArgument( - "replica_number %d out of range; must be less than num_replicas = %zu.", + "replica_number %d out of range; must be less than num_replicas = %u.", replica_number, buffers.size()); } return buffers[replica_number]; diff --git a/tensorflow/compiler/xla/service/platform_util.cc b/tensorflow/compiler/xla/service/platform_util.cc index 150af0cd93..ae1e13d8a6 100644 --- a/tensorflow/compiler/xla/service/platform_util.cc +++ b/tensorflow/compiler/xla/service/platform_util.cc @@ -98,7 +98,7 @@ PlatformUtil::GetSupportedPlatforms() { [](string* out, const se::Platform* p) { out->append(p->Name()); }); return InvalidArgument( "must specify platform because more than one platform found: %s", - platforms_string.c_str()); + platforms_string); } /* static */ StatusOr PlatformUtil::GetDefaultPlatform() { @@ -123,7 +123,7 @@ PlatformUtil::GetSupportedPlatforms() { return InvalidArgument( "must specify platform because more than one platform (except for the " "interpreter platform) found: %s", - platforms_string.c_str()); + platforms_string); } /*static*/ StatusOr PlatformUtil::GetPlatform( @@ -135,7 +135,7 @@ PlatformUtil::GetSupportedPlatforms() { return platform; } } - return InvalidArgument("platform %s not found", platform_name.c_str()); + return InvalidArgument("platform %s not found", platform_name); } /*static*/ StatusOr PlatformUtil::GetPlatformExceptFor( @@ -151,7 +151,7 @@ PlatformUtil::GetSupportedPlatforms() { } if (matched.empty()) { return InvalidArgument("unable to find platform that is not %s", - platform_name.c_str()); + platform_name); } if (matched.size() == 1) { return matched[0]; @@ -161,7 +161,7 @@ PlatformUtil::GetSupportedPlatforms() { [](string* out, const se::Platform* p) { out->append(p->Name()); }); return InvalidArgument( "found multiple platforms %s, but expected one platform except for %s", - matched_string.c_str(), platform_name.c_str()); + matched_string, platform_name); } // Returns whether the device underlying the given StreamExecutor is supported @@ -192,7 +192,7 @@ static bool IsDeviceSupported(se::StreamExecutor* executor) { PlatformUtil::GetStreamExecutors(se::Platform* platform) { int device_count = platform->VisibleDeviceCount(); if (device_count <= 0) { - return NotFound("no %s devices found", platform->Name().c_str()); + return NotFound("no %s devices found", platform->Name()); } if (platform->id() == se::host::kHostPlatformId) { // On host "devices", StreamExecutor exports a device for each hardware @@ -231,7 +231,7 @@ PlatformUtil::GetStreamExecutors(se::Platform* platform) { if (std::all_of(stream_executors.begin(), stream_executors.end(), [](se::StreamExecutor* s) { return s == nullptr; })) { return InternalError("no supported devices found for platform %s", - platform->Name().c_str()); + platform->Name()); } return stream_executors; } diff --git a/tensorflow/compiler/xla/service/scatter_expander.cc b/tensorflow/compiler/xla/service/scatter_expander.cc index 338f0c09e9..2077b57c05 100644 --- a/tensorflow/compiler/xla/service/scatter_expander.cc +++ b/tensorflow/compiler/xla/service/scatter_expander.cc @@ -291,7 +291,7 @@ StatusOr ScatterExpander::ExpandScatter( return Unimplemented( "Scatter operations with more than 2147483647 scatter indices are not " "supported. This error occurred for %s.", - scatter->ToString().c_str()); + scatter->ToString()); } // Canonicalize the scatter_indices, after which the size of its most-major diff --git a/tensorflow/compiler/xla/service/service.cc b/tensorflow/compiler/xla/service/service.cc index d39a5191b8..e10c1d9927 100644 --- a/tensorflow/compiler/xla/service/service.cc +++ b/tensorflow/compiler/xla/service/service.cc @@ -22,6 +22,7 @@ limitations under the License. #include "absl/memory/memory.h" #include "absl/strings/str_cat.h" +#include "absl/strings/str_format.h" #include "tensorflow/compiler/xla/execution_options_util.h" #include "tensorflow/compiler/xla/layout_util.h" #include "tensorflow/compiler/xla/legacy_flags/debug_options_flags.h" @@ -47,7 +48,6 @@ limitations under the License. #include "tensorflow/compiler/xla/util.h" #include "tensorflow/compiler/xla/xla_data.pb.h" #include "tensorflow/core/lib/gtl/cleanup.h" -#include "tensorflow/core/lib/strings/stringprintf.h" #include "tensorflow/core/platform/env.h" #include "tensorflow/core/platform/logging.h" #include "tensorflow/core/platform/protobuf.h" @@ -55,13 +55,12 @@ limitations under the License. #include "tensorflow/core/platform/types.h" #include "tensorflow/core/util/ptr_util.h" -using absl::StrCat; -using ::tensorflow::strings::Printf; - namespace xla { - namespace { +using absl::StrCat; +using absl::StrFormat; + // Records the arguments used to invoke a computation in an HloSnapshot proto. Status RecordArguments( const tensorflow::gtl::ArraySlice arguments, @@ -148,19 +147,19 @@ Service::Service(const ServiceOptions& options, CHECK_GE(execute_backend_->device_count(), options_.number_of_replicas()) << "Requested more replicas than there are devices."; } - LOG(INFO) << Printf( + LOG(INFO) << StrFormat( "XLA service %p executing computations on platform %s. Devices:", this, - execute_backend_->platform()->Name().c_str()); + execute_backend_->platform()->Name()); for (int i = 0; i < execute_backend_->device_count(); ++i) { if (execute_backend_->device_ordinal_supported(i)) { se::StreamExecutor* executor = execute_backend_->stream_executor(i).ValueOrDie(); const auto& description = executor->GetDeviceDescription(); - LOG(INFO) << Printf(" StreamExecutor device (%d): %s, %s", i, - description.name().c_str(), - description.platform_version().c_str()); + LOG(INFO) << StrFormat(" StreamExecutor device (%d): %s, %s", i, + description.name(), + description.platform_version()); } else { - LOG(INFO) << Printf(" StreamExecutor device (%d) not supported", i); + LOG(INFO) << StrFormat(" StreamExecutor device (%d) not supported", i); } } } else { @@ -200,8 +199,8 @@ Status Service::ValidateResultShape(const Shape& client_shape, return InvalidArgument( "Shape used to set computation result layout %s is not compatible " "with result shape %s", - ShapeUtil::HumanStringWithLayout(client_shape).c_str(), - ShapeUtil::HumanString(result_shape).c_str()); + ShapeUtil::HumanStringWithLayout(client_shape), + ShapeUtil::HumanString(result_shape)); } return Status::OK(); } @@ -231,9 +230,9 @@ Service::ResolveAndValidateArguments( return InvalidArgument( "argument %lu is on device %s:%d but computation will be executed " "on device %s", - i, shaped_buffer->platform()->Name().c_str(), + i, shaped_buffer->platform()->Name(), shaped_buffer->device_ordinal(), - execute_backend_->device_name(replica_device_ordinal).c_str()); + execute_backend_->device_name(replica_device_ordinal)); } replicated_arguments[replica].push_back(shaped_buffer); } @@ -249,7 +248,7 @@ StatusOr> Service::CreateModuleConfig( ComputationLayout* computation_layout = config->mutable_entry_computation_layout(); if (program_shape.parameters_size() != argument_shapes.size()) { - return InvalidArgument("computation takes %d parameters, but %zu given", + return InvalidArgument("computation takes %d parameters, but %u given", program_shape.parameters_size(), argument_shapes.size()); } @@ -261,8 +260,8 @@ StatusOr> Service::CreateModuleConfig( return InvalidArgument( "Argument does not match shape of computation parameter %d: want " "%s, got %s", - i, ShapeUtil::HumanString(program_shape.parameters(i)).c_str(), - ShapeUtil::HumanString(*argument_shapes[i]).c_str()); + i, ShapeUtil::HumanString(program_shape.parameters(i)), + ShapeUtil::HumanString(*argument_shapes[i])); } TF_RETURN_IF_ERROR( computation_layout->mutable_parameter_layout(i)->CopyLayoutFromShape( @@ -314,7 +313,7 @@ StatusOr>> Service::BuildExecutables( std::vector> module_configs, Backend* backend, std::vector> executors, DeviceMemoryAllocator* device_allocator) { - VLOG(1) << Printf("BuildExecutable on service %p", this); + VLOG(1) << StrFormat("BuildExecutable on service %p", this); // Dump computation proto state if flag is set. std::vector> hlo_snapshots; @@ -329,9 +328,8 @@ StatusOr>> Service::BuildExecutables( auto hlo_snapshot = absl::make_unique(); *hlo_snapshot->mutable_hlo()->mutable_hlo_module() = *module_protos[i]; if (!directory_path.empty()) { - string filename = - Printf("computation_%lld__%s", module_protos[i]->id(), - module_protos[i]->entry_computation_name().c_str()); + string filename = StrFormat("computation_%d__%s", module_protos[i]->id(), + module_protos[i]->entry_computation_name()); TF_RETURN_IF_ERROR( Executable::DumpToDirectory(directory_path, filename, *hlo_snapshot)); } @@ -454,8 +452,8 @@ Service::ExecuteParallelAndRegisterResult( for (int64 i = 0; i < streams.size(); ++i) { Status block_status = streams[i]->BlockHostUntilDone(); if (!block_status.ok()) { - return InternalError("failed to complete execution for stream %lld: %s", - i, block_status.error_message().c_str()); + return InternalError("failed to complete execution for stream %d: %s", i, + block_status.error_message()); } } @@ -580,7 +578,7 @@ StatusOr> Service::GetExecutors( if (requests_size > 1 && execution_options.device_handles_size() > 1) { return InvalidArgument( "Parallel requests with multiple device handles is not supported. " - "Found %lld parallel requests, with request %lld containing %d device " + "Found %d parallel requests, with request %d containing %d device " "handles.", requests_size, request_index, execution_options.device_handles_size()); } @@ -745,8 +743,8 @@ Status Service::GetDeviceHandles(const GetDeviceHandlesRequest* arg, } if (available_device_count < arg->device_count() * replica_count) { return ResourceExhausted( - "Requested device count (%lld) exceeds the number of available devices " - "on the target (%lld)", + "Requested device count (%d) exceeds the number of available devices " + "on the target (%d)", arg->device_count(), available_device_count); } @@ -796,9 +794,9 @@ StatusOr> Service::BuildExecutable( const HloModuleProto& module_proto, std::unique_ptr module_config, Backend* backend, se::StreamExecutor* executor, DeviceMemoryAllocator* device_allocator) { - VLOG(1) << Printf( + VLOG(1) << StrFormat( "BuildExecutable on service %p with serialized module proto: %s", this, - module_proto.name().c_str()); + module_proto.name()); // Dump computation proto state if flag is set. auto hlo_snapshot = absl::make_unique(); @@ -809,8 +807,8 @@ StatusOr> Service::BuildExecutable( if (!directory_path.empty() || !execution_directory_path.empty()) { *hlo_snapshot->mutable_hlo()->mutable_hlo_module() = module_proto; if (!directory_path.empty()) { - string filename = Printf("computation_%lld__%s", module_proto.id(), - module_proto.entry_computation_name().c_str()); + string filename = StrFormat("computation_%d__%s", module_proto.id(), + module_proto.entry_computation_name()); TF_RETURN_IF_ERROR( Executable::DumpToDirectory(directory_path, filename, *hlo_snapshot)); } @@ -1010,8 +1008,7 @@ Status Service::TransferToInfeed(const TransferToInfeedRequest* arg, "%s", StrCat("The replica_id=", arg->replica_id(), " on TransferToInfeedRequest not in range [0, replica_count=", - replica_count, ").") - .c_str()); + replica_count, ").")); } se::StreamExecutor* executor; @@ -1037,8 +1034,7 @@ Status Service::TransferFromOutfeed(const TransferFromOutfeedRequest* arg, const int64 replica_count = options_.number_of_replicas(); if (arg->replica_id() < 0 || arg->replica_id() >= replica_count) { return FailedPrecondition( - "The replica_id=%lld on TransferFromOutfeedRequest not in range [0, " - "%lld)", + "The replica_id=%d on TransferFromOutfeedRequest not in range [0, %d)", arg->replica_id(), replica_count); } diff --git a/tensorflow/compiler/xla/service/shape_inference.cc b/tensorflow/compiler/xla/service/shape_inference.cc index 6a22f8bef4..ae6a366d25 100644 --- a/tensorflow/compiler/xla/service/shape_inference.cc +++ b/tensorflow/compiler/xla/service/shape_inference.cc @@ -23,6 +23,7 @@ limitations under the License. #include "absl/algorithm/container.h" #include "absl/strings/str_cat.h" +#include "absl/strings/str_format.h" #include "absl/strings/str_join.h" #include "absl/strings/string_view.h" #include "tensorflow/compiler/xla/shape_util.h" @@ -34,15 +35,14 @@ limitations under the License. #include "tensorflow/core/lib/core/errors.h" #include "tensorflow/core/lib/gtl/flatset.h" #include "tensorflow/core/lib/math/math_util.h" -#include "tensorflow/core/lib/strings/stringprintf.h" #include "tensorflow/core/platform/logging.h" #include "tensorflow/core/platform/protobuf.h" namespace xla { namespace { +using absl::StrFormat; using absl::StrJoin; -using tensorflow::strings::Printf; // Returns true if no element is present in slice more than once. bool AllUnique(tensorflow::gtl::ArraySlice slice) { @@ -52,8 +52,7 @@ bool AllUnique(tensorflow::gtl::ArraySlice slice) { Status ExpectArray(const Shape& shape, absl::string_view op_type) { if (!ShapeUtil::IsArray(shape)) { return InvalidArgument("Expected array argument for %s, but got %s.", - std::string(op_type).c_str(), - ShapeUtil::HumanString(shape).c_str()); + std::string(op_type), ShapeUtil::HumanString(shape)); } return Status::OK(); } @@ -65,7 +64,7 @@ Status VerifyReducerShape( int64 inputs) { if (reducer_shape.parameters_size() != inputs * 2) { return InvalidArgument( - "Reduction function must take %lld parameters, but " + "Reduction function must take %d parameters, but " "takes %d parameter(s).", inputs * 2, reducer_shape.parameters_size()); } @@ -75,7 +74,7 @@ Status VerifyReducerShape( if (ShapeUtil::IsArray(accumulator_shape)) { if (inputs != 1) { return InvalidArgument( - "Reduction function must produce a tuple with %lld elements, but " + "Reduction function must produce a tuple with %d elements, but " "produces a scalar", inputs); } @@ -83,8 +82,8 @@ Status VerifyReducerShape( } else if (ShapeUtil::IsTuple(accumulator_shape)) { if (ShapeUtil::TupleElementCount(accumulator_shape) != inputs) { return InvalidArgument( - "Reduction function must produce a tuple with %lld elements, but has " - "%lld elements", + "Reduction function must produce a tuple with %d elements, but has " + "%d elements", inputs, ShapeUtil::TupleElementCount(accumulator_shape)); } for (const Shape& element_shape : accumulator_shape.tuple_shapes()) { @@ -94,7 +93,7 @@ Status VerifyReducerShape( return InvalidArgument( "Reduction function must produce a scalar or tuple of scalars, but has " "shape: %s", - ShapeUtil::HumanString(accumulator_shape).c_str()); + ShapeUtil::HumanString(accumulator_shape)); } for (const Shape* element_shape : accumulator_subshapes) { @@ -102,7 +101,7 @@ Status VerifyReducerShape( return InvalidArgument( "Reduction function must return a scalar or tuple of scalars but " "returns shape: %s", - ShapeUtil::HumanString(accumulator_shape).c_str()); + ShapeUtil::HumanString(accumulator_shape)); } } @@ -113,19 +112,19 @@ Status VerifyReducerShape( if (!ShapeUtil::Compatible(*accumulator_subshapes[i], reducer_shape.parameters(i))) { return InvalidArgument( - "Reduction function's %lld-th parameter shape differs from the " + "Reduction function's %d-th parameter shape differs from the " "result shape: %s vs %s", - i, ShapeUtil::HumanString(reducer_shape.parameters(i)).c_str(), - ShapeUtil::HumanString(*accumulator_subshapes[i]).c_str()); + i, ShapeUtil::HumanString(reducer_shape.parameters(i)), + ShapeUtil::HumanString(*accumulator_subshapes[i])); } // Check that init_value's shapes are suitable for reducer_shape. if (!ShapeUtil::CompatibleIgnoringFpPrecision(*accumulator_subshapes[i], *init_value_shapes[i])) { return InvalidArgument( - "Reduction function's accumulator shape at index %lld differs from " + "Reduction function's accumulator shape at index %d differs from " "the init_value shape: %s vs %s", - i, ShapeUtil::HumanString(*accumulator_subshapes[i]).c_str(), - ShapeUtil::HumanString(*init_value_shapes[i]).c_str()); + i, ShapeUtil::HumanString(*accumulator_subshapes[i]), + ShapeUtil::HumanString(*init_value_shapes[i])); } // Check that the inputs can be passed in as the non-accumulator arguments. const Shape input_element_shape = @@ -133,11 +132,11 @@ Status VerifyReducerShape( if (!ShapeUtil::CompatibleIgnoringFpPrecision( input_element_shape, reducer_shape.parameters(inputs + i))) { return InvalidArgument( - "Reduction function's %lld-th parameter shape differs from the " + "Reduction function's %d-th parameter shape differs from the " "input type element type: %s vs %s", inputs + i, - ShapeUtil::HumanString(reducer_shape.parameters(inputs + i)).c_str(), - ShapeUtil::HumanString(input_element_shape).c_str()); + ShapeUtil::HumanString(reducer_shape.parameters(inputs + i)), + ShapeUtil::HumanString(input_element_shape)); } // Check that the accumulator and inputs to the reducer function match. // If the accumulator is scalar, it must have the same type as the inputs @@ -147,11 +146,11 @@ Status VerifyReducerShape( if (!ShapeUtil::CompatibleIgnoringFpPrecision( *accumulator_subshapes[i], reducer_shape.parameters(inputs + i))) { return InvalidArgument( - "Reduction function's %lld-th parameter shape must " + "Reduction function's %d-th parameter shape must " "match the result shape, but got %s vs %s.", inputs + i, - ShapeUtil::HumanString(reducer_shape.parameters(inputs + i)).c_str(), - ShapeUtil::HumanString(*accumulator_subshapes[i]).c_str()); + ShapeUtil::HumanString(reducer_shape.parameters(inputs + i)), + ShapeUtil::HumanString(*accumulator_subshapes[i])); } } @@ -164,7 +163,7 @@ StatusOr InferWindowOutputShape(const Shape& base_shape, bool allow_negative_padding) { if (window.dimensions_size() != ShapeUtil::Rank(base_shape)) { return InvalidArgument( - "Window has dimension %d but base shape has dimension %lld.", + "Window has dimension %d but base shape has dimension %d.", window.dimensions_size(), ShapeUtil::Rank(base_shape)); } @@ -173,29 +172,29 @@ StatusOr InferWindowOutputShape(const Shape& base_shape, const auto& dim = window.dimensions(i); if (dim.size() <= 0) { return InvalidArgument("Window %s has a non-positive dimension.", - window.DebugString().c_str()); + window.DebugString()); } if (dim.stride() <= 0) { return InvalidArgument("Window %s has a non-positive stride.", - window.DebugString().c_str()); + window.DebugString()); } if (!allow_negative_padding && dim.padding_low() < 0) { return InvalidArgument("Window %s has a negative low padding.", - window.DebugString().c_str()); + window.DebugString()); } if (!allow_negative_padding && dim.padding_high() < 0) { return InvalidArgument("Window %s has a negative high padding.", - window.DebugString().c_str()); + window.DebugString()); } if (dim.base_dilation() < 1) { return InvalidArgument( "Window %s has a non-positive base area dilation factor.", - window.DebugString().c_str()); + window.DebugString()); } if (dim.window_dilation() < 1) { return InvalidArgument( "Window %s has a non-positive window dilation factor.", - window.DebugString().c_str()); + window.DebugString()); } const int64 dilated_base = window_util::DilatedBound( @@ -238,8 +237,7 @@ StatusOr InferWindowOutputShape(const Shape& base_shape, return InvalidArgument( "Expected element type in shape to be floating for %s operation; " "got %s.", - HloOpcodeString(opcode).c_str(), - PrimitiveType_Name(shape.element_type()).c_str()); + HloOpcodeString(opcode), PrimitiveType_Name(shape.element_type())); } return shape; case HloOpcode::kCos: @@ -254,8 +252,7 @@ StatusOr InferWindowOutputShape(const Shape& base_shape, return InvalidArgument( "Expected element type in shape to be floating or complex for %s " "operation; got %s.", - HloOpcodeString(opcode).c_str(), - PrimitiveType_Name(shape.element_type()).c_str()); + HloOpcodeString(opcode), PrimitiveType_Name(shape.element_type())); } return shape; case HloOpcode::kReal: @@ -268,8 +265,7 @@ StatusOr InferWindowOutputShape(const Shape& base_shape, return InvalidArgument( "Expected element type in shape to be floating or complex for " "%s operation; got %s.", - HloOpcodeString(opcode).c_str(), - PrimitiveType_Name(shape.element_type()).c_str()); + HloOpcodeString(opcode), PrimitiveType_Name(shape.element_type())); } case HloOpcode::kAbs: if (ShapeUtil::ElementIsComplex(shape)) { @@ -281,15 +277,14 @@ StatusOr InferWindowOutputShape(const Shape& base_shape, return InvalidArgument( "Expected element type in shape to be floating or complex for " "%s operation; got %s.", - HloOpcodeString(opcode).c_str(), - PrimitiveType_Name(shape.element_type()).c_str()); + HloOpcodeString(opcode), PrimitiveType_Name(shape.element_type())); } case HloOpcode::kClz: if (!ShapeUtil::ElementIsIntegral(shape)) { return InvalidArgument( "Expected an integral element type in argument to Clz " "operation; got %s.", - PrimitiveType_Name(shape.element_type()).c_str()); + PrimitiveType_Name(shape.element_type())); } return shape; case HloOpcode::kNegate: @@ -299,8 +294,7 @@ StatusOr InferWindowOutputShape(const Shape& base_shape, return InvalidArgument( "Expected element type in shape to be integral, floating or " "complex for %s operation; got %s.", - HloOpcodeString(opcode).c_str(), - PrimitiveType_Name(shape.element_type()).c_str()); + HloOpcodeString(opcode), PrimitiveType_Name(shape.element_type())); } return shape; case HloOpcode::kSign: @@ -309,8 +303,7 @@ StatusOr InferWindowOutputShape(const Shape& base_shape, return InvalidArgument( "Expected element type in shape to be signed or complex for " "%s operation; got %s.", - HloOpcodeString(opcode).c_str(), - PrimitiveType_Name(shape.element_type()).c_str()); + HloOpcodeString(opcode), PrimitiveType_Name(shape.element_type())); } return shape; @@ -320,7 +313,7 @@ StatusOr InferWindowOutputShape(const Shape& base_shape, return InvalidArgument( "Expected pred or an integral element type in argument to Not " "operation; got %s.", - PrimitiveType_Name(shape.element_type()).c_str()); + PrimitiveType_Name(shape.element_type())); } return shape; @@ -330,14 +323,14 @@ StatusOr InferWindowOutputShape(const Shape& base_shape, "Expected element type in shape to be floating " "point for IsFinite " "operation; got %s.", - PrimitiveType_Name(shape.element_type()).c_str()); + PrimitiveType_Name(shape.element_type())); } return ShapeUtil::ChangeElementType(shape, PRED); default: return InvalidArgument( "Unknown operation for unary shape inference: \"%s\".", - HloOpcodeString(opcode).c_str()); + HloOpcodeString(opcode)); } } @@ -348,7 +341,7 @@ StatusOr InferWindowOutputShape(const Shape& base_shape, return InvalidArgument("Concatenate expects at least one argument."); } if (dimension < 0 || dimension >= ShapeUtil::Rank(*arg_shapes[0])) { - return InvalidArgument("Concatenate dimension out of bounds: %lld.", + return InvalidArgument("Concatenate dimension out of bounds: %d.", dimension); } const Shape* arg_shape = nullptr; @@ -362,17 +355,16 @@ StatusOr InferWindowOutputShape(const Shape& base_shape, } if (ShapeUtil::Rank(*arg_shape) != ShapeUtil::Rank(*shape)) { return InvalidArgument( - "Cannot concatenate arrays with different ranks: %lld (%s) vs %lld " + "Cannot concatenate arrays with different ranks: %d (%s) vs %d " "(%s).", - ShapeUtil::Rank(*arg_shape), - ShapeUtil::HumanString(*arg_shape).c_str(), ShapeUtil::Rank(*shape), - ShapeUtil::HumanString(*shape).c_str()); + ShapeUtil::Rank(*arg_shape), ShapeUtil::HumanString(*arg_shape), + ShapeUtil::Rank(*shape), ShapeUtil::HumanString(*shape)); } if (!ShapeUtil::SameElementTypeIgnoringFpPrecision(*arg_shape, *shape)) { return InvalidArgument( "Cannot concatenate arrays with different element types: %s vs %s.", - PrimitiveType_Name(arg_shape->element_type()).c_str(), - PrimitiveType_Name(shape->element_type()).c_str()); + PrimitiveType_Name(arg_shape->element_type()), + PrimitiveType_Name(shape->element_type())); } for (int64 dimension_number = 0; dimension_number < ShapeUtil::Rank(*arg_shape); ++dimension_number) { @@ -385,9 +377,9 @@ StatusOr InferWindowOutputShape(const Shape& base_shape, return InvalidArgument( "Cannot concatenate arrays that differ in dimensions other than " "the one being concatenated (the other array dimensions must be " - "the same): %s vs %s in dimension %lld.", - ShapeUtil::HumanString(*arg_shape).c_str(), - ShapeUtil::HumanString(*shape).c_str(), dimension); + "the same): %s vs %s in dimension %d.", + ShapeUtil::HumanString(*arg_shape), ShapeUtil::HumanString(*shape), + dimension); } } element_type = ShapeUtil::HigherPrecisionElementType(*shape, *arg_shape); @@ -419,8 +411,8 @@ StatusOr InferWindowOutputShape(const Shape& base_shape, !primitive_util::IsComplexType(new_element_type)) { return Unimplemented( "Conversion from complex to real type %s => %s is not implemented.", - ShapeUtil::HumanString(operand_shape).c_str(), - PrimitiveType_Name(new_element_type).c_str()); + ShapeUtil::HumanString(operand_shape), + PrimitiveType_Name(new_element_type)); } if (!ShapeUtil::IsArray(operand_shape) || !primitive_util::IsArrayType(new_element_type)) { @@ -429,8 +421,8 @@ StatusOr InferWindowOutputShape(const Shape& base_shape, // are valid. For now we just reject them, though. return InvalidArgument( "Convert does not allow non-arrays, so cannot convert from %s to %s.", - ShapeUtil::HumanString(operand_shape).c_str(), - PrimitiveType_Name(new_element_type).c_str()); + ShapeUtil::HumanString(operand_shape), + PrimitiveType_Name(new_element_type)); } return ShapeUtil::ChangeElementType(operand_shape, new_element_type); @@ -442,8 +434,8 @@ StatusOr InferWindowOutputShape(const Shape& base_shape, if (primitive_util::IsComplexType(old_element_type) != primitive_util::IsComplexType(new_element_type)) { return InvalidArgument("Conversion from complex to real type %s => %s.", - ShapeUtil::HumanString(operand_shape).c_str(), - PrimitiveType_Name(new_element_type).c_str()); + ShapeUtil::HumanString(operand_shape), + PrimitiveType_Name(new_element_type)); } if (!ShapeUtil::IsArray(operand_shape) || !primitive_util::IsArrayType(new_element_type)) { @@ -452,15 +444,15 @@ StatusOr InferWindowOutputShape(const Shape& base_shape, // are valid. For now we just reject them, though. return InvalidArgument( "Cannot convert from or to tuple type; requested conversion: %s => %s.", - ShapeUtil::HumanString(operand_shape).c_str(), - PrimitiveType_Name(new_element_type).c_str()); + ShapeUtil::HumanString(operand_shape), + PrimitiveType_Name(new_element_type)); } if (primitive_util::BitWidth(old_element_type) != primitive_util::BitWidth(new_element_type)) { return InvalidArgument( "Cannot bitcast types with different bit-widths: %s => %s.", - PrimitiveType_Name(old_element_type).c_str(), - PrimitiveType_Name(new_element_type).c_str()); + PrimitiveType_Name(old_element_type), + PrimitiveType_Name(new_element_type)); } return ShapeUtil::ChangeElementType(operand_shape, new_element_type); @@ -473,7 +465,7 @@ StatusOr InferWindowOutputShape(const Shape& base_shape, return InvalidArgument( "Expected element type in shape to be floating point for " "ReducePrecision operation; got %s.", - PrimitiveType_Name(operand_shape.element_type()).c_str()); + PrimitiveType_Name(operand_shape.element_type())); } if (exponent_bits < 1) { // One exponent bit is necessary to distinguish 0 from infinity. Having @@ -505,8 +497,8 @@ StatusOr InferWindowOutputShape(const Shape& base_shape, return InvalidArgument( "The rank of the operand and the padding configuration do not match: " "%s vs %s.", - ShapeUtil::HumanString(operand_shape).c_str(), - padding_config.ShortDebugString().c_str()); + ShapeUtil::HumanString(operand_shape), + padding_config.ShortDebugString()); } if (!ShapeUtil::SameElementTypeIgnoringFpPrecision(operand_shape, padding_value_shape)) { @@ -573,7 +565,7 @@ Status ValidateDotDimensionNumbers( !dims_in_range(ShapeUtil::Rank(rhs), rhs_contracting_dimensions, rhs_batch_dimensions)) { return InvalidArgument("A dimension number is out of range in Dot: %s.", - dimension_numbers.DebugString().c_str()); + dimension_numbers.DebugString()); } // Check that dimension numbers are unique. @@ -591,7 +583,7 @@ Status ValidateDotDimensionNumbers( if (!dims_unique(lhs_contracting_dimensions, lhs_batch_dimensions) || !dims_unique(rhs_contracting_dimensions, rhs_batch_dimensions)) { return InvalidArgument("A dimension number is not unique in Dot: %s.", - dimension_numbers.DebugString().c_str()); + dimension_numbers.DebugString()); } // Check that the count of non-contracting-non-batch dimensions is in {0, 1}. @@ -636,14 +628,13 @@ Status ValidateDotDimensionNumbers( TF_RETURN_IF_ERROR(ExpectArray(rhs, "rhs of dot")); auto fail = [lhs, rhs](const string& addendum) -> Status { - string message = tensorflow::strings::Printf( - "Cannot infer shape for dot operation: %s %s.", - ShapeUtil::HumanString(lhs).c_str(), - ShapeUtil::HumanString(rhs).c_str()); + string message = + StrFormat("Cannot infer shape for dot operation: %s %s.", + ShapeUtil::HumanString(lhs), ShapeUtil::HumanString(rhs)); if (!addendum.empty()) { message += " " + addendum; } - return InvalidArgument("%s", message.c_str()); + return InvalidArgument("%s", message); }; // Check if both element types are the same. @@ -739,9 +730,8 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(HloOpcode operation, } else { return InvalidArgument( "Binary op %s with incompatible shapes: %s and %s.", - HloOpcodeString(operation).c_str(), - ShapeUtil::HumanString(lhs).c_str(), - ShapeUtil::HumanString(rhs).c_str()); + HloOpcodeString(operation), ShapeUtil::HumanString(lhs), + ShapeUtil::HumanString(rhs)); } } return ShapeUtil::MakeShape(ShapeUtil::HigherPrecisionElementType(lhs, rhs), @@ -756,14 +746,14 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(HloOpcode operation, // the user to provide an explicit broadcast dimension in this case. // See b/25177275 for more details. return InvalidArgument("Automatic shape inference not supported: %s and %s", - ShapeUtil::HumanString(smaller_shape).c_str(), - ShapeUtil::HumanString(larger_shape).c_str()); + ShapeUtil::HumanString(smaller_shape), + ShapeUtil::HumanString(larger_shape)); } else if (broadcast_dimensions.size() != ShapeUtil::Rank(smaller_shape)) { return InvalidArgument( "Size of broadcast_dimensions has to match lower-rank operand's " "rank; " - " lower-rank operand's rank is %lld, size of broadcast_dimensions is " - "%zu.", + " lower-rank operand's rank is %d, size of broadcast_dimensions is " + "%u.", ShapeUtil::Rank(smaller_shape), broadcast_dimensions.size()); } @@ -813,12 +803,12 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(HloOpcode operation, int64 dimension_to_match = broadcast_dimensions.at(i); if (dimension_to_match < 0) { return InvalidArgument( - "Broadcast dimension number (%lld) cannot be negative.", + "Broadcast dimension number (%d) cannot be negative.", dimension_to_match); } if (dimension_to_match >= larger_shape.dimensions_size()) { return InvalidArgument( - "Broadcast dimension number (%lld) too large; higher-rank " + "Broadcast dimension number (%d) too large; higher-rank " "operand has rank %d.", dimension_to_match, larger_shape.dimensions_size()); } @@ -830,16 +820,16 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(HloOpcode operation, if (small_dimension_size != large_dimension_size && small_dimension_size != 1 && large_dimension_size != 1) { return InvalidArgument( - "Broadcast dimension %d mismatch: %lld != %lld; %s and %s.", i, + "Broadcast dimension %d mismatch: %d != %d; %s and %s.", i, small_dimension_size, large_dimension_size, - ShapeUtil::HumanString(smaller_shape).c_str(), - ShapeUtil::HumanString(larger_shape).c_str()); + ShapeUtil::HumanString(smaller_shape), + ShapeUtil::HumanString(larger_shape)); } // Make sure the broadcast dimensions are listed in a strictly increasing // order. if (i > 0 && broadcast_dimensions.at(i - 1) >= dimension_to_match) { return InvalidArgument( - "Broadcast dimensions order is wrong: %lld comes after %lld.", + "Broadcast dimensions order is wrong: %d comes after %d.", dimension_to_match, broadcast_dimensions.at(i - 1)); } @@ -858,8 +848,8 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(HloOpcode operation, if (!ShapeUtil::SameElementTypeIgnoringFpPrecision(lhs, rhs)) { return InvalidArgument( "Binary op %s with different element types: %s and %s.", - HloOpcodeString(operation).c_str(), ShapeUtil::HumanString(lhs).c_str(), - ShapeUtil::HumanString(rhs).c_str()); + HloOpcodeString(operation), ShapeUtil::HumanString(lhs), + ShapeUtil::HumanString(rhs)); } if (ShapeUtil::Rank(lhs) == ShapeUtil::Rank(rhs)) { @@ -909,11 +899,10 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(HloOpcode operation, /* static */ StatusOr ShapeInference::InferBinaryOpShape( HloOpcode opcode, const Shape& lhs, const Shape& rhs, tensorflow::gtl::ArraySlice broadcast_dimensions) { - VLOG(2) << tensorflow::strings::Printf( + VLOG(2) << StrFormat( "inferring shape for <%s>(%s, %s) with broadcast_dimensions={%s}", - HloOpcodeString(opcode).c_str(), ShapeUtil::HumanString(lhs).c_str(), - ShapeUtil::HumanString(rhs).c_str(), - StrJoin(broadcast_dimensions, ", ").c_str()); + HloOpcodeString(opcode), ShapeUtil::HumanString(lhs), + ShapeUtil::HumanString(rhs), StrJoin(broadcast_dimensions, ", ")); TF_DCHECK_OK(ShapeUtil::ValidateShapeWithOptionalLayout(lhs)); TF_DCHECK_OK(ShapeUtil::ValidateShapeWithOptionalLayout(rhs)); @@ -942,7 +931,7 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(HloOpcode operation, return InvalidArgument( "Expected element type in shape to be floating for complex compose " "operation; got %s.", - PrimitiveType_Name(lhs.element_type()).c_str()); + PrimitiveType_Name(lhs.element_type())); } TF_ASSIGN_OR_RETURN(const Shape& shape, InferElementwiseBinaryOpShape(opcode, lhs, rhs, @@ -961,7 +950,7 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(HloOpcode operation, return InvalidArgument( "Expected pred or integral type in argument to and/or operation; " "got %s.", - PrimitiveType_Name(lhs.element_type()).c_str()); + PrimitiveType_Name(lhs.element_type())); } return InferElementwiseBinaryOpShape(opcode, lhs, rhs, broadcast_dimensions); @@ -979,8 +968,8 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(HloOpcode operation, default: return Unimplemented( "Binary op shape inference: %s; lhs: %s; rhs: %s is not implemented.", - HloOpcodeString(opcode).c_str(), lhs.ShortDebugString().c_str(), - rhs.ShortDebugString().c_str()); + HloOpcodeString(opcode), lhs.ShortDebugString(), + rhs.ShortDebugString()); } } @@ -1003,8 +992,7 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(HloOpcode operation, case HloOpcode::kTupleSelect: return InferTupleSelectShape(lhs, rhs, ehs); default: - return InvalidArgument("Unknown operation %s.", - HloOpcodeString(opcode).c_str()); + return InvalidArgument("Unknown operation %s.", HloOpcodeString(opcode)); } } @@ -1043,8 +1031,8 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(HloOpcode operation, return InvalidArgument( "Sort keys and values dimensions must match. " "Keys shape is: %s\n, Values shape is: %s", - ShapeUtil::HumanString(*operand_shapes[0]).c_str(), - ShapeUtil::HumanString(*operand_shapes[1]).c_str()); + ShapeUtil::HumanString(*operand_shapes[0]), + ShapeUtil::HumanString(*operand_shapes[1])); } return ShapeUtil::MakeTupleShape( {*operand_shapes[0], *operand_shapes[1]}); @@ -1052,8 +1040,7 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(HloOpcode operation, return InvalidArgument("Unexpected number of operands for sort"); } default: - return InvalidArgument("Unknown operation %s.", - HloOpcodeString(opcode).c_str()); + return InvalidArgument("Unknown operation %s.", HloOpcodeString(opcode)); } } @@ -1091,7 +1078,7 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(HloOpcode operation, return InvalidArgument( "Map operation requires all operands to have the same shape; got: " "%s.", - StrJoin(pieces, ", ").c_str()); + StrJoin(pieces, ", ")); } // Check that dimensions.size == arg_shape.dimensions_size() (we currently @@ -1099,7 +1086,7 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(HloOpcode operation, if (dimensions.size() != arg_shape->dimensions_size()) { return InvalidArgument( "Map applied to a subset of dimensions currently not supported: " - "arg_dimension_size: %d, requested_map_dimensions_size: %zu.", + "arg_dimension_size: %d, requested_map_dimensions_size: %u.", arg_shape->dimensions_size(), dimensions.size()); } @@ -1108,7 +1095,7 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(HloOpcode operation, if (dimensions[i] != i) { return InvalidArgument( "Map requires monotonically increasing dimension numbers; got: %s.", - StrJoin(dimensions, ", ").c_str()); + StrJoin(dimensions, ", ")); } } @@ -1116,7 +1103,7 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(HloOpcode operation, if (arg_shapes.size() != to_apply.parameters_size()) { return InvalidArgument( "Map applied function arity must match number of arguments; got: " - "arity: %d, arguments: %zu.", + "arity: %d, arguments: %u.", to_apply.parameters_size(), arg_shapes.size()); } @@ -1125,7 +1112,7 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(HloOpcode operation, if (!ShapeUtil::IsScalar(output_shape)) { return InvalidArgument( "Mapped computation's result has to be a scalar; got: %s.", - ShapeUtil::HumanString(output_shape).c_str()); + ShapeUtil::HumanString(output_shape)); } for (int i = 0; i < to_apply.parameters_size(); ++i) { @@ -1135,7 +1122,7 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(HloOpcode operation, return InvalidArgument( "Mapped computation's parameter has to be a scalar; " "got parameter %d shape: %s.", - i, ShapeUtil::HumanString(parameter_shape).c_str()); + i, ShapeUtil::HumanString(parameter_shape)); } if (!ShapeUtil::SameElementTypeIgnoringFpPrecision(parameter_shape, @@ -1143,8 +1130,8 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(HloOpcode operation, return InvalidArgument( "Mapped computation's parameter type has to match argument element " "type; got parameter %d shape: %s, argument shape: %s.", - i, ShapeUtil::HumanString(parameter_shape).c_str(), - ShapeUtil::HumanString(*arg_shape).c_str()); + i, ShapeUtil::HumanString(parameter_shape), + ShapeUtil::HumanString(*arg_shape)); } } @@ -1173,35 +1160,35 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(HloOpcode operation, return InvalidArgument( "Expected feature_index of batch-norm-training to be " "smaller than the rank of operand_shape; " - "got feature_index %lld, and rank %lld.", + "got feature_index %d, and rank %d.", feature_index, ShapeUtil::Rank(operand_shape)); } if (feature_index < 0) { return InvalidArgument( "Expected feature_index of batch-norm-training to " - "be a non-negative number, got %lld.", + "be a non-negative number, got %d.", feature_index); } if (ShapeUtil::Rank(operand_shape) < 1) { return InvalidArgument( "Expected the rank of operand to " - "batch-norm-training to be at least 1; got %lld.", + "batch-norm-training to be at least 1; got %d.", ShapeUtil::Rank(operand_shape)); } if (ShapeUtil::Rank(offset_shape) != 1) { return InvalidArgument( "Offset input of batch-norm-training must have" - " rank 1, but has rank %lld.", + " rank 1, but has rank %d.", ShapeUtil::Rank(offset_shape)); } if (ShapeUtil::Rank(scale_shape) != 1) { return InvalidArgument( "Scale input of batch-norm-training must have" - " rank 1, but has rank %lld.", + " rank 1, but has rank %d.", ShapeUtil::Rank(scale_shape)); } @@ -1209,7 +1196,7 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(HloOpcode operation, return InvalidArgument( "The operand to batch-norm-training must have a floating point " "element type, but the shape is %s.", - PrimitiveType_Name(operand_shape.element_type()).c_str()); + PrimitiveType_Name(operand_shape.element_type())); } if (!ShapeUtil::SameElementTypeIgnoringFpPrecision(offset_shape, @@ -1218,8 +1205,8 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(HloOpcode operation, "The inputs should have the same element type for batch-norm-training, " "but the shape of offset factor is %s " "and the shape of operand is %s.", - PrimitiveType_Name(offset_shape.element_type()).c_str(), - PrimitiveType_Name(operand_shape.element_type()).c_str()); + PrimitiveType_Name(offset_shape.element_type()), + PrimitiveType_Name(operand_shape.element_type())); } if (!ShapeUtil::SameElementTypeIgnoringFpPrecision(scale_shape, @@ -1228,8 +1215,8 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(HloOpcode operation, "The inputs should have the same element type for batch-norm-training, " "but the shape of scale factor is %s " "and the shape of operand is %s.", - PrimitiveType_Name(scale_shape.element_type()).c_str(), - PrimitiveType_Name(operand_shape.element_type()).c_str()); + PrimitiveType_Name(scale_shape.element_type()), + PrimitiveType_Name(operand_shape.element_type())); } const int64 feature_count = operand_shape.dimensions(feature_index); @@ -1239,16 +1226,16 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(HloOpcode operation, if (ShapeUtil::GetDimension(offset_shape, 0) != feature_count) { return InvalidArgument( "The size of offset factor should be the same as feature count," - "but the size of offset factor is %lld " - "and the feature count is %lld.", + "but the size of offset factor is %d " + "and the feature count is %d.", ShapeUtil::GetDimension(offset_shape, 0), feature_count); } if (ShapeUtil::GetDimension(scale_shape, 0) != feature_count) { return InvalidArgument( "The size of scale factor should be the same as feature count," - "but the size of scale factor is %lld " - "and the feature count is %lld.", + "but the size of scale factor is %d " + "and the feature count is %d.", ShapeUtil::GetDimension(scale_shape, 0), feature_count); } @@ -1283,35 +1270,35 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(HloOpcode operation, return InvalidArgument( "Expected feature_index of batch-norm-inference to be " "smaller than the rank of operand_shape; " - "got feature_index %lld, and rank %lld.", + "got feature_index %d, and rank %d.", feature_index, ShapeUtil::Rank(operand_shape)); } if (feature_index < 0) { return InvalidArgument( "Expected feature_index of batch-norm-inference to " - "be a non-negative number, got %lld.", + "be a non-negative number, got %d.", feature_index); } if (ShapeUtil::Rank(operand_shape) < 1) { return InvalidArgument( "Expected the rank of operand to " - "batch-norm-inference to be at least 1; got %lld.", + "batch-norm-inference to be at least 1; got %d.", ShapeUtil::Rank(operand_shape)); } if (ShapeUtil::Rank(offset_shape) != 1) { return InvalidArgument( "Offset input of batch-norm-inference must have" - " rank 1, but has rank %lld.", + " rank 1, but has rank %d.", ShapeUtil::Rank(offset_shape)); } if (ShapeUtil::Rank(scale_shape) != 1) { return InvalidArgument( "Scale input of batch-norm-inference must have" - " rank 1, but has rank %lld.", + " rank 1, but has rank %d.", ShapeUtil::Rank(scale_shape)); } @@ -1319,7 +1306,7 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(HloOpcode operation, return InvalidArgument( "The operand to batch-norm-inference must have a floating point " "element type, but the shape is %s.", - PrimitiveType_Name(operand_shape.element_type()).c_str()); + PrimitiveType_Name(operand_shape.element_type())); } if (!ShapeUtil::SameElementTypeIgnoringFpPrecision(offset_shape, @@ -1329,8 +1316,8 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(HloOpcode operation, "batch-norm-inference, " "but the shape of offset factor is %s " "and the shape of operand is %s.", - PrimitiveType_Name(offset_shape.element_type()).c_str(), - PrimitiveType_Name(operand_shape.element_type()).c_str()); + PrimitiveType_Name(offset_shape.element_type()), + PrimitiveType_Name(operand_shape.element_type())); } if (!ShapeUtil::SameElementTypeIgnoringFpPrecision(scale_shape, @@ -1340,8 +1327,8 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(HloOpcode operation, "batch-norm-inference, " "but the shape of scale factor is %s " "and the shape of operand is %s.", - PrimitiveType_Name(scale_shape.element_type()).c_str(), - PrimitiveType_Name(operand_shape.element_type()).c_str()); + PrimitiveType_Name(scale_shape.element_type()), + PrimitiveType_Name(operand_shape.element_type())); } if (!ShapeUtil::SameElementTypeIgnoringFpPrecision(mean_shape, @@ -1351,8 +1338,8 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(HloOpcode operation, "batch-norm-inference, " "but the shape of mean is %s " "and the shape of operand is %s.", - PrimitiveType_Name(mean_shape.element_type()).c_str(), - PrimitiveType_Name(operand_shape.element_type()).c_str()); + PrimitiveType_Name(mean_shape.element_type()), + PrimitiveType_Name(operand_shape.element_type())); } if (!ShapeUtil::SameElementTypeIgnoringFpPrecision(variance_shape, @@ -1362,8 +1349,8 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(HloOpcode operation, "batch-norm-inference, " "but the shape of variance is %s " "and the shape of operand is %s.", - PrimitiveType_Name(mean_shape.element_type()).c_str(), - PrimitiveType_Name(variance_shape.element_type()).c_str()); + PrimitiveType_Name(mean_shape.element_type()), + PrimitiveType_Name(variance_shape.element_type())); } const int64 feature_count = operand_shape.dimensions(feature_index); @@ -1373,32 +1360,32 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(HloOpcode operation, if (ShapeUtil::GetDimension(offset_shape, 0) != feature_count) { return InvalidArgument( "The size of offset factor should be the same as feature count," - "but the size of offset factor is %lld " - "and the feature count is %lld.", + "but the size of offset factor is %d " + "and the feature count is %d.", ShapeUtil::GetDimension(offset_shape, 0), feature_count); } if (ShapeUtil::GetDimension(scale_shape, 0) != feature_count) { return InvalidArgument( "The size of scale factor should be the same as feature count," - "but the size of scale factor is %lld " - "and the feature count is %lld.", + "but the size of scale factor is %d " + "and the feature count is %d.", ShapeUtil::GetDimension(scale_shape, 0), feature_count); } if (ShapeUtil::GetDimension(mean_shape, 0) != feature_count) { return InvalidArgument( "The size of mean should be the same as feature count," - "but the size of mean is %lld " - "and the feature count is %lld.", + "but the size of mean is %d " + "and the feature count is %d.", ShapeUtil::GetDimension(mean_shape, 0), feature_count); } if (ShapeUtil::GetDimension(variance_shape, 0) != feature_count) { return InvalidArgument( "The size of variance should be the same as feature count," - "but the size of variance is %lld " - "and the feature count is %lld.", + "but the size of variance is %d " + "and the feature count is %d.", ShapeUtil::GetDimension(variance_shape, 0), feature_count); } @@ -1428,36 +1415,36 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(HloOpcode operation, return InvalidArgument( "Expected feature_index of batch-norm-grad to be " "smaller than the rank of operand_shape; " - "got feature_index %lld, and rank %lld.", + "got feature_index %d, and rank %d.", feature_index, ShapeUtil::Rank(operand_shape)); } if (ShapeUtil::Rank(operand_shape) != ShapeUtil::Rank(output_grad_shape)) { return InvalidArgument( "Expected operand_shape of batch-norm-grad to have the same rank as" - " output_grad_shape; got rank(oprand_shape) %lld, and" - " rank(output_grad_shape) %lld.", + " output_grad_shape; got rank(oprand_shape) %d, and" + " rank(output_grad_shape) %d.", ShapeUtil::Rank(operand_shape), ShapeUtil::Rank(output_grad_shape)); } if (ShapeUtil::Rank(mean_shape) != 1) { return InvalidArgument( "Mean input of batch-norm-grad must have" - " rank 1, but has rank %lld.", + " rank 1, but has rank %d.", ShapeUtil::Rank(mean_shape)); } if (ShapeUtil::Rank(scale_shape) != 1) { return InvalidArgument( "Scale input of batch-norm-grad must have" - " rank 1, but has rank %lld.", + " rank 1, but has rank %d.", ShapeUtil::Rank(scale_shape)); } if (ShapeUtil::Rank(var_shape) != 1) { return InvalidArgument( "Var input of batch-norm-grad must have" - " rank 1, but has rank %lld.", + " rank 1, but has rank %d.", ShapeUtil::Rank(var_shape)); } @@ -1465,14 +1452,14 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(HloOpcode operation, return InvalidArgument( "The operand to batch-norm-grad must have a floating point " "element type, but the shape is %s.", - PrimitiveType_Name(operand_shape.element_type()).c_str()); + PrimitiveType_Name(operand_shape.element_type())); } if (!ShapeUtil::ElementIsFloating(output_grad_shape)) { return InvalidArgument( "The output_grad to batch-norm-grad must have a floating point " "element type, but the shape is %s.", - PrimitiveType_Name(output_grad_shape.element_type()).c_str()); + PrimitiveType_Name(output_grad_shape.element_type())); } if (!ShapeUtil::SameElementTypeIgnoringFpPrecision(output_grad_shape, @@ -1481,8 +1468,8 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(HloOpcode operation, "The inputs should have the same element type for batch-norm-grad, " "but the element type of output_grad is %s " "and the element type of operand is %s.", - PrimitiveType_Name(output_grad_shape.element_type()).c_str(), - PrimitiveType_Name(operand_shape.element_type()).c_str()); + PrimitiveType_Name(output_grad_shape.element_type()), + PrimitiveType_Name(operand_shape.element_type())); } if (!ShapeUtil::SameElementTypeIgnoringFpPrecision(scale_shape, @@ -1491,8 +1478,8 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(HloOpcode operation, "The inputs should have the same element type for batch-norm-grad, " "but the element type of scale factor is %s " "and the element type of operand is %s.", - PrimitiveType_Name(scale_shape.element_type()).c_str(), - PrimitiveType_Name(operand_shape.element_type()).c_str()); + PrimitiveType_Name(scale_shape.element_type()), + PrimitiveType_Name(operand_shape.element_type())); } if (!ShapeUtil::SameElementTypeIgnoringFpPrecision(mean_shape, @@ -1501,8 +1488,8 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(HloOpcode operation, "The inputs should have the same element type for batch-norm-grad, " "but the element type of mean is %s " "and the element type of operand is %s.", - PrimitiveType_Name(mean_shape.element_type()).c_str(), - PrimitiveType_Name(operand_shape.element_type()).c_str()); + PrimitiveType_Name(mean_shape.element_type()), + PrimitiveType_Name(operand_shape.element_type())); } if (!ShapeUtil::SameElementTypeIgnoringFpPrecision(var_shape, @@ -1511,8 +1498,8 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(HloOpcode operation, "The inputs should have the same element type for batch-norm-grad, " "but the element type of mean is %s " "and the element type of operand is %s.", - PrimitiveType_Name(mean_shape.element_type()).c_str(), - PrimitiveType_Name(operand_shape.element_type()).c_str()); + PrimitiveType_Name(mean_shape.element_type()), + PrimitiveType_Name(operand_shape.element_type())); } const int64 feature_count = operand_shape.dimensions(feature_index); @@ -1523,24 +1510,24 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(HloOpcode operation, if (ShapeUtil::GetDimension(mean_shape, 0) != feature_count) { return InvalidArgument( "The size of mean should be the same as feature count," - "but the size of offset factor is %lld " - "and the feature count is %lld.", + "but the size of offset factor is %d " + "and the feature count is %d.", ShapeUtil::GetDimension(mean_shape, 0), feature_count); } if (ShapeUtil::GetDimension(scale_shape, 0) != feature_count) { return InvalidArgument( "The size of scale factor should be the same as feature count," - "but the size of scale factor is %lld " - "and the feature count is %lld.", + "but the size of scale factor is %d " + "and the feature count is %d.", ShapeUtil::GetDimension(scale_shape, 0), feature_count); } if (ShapeUtil::GetDimension(var_shape, 0) != feature_count) { return InvalidArgument( "The size of variance should be the same as feature count," - "but the size of variance is %lld " - "and the feature count is %lld.", + "but the size of variance is %d " + "and the feature count is %d.", ShapeUtil::GetDimension(var_shape, 0), feature_count); } @@ -1550,8 +1537,8 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(HloOpcode operation, ShapeUtil::GetDimension(output_grad_shape, i)) { return InvalidArgument( "The bounds of operand shape should be the same as output_grad's," - "but the bound of operand_shape at dimension %lld is %lld " - "and the bound of output_grad_shape is %lld.", + "but the bound of operand_shape at dimension %d is %d " + "and the bound of output_grad_shape is %d.", i, ShapeUtil::GetDimension(operand_shape, i), ShapeUtil::GetDimension(output_grad_shape, i)); } @@ -1570,15 +1557,14 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(HloOpcode operation, if (!ShapeUtil::SameElementTypeIgnoringFpPrecision(lhs, rhs)) { return InvalidArgument( "Convolution with different element types: %s and %s.", - ShapeUtil::HumanString(lhs).c_str(), - ShapeUtil::HumanString(rhs).c_str()); + ShapeUtil::HumanString(lhs), ShapeUtil::HumanString(rhs)); } if (dnums.input_spatial_dimensions_size() != dnums.kernel_spatial_dimensions_size()) { return InvalidArgument( "Both arguments to convolution must have same number of dimensions.\n" "Window: %s", - window.DebugString().c_str()); + window.DebugString()); } const int num_spatial_dims = dnums.input_spatial_dimensions_size(); @@ -1586,19 +1572,19 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(HloOpcode operation, return InvalidArgument( "Window must have same number of dimensions as dimension numbers.\n" "Window: %s\nDimension numbers: %s.", - window.DebugString().c_str(), dnums.DebugString().c_str()); + window.DebugString(), dnums.DebugString()); } const int num_dims = num_spatial_dims + 2; if (ShapeUtil::Rank(lhs) != num_dims) { return InvalidArgument( "The LHS argument to a convolution should have rank %d; lhs: %s.", - num_dims, ShapeUtil::HumanString(lhs).c_str()); + num_dims, ShapeUtil::HumanString(lhs)); } if (ShapeUtil::Rank(rhs) != num_dims) { return InvalidArgument( "The RHS argument to a convolution should have rank %d; lhs: %s.", - num_dims, ShapeUtil::HumanString(lhs).c_str()); + num_dims, ShapeUtil::HumanString(lhs)); } TF_DCHECK_OK(ShapeUtil::ValidateShapeWithOptionalLayout(lhs)); TF_DCHECK_OK(ShapeUtil::ValidateShapeWithOptionalLayout(rhs)); @@ -1635,26 +1621,26 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(HloOpcode operation, !std::all_of(output_dnums.begin(), output_dnums.end(), in_range)) { return InvalidArgument( "A dimension number is out of range in convolution: %s.", - dnums.DebugString().c_str()); + dnums.DebugString()); } if (input_dnums != expected_dnums) { return InvalidArgument( "Input dimensions of convolution must contain each dimension exactly " "once: %s.", - dnums.DebugString().c_str()); + dnums.DebugString()); } if (window_dnums != expected_dnums) { return InvalidArgument( "Window dimensions of convolution must contain each dimension exactly " "once: %s.", - dnums.DebugString().c_str()); + dnums.DebugString()); } if (output_dnums != expected_dnums) { return InvalidArgument( "Output dimensions of convolution must contain each dimension exactly " "once: %s.", - dnums.DebugString().c_str()); + dnums.DebugString()); } std::vector input_spatial_dims(num_spatial_dims); @@ -1675,13 +1661,13 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(HloOpcode operation, if (input_features != kernel_input_features * feature_group_count) { return InvalidArgument( - "Expected LHS feature dimension (value %lld) to match RHS " - "input feature dimension * feature_group_count (value %lld); " + "Expected LHS feature dimension (value %d) to match RHS " + "input feature dimension * feature_group_count (value %d); " "got (%s, %s)\n" "Dimension numbers: {%s}.", input_features, kernel_input_features * feature_group_count, - ShapeUtil::HumanString(lhs).c_str(), - ShapeUtil::HumanString(rhs).c_str(), dnums.DebugString().c_str()); + ShapeUtil::HumanString(lhs), ShapeUtil::HumanString(rhs), + dnums.DebugString()); } std::vector window_dims(num_spatial_dims); for (int i = 0; i < num_spatial_dims; ++i) { @@ -1693,8 +1679,8 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(HloOpcode operation, "RHS shape: %s\n\t" "Window: {%s}\n\t" "Dimension numbers: {%s}.", - ShapeUtil::HumanString(rhs).c_str(), window.ShortDebugString().c_str(), - dnums.ShortDebugString().c_str()); + ShapeUtil::HumanString(rhs), window.ShortDebugString(), + dnums.ShortDebugString()); } Shape base_shape = @@ -1720,29 +1706,29 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(HloOpcode operation, const tensorflow::gtl::ArraySlice fft_length) { const int64 fft_rank = fft_length.size(); if (fft_rank < 1 || fft_rank > 3) { - return InvalidArgument("FFT only supports ranks 1-3; got %lld.", fft_rank); + return InvalidArgument("FFT only supports ranks 1-3; got %d.", fft_rank); } -#define RET_CHECK_RANK(x) \ - if (x.dimensions_size() < fft_rank) { \ - return InvalidArgument( \ - "FFT of rank %lld requires input of at least " \ - "same rank; got input of rank %d", \ - fft_rank, x.dimensions_size()); \ +#define RET_CHECK_RANK(x) \ + if (x.dimensions_size() < fft_rank) { \ + return InvalidArgument( \ + "FFT of rank %d requires input of at least " \ + "same rank; got input of rank %d", \ + fft_rank, x.dimensions_size()); \ } switch (fft_type) { case FFT: case IFFT: if (in.element_type() != C64) { return InvalidArgument("%s requires C64 input type, found %s.", - FftType_Name(fft_type).c_str(), - PrimitiveType_Name(in.element_type()).c_str()); + FftType_Name(fft_type), + PrimitiveType_Name(in.element_type())); } RET_CHECK_RANK(in); return in; case RFFT: { if (in.element_type() != F32) { return InvalidArgument("RFFT requires F32 input type, found %s.", - PrimitiveType_Name(in.element_type()).c_str()); + PrimitiveType_Name(in.element_type())); } RET_CHECK_RANK(in); for (int i = 0; i < fft_rank; i++) { @@ -1750,7 +1736,7 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(HloOpcode operation, fft_length[i]) { return InvalidArgument( "RFFT requires innermost dimensions match fft_length but " - "dimension %lld is %lld and should be %lld.", + "dimension %d is %d and should be %d.", in.dimensions_size() - fft_rank + i, in.dimensions(in.dimensions_size() - fft_rank + i), fft_length[i]); @@ -1764,7 +1750,7 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(HloOpcode operation, case IRFFT: { if (in.element_type() != C64) { return InvalidArgument("IRFFT requires C64 input type, found %s.", - PrimitiveType_Name(in.element_type()).c_str()); + PrimitiveType_Name(in.element_type())); } RET_CHECK_RANK(in); Shape result = ShapeUtil::ComplexComponentShape(in); @@ -1773,7 +1759,7 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(HloOpcode operation, fft_length[i]) { return InvalidArgument( "IRFFT requires all but one innermost dimensions match " - "fft_length, but dimension %lld is %lld and should be %lld.", + "fft_length, but dimension %d is %d and should be %d.", in.dimensions_size() - fft_rank + i, in.dimensions(in.dimensions_size() - fft_rank + i), fft_length[i]); @@ -1783,7 +1769,7 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(HloOpcode operation, fft_length[fft_rank - 1] / 2 + 1) { return InvalidArgument( "IRFFT requires innermost dimension matches fft_length/2+1, but " - "dimension %d is %lld and should be %lld.", + "dimension %d is %d and should be %d.", in.dimensions_size() - 1, in.dimensions(in.dimensions_size() - 1), fft_length[fft_rank - 1] / 2 + 1); } @@ -1819,18 +1805,18 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(HloOpcode operation, TF_RET_CHECK(split_count > 0); if (split_dimension >= ShapeUtil::Rank(shape) || split_dimension < 0) { return InvalidArgument( - "AllToAll split_dimension %lld is out-of-bounds in shape %s.", - split_dimension, ShapeUtil::HumanString(shape).c_str()); + "AllToAll split_dimension %d is out-of-bounds in shape %s.", + split_dimension, ShapeUtil::HumanString(shape)); } if (concat_dimension >= ShapeUtil::Rank(shape) || concat_dimension < 0) { return InvalidArgument( - "AllToAll concat_dimension %lld is out-of-bounds in shape %s.", - concat_dimension, ShapeUtil::HumanString(shape).c_str()); + "AllToAll concat_dimension %d is out-of-bounds in shape %s.", + concat_dimension, ShapeUtil::HumanString(shape)); } if (shape.dimensions(split_dimension) % split_count != 0) { return InvalidArgument( - "AllToAll split dimension size %lld must be dividable by split_count " - "%lld.", + "AllToAll split dimension size %d must be dividable by split_count " + "%d.", shape.dimensions(split_dimension), split_count); } std::vector new_dimensions(shape.dimensions().begin(), @@ -1850,8 +1836,8 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(HloOpcode operation, return InvalidArgument( "HLO all-to-all has operands with different shapes: the 0th " "operand shape %s, but the %dth operand has shape %s.", - ShapeUtil::HumanString(*operand_shapes[0]).c_str(), i, - ShapeUtil::HumanString(*operand_shapes[i]).c_str()); + ShapeUtil::HumanString(*operand_shapes[0]), i, + ShapeUtil::HumanString(*operand_shapes[i])); } } @@ -1880,9 +1866,9 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(HloOpcode operation, if (!ShapeUtil::SameDimensions(*reduced_args[0], *reduced_args[i])) { return InvalidArgument( "All reduced tensors must have the sime dimension. Tensor 0 has " - "shape %s, Tensor %lld has shape %s", - ShapeUtil::HumanString(*reduced_args[0]).c_str(), i, - ShapeUtil::HumanString(*reduced_args[i]).c_str()); + "shape %s, Tensor %d has shape %s", + ShapeUtil::HumanString(*reduced_args[0]), i, + ShapeUtil::HumanString(*reduced_args[i])); } } @@ -1892,9 +1878,8 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(HloOpcode operation, const Shape& arg = *reduced_args[0]; for (int64 dimension : dimensions_to_reduce) { if (dimension >= ShapeUtil::Rank(arg) || dimension < 0) { - return InvalidArgument( - "Reducing out-of-bounds dimension %lld in shape %s.", dimension, - ShapeUtil::HumanString(arg).c_str()); + return InvalidArgument("Reducing out-of-bounds dimension %d in shape %s.", + dimension, ShapeUtil::HumanString(arg)); } } @@ -1967,16 +1952,16 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(HloOpcode operation, return InvalidArgument( "Select function's first parameter shape currently must " "match the operand element shape, but got %s vs %s.", - ShapeUtil::HumanString(select_shape.parameters(0)).c_str(), - ShapeUtil::HumanString(operand_element_shape).c_str()); + ShapeUtil::HumanString(select_shape.parameters(0)), + ShapeUtil::HumanString(operand_element_shape)); } if (!ShapeUtil::CompatibleIgnoringFpPrecision(operand_element_shape, select_shape.parameters(1))) { return InvalidArgument( "Select function's second parameter shape currently must " "match the operand element shape, but got %s vs %s.", - ShapeUtil::HumanString(select_shape.parameters(1)).c_str(), - ShapeUtil::HumanString(operand_element_shape).c_str()); + ShapeUtil::HumanString(select_shape.parameters(1)), + ShapeUtil::HumanString(operand_element_shape)); } // Check if the scatter function has a proper shape as a reduction. @@ -1994,8 +1979,8 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(HloOpcode operation, return InvalidArgument( "Source shape does not match the shape of window-reduced operand: " "source(%s), window-reduced operand(%s).", - ShapeUtil::HumanString(source_shape).c_str(), - ShapeUtil::HumanString(window_result_shape).c_str()); + ShapeUtil::HumanString(source_shape), + ShapeUtil::HumanString(window_result_shape)); } return operand_shape; } @@ -2008,29 +1993,27 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(HloOpcode operation, return InvalidArgument( "%s in slice operation; argument shape: %s; starts: {%s}; limits: " "{%s}; strides: {%s}.", - message.c_str(), ShapeUtil::HumanString(arg).c_str(), - StrJoin(starts, ",").c_str(), StrJoin(limits, ",").c_str(), - StrJoin(strides, ",").c_str()); + message, ShapeUtil::HumanString(arg), StrJoin(starts, ","), + StrJoin(limits, ","), StrJoin(strides, ",")); }; TF_RETURN_IF_ERROR(ExpectArray(arg, "operand of slice")); - VLOG(2) << tensorflow::strings::Printf( - "slicing shape %s starts={%s} limits={%s}", - ShapeUtil::HumanString(arg).c_str(), StrJoin(starts, ", ").c_str(), - StrJoin(limits, ", ").c_str()); + VLOG(2) << StrFormat("slicing shape %s starts={%s} limits={%s}", + ShapeUtil::HumanString(arg), StrJoin(starts, ", "), + StrJoin(limits, ", ")); if (starts.size() != limits.size()) { - return error(Printf("slice start and limit sizes differ: %zu vs %zu", - starts.size(), limits.size())); + return error(StrFormat("slice start and limit sizes differ: %u vs %u", + starts.size(), limits.size())); } if (starts.size() != strides.size()) { - return error(Printf("slice start and strides sizes differ: %zu vs %zu", - starts.size(), strides.size())); + return error(StrFormat("slice start and strides sizes differ: %u vs %u", + starts.size(), strides.size())); } if (starts.size() != ShapeUtil::Rank(arg)) { return InvalidArgument( - "Slice index count does not match argument rank: %zu vs %lld.", + "Slice index count does not match argument rank: %u vs %d.", starts.size(), ShapeUtil::Rank(arg)); } @@ -2040,27 +2023,24 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(HloOpcode operation, int64 limit_index = limits[dimension]; int64 stride = strides[dimension]; if (start_index < 0) { - return InvalidArgument("Negative start index to slice: %lld.", - start_index); + return InvalidArgument("Negative start index to slice: %d.", start_index); } if (limit_index > arg.dimensions(dimension)) { return error( - Printf("limit index (%lld) must be less than or equal to dimension " - "size (%lld)", - limit_index, arg.dimensions(dimension))); - } - VLOG(2) << tensorflow::strings::Printf("starts[%lld] = %lld", dimension, - start_index); - VLOG(2) << tensorflow::strings::Printf("limits[%lld] = %lld", dimension, - limit_index); + StrFormat("limit index (%d) must be less than or equal to dimension " + "size (%d)", + limit_index, arg.dimensions(dimension))); + } + VLOG(2) << StrFormat("starts[%d] = %d", dimension, start_index); + VLOG(2) << StrFormat("limits[%d] = %d", dimension, limit_index); if (start_index > limit_index) { return error( - Printf("limit index (%lld) must be greater or equal to " - "start index (%lld) in slice with positive stride", - limit_index, start_index)); + StrFormat("limit index (%d) must be greater or equal to " + "start index (%d) in slice with positive stride", + limit_index, start_index)); } if (stride <= 0) { - return InvalidArgument("Stride (%lld) must be positive.", stride); + return InvalidArgument("Stride (%d) must be positive.", stride); } sizes.push_back((limit_index - start_index + stride - 1) / stride); } @@ -2075,15 +2055,14 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(HloOpcode operation, TF_RETURN_IF_ERROR( ExpectArray(start_indices_shape, "start indices of dynamic slice")); - VLOG(2) << tensorflow::strings::Printf( + VLOG(2) << StrFormat( "slicing shape %s at dynamic start_indices %s with slice_sizes={%s}", - ShapeUtil::HumanString(operand_shape).c_str(), - ShapeUtil::HumanString(start_indices_shape).c_str(), - StrJoin(slice_sizes, ", ").c_str()); + ShapeUtil::HumanString(operand_shape), + ShapeUtil::HumanString(start_indices_shape), StrJoin(slice_sizes, ", ")); if (ShapeUtil::Rank(start_indices_shape) != 1) { return InvalidArgument( - "Dynamic slice start indices of rank %lld must be rank1.", + "Dynamic slice start indices of rank %d must be rank1.", ShapeUtil::Rank(start_indices_shape)); } @@ -2095,16 +2074,15 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(HloOpcode operation, const int64 start_num_dims = start_indices_shape.dimensions(0); if (ShapeUtil::Rank(operand_shape) != start_num_dims) { return InvalidArgument( - "Dynamic slice start number of dimensions %lld (%s) must match rank " - "%lld of slice input (%s).", - start_num_dims, ShapeUtil::HumanString(start_indices_shape).c_str(), - ShapeUtil::Rank(operand_shape), - ShapeUtil::HumanString(operand_shape).c_str()); + "Dynamic slice start number of dimensions %d (%s) must match rank " + "%d of slice input (%s).", + start_num_dims, ShapeUtil::HumanString(start_indices_shape), + ShapeUtil::Rank(operand_shape), ShapeUtil::HumanString(operand_shape)); } if (slice_sizes.size() != ShapeUtil::Rank(operand_shape)) { return InvalidArgument( - "Dynamic slice index count does not match argument rank: %zu vs %lld.", + "Dynamic slice index count does not match argument rank: %u vs %d.", slice_sizes.size(), ShapeUtil::Rank(operand_shape)); } @@ -2112,16 +2090,15 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(HloOpcode operation, const int64 input_dim_size = operand_shape.dimensions(dim); const int64 slice_dim_size = slice_sizes[dim]; if (slice_dim_size < 0) { - return InvalidArgument("Negative size index to dynamic slice: %lld.", + return InvalidArgument("Negative size index to dynamic slice: %d.", slice_dim_size); } if (slice_dim_size > input_dim_size) { return InvalidArgument( - "Slice dim size %lld greater than dynamic slice dimension: %lld.", + "Slice dim size %d greater than dynamic slice dimension: %d.", slice_dim_size, input_dim_size); } - VLOG(2) << tensorflow::strings::Printf("slice_sizes[%lld] = %lld", dim, - slice_dim_size); + VLOG(2) << StrFormat("slice_sizes[%d] = %d", dim, slice_dim_size); } return ShapeUtil::MakeShape(operand_shape.element_type(), slice_sizes); @@ -2137,16 +2114,16 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(HloOpcode operation, TF_RETURN_IF_ERROR(ExpectArray(start_indices_shape, "start indices of dynamic update slice")); - VLOG(2) << tensorflow::strings::Printf( + VLOG(2) << StrFormat( "updating slice of shape %s at dynamic start_indices %s with update " "shape %s", - ShapeUtil::HumanString(operand_shape).c_str(), - ShapeUtil::HumanString(start_indices_shape).c_str(), - ShapeUtil::HumanString(update_shape).c_str()); + ShapeUtil::HumanString(operand_shape), + ShapeUtil::HumanString(start_indices_shape), + ShapeUtil::HumanString(update_shape)); if (ShapeUtil::Rank(start_indices_shape) != 1) { return InvalidArgument( - "Dynamic update slice start indices of rank %lld must be rank1.", + "Dynamic update slice start indices of rank %d must be rank1.", ShapeUtil::Rank(start_indices_shape)); } @@ -2158,17 +2135,16 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(HloOpcode operation, const int64 start_num_dims = start_indices_shape.dimensions(0); if (ShapeUtil::Rank(operand_shape) != start_num_dims) { return InvalidArgument( - "Dynamic update slice start number of dimensions %lld (%s) must match " - "rank %lld of slice input (%s).", - start_num_dims, ShapeUtil::HumanString(start_indices_shape).c_str(), - ShapeUtil::Rank(operand_shape), - ShapeUtil::HumanString(operand_shape).c_str()); + "Dynamic update slice start number of dimensions %d (%s) must match " + "rank %d of slice input (%s).", + start_num_dims, ShapeUtil::HumanString(start_indices_shape), + ShapeUtil::Rank(operand_shape), ShapeUtil::HumanString(operand_shape)); } if (ShapeUtil::Rank(update_shape) != ShapeUtil::Rank(operand_shape)) { return InvalidArgument( "Dynamic update slice update rank does not match argument rank: " - "%lld vs %lld.", + "%d vs %d.", ShapeUtil::Rank(update_shape), ShapeUtil::Rank(operand_shape)); } @@ -2177,8 +2153,8 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(HloOpcode operation, return InvalidArgument( "Dynamic update slice update element type does not match argument. " "operand.element_type: %s vs update.element_type: %s.", - PrimitiveType_Name(operand_shape.element_type()).c_str(), - PrimitiveType_Name(update_shape.element_type()).c_str()); + PrimitiveType_Name(operand_shape.element_type()), + PrimitiveType_Name(update_shape.element_type())); } for (int64 dim = 0; dim < ShapeUtil::Rank(operand_shape); ++dim) { @@ -2186,16 +2162,15 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(HloOpcode operation, const int64 update_dim_size = update_shape.dimensions(dim); if (update_dim_size < 0) { return InvalidArgument( - "Size index %lld to dynamic update slice must be >= 0.", + "Size index %d to dynamic update slice must be >= 0.", update_dim_size); } if (update_dim_size > input_dim_size) { return InvalidArgument( - "Update dim size %lld greater than dynamic slice dimension: %lld.", + "Update dim size %d greater than dynamic slice dimension: %d.", update_dim_size, input_dim_size); } - VLOG(2) << tensorflow::strings::Printf("update_sizes[%lld] = %lld", dim, - update_dim_size); + VLOG(2) << StrFormat("update_sizes[%d] = %d", dim, update_dim_size); } return operand_shape; @@ -2210,8 +2185,8 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(HloOpcode operation, for (int64 dimension : dimensions) { if (dimension >= ShapeUtil::Rank(operand_shape) || dimension < 0) { return InvalidArgument( - "One of the reverse dimensions (%lld) is out-of-bounds in shape %s.", - dimension, ShapeUtil::HumanString(operand_shape).c_str()); + "One of the reverse dimensions (%d) is out-of-bounds in shape %s.", + dimension, ShapeUtil::HumanString(operand_shape)); } } return operand_shape; @@ -2222,14 +2197,14 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(HloOpcode operation, if (!ShapeUtil::IsTuple(arg)) { return InvalidArgument( "Cannot infer shape: attempting to index into non-tuple: %s.", - ShapeUtil::HumanString(arg).c_str()); + ShapeUtil::HumanString(arg)); } if (index >= arg.tuple_shapes_size()) { return InvalidArgument( - "Cannot infer shape: attempt to index out of tuple bounds: %lld " + "Cannot infer shape: attempt to index out of tuple bounds: %d " ">= %d in shape %s.", - index, arg.tuple_shapes_size(), ShapeUtil::HumanString(arg).c_str()); + index, arg.tuple_shapes_size(), ShapeUtil::HumanString(arg)); } return arg.tuple_shapes(index); @@ -2249,17 +2224,15 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(HloOpcode operation, } auto shape_string = [&]() { - return tensorflow::strings::Printf( - "Condition: %s; body: %s; init: %s.", - ShapeUtil::HumanString(condition).c_str(), - ShapeUtil::HumanString(body).c_str(), - ShapeUtil::HumanString(init).c_str()); + return StrFormat( + "Condition: %s; body: %s; init: %s.", ShapeUtil::HumanString(condition), + ShapeUtil::HumanString(body), ShapeUtil::HumanString(init)); }; // Check the shapes of computation parameters and return types. if (!ShapeUtil::ShapeIs(condition.result(), PRED, {})) { return InvalidArgument("Condition must return a boolean; got %s.", - shape_string().c_str()); + shape_string()); } if (!ShapeUtil::Compatible(body.result(), condition.parameters(0)) || !ShapeUtil::Compatible(body.result(), body.parameters(0)) || @@ -2267,7 +2240,7 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(HloOpcode operation, return InvalidArgument( "The parameter of condition and body, the result of the body, and init " "must all have the same shape; got %s.", - shape_string().c_str()); + shape_string()); } return init; @@ -2279,7 +2252,7 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(HloOpcode operation, const ProgramShape& false_computation) { if (!ShapeUtil::ShapeIs(predicate, PRED, {})) { return InvalidArgument("Predicate must be a boolean; got %s.", - ShapeUtil::HumanString(predicate).c_str()); + ShapeUtil::HumanString(predicate)); } if (true_computation.parameters_size() != 1) { @@ -2288,15 +2261,14 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(HloOpcode operation, } if (!ShapeUtil::Compatible(true_computation.parameters(0), true_operand)) { auto true_shape_string = [&]() { - return tensorflow::strings::Printf( - "true_operand: %s; true_computation: %s", - ShapeUtil::HumanString(true_operand).c_str(), - ShapeUtil::HumanString(true_computation).c_str()); + return StrFormat("true_operand: %s; true_computation: %s", + ShapeUtil::HumanString(true_operand), + ShapeUtil::HumanString(true_computation)); }; return InvalidArgument( "true_operand must match the shape of the only parameter of " "true_computation: got %s.", - true_shape_string().c_str()); + true_shape_string()); } if (false_computation.parameters_size() != 1) { @@ -2305,28 +2277,27 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(HloOpcode operation, } if (!ShapeUtil::Compatible(false_computation.parameters(0), false_operand)) { auto false_shape_string = [&]() { - return tensorflow::strings::Printf( - "false_operand: %s; false_computation: %s", - ShapeUtil::HumanString(false_operand).c_str(), - ShapeUtil::HumanString(false_computation).c_str()); + return StrFormat("false_operand: %s; false_computation: %s", + ShapeUtil::HumanString(false_operand), + ShapeUtil::HumanString(false_computation)); }; return InvalidArgument( "false_operand must match the shape of the only parameter of " "false_computation: got %s.", - false_shape_string().c_str()); + false_shape_string()); } if (!ShapeUtil::Compatible(true_computation.result(), false_computation.result())) { auto shape_string = [&]() { - return tensorflow::strings::Printf( + return StrFormat( "true_computation result: %s; false_computation result: %s.", - ShapeUtil::HumanString(true_computation.result()).c_str(), - ShapeUtil::HumanString(false_computation.result()).c_str()); + ShapeUtil::HumanString(true_computation.result()), + ShapeUtil::HumanString(false_computation.result())); }; return InvalidArgument( "the result of true_computation and false_computation must have the " "same shape: got %s.", - shape_string().c_str()); + shape_string()); } return true_computation.result(); } @@ -2336,7 +2307,7 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(HloOpcode operation, TF_RETURN_IF_ERROR(ExpectArray(operand, "operand of broadcast")); for (int64 size : broadcast_sizes) { if (size < 0) { - return InvalidArgument("Broadcast with negative dimension size %lld.", + return InvalidArgument("Broadcast with negative dimension size %d.", size); } } @@ -2361,11 +2332,11 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(HloOpcode operation, if (ShapeUtil::ElementsIn(operand) != ShapeUtil::ElementsIn(inferred_shape)) { return InvalidArgument( - "Reshape operation has mismatched element counts: from=%lld (%s) " - "to=%lld (%s).", - ShapeUtil::ElementsIn(operand), ShapeUtil::HumanString(operand).c_str(), + "Reshape operation has mismatched element counts: from=%d (%s) " + "to=%d (%s).", + ShapeUtil::ElementsIn(operand), ShapeUtil::HumanString(operand), ShapeUtil::ElementsIn(inferred_shape), - ShapeUtil::HumanString(inferred_shape).c_str()); + ShapeUtil::HumanString(inferred_shape)); } std::vector indices(ShapeUtil::Rank(operand)); @@ -2376,8 +2347,7 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(HloOpcode operation, return InvalidArgument( "Reshape dimensions [%s] are not a permutation of the operand " "dimensions (operand shape is %s).", - StrJoin(dimensions, ",").c_str(), - ShapeUtil::HumanString(operand).c_str()); + StrJoin(dimensions, ","), ShapeUtil::HumanString(operand)); } return inferred_shape; @@ -2412,9 +2382,9 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(HloOpcode operation, if (!ShapeUtil::SameElementTypeIgnoringFpPrecision(min, operand) || !ShapeUtil::SameElementTypeIgnoringFpPrecision(max, operand)) { return InvalidArgument("Clamp with different operand types: %s, %s, %s.", - ShapeUtil::HumanString(min).c_str(), - ShapeUtil::HumanString(operand).c_str(), - ShapeUtil::HumanString(max).c_str()); + ShapeUtil::HumanString(min), + ShapeUtil::HumanString(operand), + ShapeUtil::HumanString(max)); } if (((ShapeUtil::CompatibleIgnoringFpPrecision(min, operand) || ShapeUtil::IsScalar(min)) && @@ -2431,9 +2401,9 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(HloOpcode operation, return ShapeUtil::ChangeElementType(min, operand.element_type()); } } - return Unimplemented( - "%s, %s %s is not implemented.", min.ShortDebugString().c_str(), - max.ShortDebugString().c_str(), operand.ShortDebugString().c_str()); + return Unimplemented("%s, %s %s is not implemented.", + min.ShortDebugString(), max.ShortDebugString(), + operand.ShortDebugString()); } // TODO(b/36794510): Make broadcast semantics more consistent, by supporting @@ -2444,13 +2414,12 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(HloOpcode operation, if (!ShapeUtil::CompatibleIgnoringFpPrecision(on_true, on_false)) { return InvalidArgument( "Operands to select must be the same shape; got %s and %s.", - ShapeUtil::HumanString(on_true).c_str(), - ShapeUtil::HumanString(on_false).c_str()); + ShapeUtil::HumanString(on_true), ShapeUtil::HumanString(on_false)); } if (pred.element_type() != PRED) { return InvalidArgument( "Select's pred operand must have PRED element type; got %s.", - ShapeUtil::HumanString(pred).c_str()); + ShapeUtil::HumanString(pred)); } if (ShapeUtil::CompatibleIgnoringElementType(pred, on_true) || ShapeUtil::IsScalar(pred)) { @@ -2463,7 +2432,7 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(HloOpcode operation, return InvalidArgument( "Select operation with non-scalar predicate with dimensionality " " different from the other operands: %s.", - ShapeUtil::HumanString(pred).c_str()); + ShapeUtil::HumanString(pred)); } } @@ -2474,18 +2443,17 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(HloOpcode operation, if (!ShapeUtil::Compatible(on_true, on_false)) { return InvalidArgument( "Operands to tuple-select must be the same shape; got %s and %s.", - ShapeUtil::HumanString(on_true).c_str(), - ShapeUtil::HumanString(on_false).c_str()); + ShapeUtil::HumanString(on_true), ShapeUtil::HumanString(on_false)); } if (pred.element_type() != PRED) { return InvalidArgument( "TupleSelect's pred operand must have PRED element type; got %s.", - ShapeUtil::HumanString(pred).c_str()); + ShapeUtil::HumanString(pred)); } if (!ShapeUtil::IsScalar(pred)) { return InvalidArgument( "TupleSelect operation with non-scalar predicate: %s.", - ShapeUtil::HumanString(pred).c_str()); + ShapeUtil::HumanString(pred)); } return on_true; } @@ -2502,10 +2470,10 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(HloOpcode operation, }); return InvalidArgument( "Call applied function arity must match number of arguments; got: " - "arity: %d, arguments: %zu; computation signature: %s; argument " + "arity: %d, arguments: %u; computation signature: %s; argument " "shapes: [%s].", - to_apply.parameters_size(), arg_shapes.size(), - computation_signature.c_str(), argument_shapes.c_str()); + to_apply.parameters_size(), arg_shapes.size(), computation_signature, + argument_shapes); } // All arguments must be compatible with the program shape. @@ -2516,8 +2484,8 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(HloOpcode operation, return InvalidArgument( "Call parameter must match argument; got parameter %d shape: %s, " "argument shape: %s.", - i, ShapeUtil::HumanString(param_shape).c_str(), - ShapeUtil::HumanString(arg_shape).c_str()); + i, ShapeUtil::HumanString(param_shape), + ShapeUtil::HumanString(arg_shape)); } } @@ -2531,14 +2499,14 @@ static Status ValidateGatherDimensionNumbers( if (!absl::c_is_sorted(dim_numbers.offset_dims())) { return InvalidArgument( "Output window dimensions in gather op must be ascending; got: %s.", - StrJoin(dim_numbers.offset_dims(), ", ").c_str()); + StrJoin(dim_numbers.offset_dims(), ", ")); } if (absl::c_adjacent_find(dim_numbers.offset_dims()) != dim_numbers.offset_dims().end()) { return InvalidArgument( "Output window dimensions in gather op must not repeat; got: %s.", - StrJoin(dim_numbers.offset_dims(), ", ").c_str()); + StrJoin(dim_numbers.offset_dims(), ", ")); } const int64 output_offset_dim_count = dim_numbers.offset_dims_size(); @@ -2549,9 +2517,9 @@ static Status ValidateGatherDimensionNumbers( int64 offset_dim = dim_numbers.offset_dims(i); if (offset_dim < 0 || offset_dim >= output_shape_rank) { return InvalidArgument( - "Offset dimension %d in gather op is out of bounds; got %lld, but " + "Offset dimension %d in gather op is out of bounds; got %d, but " "should " - "have been in [0,%lld).", + "have been in [0,%d).", i, offset_dim, output_shape_rank); } } @@ -2560,8 +2528,8 @@ static Status ValidateGatherDimensionNumbers( start_indices_shape[dim_numbers.index_vector_dim()]) { return InvalidArgument( "Gather op has %d elements in start_index_map and the " - "bound of dimension index_vector_dim=%lld of start_indices is " - "%lld. These two numbers must be equal.", + "bound of dimension index_vector_dim=%d of start_indices is " + "%d. These two numbers must be equal.", dim_numbers.start_index_map_size(), dim_numbers.index_vector_dim(), start_indices_shape[dim_numbers.index_vector_dim()]); } @@ -2571,7 +2539,7 @@ static Status ValidateGatherDimensionNumbers( if (operand_dim_for_start_index_i < 0 || operand_dim_for_start_index_i >= input_shape.dimensions_size()) { return InvalidArgument( - "Invalid start_index_map; domain is [0, %d), got: %d->%lld.", + "Invalid start_index_map; domain is [0, %d), got: %d->%d.", input_shape.dimensions_size(), i, operand_dim_for_start_index_i); } } @@ -2587,14 +2555,14 @@ static Status ValidateGatherDimensionNumbers( return InvalidArgument( "Repeated dimensions are not allowed in start_index_map; " "got: %s.", - StrJoin(dim_numbers.start_index_map(), ", ").c_str()); + StrJoin(dim_numbers.start_index_map(), ", ")); } for (int64 collapsed_dim : dim_numbers.collapsed_slice_dims()) { if (collapsed_dim < 0 || collapsed_dim >= input_shape.dimensions_size()) { return InvalidArgument( "Invalid collapsed_slice_dims set in gather op; valid range is [0, " - "%d), got: %lld.", + "%d), got: %d.", input_shape.dimensions_size(), collapsed_dim); } } @@ -2602,7 +2570,7 @@ static Status ValidateGatherDimensionNumbers( if (!absl::c_is_sorted(dim_numbers.collapsed_slice_dims())) { return InvalidArgument( "collapsed_slice_dims in gather op must be sorted; got: %s", - StrJoin(dim_numbers.collapsed_slice_dims(), ", ").c_str()); + StrJoin(dim_numbers.collapsed_slice_dims(), ", ")); } if (absl::c_adjacent_find(dim_numbers.collapsed_slice_dims()) != @@ -2610,7 +2578,7 @@ static Status ValidateGatherDimensionNumbers( return InvalidArgument( "Repeated dimensions not allowed in collapsed_slice_dims in gather op; " "got: %s.", - StrJoin(dim_numbers.collapsed_slice_dims(), ", ").c_str()); + StrJoin(dim_numbers.collapsed_slice_dims(), ", ")); } return Status::OK(); @@ -2628,7 +2596,7 @@ static Status ValidateGatherDimensionNumbers( if (!ShapeUtil::ElementIsIntegral(start_indices_shape)) { return InvalidArgument( "Gather indices parameter must be an integral tensor; got %s.", - ShapeUtil::HumanString(start_indices_shape).c_str()); + ShapeUtil::HumanString(start_indices_shape)); } // We implicitly reshape gather indices of shape P[A,B,C] to P[A,B,C,1] if @@ -2641,7 +2609,7 @@ static Status ValidateGatherDimensionNumbers( return InvalidArgument( "Gather index leaf dimension must be within [0, rank(start_indices) + " "1). rank(start_indices) is %d and gather index leaf dimension is " - "%lld.", + "%d.", start_indices_shape.dimensions_size(), gather_dim_numbers.index_vector_dim()); } @@ -2672,9 +2640,8 @@ static Status ValidateGatherDimensionNumbers( "All components of the offset index in a gather op must either be a " "offset dimension or explicitly collapsed; got len(slice_sizes)=%lu, " "output_slice_sizes=%s, collapsed_slice_dims=%s.", - slice_sizes.size(), - StrJoin(gather_dim_numbers.offset_dims(), ",").c_str(), - StrJoin(gather_dim_numbers.collapsed_slice_dims(), ",").c_str()); + slice_sizes.size(), StrJoin(gather_dim_numbers.offset_dims(), ","), + StrJoin(gather_dim_numbers.collapsed_slice_dims(), ",")); } for (int i = 0; i < slice_sizes.size(); i++) { @@ -2683,7 +2650,7 @@ static Status ValidateGatherDimensionNumbers( if (slice_size < 0 || slice_size > corresponding_input_size) { return InvalidArgument( "Slice size at index %d in gather op is out of range, must be " - "within [0, %lld), got %lld.", + "within [0, %d), got %d.", i, corresponding_input_size + 1, slice_size); } } @@ -2692,7 +2659,7 @@ static Status ValidateGatherDimensionNumbers( if (slice_sizes[gather_dim_numbers.collapsed_slice_dims(i)] != 1) { return InvalidArgument( "Gather op can only collapse slice dims with bound 1, but bound is " - "%lld for index %lld at position %d.", + "%d for index %d at position %d.", slice_sizes[gather_dim_numbers.collapsed_slice_dims(i)], gather_dim_numbers.collapsed_slice_dims(i), i); } @@ -2737,20 +2704,20 @@ Status ValidateScatterDimensionNumbers( if (!absl::c_is_sorted(dim_numbers.update_window_dims())) { return InvalidArgument( "update_window_dims in scatter op must be sorted; got: %s.", - StrJoin(dim_numbers.update_window_dims(), ", ").c_str()); + StrJoin(dim_numbers.update_window_dims(), ", ")); } if (absl::c_adjacent_find(dim_numbers.update_window_dims()) != dim_numbers.update_window_dims().end()) { return InvalidArgument( "update_window_dims in scatter op must not repeat; got: %s.", - StrJoin(dim_numbers.update_window_dims(), ", ").c_str()); + StrJoin(dim_numbers.update_window_dims(), ", ")); } const int64 updates_rank = ShapeUtil::Rank(updates_shape); for (int64 window_dim : dim_numbers.update_window_dims()) { if (window_dim < 0 || window_dim >= updates_rank) { return InvalidArgument( "Invalid update_window_dims set in scatter op; valid range is [0, " - "%lld). got: %lld.", + "%d). got: %d.", updates_rank, window_dim); } } @@ -2759,19 +2726,19 @@ Status ValidateScatterDimensionNumbers( if (!absl::c_is_sorted(dim_numbers.inserted_window_dims())) { return InvalidArgument( "inserted_window_dims in scatter op must be sorted; got: %s.", - StrJoin(dim_numbers.inserted_window_dims(), ", ").c_str()); + StrJoin(dim_numbers.inserted_window_dims(), ", ")); } if (absl::c_adjacent_find(dim_numbers.inserted_window_dims()) != dim_numbers.inserted_window_dims().end()) { return InvalidArgument( "inserted_window_dims in scatter op must not repeat; got: %s.", - StrJoin(dim_numbers.inserted_window_dims(), ", ").c_str()); + StrJoin(dim_numbers.inserted_window_dims(), ", ")); } for (int64 inserted_dim : dim_numbers.inserted_window_dims()) { if (inserted_dim < 0 || inserted_dim >= operand_shape.dimensions_size()) { return InvalidArgument( "Invalid inserted_window_dims set in scatter op; valid range is [0, " - "%d), got: %lld.", + "%d), got: %d.", operand_shape.dimensions_size(), inserted_dim); } } @@ -2781,7 +2748,7 @@ Status ValidateScatterDimensionNumbers( scatter_indices_shape[dim_numbers.index_vector_dim()]) { return InvalidArgument( "Scatter op has %d elements in scatter_dims_to_operand_dims and the " - "bound of dimension index_vector_dim=%lld of scatter_indices is %lld. " + "bound of dimension index_vector_dim=%d of scatter_indices is %d. " "These two numbers must be equal.", dim_numbers.scatter_dims_to_operand_dims_size(), dim_numbers.index_vector_dim(), @@ -2794,7 +2761,7 @@ Status ValidateScatterDimensionNumbers( scatter_dim_to_operand_dim >= operand_shape.dimensions_size()) { return InvalidArgument( "Invalid scatter_dims_to_operand_dims mapping; domain is [0, %d), " - "got: %d->%lld.", + "got: %d->%d.", operand_shape.dimensions_size(), i, scatter_dim_to_operand_dim); } } @@ -2807,7 +2774,7 @@ Status ValidateScatterDimensionNumbers( return InvalidArgument( "Repeated dimensions not allowed in scatter_dims_to_operand_dims; " "got: %s.", - StrJoin(dim_numbers.scatter_dims_to_operand_dims(), ", ").c_str()); + StrJoin(dim_numbers.scatter_dims_to_operand_dims(), ", ")); } return Status::OK(); @@ -2828,7 +2795,7 @@ Status ValidateScatterDimensionNumbers( if (!ShapeUtil::ElementIsIntegral(scatter_indices_shape)) { return InvalidArgument( "Scatter indices parameter must be an integral tensor; got %s.", - ShapeUtil::HumanString(scatter_indices_shape).c_str()); + ShapeUtil::HumanString(scatter_indices_shape)); } if (scatter_indices_shape.dimensions_size() < @@ -2837,7 +2804,7 @@ Status ValidateScatterDimensionNumbers( return InvalidArgument( "Scatter index leaf dimension must be within [0, rank(scatter_indices)" " + 1). rank(scatter_indices) is %d and scatter index leaf dimension " - "is %lld.", + "is %d.", scatter_indices_shape.dimensions_size(), scatter_dim_numbers.index_vector_dim()); } @@ -2859,7 +2826,7 @@ Status ValidateScatterDimensionNumbers( int64 expected_updates_rank = expanded_scatter_indices_shape.size() - 1 + scatter_dim_numbers.update_window_dims_size(); if (ShapeUtil::Rank(updates_shape) != expected_updates_rank) { - return InvalidArgument("Updates tensor must be of rank %lld; got %lld.", + return InvalidArgument("Updates tensor must be of rank %d; got %d.", expected_updates_rank, ShapeUtil::Rank(updates_shape)); } @@ -2885,7 +2852,7 @@ Status ValidateScatterDimensionNumbers( return InvalidArgument( "Bounds of the window dimensions of updates must not exceed the " "bounds of the corresponding dimensions of operand. For dimension " - "%lld, updates bound is %lld, operand bound is %lld.", + "%d, updates bound is %d, operand bound is %d.", update_window_dim, updates_shape.dimensions(update_window_dim), max_update_slice_sizes[i]); } @@ -2906,8 +2873,8 @@ Status ValidateScatterDimensionNumbers( return InvalidArgument( "Bounds of the scatter dimensions of updates must be same as the " "bounds of the corresponding dimensions of scatter indices. For " - "scatter dimension %lld, updates bound is %lld, scatter_indices " - "bound is %lld.", + "scatter dimension %d, updates bound is %d, scatter_indices " + "bound is %d.", i, updates_shape.dimensions(i), expanded_scatter_indices_shape[scatter_dims_seen]); } diff --git a/tensorflow/compiler/xla/service/shaped_buffer.cc b/tensorflow/compiler/xla/service/shaped_buffer.cc index 5c12dc37b7..921a984589 100644 --- a/tensorflow/compiler/xla/service/shaped_buffer.cc +++ b/tensorflow/compiler/xla/service/shaped_buffer.cc @@ -20,19 +20,17 @@ limitations under the License. #include "absl/memory/memory.h" #include "absl/strings/str_cat.h" +#include "absl/strings/str_format.h" #include "tensorflow/compiler/xla/layout_util.h" #include "tensorflow/compiler/xla/shape_util.h" #include "tensorflow/compiler/xla/status_macros.h" #include "tensorflow/compiler/xla/types.h" #include "tensorflow/compiler/xla/util.h" #include "tensorflow/core/lib/gtl/flatset.h" -#include "tensorflow/core/lib/strings/stringprintf.h" #include "tensorflow/core/platform/logging.h" namespace xla { -using ::tensorflow::strings::Appendf; - ShapedBuffer::ShapedBuffer(const Shape& on_host_shape, const Shape& on_device_shape, const se::Platform* platform, int device_ordinal) @@ -93,9 +91,9 @@ string ShapedBuffer::ToString() const { shape_str = ShapeUtil::HumanStringWithLayout(subshape); } const se::DeviceMemoryBase& memory = buffer(index); - Appendf(&s, " %s%p (%lld bytes) : %s\n", - string(index.size() * 2, ' ').c_str(), memory.opaque(), - memory.size(), shape_str.c_str()); + absl::StrAppendFormat(&s, " %s%p (%d bytes) : %s\n", + string(index.size() * 2, ' '), memory.opaque(), + memory.size(), shape_str); }); return s; } diff --git a/tensorflow/compiler/xla/service/source_map_util.cc b/tensorflow/compiler/xla/service/source_map_util.cc index 8cbaac7b37..dd53c7531b 100644 --- a/tensorflow/compiler/xla/service/source_map_util.cc +++ b/tensorflow/compiler/xla/service/source_map_util.cc @@ -15,6 +15,7 @@ limitations under the License. #include "tensorflow/compiler/xla/service/source_map_util.h" +#include "absl/strings/str_format.h" #include "tensorflow/compiler/xla/util.h" namespace xla { @@ -26,11 +27,10 @@ Status InvalidParameterArgumentV(const OpMetadata& op_metadata, string message; tensorflow::strings::Appendv(&message, format, args); if (!op_metadata.source_file().empty()) { - tensorflow::strings::Appendf(&message, " (%s:%d)", - op_metadata.source_file().c_str(), - op_metadata.source_line()); + absl::StrAppendFormat(&message, " (%s:%d)", op_metadata.source_file(), + op_metadata.source_line()); } - return InvalidArgument("%s", message.c_str()); + return InvalidArgument("%s", message); } } // namespace diff --git a/tensorflow/compiler/xla/service/source_map_util.h b/tensorflow/compiler/xla/service/source_map_util.h index 84607cd012..c5a7e17cb4 100644 --- a/tensorflow/compiler/xla/service/source_map_util.h +++ b/tensorflow/compiler/xla/service/source_map_util.h @@ -16,6 +16,7 @@ limitations under the License. #ifndef TENSORFLOW_COMPILER_XLA_SERVICE_SOURCE_MAP_UTIL_H_ #define TENSORFLOW_COMPILER_XLA_SERVICE_SOURCE_MAP_UTIL_H_ +#include "absl/strings/str_format.h" #include "tensorflow/compiler/xla/service/executable.h" #include "tensorflow/compiler/xla/status.h" #include "tensorflow/core/platform/macros.h" @@ -23,6 +24,19 @@ limitations under the License. namespace xla { namespace source_map_util { +// Creates an INVALID_ARGUMENT status with the given format string. +template +Status InvalidParameterArgument(const OpMetadata& op_metadata, + const absl::FormatSpec& format, + const Args&... args) { + string message = absl::StrFormat(format, args...); + if (!op_metadata.source_file().empty()) { + absl::StrAppendFormat(&message, " (%s:%d)", op_metadata.source_file(), + op_metadata.source_line()); + } + return InvalidArgument("%s", message); +} + // Creates an INVALID_ARGUMENT status with the given format string. // // Also, attempts to extract the OpMetadata for parameter_number on executable @@ -30,15 +44,19 @@ namespace source_map_util { // // executable may be nullptr, but parameter_number should not be out of bounds // or a CHECK-failure may occur. +template Status InvalidParameterArgument(Executable* executable, int parameter_number, - const char* format, ...) - TF_PRINTF_ATTRIBUTE(3, 4); - -// As above, but takes the parameter metadata directly instead of extracting it -// from the executable. -Status InvalidParameterArgument(const OpMetadata& op_metadata, - const char* format, ...) - TF_PRINTF_ATTRIBUTE(2, 3); + const absl::FormatSpec& format, + const Args&... args) { + if (executable != nullptr && executable->has_module()) { + const HloModule& module = executable->module(); + const HloComputation& computation = *module.entry_computation(); + HloInstruction* param = computation.parameter_instruction(parameter_number); + const OpMetadata& metadata = param->metadata(); + return InvalidParameterArgument(metadata, format, args...); + } + return InvalidArgument(format, args...); +} } // namespace source_map_util } // namespace xla diff --git a/tensorflow/compiler/xla/service/transfer_manager.cc b/tensorflow/compiler/xla/service/transfer_manager.cc index 0c577ec67a..b8d2d546e5 100644 --- a/tensorflow/compiler/xla/service/transfer_manager.cc +++ b/tensorflow/compiler/xla/service/transfer_manager.cc @@ -149,7 +149,7 @@ Status TransferManager::TransferArrayToDeviceAsync( if (dest.size() < GetByteSizeRequirement(on_device_shape)) { return FailedPrecondition( "Allocation on device not large enough for array: " - "%lld < %lld", + "%d < %d", dest.size(), GetByteSizeRequirement(on_device_shape)); } ShapedBuffer shaped_buffer(/*on_host_shape=*/literal.shape(), on_device_shape, @@ -166,12 +166,12 @@ void TransferManager::TransferArrayFromDevice( auto error = StrCat("Shape ", ShapeUtil::HumanString(shape), " has a differently shaped representation on-device: ", ShapeUtil::HumanString(HostShapeToDeviceShape(shape))); - return done(FailedPrecondition("%s", error.c_str())); + return done(FailedPrecondition("%s", error)); } if (source.size() < GetByteSizeRequirement(shape)) { return done( FailedPrecondition("Allocation on device not large enough for array: " - "%lld < %lld", + "%d < %d", source.size(), GetByteSizeRequirement(shape))); } ShapedBuffer shaped_buffer(/*on_host_shape=*/shape, shape, @@ -203,7 +203,7 @@ void TransferManager::TransferArrayFromDevice( return NotFound( "could not find registered transfer manager for platform %s -- check " "target linkage", - platform->Name().c_str()); + platform->Name()); } if (it->second.manager == nullptr) { @@ -254,7 +254,7 @@ Status TransferManager::TransferBufferFromDevice( if (source.size() < size) { return FailedPrecondition( "Source allocation on device not large enough for data tranfer: " - "%lld < %lld", + "%d < %d", source.size(), size); } stream->ThenMemcpy(destination, source, size); @@ -267,7 +267,7 @@ Status TransferManager::TransferBufferToDevice( if (destination->size() < size) { return FailedPrecondition( "Destination allocation on device not large enough for data tranfer: " - "%lld < %lld", + "%d < %d", destination->size(), size); } stream->ThenMemcpy(destination, source, size); @@ -278,9 +278,8 @@ StatusOr TransferManager::AllocateScopedShapedBuffer( const Shape& on_host_shape, DeviceMemoryAllocator* allocator, int device_ordinal) { if (!LayoutUtil::HasLayout(on_host_shape)) { - return InvalidArgument( - "Shape must have a layout: %s", - ShapeUtil::HumanStringWithLayout(on_host_shape).c_str()); + return InvalidArgument("Shape must have a layout: %s", + ShapeUtil::HumanStringWithLayout(on_host_shape)); } TF_RETURN_IF_ERROR(ShapeUtil::ValidateShape(on_host_shape)); const Shape on_device_shape = HostShapeToDeviceShape(on_host_shape); diff --git a/tensorflow/compiler/xla/service/tuple_points_to_analysis.cc b/tensorflow/compiler/xla/service/tuple_points_to_analysis.cc index cb07b8d4d3..cf00ca102b 100644 --- a/tensorflow/compiler/xla/service/tuple_points_to_analysis.cc +++ b/tensorflow/compiler/xla/service/tuple_points_to_analysis.cc @@ -21,6 +21,7 @@ limitations under the License. #include "absl/memory/memory.h" #include "absl/strings/str_cat.h" +#include "absl/strings/str_format.h" #include "absl/strings/str_join.h" #include "tensorflow/compiler/xla/map_util.h" #include "tensorflow/compiler/xla/service/hlo_dataflow_analysis.h" @@ -29,7 +30,6 @@ limitations under the License. #include "tensorflow/compiler/xla/types.h" #include "tensorflow/compiler/xla/util.h" #include "tensorflow/core/lib/core/errors.h" -#include "tensorflow/core/lib/strings/stringprintf.h" #include "tensorflow/core/platform/logging.h" namespace xla { @@ -462,21 +462,20 @@ Status TuplePointsToAnalysis::VerifyBuffer(const LogicalBuffer& buffer) const { return FailedPrecondition( "LogicalBuffer %s is ill-defined: instruction %s does not define a " "buffer at that index", - buffer.ToString().c_str(), buffer.instruction()->name().c_str()); + buffer.ToString(), buffer.instruction()->name()); } } if (buffer.id() < 0 || buffer.id() >= logical_buffer_analysis_->num_logical_buffers()) { - return FailedPrecondition( - "LogicalBuffer %s is ill-defined: invalid id %lld", - buffer.ToString().c_str(), buffer.id()); + return FailedPrecondition("LogicalBuffer %s is ill-defined: invalid id %d", + buffer.ToString(), buffer.id()); } if (GetBuffer(buffer.id()).instruction() != buffer.instruction() || GetBuffer(buffer.id()).index() != buffer.index()) { return FailedPrecondition( "LogicalBuffer %s is ill-defined: buffer with same id differs: %s", - buffer.ToString().c_str(), GetBuffer(buffer.id()).ToString().c_str()); + buffer.ToString(), GetBuffer(buffer.id()).ToString()); } return Status::OK(); @@ -495,7 +494,7 @@ StatusOr TuplePointsToAnalysis::GetBufferDefinedAt( if (buffers.size() != 1 || buffers[0]->instruction() != instruction) { return FailedPrecondition( "instruction %s does not define buffer at index {%s}", - instruction->name().c_str(), absl::StrJoin(index, ",").c_str()); + instruction->name(), absl::StrJoin(index, ",")); } return buffers[0]; } @@ -556,8 +555,8 @@ PointsToSet& TuplePointsToAnalysis::CreateCopiedPointsToSet( } string TuplePointsToAnalysis::ToString() const { - string output = tensorflow::strings::Printf( - "TuplePointsToSet for module %s:\n", module_->name().c_str()); + string output = + absl::StrFormat("TuplePointsToSet for module %s:\n", module_->name()); for (const auto* computation : module_->MakeNonfusionComputations()) { const char* entry = computation == module_->entry_computation() ? "entry " : ""; diff --git a/tensorflow/compiler/xla/shape_layout.cc b/tensorflow/compiler/xla/shape_layout.cc index caad31d6ce..d44db89d57 100644 --- a/tensorflow/compiler/xla/shape_layout.cc +++ b/tensorflow/compiler/xla/shape_layout.cc @@ -25,8 +25,8 @@ namespace xla { Status ShapeLayout::CopyLayoutFromShape(const Shape& other_shape) { if (!ShapeUtil::Compatible(other_shape, shape_)) { return InvalidArgument("Shape %s is not compatible with shape %s", - ShapeUtil::HumanString(other_shape).c_str(), - ShapeUtil::HumanString(shape()).c_str()); + ShapeUtil::HumanString(other_shape), + ShapeUtil::HumanString(shape())); } shape_ = other_shape; return Status::OK(); @@ -35,8 +35,8 @@ Status ShapeLayout::CopyLayoutFromShape(const Shape& other_shape) { Status ShapeLayout::AssignLayoutToShape(Shape* to_shape) const { if (!ShapeUtil::Compatible(*to_shape, shape_)) { return InvalidArgument("Shape %s is not compatible with shape %s", - ShapeUtil::HumanString(*to_shape).c_str(), - ShapeUtil::HumanString(shape()).c_str()); + ShapeUtil::HumanString(*to_shape), + ShapeUtil::HumanString(shape())); } *to_shape = shape_; return Status::OK(); diff --git a/tensorflow/compiler/xla/shape_util.cc b/tensorflow/compiler/xla/shape_util.cc index 31ddd57eef..5477a78a9a 100644 --- a/tensorflow/compiler/xla/shape_util.cc +++ b/tensorflow/compiler/xla/shape_util.cc @@ -147,7 +147,7 @@ StatusOr MakeShapeWithLayoutInternal( } if (element_type == OPAQUE || element_type == TUPLE) { return InvalidArgument("Unsupported element type: %s", - PrimitiveType_Name(element_type).c_str()); + PrimitiveType_Name(element_type)); } Shape shape = ShapeUtil::MakeShape(element_type, dimensions); auto min2maj = shape.mutable_layout()->mutable_minor_to_major(); @@ -491,8 +491,7 @@ StatusOr StringToPrimitiveType(const string& name) { }(); auto found = name_to_type->find(name); if (found == name_to_type->end()) { - return InvalidArgument("Invalid element type string: \"%s\".", - name.c_str()); + return InvalidArgument("Invalid element type string: \"%s\".", name); } return found->second; } @@ -564,8 +563,7 @@ StatusOr ParseShapeStringInternal(absl::string_view* s) { if (absl::ConsumePrefix(s, ")")) { break; } else if (must_end) { - return InvalidArgument("Expected end of tuple; got: \"%s\"", - string(*s).c_str()); + return InvalidArgument("Expected end of tuple; got: \"%s\"", *s); } shapes.emplace_back(); TF_ASSIGN_OR_RETURN(shapes.back(), ParseShapeStringInternal(s)); @@ -593,8 +591,8 @@ StatusOr ParseShapeStringInternal(absl::string_view* s) { int64 element; if (!absl::SimpleAtoi(input, &element)) { return InvalidArgument( - "Invalid s64 value in parsed shape string: \"%s\" in \"%s\"", - string(input).c_str(), string(*s).c_str()); + "Invalid s64 value in parsed shape string: \"%s\" in \"%s\"", input, + *s); } return element; }; @@ -618,7 +616,7 @@ StatusOr ParseShapeStringInternal(absl::string_view* s) { StringToPrimitiveType(element_type_string)); if (primitive_type == PRIMITIVE_TYPE_INVALID || primitive_type == TUPLE) { return InvalidArgument("Invalid element type string: \"%s\".", - element_type_string.c_str()); + element_type_string); } Shape result; @@ -648,16 +646,14 @@ StatusOr ParseShapeStringInternal(absl::string_view* s) { return std::move(result); } - return InvalidArgument("Invalid shape string to parse: \"%s\"", - string(*s).c_str()); + return InvalidArgument("Invalid shape string to parse: \"%s\"", *s); } } // namespace /* static */ StatusOr ShapeUtil::ParseShapeString(absl::string_view s) { TF_ASSIGN_OR_RETURN(Shape shape, ParseShapeStringInternal(&s)); if (!s.empty()) { - return InvalidArgument("Invalid shape string to parse: \"%s\"", - string(s).c_str()); + return InvalidArgument("Invalid shape string to parse: \"%s\"", s); } return shape; } @@ -822,7 +818,7 @@ StatusOr ParseShapeStringInternal(absl::string_view* s) { const Shape& shape) { if (shape.element_type() == PRIMITIVE_TYPE_INVALID) { return InvalidArgument("shape has invalid element type: %s", - shape.ShortDebugString().c_str()); + shape.ShortDebugString()); } if (shape.element_type() == TUPLE) { if (shape.dimensions_size() != 0) { @@ -845,21 +841,21 @@ StatusOr ParseShapeStringInternal(absl::string_view* s) { if (shape.dimensions_size() != 0) { return InvalidArgument( "shape has %s element type, but has dimensions field: %s", - LowercasePrimitiveTypeName(shape.element_type()).c_str(), - shape.ShortDebugString().c_str()); + LowercasePrimitiveTypeName(shape.element_type()), + shape.ShortDebugString()); } if (shape.has_layout()) { return InvalidArgument( "shape has %s element type, but has layout field: %s", - LowercasePrimitiveTypeName(shape.element_type()).c_str(), - shape.ShortDebugString().c_str()); + LowercasePrimitiveTypeName(shape.element_type()), + shape.ShortDebugString()); } return Status::OK(); } if (Rank(shape) != shape.dimensions_size()) { return InvalidArgument( - "shape's rank is mismatched with dimension count; rank=%lld " + "shape's rank is mismatched with dimension count; rank=%d " "dimensions_size=%d", Rank(shape), shape.dimensions_size()); } @@ -867,9 +863,8 @@ StatusOr ParseShapeStringInternal(absl::string_view* s) { int64 dimension = shape.dimensions(i); if (dimension < 0) { return InvalidArgument( - "shape's dimensions must not be < 0; dimension at index %lld was " - "%lld", - i, dimension); + "shape's dimensions must not be < 0; dimension at index %d was %d", i, + dimension); } } @@ -934,7 +929,7 @@ StatusOr ParseShapeStringInternal(absl::string_view* s) { if (shape_size < 0) { return InvalidArgument("Shape %s size may overflow int64.", - ShapeUtil::HumanString(shape).c_str()); + ShapeUtil::HumanString(shape)); } VLOG(3) << "Shape size is valid: " << shape_size; @@ -994,7 +989,7 @@ StatusOr ParseShapeStringInternal(absl::string_view* s) { i >= return_shape->tuple_shapes_size()) { return InvalidArgument( "Shape index %s not a valid subshape index for tuple with shape %s", - index.ToString().c_str(), shape.DebugString().c_str()); + index.ToString(), shape.DebugString()); } return_shape = &return_shape->tuple_shapes(i); } diff --git a/tensorflow/compiler/xla/tests/BUILD b/tensorflow/compiler/xla/tests/BUILD index 6b29d833da..d5e3b747e7 100644 --- a/tensorflow/compiler/xla/tests/BUILD +++ b/tensorflow/compiler/xla/tests/BUILD @@ -99,6 +99,7 @@ cc_library( "//tensorflow/compiler/xla:xla_data_proto", "//tensorflow/core:lib", "//tensorflow/core:test", + "@com_google_absl//absl/strings:str_format", "@com_google_absl//absl/types:optional", ], ) @@ -1014,6 +1015,7 @@ xla_test( "//tensorflow/core:test", "@com_google_absl//absl/container:inlined_vector", "@com_google_absl//absl/strings", + "@com_google_absl//absl/strings:str_format", ], ) @@ -1123,7 +1125,6 @@ xla_test( deps = [ "//tensorflow/compiler/xla:array2d", "//tensorflow/compiler/xla:array4d", - "//tensorflow/compiler/xla:literal", "//tensorflow/compiler/xla:literal_util", "//tensorflow/compiler/xla:reference_util", "//tensorflow/compiler/xla:shape_util", @@ -1142,6 +1143,7 @@ xla_test( "//tensorflow/core:lib", "//tensorflow/core:test", "@com_google_absl//absl/strings", + "@com_google_absl//absl/strings:str_format", ], ) @@ -1428,7 +1430,6 @@ xla_test( "//tensorflow/compiler/xla:shape_util", "//tensorflow/compiler/xla:statusor", "//tensorflow/compiler/xla:test_helpers", - "//tensorflow/compiler/xla:util", "//tensorflow/compiler/xla:xla_data_proto", "//tensorflow/compiler/xla/client:local_client", "//tensorflow/compiler/xla/client:xla_builder", @@ -1441,6 +1442,7 @@ xla_test( "//tensorflow/core:test", "@com_google_absl//absl/memory", "@com_google_absl//absl/strings", + "@com_google_absl//absl/strings:str_format", ], ) @@ -1511,6 +1513,7 @@ xla_test( "//tensorflow/core:lib", "//tensorflow/core:test", "@com_google_absl//absl/strings", + "@com_google_absl//absl/strings:str_format", ], ) diff --git a/tensorflow/compiler/xla/tests/literal_test_util.cc b/tensorflow/compiler/xla/tests/literal_test_util.cc index a4e3a998fc..554eb24d44 100644 --- a/tensorflow/compiler/xla/tests/literal_test_util.cc +++ b/tensorflow/compiler/xla/tests/literal_test_util.cc @@ -15,9 +15,9 @@ limitations under the License. #include "tensorflow/compiler/xla/tests/literal_test_util.h" +#include "absl/strings/str_format.h" #include "tensorflow/compiler/xla/literal_comparison.h" #include "tensorflow/core/lib/io/path.h" -#include "tensorflow/core/lib/strings/stringprintf.h" #include "tensorflow/core/platform/test.h" namespace xla { @@ -35,8 +35,7 @@ void WriteLiteralToTempFile(const LiteralSlice& literal, const string& name) { int64 now_usec = tensorflow::Env::Default()->NowMicros(); string filename = tensorflow::io::JoinPath( tensorflow::testing::TmpDir(), - tensorflow::strings::Printf("tempfile-%s-%llx-%s", get_hostname().c_str(), - now_usec, name.c_str())); + absl::StrFormat("tempfile-%s-%x-%s", get_hostname(), now_usec, name)); TF_CHECK_OK(tensorflow::WriteBinaryProto(tensorflow::Env::Default(), filename, literal.ToProto())); LOG(ERROR) << "wrote to " << name << " file: " << filename; diff --git a/tensorflow/compiler/xla/tests/matrix_ops_simple_test.cc b/tensorflow/compiler/xla/tests/matrix_ops_simple_test.cc index 7956a034f8..edb592f43e 100644 --- a/tensorflow/compiler/xla/tests/matrix_ops_simple_test.cc +++ b/tensorflow/compiler/xla/tests/matrix_ops_simple_test.cc @@ -19,6 +19,7 @@ limitations under the License. #include "absl/memory/memory.h" #include "absl/strings/str_cat.h" +#include "absl/strings/str_format.h" #include "tensorflow/compiler/xla/array2d.h" #include "tensorflow/compiler/xla/client/local_client.h" #include "tensorflow/compiler/xla/client/xla_builder.h" @@ -33,7 +34,6 @@ limitations under the License. #include "tensorflow/compiler/xla/tests/test_macros.h" #include "tensorflow/compiler/xla/tests/test_utils.h" #include "tensorflow/compiler/xla/xla_data.pb.h" -#include "tensorflow/core/lib/strings/stringprintf.h" #include "tensorflow/core/platform/logging.h" #include "tensorflow/core/platform/test.h" #include "tensorflow/core/platform/types.h" @@ -136,8 +136,7 @@ class TestLinspaceMaxParametric MakeLinspaceArray2D(from, to, rows, cols); auto arhs = absl::make_unique>(rows, cols, static_cast(1.0f)); - XlaBuilder builder( - tensorflow::strings::Printf("max_%lldx%lld_linspace", rows, cols)); + XlaBuilder builder(absl::StrFormat("max_%dx%d_linspace", rows, cols)); auto lhs = ConstantR2FromArray2D(&builder, *alhs); auto rhs = ConstantR2FromArray2D(&builder, *arhs); Max(lhs, rhs); diff --git a/tensorflow/compiler/xla/tests/reduce_test.cc b/tensorflow/compiler/xla/tests/reduce_test.cc index b93d838349..346f702488 100644 --- a/tensorflow/compiler/xla/tests/reduce_test.cc +++ b/tensorflow/compiler/xla/tests/reduce_test.cc @@ -32,6 +32,7 @@ limitations under the License. #include #include +#include "absl/strings/str_format.h" #include "absl/strings/str_join.h" #include "tensorflow/compiler/xla/array2d.h" #include "tensorflow/compiler/xla/array4d.h" @@ -557,12 +558,11 @@ struct BoundsLayout { }; void PrintTo(const BoundsLayout& spec, std::ostream* os) { - *os << tensorflow::strings::Printf( - "R%luToR%lu%s_%s_Reduce%s", spec.bounds.size(), - spec.bounds.size() - spec.reduce_dims.size(), - absl::StrJoin(spec.bounds, "x").c_str(), - absl::StrJoin(spec.layout, "").c_str(), - absl::StrJoin(spec.reduce_dims, "").c_str()); + *os << absl::StrFormat("R%uToR%u%s_%s_Reduce%s", spec.bounds.size(), + spec.bounds.size() - spec.reduce_dims.size(), + absl::StrJoin(spec.bounds, "x"), + absl::StrJoin(spec.layout, ""), + absl::StrJoin(spec.reduce_dims, "")); } // Add-reduces a broadcasted scalar matrix among dimension 1 and 0. diff --git a/tensorflow/compiler/xla/tests/reverse_test.cc b/tensorflow/compiler/xla/tests/reverse_test.cc index 60084f143d..c755ff63c9 100644 --- a/tensorflow/compiler/xla/tests/reverse_test.cc +++ b/tensorflow/compiler/xla/tests/reverse_test.cc @@ -15,6 +15,7 @@ limitations under the License. #include +#include "absl/strings/str_format.h" #include "absl/strings/str_join.h" #include "tensorflow/compiler/xla/array2d.h" #include "tensorflow/compiler/xla/array4d.h" @@ -43,9 +44,9 @@ struct ReverseSpec { bool use_bfloat16; string ToTestCaseName() const { - return tensorflow::strings::Printf( - "reverse_%s_in_dims_%s_%s", absl::StrJoin(input_dims, "x").c_str(), - absl::StrJoin(reversal, "x").c_str(), use_bfloat16 ? "bf16" : "f32"); + return absl::StrFormat( + "reverse_%s_in_dims_%s_%s", absl::StrJoin(input_dims, "x"), + absl::StrJoin(reversal, "x"), use_bfloat16 ? "bf16" : "f32"); } }; diff --git a/tensorflow/compiler/xla/tests/slice_test.cc b/tensorflow/compiler/xla/tests/slice_test.cc index c57bbbd1e4..69585ae39a 100644 --- a/tensorflow/compiler/xla/tests/slice_test.cc +++ b/tensorflow/compiler/xla/tests/slice_test.cc @@ -20,6 +20,7 @@ limitations under the License. #include "absl/container/inlined_vector.h" #include "absl/strings/str_cat.h" +#include "absl/strings/str_format.h" #include "absl/strings/str_join.h" #include "tensorflow/compiler/xla/array2d.h" #include "tensorflow/compiler/xla/client/local_client.h" @@ -29,7 +30,6 @@ limitations under the License. #include "tensorflow/compiler/xla/tests/literal_test_util.h" #include "tensorflow/compiler/xla/tests/test_macros.h" #include "tensorflow/core/lib/gtl/array_slice.h" -#include "tensorflow/core/lib/strings/stringprintf.h" #include "tensorflow/core/platform/test.h" #include "tensorflow/core/platform/types.h" @@ -223,9 +223,8 @@ class SliceR1LargeTest : public SliceR1Test {}; string SliceR1TestDataToString(const ::testing::TestParamInfo& data) { const R1Spec& spec = data.param; - return ::tensorflow::strings::Printf("%lld_%lld_%lld_%lld", spec.input_dim0, - spec.slice_start, spec.slice_limit, - spec.slice_stride); + return absl::StrFormat("%d_%d_%d_%d", spec.input_dim0, spec.slice_start, + spec.slice_limit, spec.slice_stride); } XLA_TEST_P(SliceR1Test, DoIt_F32) { Run(GetParam()); } diff --git a/tensorflow/compiler/xla/tests/test_utils.cc b/tensorflow/compiler/xla/tests/test_utils.cc index 21c58e075e..776f93d9f7 100644 --- a/tensorflow/compiler/xla/tests/test_utils.cc +++ b/tensorflow/compiler/xla/tests/test_utils.cc @@ -194,7 +194,7 @@ StatusOr> MakeFakeLiteralInternal( break; default: return Unimplemented("Unsupported type for fake literal generation: %s", - ShapeUtil::HumanString(shape).c_str()); + ShapeUtil::HumanString(shape)); } return std::move(literal); } @@ -342,7 +342,7 @@ StatusOr> CreateLiteralForConstrainedUses( default: return Unimplemented( "Constrained operand generation not implemented for %s.", - use->ToString().c_str()); + use->ToString()); } } int constraint_count = 0; diff --git a/tensorflow/compiler/xla/text_literal_reader.cc b/tensorflow/compiler/xla/text_literal_reader.cc index 9835e3d803..442e66321e 100644 --- a/tensorflow/compiler/xla/text_literal_reader.cc +++ b/tensorflow/compiler/xla/text_literal_reader.cc @@ -71,7 +71,7 @@ StatusOr> TextLiteralReader::ReadAllLines() { if (shape.element_type() != F32) { return Unimplemented( "unsupported element type for text literal reading: %s", - ShapeUtil::HumanString(shape).c_str()); + ShapeUtil::HumanString(shape)); } auto result = absl::make_unique(shape); @@ -88,16 +88,16 @@ StatusOr> TextLiteralReader::ReadAllLines() { absl::string_view value_string = absl::StripAsciiWhitespace(pieces[1]); if (!absl::ConsumePrefix(&coordinates_string, "(")) { return InvalidArgument( - "expected '(' at the beginning of coordinates: \"%s\"", line.c_str()); + "expected '(' at the beginning of coordinates: \"%s\"", line); } if (!absl::ConsumeSuffix(&coordinates_string, ")")) { return InvalidArgument("expected ')' at the end of coordinates: \"%s\"", - line.c_str()); + line); } float value; - if (!absl::SimpleAtof(absl::string_view(value_string), &value)) { + if (!absl::SimpleAtof(value_string, &value)) { return InvalidArgument("could not parse value as float: \"%s\"", - string(value_string).c_str()); + value_string); } coordinates = absl::StrSplit(coordinates_string, ','); coordinate_values.clear(); @@ -106,15 +106,15 @@ StatusOr> TextLiteralReader::ReadAllLines() { if (!absl::SimpleAtoi(piece, &coordinate_value)) { return InvalidArgument( "could not parse coordinate member as int64: \"%s\"", - std::string(piece).c_str()); + std::string(piece)); } coordinate_values.push_back(coordinate_value); } if (coordinate_values.size() != shape.dimensions_size()) { return InvalidArgument( - "line did not have expected number of coordinates; want %d got %zu: " + "line did not have expected number of coordinates; want %d got %u: " "\"%s\"", - shape.dimensions_size(), coordinate_values.size(), line.c_str()); + shape.dimensions_size(), coordinate_values.size(), line); } result->Set(coordinate_values, value); } diff --git a/tensorflow/compiler/xla/tools/BUILD b/tensorflow/compiler/xla/tools/BUILD index 1e45588148..f23c5b3ef1 100644 --- a/tensorflow/compiler/xla/tools/BUILD +++ b/tensorflow/compiler/xla/tools/BUILD @@ -193,6 +193,7 @@ tf_cc_binary( "//tensorflow/compiler/xla/service:interpreter_plugin", "//tensorflow/core:lib", "@com_google_absl//absl/strings", + "@com_google_absl//absl/strings:str_format", ], ) diff --git a/tensorflow/compiler/xla/tools/dumped_computation_to_operation_list.cc b/tensorflow/compiler/xla/tools/dumped_computation_to_operation_list.cc index 7aedd1da98..72e5abd274 100644 --- a/tensorflow/compiler/xla/tools/dumped_computation_to_operation_list.cc +++ b/tensorflow/compiler/xla/tools/dumped_computation_to_operation_list.cc @@ -19,6 +19,7 @@ limitations under the License. #include #include +#include "absl/strings/str_format.h" #include "absl/strings/str_join.h" #include "tensorflow/compiler/xla/client/client.h" #include "tensorflow/compiler/xla/client/client_library.h" @@ -31,7 +32,6 @@ limitations under the License. #include "tensorflow/compiler/xla/types.h" #include "tensorflow/compiler/xla/xla_data.pb.h" #include "tensorflow/core/lib/gtl/array_slice.h" -#include "tensorflow/core/lib/strings/stringprintf.h" #include "tensorflow/core/platform/env.h" #include "tensorflow/core/platform/init_main.h" #include "tensorflow/core/platform/logging.h" @@ -49,10 +49,9 @@ class OperationDumper : public DfsHloVisitorWithDefault { absl::StrAppend(out, ShapeUtil::HumanString(operand->shape())); }); // Spit `op_name(params...) -> result_type :: path` to stdout. - std::cout << tensorflow::strings::Printf( - "%s :: (%s) -> %s :: %s\n", HloOpcodeString(hlo->opcode()).c_str(), - params.c_str(), ShapeUtil::HumanString(hlo->shape()).c_str(), - path_.c_str()); + std::cout << absl::StrFormat("%s :: (%s) -> %s :: %s\n", + HloOpcodeString(hlo->opcode()), params, + ShapeUtil::HumanString(hlo->shape()), path_); return Status::OK(); } diff --git a/tensorflow/compiler/xla/tools/replay_computation.cc b/tensorflow/compiler/xla/tools/replay_computation.cc index 311a1bee8d..e776e6a4eb 100644 --- a/tensorflow/compiler/xla/tools/replay_computation.cc +++ b/tensorflow/compiler/xla/tools/replay_computation.cc @@ -250,7 +250,7 @@ StatusOr ParseInputFile(const string& filename, } fprintf(stderr, "%s: is not HLO text. Nothing left to try.\n", filename.c_str()); - return InvalidArgument("Could not parse %s.", filename.c_str()); + return InvalidArgument("Could not parse %s.", filename); } int RealMain(tensorflow::gtl::ArraySlice args, const Options& opts) { diff --git a/tensorflow/compiler/xla/util.cc b/tensorflow/compiler/xla/util.cc index 85f05b7b8d..0f607a0c8a 100644 --- a/tensorflow/compiler/xla/util.cc +++ b/tensorflow/compiler/xla/util.cc @@ -25,7 +25,6 @@ limitations under the License. #include "tensorflow/compiler/xla/types.h" #include "tensorflow/core/lib/core/errors.h" #include "tensorflow/core/lib/strings/numbers.h" -#include "tensorflow/core/lib/strings/stringprintf.h" #include "tensorflow/core/platform/env.h" #include "tensorflow/core/platform/mutex.h" #include "tensorflow/core/platform/stacktrace.h" @@ -68,86 +67,6 @@ Status AppendStatus(Status prior, absl::string_view context) { absl::StrCat(prior.error_message(), ": ", context)}; } -// Implementation note: we can't common these out (without using macros) because -// they all need to va_start/va_end their varargs in their frame. - -Status InvalidArgumentV(const char* format, va_list args) { - string message; - tensorflow::strings::Appendv(&message, format, args); - return WithLogBacktrace(tensorflow::errors::InvalidArgument(message)); -} - -Status InvalidArgument(const char* format, ...) { - va_list args; - va_start(args, format); - Status result = InvalidArgumentV(format, args); - va_end(args); - return result; -} - -Status Unimplemented(const char* format, ...) { - string message; - va_list args; - va_start(args, format); - tensorflow::strings::Appendv(&message, format, args); - va_end(args); - return WithLogBacktrace(tensorflow::errors::Unimplemented(message)); -} - -Status InternalError(const char* format, ...) { - string message; - va_list args; - va_start(args, format); - tensorflow::strings::Appendv(&message, format, args); - va_end(args); - return WithLogBacktrace(tensorflow::errors::Internal(message)); -} - -Status FailedPrecondition(const char* format, ...) { - string message; - va_list args; - va_start(args, format); - tensorflow::strings::Appendv(&message, format, args); - va_end(args); - return WithLogBacktrace(tensorflow::errors::FailedPrecondition(message)); -} - -Status Cancelled(const char* format, ...) { - string message; - va_list args; - va_start(args, format); - tensorflow::strings::Appendv(&message, format, args); - va_end(args); - return WithLogBacktrace(tensorflow::errors::Cancelled(message)); -} - -Status ResourceExhausted(const char* format, ...) { - string message; - va_list args; - va_start(args, format); - tensorflow::strings::Appendv(&message, format, args); - va_end(args); - return WithLogBacktrace(tensorflow::errors::ResourceExhausted(message)); -} - -Status NotFound(const char* format, ...) { - string message; - va_list args; - va_start(args, format); - tensorflow::strings::Appendv(&message, format, args); - va_end(args); - return WithLogBacktrace(tensorflow::errors::NotFound(message)); -} - -Status Unavailable(const char* format, ...) { - string message; - va_list args; - va_start(args, format); - tensorflow::strings::Appendv(&message, format, args); - va_end(args); - return WithLogBacktrace(tensorflow::errors::Unavailable(message)); -} - string Reindent(absl::string_view original, const absl::string_view indentation) { std::vector pieces = diff --git a/tensorflow/compiler/xla/util.h b/tensorflow/compiler/xla/util.h index 671ef17f36..62f486369f 100644 --- a/tensorflow/compiler/xla/util.h +++ b/tensorflow/compiler/xla/util.h @@ -27,11 +27,13 @@ limitations under the License. #include "absl/algorithm/container.h" #include "absl/container/inlined_vector.h" #include "absl/strings/str_cat.h" +#include "absl/strings/str_format.h" #include "absl/strings/string_view.h" #include "tensorflow/compiler/xla/status.h" #include "tensorflow/compiler/xla/status_macros.h" #include "tensorflow/compiler/xla/types.h" #include "tensorflow/compiler/xla/xla_data.pb.h" +#include "tensorflow/core/lib/core/errors.h" #include "tensorflow/core/lib/core/status.h" #include "tensorflow/core/lib/gtl/array_slice.h" #include "tensorflow/core/lib/math/math_util.h" @@ -205,43 +207,73 @@ void StridedCopy(tensorflow::gtl::MutableArraySlice dest, int64 dest_base, Status AddStatus(Status prior, absl::string_view context); Status AppendStatus(Status prior, absl::string_view context); -// Status error shorthands -- printfs the arguments to be -// used as an error message and returns a status in the canonical -// error space. -Status InvalidArgument(const char* format, ...) TF_PRINTF_ATTRIBUTE(1, 2); -Status Unimplemented(const char* format, ...) TF_PRINTF_ATTRIBUTE(1, 2); -Status InternalError(const char* format, ...) TF_PRINTF_ATTRIBUTE(1, 2); -Status FailedPrecondition(const char* format, ...) TF_PRINTF_ATTRIBUTE(1, 2); -Status Cancelled(const char* format, ...) TF_PRINTF_ATTRIBUTE(1, 2); -Status ResourceExhausted(const char* format, ...) TF_PRINTF_ATTRIBUTE(1, 2); -Status NotFound(const char* format, ...) TF_PRINTF_ATTRIBUTE(1, 2); -Status Unavailable(const char* format, ...) TF_PRINTF_ATTRIBUTE(1, 2); - -// Passed-varargs variant of the InvalidArgument factory above. -Status InvalidArgumentV(const char* format, va_list args); +// Status error shorthands -- StrFormat's the arguments to be used as an error +// message and returns a status in the canonical error space. +template +Status InvalidArgument(const absl::FormatSpec& format, + const Args&... args) { + return WithLogBacktrace( + tensorflow::errors::InvalidArgument(absl::StrFormat(format, args...))); +} +template +Status Unimplemented(const absl::FormatSpec& format, + const Args&... args) { + return WithLogBacktrace( + tensorflow::errors::Unimplemented(absl::StrFormat(format, args...))); +} +template +Status InternalError(const absl::FormatSpec& format, + const Args&... args) { + return WithLogBacktrace( + tensorflow::errors::Internal(absl::StrFormat(format, args...))); +} +template +Status FailedPrecondition(const absl::FormatSpec& format, + const Args&... args) { + return WithLogBacktrace( + tensorflow::errors::FailedPrecondition(absl::StrFormat(format, args...))); +} +template +Status Cancelled(const absl::FormatSpec& format, const Args&... args) { + return WithLogBacktrace( + tensorflow::errors::Cancelled(absl::StrFormat(format, args...))); +} +template +Status ResourceExhausted(const absl::FormatSpec& format, + const Args&... args) { + return WithLogBacktrace( + tensorflow::errors::ResourceExhausted(absl::StrFormat(format, args...))); +} +template +Status NotFound(const absl::FormatSpec& format, const Args&... args) { + return WithLogBacktrace( + tensorflow::errors::NotFound(absl::StrFormat(format, args...))); +} +template +Status Unavailable(const absl::FormatSpec& format, + const Args&... args) { + return WithLogBacktrace( + tensorflow::errors::Unavailable(absl::StrFormat(format, args...))); +} template Status InvalidArgumentStrCat(Args&&... concat) { - return InvalidArgument("%s", - absl::StrCat(std::forward(concat)...).c_str()); + return InvalidArgument("%s", absl::StrCat(std::forward(concat)...)); } template Status UnimplementedStrCat(Args&&... concat) { - return Unimplemented("%s", - absl::StrCat(std::forward(concat)...).c_str()); + return Unimplemented("%s", absl::StrCat(std::forward(concat)...)); } template Status InternalErrorStrCat(Args&&... concat) { - return InternalError("%s", - absl::StrCat(std::forward(concat)...).c_str()); + return InternalError("%s", absl::StrCat(std::forward(concat)...)); } template Status ResourceExhaustedStrCat(Args&&... concat) { - return ResourceExhausted("%s", - absl::StrCat(std::forward(concat)...).c_str()); + return ResourceExhausted("%s", absl::StrCat(std::forward(concat)...)); } // Splits the lines of the original, replaces leading whitespace with the prefix diff --git a/tensorflow/compiler/xla/window_util.cc b/tensorflow/compiler/xla/window_util.cc index 44fb1bdc38..268dc5db01 100644 --- a/tensorflow/compiler/xla/window_util.cc +++ b/tensorflow/compiler/xla/window_util.cc @@ -20,7 +20,6 @@ limitations under the License. #include "absl/strings/str_cat.h" #include "tensorflow/compiler/xla/types.h" #include "tensorflow/compiler/xla/xla_data.pb.h" -#include "tensorflow/core/lib/strings/stringprintf.h" namespace xla { namespace window_util { -- GitLab From 599bb5522d622b2f4feaee06b5dd194b04424345 Mon Sep 17 00:00:00 2001 From: Priya Gupta Date: Mon, 27 Aug 2018 14:55:33 -0700 Subject: [PATCH 180/598] Handle the case where input_fn dataset may not contain labels. PiperOrigin-RevId: 210436691 --- tensorflow/python/estimator/estimator.py | 10 ++++++---- tensorflow/python/estimator/util.py | 8 ++++++-- 2 files changed, 12 insertions(+), 6 deletions(-) diff --git a/tensorflow/python/estimator/estimator.py b/tensorflow/python/estimator/estimator.py index 3849188c58..f55ca93c0d 100644 --- a/tensorflow/python/estimator/estimator.py +++ b/tensorflow/python/estimator/estimator.py @@ -1257,7 +1257,7 @@ class Estimator(object): if is_tpu_strategy: # Create a step_fn from the train_op of grouped_estimator_spec - def step_fn(ctx, features, labels): + def step_fn(ctx, features, labels=None): """A single step that is passed to run_on_dataset.""" estimator_spec = self._train_distribution.call_for_each_tower( self._call_model_fn, @@ -1282,7 +1282,8 @@ class Estimator(object): loss = ctx.last_step_outputs['loss'] grouped_estimator_spec = ctx.non_tensor_outputs['estimator_spec'] else: - features, labels = iterator.get_next() + features, labels = estimator_util.parse_iterator_result( + iterator.get_next()) grouped_estimator_spec = self._train_distribution.call_for_each_tower( self._call_model_fn, features, @@ -1471,7 +1472,7 @@ class Estimator(object): self._eval_distribution.__class__.__name__ == 'TPUStrategy') if is_tpu_strategy: - def step_fn(ctx, features, labels): + def step_fn(ctx, features, labels=None): """Runs one step of the eval computation and captures outputs.""" estimator_spec = self._eval_distribution.call_for_each_tower( self._call_model_fn, features, labels, model_fn_lib.ModeKeys.EVAL, @@ -1492,7 +1493,8 @@ class Estimator(object): eval_dict = ctx.non_tensor_outputs['eval_dict'] grouped_estimator_spec = ctx.non_tensor_outputs['estimator_spec'] else: - features, labels = iterator.get_next() + features, labels = estimator_util.parse_iterator_result( + iterator.get_next()) grouped_estimator_spec = self._eval_distribution.call_for_each_tower( self._call_model_fn, features, labels, model_fn_lib.ModeKeys.EVAL, config) diff --git a/tensorflow/python/estimator/util.py b/tensorflow/python/estimator/util.py index d4a75478d5..31e4778e72 100644 --- a/tensorflow/python/estimator/util.py +++ b/tensorflow/python/estimator/util.py @@ -109,13 +109,17 @@ def parse_input_fn_result(result): else: input_hooks.append(_DatasetInitializerHook(iterator)) result = iterator.get_next() + return parse_iterator_result(result) + (input_hooks,) + +def parse_iterator_result(result): + """Gets features, labels from result.""" if isinstance(result, (list, tuple)): if len(result) != 2: raise ValueError( 'input_fn should return (features, labels) as a len 2 tuple.') - return result[0], result[1], input_hooks - return result, None, input_hooks + return result[0], result[1] + return result, None class _DatasetInitializerHook(training.SessionRunHook): -- GitLab From ab8e195d2e0978c21234a5632d4fabf47535eda1 Mon Sep 17 00:00:00 2001 From: Yao Zhang Date: Mon, 27 Aug 2018 15:11:13 -0700 Subject: [PATCH 181/598] Open source graph analyzer. PiperOrigin-RevId: 210439649 --- tensorflow/core/grappler/graph_analyzer/BUILD | 139 ++ .../core/grappler/graph_analyzer/gen_node.cc | 148 ++ .../core/grappler/graph_analyzer/gen_node.h | 167 +++ .../grappler/graph_analyzer/gen_node_test.cc | 491 +++++++ .../grappler/graph_analyzer/graph_analyzer.cc | 341 +++++ .../grappler/graph_analyzer/graph_analyzer.h | 154 ++ .../graph_analyzer/graph_analyzer_test.cc | 569 ++++++++ .../graph_analyzer/graph_analyzer_tool.cc | 98 ++ .../graph_analyzer/graph_analyzer_tool.h | 31 + .../core/grappler/graph_analyzer/hash_tools.h | 47 + .../graph_analyzer/hash_tools_test.cc | 46 + .../core/grappler/graph_analyzer/map_tools.h | 46 + .../core/grappler/graph_analyzer/sig_node.cc | 453 ++++++ .../core/grappler/graph_analyzer/sig_node.h | 304 ++++ .../grappler/graph_analyzer/sig_node_test.cc | 1235 +++++++++++++++++ .../core/grappler/graph_analyzer/subgraph.cc | 235 ++++ .../core/grappler/graph_analyzer/subgraph.h | 189 +++ .../grappler/graph_analyzer/subgraph_test.cc | 348 +++++ .../grappler/graph_analyzer/test_tools.cc | 296 ++++ .../core/grappler/graph_analyzer/test_tools.h | 120 ++ tensorflow/python/BUILD | 14 + tensorflow/python/grappler/graph_analyzer.i | 26 + tensorflow/python/grappler/graph_analyzer.py | 46 + tensorflow/python/tensorflow.i | 1 + 24 files changed, 5544 insertions(+) create mode 100644 tensorflow/core/grappler/graph_analyzer/BUILD create mode 100644 tensorflow/core/grappler/graph_analyzer/gen_node.cc create mode 100644 tensorflow/core/grappler/graph_analyzer/gen_node.h create mode 100644 tensorflow/core/grappler/graph_analyzer/gen_node_test.cc create mode 100644 tensorflow/core/grappler/graph_analyzer/graph_analyzer.cc create mode 100644 tensorflow/core/grappler/graph_analyzer/graph_analyzer.h create mode 100644 tensorflow/core/grappler/graph_analyzer/graph_analyzer_test.cc create mode 100644 tensorflow/core/grappler/graph_analyzer/graph_analyzer_tool.cc create mode 100644 tensorflow/core/grappler/graph_analyzer/graph_analyzer_tool.h create mode 100644 tensorflow/core/grappler/graph_analyzer/hash_tools.h create mode 100644 tensorflow/core/grappler/graph_analyzer/hash_tools_test.cc create mode 100644 tensorflow/core/grappler/graph_analyzer/map_tools.h create mode 100644 tensorflow/core/grappler/graph_analyzer/sig_node.cc create mode 100644 tensorflow/core/grappler/graph_analyzer/sig_node.h create mode 100644 tensorflow/core/grappler/graph_analyzer/sig_node_test.cc create mode 100644 tensorflow/core/grappler/graph_analyzer/subgraph.cc create mode 100644 tensorflow/core/grappler/graph_analyzer/subgraph.h create mode 100644 tensorflow/core/grappler/graph_analyzer/subgraph_test.cc create mode 100644 tensorflow/core/grappler/graph_analyzer/test_tools.cc create mode 100644 tensorflow/core/grappler/graph_analyzer/test_tools.h create mode 100644 tensorflow/python/grappler/graph_analyzer.i create mode 100644 tensorflow/python/grappler/graph_analyzer.py diff --git a/tensorflow/core/grappler/graph_analyzer/BUILD b/tensorflow/core/grappler/graph_analyzer/BUILD new file mode 100644 index 0000000000..d56a08d3c8 --- /dev/null +++ b/tensorflow/core/grappler/graph_analyzer/BUILD @@ -0,0 +1,139 @@ +licenses(["notice"]) # Apache 2.0 + +load("//tensorflow:tensorflow.bzl", "tf_cc_test") + +cc_library( + name = "graph_analyzer_lib", + srcs = [ + "gen_node.cc", + "graph_analyzer.cc", + "sig_node.cc", + "subgraph.cc", + ], + hdrs = [ + "gen_node.h", + "graph_analyzer.h", + "hash_tools.h", + "map_tools.h", + "sig_node.h", + "subgraph.h", + ], + visibility = ["//visibility:public"], + deps = [ + "//tensorflow/core:framework", + "//tensorflow/core:lib", + "//tensorflow/core:protos_all_cc", + "//tensorflow/core:tensorflow", + "//tensorflow/core/grappler:op_types", + "//tensorflow/core/grappler:utils", + "@com_google_absl//absl/memory", + "@com_google_absl//absl/strings", + "@com_google_absl//absl/strings:str_format", + ], +) + +cc_library( + name = "graph_analyzer_tool", + srcs = ["graph_analyzer_tool.cc"], + hdrs = ["graph_analyzer_tool.h"], + visibility = ["//visibility:public"], + deps = [ + ":graph_analyzer_lib", + "//tensorflow/core:lib", + "//tensorflow/core:protos_all_cc", + "//tensorflow/core:tensorflow", + "//tensorflow/core/grappler:grappler_item", + "@com_google_absl//absl/strings", + ], +) + +cc_library( + name = "test_tools_lib", + testonly = 1, + srcs = [ + "test_tools.cc", + ], + hdrs = [ + "test_tools.h", + ], + visibility = ["//visibility:public"], + deps = [ + ":graph_analyzer_lib", + "//tensorflow/core:framework", + "//tensorflow/core:tensorflow", + "//tensorflow/core/grappler:op_types", + "@com_google_absl//absl/strings", + "@com_google_absl//absl/strings:str_format", + ], +) + +tf_cc_test( + name = "hash_tools_test", + testonly = 1, + srcs = [ + "hash_tools_test.cc", + ], + deps = [ + ":graph_analyzer_lib", + "@com_google_googletest//:gtest_main", + ], +) + +tf_cc_test( + name = "gen_node_test", + testonly = 1, + srcs = [ + "gen_node_test.cc", + ], + deps = [ + ":graph_analyzer_lib", + ":test_tools_lib", + "@com_google_absl//absl/memory", + "@com_google_googletest//:gtest_main", + ], +) + +tf_cc_test( + name = "sig_node_test", + testonly = 1, + srcs = [ + "sig_node_test.cc", + ], + deps = [ + ":graph_analyzer_lib", + ":test_tools_lib", + "//tensorflow/core/grappler:utils", + "@com_google_absl//absl/memory", + "@com_google_absl//absl/strings:str_format", + "@com_google_googletest//:gtest_main", + ], +) + +tf_cc_test( + name = "graph_analyzer_test", + testonly = 1, + srcs = [ + "graph_analyzer_test.cc", + ], + deps = [ + ":graph_analyzer_lib", + ":test_tools_lib", + "@com_google_absl//absl/memory", + "@com_google_googletest//:gtest_main", + ], +) + +tf_cc_test( + name = "subgraph_test", + testonly = 1, + srcs = [ + "subgraph_test.cc", + ], + deps = [ + ":graph_analyzer_lib", + ":test_tools_lib", + "@com_google_absl//absl/memory", + "@com_google_absl//absl/strings:str_format", + "@com_google_googletest//:gtest_main", + ], +) diff --git a/tensorflow/core/grappler/graph_analyzer/gen_node.cc b/tensorflow/core/grappler/graph_analyzer/gen_node.cc new file mode 100644 index 0000000000..f8c15fd50e --- /dev/null +++ b/tensorflow/core/grappler/graph_analyzer/gen_node.cc @@ -0,0 +1,148 @@ +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include "tensorflow/core/grappler/graph_analyzer/gen_node.h" +#include "absl/memory/memory.h" +#include "absl/strings/str_format.h" +#include "tensorflow/core/framework/op.h" +#include "tensorflow/core/grappler/graph_analyzer/hash_tools.h" +#include "tensorflow/core/grappler/op_types.h" +#include "tensorflow/core/grappler/utils.h" + +namespace tensorflow { +namespace grappler { +namespace graph_analyzer { + +GenNode::GenNode(const NodeDef* node) : node_(node), op_(nullptr) {} + +Status GenNode::BuildGraphInMap(const GraphDef& source, GenNodeMap* map) { + for (const auto& n : source.node()) { + const string& name = n.name(); + if (map->find(name) != map->end()) { + // This error code looks more meaningful than ALREADY_EXISTS. + return Status(error::INVALID_ARGUMENT, + "Duplicate node name '" + name + "'."); + } + (*map)[name] = absl::make_unique(&n); + } + // Now parse the links. + for (const auto& mapit : *map) { + Status st = mapit.second->ParseInputs(map); + if (!st.ok()) { + return st; + } + } + return Status::OK(); +} + +Status GenNode::ParseInputs(const GenNodeMap* map) { + all_inputs_or_none_ = false; + Status st = OpRegistry::Global()->LookUpOpDef(opcode(), &op_); + if (!st.ok()) { + return Status( + error::INVALID_ARGUMENT, + absl::StrFormat("Node '%s' contains an undefined operation '%s': %s", + name(), opcode(), st.error_message())); + } + + int n_inputs = node_->input_size(); + + int n_named_inputs = op_->input_arg_size(); + + int n_multi_inputs = 0; + for (const auto& inarg : op_->input_arg()) { + if (!inarg.number_attr().empty() || !inarg.type_list_attr().empty()) { + ++n_multi_inputs; + } + } + bool is_commutative = grappler::IsCommutative(*node_); + + if (n_multi_inputs > 1 || (n_multi_inputs > 0 && n_named_inputs > 1)) { + // Can't handle more than one multi-input at a time. + // And can't handle the commutativeness of only some arguments + // rather than all of them. + is_commutative = false; + } + + if (is_commutative) { + // If truly commutative, can treat all the inputs as one multi-input. + // It's possible to just treat the commutative nodes as AllInputsOrNone + // but (1) this way is a bit more efficient and (2) I want to preserve this + // more efficient code path that does all-or-none by a single input and + // perhaps extend its use in the future. + n_named_inputs = 1; + all_inputs_or_none_ = false; + } else if (n_multi_inputs > 0) { + all_inputs_or_none_ = true; + } + + for (int i = 0; i < n_inputs; ++i) { + int other_position; + string other_name = ParseNodeName(node_->input(i), &other_position); + auto other_it = map->find(other_name); + if (other_it == map->end()) { + return Status( + error::INVALID_ARGUMENT, + absl::StrFormat( + "Node '%s' input %d refers to a non-existing node '%s'.", name(), + i, other_name)); + } + GenNode* other_node = other_it->second.get(); + + int this_position = other_position < 0 ? -1 : (is_commutative ? 0 : i); + + if (this_position >= 0 && n_multi_inputs == 0 && + this_position >= n_named_inputs) { + return Status( + error::INVALID_ARGUMENT, + absl::StrFormat( + "Node '%s' has a non-control input from '%s' at index %d but its " + "operation '%s' defines only %d inputs.", + name(), other_name, this_position, op_->name(), n_named_inputs)); + } + + Port this_port(/*inbound=*/true, this_position); + Port other_port(/*inbound=*/false, other_position); + + links_[this_port].emplace_back(LinkTarget(other_node, other_port)); + other_node->links_[other_port].emplace_back(LinkTarget(this, this_port)); + } + return Status::OK(); +} + +bool GenNode::IsMultiInput(Port port) const { + if (!port.IsInbound()) { + return false; + } + auto it = links_.find(port); + if (it == links_.end()) { + return false; // Shouldn't happen. + } + return (it->second.size() > 1); +} + +GenNode::Port::operator string() const { + string result = this->IsInbound() ? "i" : "o"; + if (this->IsControl()) { + result.append("C"); + } else { + result.append(absl::StrFormat("%d", this->Id())); + } + return result; +} + +} // end namespace graph_analyzer +} // end namespace grappler +} // end namespace tensorflow diff --git a/tensorflow/core/grappler/graph_analyzer/gen_node.h b/tensorflow/core/grappler/graph_analyzer/gen_node.h new file mode 100644 index 0000000000..faec9ecad8 --- /dev/null +++ b/tensorflow/core/grappler/graph_analyzer/gen_node.h @@ -0,0 +1,167 @@ +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#ifndef TENSORFLOW_CORE_GRAPPLER_GRAPH_ANALYZER_GEN_NODE_H_ +#define TENSORFLOW_CORE_GRAPPLER_GRAPH_ANALYZER_GEN_NODE_H_ + +#include +#include +#include + +#include "tensorflow/core/framework/graph.pb.h" +#include "tensorflow/core/framework/node_def.pb.h" +#include "tensorflow/core/framework/op_def.pb.h" +#include "tensorflow/core/lib/core/status.h" +#include "tensorflow/core/protobuf/meta_graph.pb.h" + +namespace tensorflow { +namespace grappler { +namespace graph_analyzer { + +class GenNode; + +// To find nodes by name. +using GenNodeMap = std::unordered_map>; + +// One node in the graph, in the form convenient for traversal and generation of +// subgraphs. It refers to the original NodeDef protobuf for most information +// and adds the extra enrichment. +// +// The graph building is 2-stage: first match a GenNode with each NodeDef and +// collect them into a map that finds them by name, then process the map, +// deep-parse the underlying NodeDefs and connect the GenNodes together. +class GenNode { + public: + // Will keep the pointer, so the underlying object must not be deleted while + // GenNode is alive. + explicit GenNode(const NodeDef* node); + + // Access wrappers. + const string& name() const { return node_->name(); } + const string& opcode() const { return node_->op(); } + const NodeDef* node_def() const { return node_; } + + // Parse the inputs of this node and update the map accordingly, creating the + // links (i.e. edges, connections between nodes) in itself and in the nodes + // it's linked to (the map itself is unchanged, only the nodes in it are + // updated). + Status ParseInputs(const GenNodeMap* map); + + // Does the full 2-stage build of the graph. The map should be initially + // empty. The map keeps pointers to the nodes in source, so the source must + // not be destroyed before the map. + static Status BuildGraphInMap(const GraphDef& source, GenNodeMap* map); + + // The enrichment that constitutes the point of this class. + + // Representation of a connection on a node. + class Port { + public: + // A port may be inbound or outbound. + // Negative ids (canonically -1) mean a control port. + Port(bool inbound, int32_t id) : value_(id << 1) { + if (inbound) { + value_ |= 1; + } + } + Port(const Port&) = default; + Port& operator=(const Port&) = default; + + bool IsInbound() const { return (value_ & 0x1); } + + bool IsControl() const { return (value_ < 0); } + + int32_t Id() const { + // Arithmetic shift preserves the sign. + return (value_ >> 1); + } + + // Integer type used to represent the encoded port value. + using IntPort = int32_t; + + // Returns the encoded form of this port, so that it can be used + // as various map indexes. + IntPort Encoded() const { return value_; } + + static Port Decode(IntPort encoded) { return Port(encoded); } + + bool operator==(const Port& other) const { return value_ == other.value_; } + bool operator<(const Port& other) const { return value_ < other.value_; } + + struct Hasher { + size_t operator()(const Port& port) const noexcept { + return hasher(port.Encoded()); + } + std::hash hasher; + }; + + // Convenient for printing. I've really wanted it to be implicit but + // ClangTidy insists on making it explicit. + explicit operator string() const; + + private: + explicit Port(IntPort value) : value_(value) {} + + IntPort value_; + }; + + struct LinkTarget { + GenNode* node; // Node where this link points. + Port port; // Port on the remote side of this link. + + LinkTarget(GenNode* a_node, Port a_port) : node(a_node), port(a_port) {} + }; + // All the links that are connected to the same port of this node + // are collected in one vector. A link is an edge of the graph that connects + // 2 nodes. Each of the connected nodes has its own perspective on the link, + // seeing its local port, remote port and the remote node. The direction of + // the link is encoded in the ports, one port is always incoming and another + // one outgoing. + using LinkTargetVector = std::vector; + // Both inputs and outputs are stored in the same map. + using LinkMap = std::unordered_map; + + // Access to the link map. + const LinkMap& links() const { return links_; } + + // Check whether the port is an input (including the controls) with multiple + // connections. Such inputs get handled in a special way when building the + // subgraphs, in an "all or nothing" fashion. + bool IsMultiInput(Port port) const; + + // When building the subgraphs, must include either all non-control inputs of + // this node into the subgraph or none of them. This happens when at least one + // of the inputs is a multi-input (or if the opcode is commutative, thus + // treating all the inputs as one multi-input). + bool AllInputsOrNone() const { return all_inputs_or_none_; } + + private: + const NodeDef* node_; + // Becomes valid only after ParseInputs(). + const OpDef* op_; + + // The opcode has a complicated structure of input args, with multi-input args + // that are not commutative. This means that to make sense, the subgraphs that + // include this node must also include either all its inputs or none of them. + bool all_inputs_or_none_ = false; + + LinkMap links_; +}; + +} // end namespace graph_analyzer +} // end namespace grappler +} // end namespace tensorflow + +#endif // TENSORFLOW_CORE_GRAPPLER_GRAPH_ANALYZER_GEN_NODE_H_ diff --git a/tensorflow/core/grappler/graph_analyzer/gen_node_test.cc b/tensorflow/core/grappler/graph_analyzer/gen_node_test.cc new file mode 100644 index 0000000000..d77daf7849 --- /dev/null +++ b/tensorflow/core/grappler/graph_analyzer/gen_node_test.cc @@ -0,0 +1,491 @@ +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include "tensorflow/core/grappler/graph_analyzer/gen_node.h" + +#include +#include +#include "absl/memory/memory.h" +#include "tensorflow/core/grappler/graph_analyzer/test_tools.h" + +namespace tensorflow { +namespace grappler { +namespace graph_analyzer { +namespace test { +namespace { + +using ::testing::ElementsAre; +using ::testing::Eq; +using ::testing::Ne; + +TEST(GenNodeTest, Port) { + { + GenNode::Port p(true, 100); + EXPECT_THAT(p.IsInbound(), Eq(true)); + EXPECT_THAT(p.IsControl(), Eq(false)); + EXPECT_THAT(p.Id(), Eq(100)); + GenNode::Port p2 = GenNode::Port::Decode(p.Encoded()); + EXPECT_THAT(p2.IsInbound(), Eq(true)); + EXPECT_THAT(p2.IsControl(), Eq(false)); + EXPECT_THAT(p2.Id(), Eq(100)); + } + { + GenNode::Port p(false, 0); + EXPECT_THAT(p.IsInbound(), Eq(false)); + EXPECT_THAT(p.IsControl(), Eq(false)); + EXPECT_THAT(p.Id(), Eq(0)); + GenNode::Port p2 = GenNode::Port::Decode(p.Encoded()); + EXPECT_THAT(p2.IsInbound(), Eq(false)); + EXPECT_THAT(p2.IsControl(), Eq(false)); + EXPECT_THAT(p2.Id(), Eq(0)); + } + { + GenNode::Port p(true, -100); + EXPECT_THAT(p.IsInbound(), Eq(true)); + EXPECT_THAT(p.IsControl(), Eq(true)); + EXPECT_THAT(p.Id(), Eq(-100)); + GenNode::Port p2 = GenNode::Port::Decode(p.Encoded()); + EXPECT_THAT(p2.IsInbound(), Eq(true)); + EXPECT_THAT(p2.IsControl(), Eq(true)); + EXPECT_THAT(p2.Id(), Eq(-100)); + } + { + GenNode::Port p(false, -1); + EXPECT_THAT(p.IsInbound(), Eq(false)); + EXPECT_THAT(p.IsControl(), Eq(true)); + EXPECT_THAT(p.Id(), Eq(-1)); + GenNode::Port p2 = GenNode::Port::Decode(p.Encoded()); + EXPECT_THAT(p2.IsInbound(), Eq(false)); + EXPECT_THAT(p2.IsControl(), Eq(true)); + EXPECT_THAT(p2.Id(), Eq(-1)); + } +} + +TEST(GenNodeTest, ParseNodeNoInputs) { + GenNodeMap map; + NodeDef node1 = MakeNodeConst("node1"); + map["node1"] = absl::make_unique(&node1); + + auto gn1 = map["node1"].get(); + ASSERT_THAT(gn1->ParseInputs(&map), Eq(Status::OK())); + EXPECT_THAT(DumpLinkMap(gn1->links()), ElementsAre()); +} + +// A general operation, and a control link. +TEST(GenNodeTest, ParseNodeWithControl) { + GenNodeMap map; + + NodeDef node1 = MakeNodeConst("node1"); + map["node1"] = absl::make_unique(&node1); + + NodeDef node2 = MakeNodeConst("node2"); + map["node2"] = absl::make_unique(&node2); + + NodeDef node3 = MakeNodeSub("node3", "node1", "node2"); + node3.add_input("^node1"); // The control link. + node3.add_input("^node2"); // The control link. + map["node3"] = absl::make_unique(&node3); + + auto gn1 = map["node1"].get(); + auto gn2 = map["node2"].get(); + auto gn3 = map["node3"].get(); + ASSERT_THAT(gn3->ParseInputs(&map), Eq(Status::OK())); + // clang-format off + EXPECT_THAT(DumpLinkMap(gn1->links()), ElementsAre( + "o0: node3[i0]", + "oC: node3[iC]" + )); + EXPECT_THAT(DumpLinkMap(gn2->links()), ElementsAre( + "o0: node3[i1]", + "oC: node3[iC]" + )); + EXPECT_THAT(DumpLinkMap(gn3->links()), ElementsAre( + "i0: node1[o0]", + "i1: node2[o0]", + "iC: node1[oC], node2[oC]" + )); + // clang-format on + + EXPECT_THAT(gn3->IsMultiInput(GenNode::Port(true, 0)), Eq(false)); + + // This is a multi-control-input. + EXPECT_THAT(gn3->IsMultiInput(GenNode::Port(true, -1)), Eq(true)); + + EXPECT_FALSE(gn1->AllInputsOrNone()); + EXPECT_FALSE(gn2->AllInputsOrNone()); + EXPECT_FALSE(gn3->AllInputsOrNone()); +} + +// Commutative nodes are treated as having a single input, +// because their inputs are equivalent. +TEST(GenNodeTest, ParseNodeCommutative) { + GenNodeMap map; + + NodeDef node1 = MakeNodeConst("node1"); + map["node1"] = absl::make_unique(&node1); + + NodeDef node2 = MakeNodeConst("node2"); + map["node2"] = absl::make_unique(&node2); + + // TODO(babkin): grappler::IsCommutative() should return true for Add but + // apparently doesn't. So use Mul in the meantime. + NodeDef node3 = MakeNodeMul("node3", "node1", "node2"); + map["node3"] = absl::make_unique(&node3); + + auto gn1 = map["node1"].get(); + auto gn2 = map["node2"].get(); + auto gn3 = map["node3"].get(); + ASSERT_THAT(gn3->ParseInputs(&map), Eq(Status::OK())); + // clang-format off + EXPECT_THAT(DumpLinkMap(gn1->links()), ElementsAre( + "o0: node3[i0]" + )); + EXPECT_THAT(DumpLinkMap(gn2->links()), ElementsAre( + "o0: node3[i0]" + )); + EXPECT_THAT(DumpLinkMap(gn3->links()), ElementsAre( + "i0: node1[o0], node2[o0]" + )); + // clang-format on + + EXPECT_THAT(gn3->IsMultiInput(GenNode::Port(true, 0)), Eq(true)); + + EXPECT_FALSE(gn3->AllInputsOrNone()); +} + +TEST(GenNodeTest, ParseNodeMultiInputCommutative) { + GenNodeMap map; + + NodeDef node1 = MakeNodeConst("node1"); + map["node1"] = absl::make_unique(&node1); + + NodeDef node2 = MakeNodeConst("node2"); + map["node2"] = absl::make_unique(&node2); + + NodeDef node3 = MakeNodeAddN("node3", "node1", "node2"); + map["node3"] = absl::make_unique(&node3); + + auto gn1 = map["node1"].get(); + auto gn2 = map["node2"].get(); + auto gn3 = map["node3"].get(); + ASSERT_THAT(gn3->ParseInputs(&map), Eq(Status::OK())); + // clang-format off + EXPECT_THAT(DumpLinkMap(gn1->links()), ElementsAre( + "o0: node3[i0]" + )); + EXPECT_THAT(DumpLinkMap(gn2->links()), ElementsAre( + "o0: node3[i0]" + )); + EXPECT_THAT(DumpLinkMap(gn3->links()), ElementsAre( + "i0: node1[o0], node2[o0]" + )); + // clang-format on + + // This is a multi-output. + EXPECT_THAT(gn2->IsMultiInput(GenNode::Port(false, 0)), Eq(false)); + // This is a multi-input. + EXPECT_THAT(gn3->IsMultiInput(GenNode::Port(true, 0)), Eq(true)); + + EXPECT_FALSE(gn3->AllInputsOrNone()); +} + +TEST(GenNodeTest, ParseNodeMultiInputNotCommutative) { + GenNodeMap map; + + NodeDef node1 = MakeNodeConst("node1"); + map["node1"] = absl::make_unique(&node1); + + NodeDef node2 = MakeNodeConst("node2"); + map["node2"] = absl::make_unique(&node2); + + NodeDef node3 = MakeNodeShapeN("node3", "node1", "node2"); + map["node3"] = absl::make_unique(&node3); + + auto gn1 = map["node1"].get(); + auto gn2 = map["node2"].get(); + auto gn3 = map["node3"].get(); + ASSERT_THAT(gn3->ParseInputs(&map), Eq(Status::OK())); + // clang-format off + EXPECT_THAT(DumpLinkMap(gn1->links()), ElementsAre( + "o0: node3[i0]" + )); + EXPECT_THAT(DumpLinkMap(gn2->links()), ElementsAre( + "o0: node3[i1]" + )); + EXPECT_THAT(DumpLinkMap(gn3->links()), ElementsAre( + "i0: node1[o0]", + "i1: node2[o0]" + )); + // clang-format on + + // Non-commutative multi-input doesn't count. + EXPECT_THAT(gn3->IsMultiInput(GenNode::Port(true, 0)), Eq(false)); + EXPECT_TRUE(gn3->AllInputsOrNone()); +} + +TEST(GenNodeTest, ParseNodeMultiInputList) { + GenNodeMap map; + + NodeDef node1 = MakeNodeConst("node1"); + map["node1"] = absl::make_unique(&node1); + + NodeDef node2 = MakeNodeConst("node2"); + map["node2"] = absl::make_unique(&node2); + + NodeDef node3 = MakeNodeIdentityN("node3", "node1", "node2"); + map["node3"] = absl::make_unique(&node3); + + auto gn1 = map["node1"].get(); + auto gn2 = map["node2"].get(); + auto gn3 = map["node3"].get(); + ASSERT_THAT(gn3->ParseInputs(&map), Eq(Status::OK())); + // clang-format off + EXPECT_THAT(DumpLinkMap(gn1->links()), ElementsAre( + "o0: node3[i0]" + )); + EXPECT_THAT(DumpLinkMap(gn2->links()), ElementsAre( + "o0: node3[i1]" + )); + EXPECT_THAT(DumpLinkMap(gn3->links()), ElementsAre( + "i0: node1[o0]", + "i1: node2[o0]" + )); + // clang-format on + + // Non-commutative multi-input doesn't count. + EXPECT_THAT(gn3->IsMultiInput(GenNode::Port(true, 0)), Eq(false)); + EXPECT_TRUE(gn3->AllInputsOrNone()); +} + +TEST(GenNodeTest, ParseNodeMultiMultiInput) { + GenNodeMap map; + + NodeDef node1 = MakeNodeConst("node1"); + map["node1"] = absl::make_unique(&node1); + + NodeDef node2 = MakeNodeConst("node2"); + map["node2"] = absl::make_unique(&node2); + + NodeDef node3 = MakeNodeConst("node3"); + map["node3"] = absl::make_unique(&node3); + + NodeDef node4 = MakeNodeConst("node4"); + map["node4"] = absl::make_unique(&node4); + + NodeDef node5 = + MakeNodeQuantizedConcat("node5", "node1", "node2", "node3", "node4"); + map["node5"] = absl::make_unique(&node5); + + auto gn1 = map["node1"].get(); + auto gn2 = map["node2"].get(); + auto gn3 = map["node3"].get(); + auto gn4 = map["node4"].get(); + auto gn5 = map["node5"].get(); + ASSERT_THAT(gn5->ParseInputs(&map), Eq(Status::OK())); + // clang-format off + EXPECT_THAT(DumpLinkMap(gn1->links()), ElementsAre( + "o0: node5[i0]" + )); + EXPECT_THAT(DumpLinkMap(gn2->links()), ElementsAre( + "o0: node5[i1]" + )); + EXPECT_THAT(DumpLinkMap(gn3->links()), ElementsAre( + "o0: node5[i2]" + )); + EXPECT_THAT(DumpLinkMap(gn4->links()), ElementsAre( + "o0: node5[i3]" + )); + EXPECT_THAT(DumpLinkMap(gn5->links()), ElementsAre( + "i0: node1[o0]", + "i1: node2[o0]", + "i2: node3[o0]", + "i3: node4[o0]" + )); + // clang-format on + + // Non-commutative multi-input doesn't count. + EXPECT_THAT(gn5->IsMultiInput(GenNode::Port(true, 1)), Eq(false)); + EXPECT_THAT(gn5->IsMultiInput(GenNode::Port(true, 2)), Eq(false)); + EXPECT_TRUE(gn5->AllInputsOrNone()); +} + +TEST(GenNodeTest, ParseNodeMultiOutput) { + GenNodeMap map; + + NodeDef node1 = MakeNodeConst("node1"); + map["node1"] = absl::make_unique(&node1); + + NodeDef node2 = MakeNodeConst("node2"); + map["node2"] = absl::make_unique(&node2); + + NodeDef node3 = MakeNodeBroadcastGradientArgs("node3", "node1", "node2"); + map["node3"] = absl::make_unique(&node3); + + NodeDef node4 = MakeNodeSub("node4", "node3:1", "node3:0"); + map["node4"] = absl::make_unique(&node4); + + auto gn4 = map["node4"].get(); + ASSERT_THAT(gn4->ParseInputs(&map), Eq(Status::OK())); + // clang-format off + EXPECT_THAT(DumpLinkMap(gn4->links()), ElementsAre( + "i0: node3[o1]", + "i1: node3[o0]" + )); + // clang-format on +} + +TEST(GenNodeTest, ParseNodeUndefinedOp) { + GenNodeMap map; + NodeDef node1; + node1.set_name("node1"); + node1.set_op("Zzzx"); + + map["node1"] = absl::make_unique(&node1); + + const OpDef* opdef; + Status nested_error = OpRegistry::Global()->LookUpOpDef("Zzzx", &opdef); + + auto gn = map["node1"].get(); + ASSERT_THAT( + gn->ParseInputs(&map), + Eq(Status(error::INVALID_ARGUMENT, + "Node 'node1' contains an undefined operation 'Zzzx': " + + nested_error.error_message()))); +} + +TEST(GenNodeTest, ParseNodeUnexpectedInputs) { + GenNodeMap map; + + NodeDef node1 = MakeNodeConst("node1"); + map["node1"] = absl::make_unique(&node1); + node1.add_input("node1"); + + auto gn1 = map["node1"].get(); + EXPECT_THAT(gn1->ParseInputs(&map), + Eq(Status(error::INVALID_ARGUMENT, + "Node 'node1' has a non-control " + "input from 'node1' at index 0 but its operation " + "'Const' defines only 0 inputs."))); + + NodeDef node2 = MakeNodeConst("node2"); + map["node2"] = absl::make_unique(&node2); + + NodeDef node3 = MakeNodeSub("node3", "node1", "node2"); + map["node3"] = absl::make_unique(&node3); + node3.add_input("node1"); + + auto gn3 = map["node3"].get(); + EXPECT_THAT(gn3->ParseInputs(&map), + Eq(Status(error::INVALID_ARGUMENT, + "Node 'node3' has a non-control " + "input from 'node1' at index 2 but its operation " + "'Sub' defines only 2 inputs."))); +} + +// Even if an opcode defines no inputs, the node may still accept the control +// inputs. +TEST(GenNodeTest, ParseNodeControlInputsAlwaysOk) { + GenNodeMap map; + NodeDef node1 = MakeNodeConst("node1"); + map["node1"] = absl::make_unique(&node1); + node1.add_input("^node1"); + auto gn1 = map["node1"].get(); + ASSERT_THAT(gn1->ParseInputs(&map), Eq(Status::OK())); + // clang-format off + EXPECT_THAT(DumpLinkMap(gn1->links()), ElementsAre( + "iC: node1[oC]", + "oC: node1[iC]" + )); + // clang-format on +} + +TEST(GenNodeTest, ParseNodeInvalidInput) { + GenNodeMap map; + NodeDef node1 = MakeNodeAddN("node1", "node2", "node3"); + map["node1"] = absl::make_unique(&node1); + node1.add_input("node1"); + auto gn1 = map["node1"].get(); + ASSERT_THAT( + gn1->ParseInputs(&map), + Eq(Status( + error::INVALID_ARGUMENT, + "Node 'node1' input 0 refers to a non-existing node 'node2'."))); +} + +TEST(GenNodeTest, BuildGraphInMap) { + GraphDef graph; + // A topology with a loop. + (*graph.add_node()) = MakeNodeConst("node1"); + (*graph.add_node()) = MakeNodeSub("node2", "node3:1", "node3:0"); + (*graph.add_node()) = + MakeNodeBroadcastGradientArgs("node3", "node1", "node2"); + + GenNodeMap map; + ASSERT_THAT(GenNode::BuildGraphInMap(graph, &map), Eq(Status::OK())); + ASSERT_THAT(map.find("node1"), Ne(map.end())); + ASSERT_THAT(map.find("node2"), Ne(map.end())); + ASSERT_THAT(map.find("node3"), Ne(map.end())); + + EXPECT_THAT(map["node1"]->name(), Eq("node1")); + EXPECT_THAT(map["node2"]->name(), Eq("node2")); + EXPECT_THAT(map["node3"]->name(), Eq("node3")); + + // clang-format off + EXPECT_THAT(DumpLinkMap(map["node1"]->links()), ElementsAre( + "o0: node3[i0]" + )); + EXPECT_THAT(DumpLinkMap(map["node2"]->links()), ElementsAre( + "i0: node3[o1]", + "i1: node3[o0]", + "o0: node3[i1]" + )); + EXPECT_THAT(DumpLinkMap(map["node3"]->links()), ElementsAre( + "i0: node1[o0]", + "i1: node2[o0]", + "o0: node2[i1]", + "o1: node2[i0]" + )); + // clang-format on +} + +TEST(GenNodeTest, BuildGraphInMapDuplicateNode) { + GraphDef graph; + (*graph.add_node()) = MakeNodeConst("node1"); + (*graph.add_node()) = MakeNodeConst("node1"); + GenNodeMap map; + ASSERT_THAT( + GenNode::BuildGraphInMap(graph, &map), + Eq(Status(error::INVALID_ARGUMENT, "Duplicate node name 'node1'."))); +} + +TEST(GenNodeTest, BuildGraphInMapParseError) { + GraphDef graph; + // A topology with a loop. + (*graph.add_node()) = MakeNodeConst("node1"); + (*graph.add_node()) = MakeNodeSub("node2", "node3:1", "node3:0"); + + GenNodeMap map; + ASSERT_THAT( + GenNode::BuildGraphInMap(graph, &map), + Eq(Status( + error::INVALID_ARGUMENT, + "Node 'node2' input 0 refers to a non-existing node 'node3'."))); +} + +} // end namespace +} // end namespace test +} // end namespace graph_analyzer +} // end namespace grappler +} // end namespace tensorflow diff --git a/tensorflow/core/grappler/graph_analyzer/graph_analyzer.cc b/tensorflow/core/grappler/graph_analyzer/graph_analyzer.cc new file mode 100644 index 0000000000..f3796fcf86 --- /dev/null +++ b/tensorflow/core/grappler/graph_analyzer/graph_analyzer.cc @@ -0,0 +1,341 @@ +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include +#include + +#include "absl/memory/memory.h" +#include "absl/strings/str_format.h" +#include "tensorflow/core/grappler/graph_analyzer/gen_node.h" +#include "tensorflow/core/grappler/graph_analyzer/graph_analyzer.h" +#include "tensorflow/core/grappler/graph_analyzer/sig_node.h" + +namespace tensorflow { +namespace grappler { +namespace graph_analyzer { + +GraphAnalyzer::GraphAnalyzer(const GraphDef& graph, int subgraph_size) + : graph_(graph), subgraph_size_(subgraph_size) {} + +GraphAnalyzer::~GraphAnalyzer() {} + +Status GraphAnalyzer::Run() { + // The signature computation code would detect this too, but better + // to report it up front than spend time computing all the graphs first. + if (subgraph_size_ > Signature::kMaxGraphSize) { + return Status(error::INVALID_ARGUMENT, + absl::StrFormat("Subgraphs of %d nodes are not supported, " + "the maximal supported node count is %d.", + subgraph_size_, Signature::kMaxGraphSize)); + } + + Status st = BuildMap(); + if (!st.ok()) { + return st; + } + + FindSubgraphs(); + DropInvalidSubgraphs(); + st = CollateResult(); + if (!st.ok()) { + return st; + } + + return Status::OK(); +} + +Status GraphAnalyzer::BuildMap() { + nodes_.clear(); + return GenNode::BuildGraphInMap(graph_, &nodes_); +} + +void GraphAnalyzer::FindSubgraphs() { + result_.clear(); + + if (subgraph_size_ < 1) { + return; + } + + partial_.clear(); + todo_.clear(); // Just in case. + + // Start with all subgraphs of size 1. + const Subgraph::Identity empty_parent; + for (const auto& node : nodes_) { + if (subgraph_size_ == 1) { + result_.ExtendParent(empty_parent, node.second.get()); + } else { + // At this point ExtendParent() is guaranteed to not return nullptr. + todo_.push_back(partial_.ExtendParent(empty_parent, node.second.get())); + } + } + + // Then extend the subgraphs until no more extensions are possible. + while (!todo_.empty()) { + ExtendSubgraph(todo_.front()); + todo_.pop_front(); + } + + partial_.clear(); +} + +void GraphAnalyzer::ExtendSubgraph(Subgraph* parent) { + bool will_complete = (parent->id().size() + 1 == subgraph_size_); + SubgraphPtrSet& sg_set = will_complete ? result_ : partial_; + + const GenNode* last_all_or_none_node = nullptr; + for (SubgraphIterator sit(parent); !sit.AtEnd(); sit.Next()) { + const GenNode* node = sit.GetNode(); + GenNode::Port port = sit.GetPort(); + const GenNode::LinkTarget& neighbor = sit.GetNeighbor(); + + if (node->AllInputsOrNone() && port.IsInbound() && !port.IsControl()) { + if (node != last_all_or_none_node) { + ExtendSubgraphAllOrNone(parent, node); + last_all_or_none_node = node; + } + sit.SkipPort(); + } else if (neighbor.node->AllInputsOrNone() && !port.IsInbound() && + !port.IsControl()) { + if (parent->id().find(neighbor.node) == parent->id().end()) { + // Not added yet. + ExtendSubgraphAllOrNone(parent, neighbor.node); + } + } else if (node->IsMultiInput(port)) { + ExtendSubgraphPortAllOrNone(parent, node, port); + sit.SkipPort(); + } else if (neighbor.node->IsMultiInput(neighbor.port)) { + // Would need to add all inputs of the neighbor node at this port at + // once. + if (parent->id().find(neighbor.node) != parent->id().end()) { + continue; // Already added. + } + ExtendSubgraphPortAllOrNone(parent, neighbor.node, neighbor.port); + } else { + Subgraph* sg = sg_set.ExtendParent(parent->id(), neighbor.node); + if (!will_complete && sg != nullptr) { + todo_.push_back(sg); + } + } + } +} + +void GraphAnalyzer::ExtendSubgraphAllOrNone(Subgraph* parent, + const GenNode* node) { + Subgraph::Identity id = parent->id(); + id.insert(node); + + auto range_end = node->links().end(); + + for (auto nbit = node->links().begin(); nbit != range_end; ++nbit) { + auto port = nbit->first; + if (!port.IsInbound() || port.IsControl()) { + continue; + } + + // Since there might be multiple links to the same nodes, + // have to add all links one-by-one to check whether the subgraph + // would grow too large. But if it does grow too large, there is no + // point in growing it more, can just skip over the rest of the links. + for (const auto& link : nbit->second) { + id.insert(link.node); + if (id.size() > subgraph_size_) { + return; // Too big. + } + } + } + + AddExtendedSubgraph(parent, id); +} + +void GraphAnalyzer::ExtendSubgraphPortAllOrNone(Subgraph* parent, + const GenNode* node, + GenNode::Port port) { + auto nbit = node->links().find(port); + if (nbit == node->links().end()) { + return; // Should never happen. + } + + Subgraph::Identity id = parent->id(); + id.insert(node); + + // Since there might be multiple links to the same nodes, + // have to add all links one-by-one to check whether the subgraph + // would grow too large. But if it does grow too large, there is no + // point in growing it more, can just skip over the rest of the links. + for (const auto& link : nbit->second) { + id.insert(link.node); + if (id.size() > subgraph_size_) { + return; // Too big. + } + } + + AddExtendedSubgraph(parent, id); +} + +void GraphAnalyzer::AddExtendedSubgraph(Subgraph* parent, + const Subgraph::Identity& id) { + if (id.size() == parent->id().size()) { + return; // Nothing new was added. + } + + auto sg = absl::make_unique(id); + SubgraphPtrSet& spec_sg_set = + (id.size() == subgraph_size_) ? result_ : partial_; + if (spec_sg_set.find(sg) != spec_sg_set.end()) { + // This subgraph was already found by extending from a different path. + return; + } + + if (id.size() != subgraph_size_) { + todo_.push_back(sg.get()); + } + spec_sg_set.insert(std::move(sg)); +} + +void GraphAnalyzer::DropInvalidSubgraphs() { + auto resit = result_.begin(); + while (resit != result_.end()) { + if (HasInvalidMultiInputs(resit->get())) { + auto delit = resit; + ++resit; + result_.erase(delit); + } else { + ++resit; + } + } +} + +bool GraphAnalyzer::HasInvalidMultiInputs(Subgraph* sg) { + // Do the all-or-none-input nodes. + for (auto const& node : sg->id()) { + if (!node->AllInputsOrNone()) { + continue; + } + + bool anyIn = false; + bool anyOut = false; + + auto range_end = node->links().end(); + for (auto nbit = node->links().begin(); nbit != range_end; ++nbit) { + auto port = nbit->first; + if (!port.IsInbound() || port.IsControl()) { + continue; + } + + // Since there might be multiple links to the same nodes, + // have to add all links one-by-one to check whether the subgraph + // would grow too large. But if it does grow too large, there is no + // point in growing it more, can just skip over the rest of the links. + for (const auto& link : nbit->second) { + if (sg->id().find(link.node) == sg->id().end()) { + anyOut = true; + } else { + anyIn = true; + } + } + } + + if (anyIn && anyOut) { + return true; + } + } + + // Do the multi-input ports. + for (SubgraphIterator sit(sg); !sit.AtEnd(); sit.Next()) { + if (sit.GetNode()->IsMultiInput(sit.GetPort())) { + bool anyIn = false; + bool anyOut = false; + do { + GenNode* peer = sit.GetNeighbor().node; + if (sg->id().find(peer) == sg->id().end()) { + anyOut = true; + } else { + anyIn = true; + } + } while (sit.NextIfSamePort()); + + if (anyIn && anyOut) { + return true; + } + } + } + return false; +} + +Status GraphAnalyzer::CollateResult() { + ordered_collation_.clear(); + collation_map_.clear(); + + // Collate by the signatures of the graphs. + for (const auto& it : result_) { + auto sig = absl::make_unique(); + it->ExtractForSignature(&sig->map); + Status status = sig->Compute(); + if (!status.ok()) { + return status; + } + + auto& coll_entry = collation_map_[sig.get()]; + if (coll_entry.sig == nullptr) { + coll_entry.sig = std::move(sig); + } + ++coll_entry.count; + } + + // Then order them by the count. + for (auto& entry : collation_map_) { + ordered_collation_.insert(&entry.second); + } + + result_.clear(); // Not needed after collation. + + return Status::OK(); +} + +std::vector GraphAnalyzer::DumpRawSubgraphs() { + std::vector result; + for (const auto& it : result_) { + result.emplace_back(it->Dump()); + } + return result; +} + +std::vector GraphAnalyzer::DumpSubgraphs() { + std::vector result; + for (auto ptr : ordered_collation_) { + result.emplace_back( + absl::StrFormat("%d %s", ptr->count, ptr->sig->ToString())); + } + return result; +} + +Status GraphAnalyzer::OutputSubgraphs() { + size_t total = 0; + for (auto ptr : ordered_collation_) { + std::cout << ptr->count << ' ' << ptr->sig->ToString() << '\n'; + total += ptr->count; + } + std::cout << "Total: " << total << '\n'; + if (std::cout.fail()) { + return Status(error::DATA_LOSS, "Failed to write to stdout"); + } else { + return Status::OK(); + } +} + +} // end namespace graph_analyzer +} // end namespace grappler +} // end namespace tensorflow diff --git a/tensorflow/core/grappler/graph_analyzer/graph_analyzer.h b/tensorflow/core/grappler/graph_analyzer/graph_analyzer.h new file mode 100644 index 0000000000..26d38a4931 --- /dev/null +++ b/tensorflow/core/grappler/graph_analyzer/graph_analyzer.h @@ -0,0 +1,154 @@ +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#ifndef TENSORFLOW_CORE_GRAPPLER_GRAPH_ANALYZER_GRAPH_ANALYZER_H_ +#define TENSORFLOW_CORE_GRAPPLER_GRAPH_ANALYZER_GRAPH_ANALYZER_H_ + +#include +#include + +#include "tensorflow/core/framework/graph.pb.h" +#include "tensorflow/core/grappler/graph_analyzer/map_tools.h" +#include "tensorflow/core/grappler/graph_analyzer/sig_node.h" +#include "tensorflow/core/grappler/graph_analyzer/subgraph.h" +#include "tensorflow/core/lib/core/status.h" + +namespace tensorflow { +namespace grappler { +namespace graph_analyzer { + +namespace test { +class GraphAnalyzerTest; +} // end namespace test + +// Finds all the subgraphs of a given size and groups them by equivalence. +class GraphAnalyzer { + public: + // Makes a copy of the graph. + GraphAnalyzer(const GraphDef& graph, int subgraph_size); + + virtual ~GraphAnalyzer(); + + // Performs the analysis and collects the subgraphs. + Status Run(); + + // Returns the subgraphs found in Run() printed to text. + std::vector DumpSubgraphs(); + + // Prints the subgraphs found in Run() to stdout. + Status OutputSubgraphs(); + + // TODO(babkin): add a way to extract the subgraphs as direct data + // structures and as protobufs, and to write protobufs to a RecordIO. + + private: + GraphAnalyzer() = delete; + GraphAnalyzer(const GraphAnalyzer&) = delete; + void operator=(const GraphAnalyzer&) = delete; + + friend class tensorflow::grappler::graph_analyzer::test::GraphAnalyzerTest; + + // Builds the map of nodes from the original graph definition. + Status BuildMap(); + + // Using nodes_, finds all the subgraphs of size subgraph_size_ and places + // them into result_. + void FindSubgraphs(); + + // Deletes from result_ the unacceptable subgraphs. Those include the + // subgraphs where not all the inputs at a multi-input port are included (this + // could happen if some of these inputs were reached and included through + // different paths). + void DropInvalidSubgraphs(); + + // Deletes from result_ duplicate entries of equivalent topology. + Status CollateResult(); + + // Returns the raw subgraphs found in FindSubgraphs() printed to text. + std::vector DumpRawSubgraphs(); + + // Finds and adds appropriately to either partial_ or result_ all the + // subgraphs that can be created by extending the parent subgraph by one node. + // Ignores the duplicates. + void ExtendSubgraph(Subgraph* parent); + + // Extends the parent subgraph by adding another node (if it wasn't already + // added) and all its non-control inputs in the link map range at once. + // If the subgraph would grow over subgraph_size_, it gets ignored. + void ExtendSubgraphAllOrNone(Subgraph* parent, const GenNode* node); + // Same but adds one specific inbound port (even control) all-or-none. + void ExtendSubgraphPortAllOrNone(Subgraph* parent, const GenNode* node, + GenNode::Port port); + // The common final step called by ExtendSubgraph*AllOrNone() methods. + void AddExtendedSubgraph(Subgraph* parent, const Subgraph::Identity& id); + + // Returns true if this subgraph has any multi-inputs that aren't all-in or + // all-out. + bool HasInvalidMultiInputs(Subgraph* sg); + + // Graph to run the analysis on. + GraphDef graph_; + int subgraph_size_; + + // The enriched graph of parsed nodes and connections. + GenNodeMap nodes_; + // The resulting set of subgraphs. + SubgraphPtrSet result_; + // The subgraphs of partial size, stored while finding the result. + SubgraphPtrSet partial_; + // The subgraphs of partial size (stored in partial_) that are still waiting + // to be extended. + // + // TODO(babkin): This is rather simple-minded, each subgraph is examined from + // scratch, which means that all its internal links get iterated too. But it's + // OK for the small subgraphs. This can be improved by keeping not just + // subgraphs but iterators on the list, each of them having the list not-yet + // examined nodes (and the link position of the next link to be examined for + // the first node). This would add extra constant overhead, so the break-even + // subgraph size is not clear yet. + std::deque todo_; + + // The collation map by signature is designed to allow the removal of entries + // and moving of the signature references from the keys of this map to the + // outside world. Must be careful at inserting and removal: make sure that + // when a new entry is inserted, its signature reference gets populated with + // the same data as the key of the map, and that if a reference is moved out, + // the map entry gets removed before that reference gets destroyed. + struct CollationEntry { + std::shared_ptr sig; + size_t count = 0; + }; + using CollationMap = + std::unordered_map, + EqAtPtr >; + CollationMap collation_map_; + + // The entries are owned by collation_map_, so must be removed from + // ordered_collation_ before removing them from collation_map_. + struct ReverseLessByCount { + bool operator()(CollationEntry* left, CollationEntry* right) { + return left->count > right->count; // Reverse order. + } + }; + using CollationOrderByCount = + std::multiset; + CollationOrderByCount ordered_collation_; +}; + +} // end namespace graph_analyzer +} // end namespace grappler +} // end namespace tensorflow + +#endif // TENSORFLOW_CORE_GRAPPLER_GRAPH_ANALYZER_GRAPH_ANALYZER_H_ diff --git a/tensorflow/core/grappler/graph_analyzer/graph_analyzer_test.cc b/tensorflow/core/grappler/graph_analyzer/graph_analyzer_test.cc new file mode 100644 index 0000000000..e94c472056 --- /dev/null +++ b/tensorflow/core/grappler/graph_analyzer/graph_analyzer_test.cc @@ -0,0 +1,569 @@ +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include "tensorflow/core/grappler/graph_analyzer/graph_analyzer.h" + +#include + +#include +#include +#include "absl/memory/memory.h" +#include "tensorflow/core/grappler/graph_analyzer/test_tools.h" + +namespace tensorflow { +namespace grappler { +namespace graph_analyzer { +namespace test { + +using ::testing::ElementsAre; +using ::testing::Eq; +using ::testing::Ne; +using ::testing::SizeIs; +using ::testing::UnorderedElementsAre; + +class GraphAnalyzerTest : public ::testing::Test, protected TestGraphs { + protected: + Status BuildMap() { return gran_->BuildMap(); } + + void FindSubgraphs() { gran_->FindSubgraphs(); } + + void DropInvalidSubgraphs() { gran_->DropInvalidSubgraphs(); } + + Status CollateResult() { return gran_->CollateResult(); } + + void ExtendSubgraph(Subgraph* parent) { gran_->ExtendSubgraph(parent); } + + void ExtendSubgraphPortAllOrNone(Subgraph* parent, GenNode* node, + GenNode::Port port) { + gran_->ExtendSubgraphPortAllOrNone(parent, node, port); + } + + void ExtendSubgraphAllOrNone(Subgraph* parent, GenNode* node) { + gran_->ExtendSubgraphAllOrNone(parent, node); + } + + std::vector DumpRawSubgraphs() { return gran_->DumpRawSubgraphs(); } + + std::vector DumpPartials() { + std::vector result; + for (const auto& it : gran_->partial_) { + result.emplace_back(it->Dump()); + } + return result; + } + + const GenNodeMap& GetNodes() { return gran_->nodes_; } + + GenNode* GetNode(const string& name) { return gran_->nodes_.at(name).get(); } + + SubgraphPtrSet& GetResult() { return gran_->result_; } + SubgraphPtrSet& GetPartial() { return gran_->partial_; } + std::deque& GetTodo() { return gran_->todo_; } + + // Gets initialized by a particular test from a suitable GraphDef. + std::unique_ptr gran_; +}; + +TEST_F(GraphAnalyzerTest, BuildMap) { + gran_ = absl::make_unique(graph_3n_self_control_, 1); + Status st = BuildMap(); + EXPECT_THAT(st, Eq(Status::OK())); + + auto& map = GetNodes(); + EXPECT_THAT(map.find("node1"), Ne(map.end())); + EXPECT_THAT(map.find("node2"), Ne(map.end())); + EXPECT_THAT(map.find("node3"), Ne(map.end())); +} + +TEST_F(GraphAnalyzerTest, BuildMapError) { + // A duplicate node. + (*graph_3n_self_control_.add_node()) = MakeNodeConst("node1"); + gran_ = absl::make_unique(graph_3n_self_control_, 1); + Status st = BuildMap(); + ASSERT_THAT( + st, Eq(Status(error::INVALID_ARGUMENT, "Duplicate node name 'node1'."))); +} + +TEST_F(GraphAnalyzerTest, FindSubgraphs0) { + gran_ = absl::make_unique(graph_3n_self_control_, 0); + Status st = BuildMap(); + ASSERT_THAT(st, Eq(Status::OK())); + + FindSubgraphs(); + auto& subgraphs = GetResult(); + EXPECT_THAT(subgraphs, SizeIs(0)); + EXPECT_THAT(DumpRawSubgraphs(), ElementsAre()); + EXPECT_THAT(DumpPartials(), UnorderedElementsAre()); + EXPECT_THAT(GetTodo(), SizeIs(0)); +} + +TEST_F(GraphAnalyzerTest, FindSubgraphs1) { + gran_ = absl::make_unique(graph_3n_self_control_, 1); + Status st = BuildMap(); + ASSERT_THAT(st, Eq(Status::OK())); + + FindSubgraphs(); + auto& subgraphs = GetResult(); + EXPECT_THAT(subgraphs, SizeIs(3)); + + // clang-format off + EXPECT_THAT(DumpRawSubgraphs(), UnorderedElementsAre( + "1: BroadcastGradientArgs(node3)", + "1: Const(node1)", + "1: Sub(node2)" + )); + // clang-format on + EXPECT_THAT(DumpPartials(), UnorderedElementsAre()); + EXPECT_THAT(GetTodo(), SizeIs(0)); +} + +// The required subgraphs are larger than the graph. +TEST_F(GraphAnalyzerTest, FindSubgraphsTooLarge) { + gran_ = absl::make_unique(graph_3n_self_control_, 4); + Status st = BuildMap(); + ASSERT_THAT(st, Eq(Status::OK())); + + FindSubgraphs(); + EXPECT_THAT(DumpRawSubgraphs(), ElementsAre()); + EXPECT_THAT(DumpPartials(), UnorderedElementsAre()); + EXPECT_THAT(GetTodo(), SizeIs(0)); +} + +//=== + +// Successfully propagate backwards through a multi-input link, +// with the base (currently-extending) node already in the graph. +TEST_F(GraphAnalyzerTest, MultiInputSuccessBackwardsBaseIn) { + gran_ = absl::make_unique(graph_multi_input_, 4); + Status st = BuildMap(); + ASSERT_THAT(st, Eq(Status::OK())); + + auto root = + absl::make_unique(Subgraph::Identity({GetNode("add2")})); + + ExtendSubgraphPortAllOrNone(root.get(), GetNode("add2"), + GenNode::Port(true, 0)); + + // clang-format off + EXPECT_THAT(DumpRawSubgraphs(), UnorderedElementsAre( + "1: AddN(add2), Const(const2_1), Const(const2_2), Const(const2_3)" + )); + // clang-format on + EXPECT_THAT(DumpPartials(), UnorderedElementsAre()); + EXPECT_THAT(GetTodo(), SizeIs(0)); +} + +// Successfully propagate backwards through a multi-input link, +// with the base (currently-extending) node not in the graph yet. +TEST_F(GraphAnalyzerTest, MultiInputSuccessBackwardsBaseOut) { + gran_ = absl::make_unique(graph_multi_input_, 4); + Status st = BuildMap(); + ASSERT_THAT(st, Eq(Status::OK())); + + auto parent = absl::make_unique(Subgraph::Identity()); + auto root = + absl::make_unique(Subgraph::Identity({GetNode("add2")})); + + ExtendSubgraphPortAllOrNone(parent.get(), GetNode("add2"), + GenNode::Port(true, 0)); + + // clang-format off + EXPECT_THAT(DumpRawSubgraphs(), UnorderedElementsAre( + "1: AddN(add2), Const(const2_1), Const(const2_2), Const(const2_3)" + )); + // clang-format on + EXPECT_THAT(DumpPartials(), UnorderedElementsAre()); + EXPECT_THAT(GetTodo(), SizeIs(0)); +} + +// Successfully propagate backwards through a multi-input link, +// where the target subgraph size is larger. +TEST_F(GraphAnalyzerTest, MultiInputSuccessBackwardsIncomplete) { + gran_ = absl::make_unique(graph_multi_input_, 5); + Status st = BuildMap(); + ASSERT_THAT(st, Eq(Status::OK())); + + auto root = + absl::make_unique(Subgraph::Identity({GetNode("add2")})); + + ExtendSubgraphPortAllOrNone(root.get(), GetNode("add2"), + GenNode::Port(true, 0)); + + EXPECT_THAT(DumpRawSubgraphs(), UnorderedElementsAre()); + // clang-format off + EXPECT_THAT(DumpPartials(), UnorderedElementsAre( + "1: AddN(add2), Const(const2_1), Const(const2_2), Const(const2_3)" + )); + // clang-format on + EXPECT_THAT(GetTodo(), SizeIs(1)); +} + +// Propagate backwards through a multi-input link, finding that the +// resulting subgraph would be too large. +TEST_F(GraphAnalyzerTest, MultiInputTooLargeBackwards) { + gran_ = absl::make_unique(graph_multi_input_, 3); + Status st = BuildMap(); + ASSERT_THAT(st, Eq(Status::OK())); + + auto root = + absl::make_unique(Subgraph::Identity({GetNode("add2")})); + + ExtendSubgraphPortAllOrNone(root.get(), GetNode("add2"), + GenNode::Port(true, 0)); + + EXPECT_THAT(DumpRawSubgraphs(), UnorderedElementsAre()); + EXPECT_THAT(DumpPartials(), UnorderedElementsAre()); + EXPECT_THAT(GetTodo(), SizeIs(0)); +} + +// Propagate backwards through a multi-input link, finding that nothing +// would be added to the parent subgraph. +TEST_F(GraphAnalyzerTest, MultiInputNothingAddedBackwards) { + gran_ = absl::make_unique(graph_multi_input_, 4); + Status st = BuildMap(); + ASSERT_THAT(st, Eq(Status::OK())); + + auto root = absl::make_unique( + Subgraph::Identity({GetNode("add2"), GetNode("const2_1"), + GetNode("const2_2"), GetNode("const2_3")})); + + ExtendSubgraphPortAllOrNone(root.get(), GetNode("add2"), + GenNode::Port(true, 0)); + + EXPECT_THAT(DumpRawSubgraphs(), UnorderedElementsAre()); + EXPECT_THAT(DumpPartials(), UnorderedElementsAre()); + EXPECT_THAT(GetTodo(), SizeIs(0)); +} + +// Successfully propagate forwards through a multi-input link, +// with the base (currently-extending) node not in the subgraph yet. +TEST_F(GraphAnalyzerTest, MultiInputSuccessForwardsBaseOut) { + gran_ = absl::make_unique(graph_multi_input_, 4); + Status st = BuildMap(); + ASSERT_THAT(st, Eq(Status::OK())); + + auto root = + absl::make_unique(Subgraph::Identity({GetNode("const2_1")})); + + ExtendSubgraphPortAllOrNone(root.get(), GetNode("add2"), + GenNode::Port(true, 0)); + + // clang-format off + EXPECT_THAT(DumpRawSubgraphs(), UnorderedElementsAre( + "1: AddN(add2), Const(const2_1), Const(const2_2), Const(const2_3)" + )); + // clang-format on + EXPECT_THAT(DumpPartials(), UnorderedElementsAre()); + EXPECT_THAT(GetTodo(), SizeIs(0)); +} + +// Successfully propagate backwards through a multi-input link. +TEST_F(GraphAnalyzerTest, MultiInputSuccessBackwardsFull) { + gran_ = absl::make_unique(graph_multi_input_, 4); + Status st = BuildMap(); + ASSERT_THAT(st, Eq(Status::OK())); + + auto root = + absl::make_unique(Subgraph::Identity({GetNode("add2")})); + + ExtendSubgraph(root.get()); + + // clang-format off + EXPECT_THAT(DumpRawSubgraphs(), UnorderedElementsAre( + "1: AddN(add2), Const(const2_1), Const(const2_2), Const(const2_3)" + )); + EXPECT_THAT(DumpPartials(), UnorderedElementsAre( + "1: AddN(add2), Sub(sub)" + )); + // clang-format on + EXPECT_THAT(GetTodo(), SizeIs(1)); +} + +// Successfully propagate forwards through a multi-input link. +TEST_F(GraphAnalyzerTest, MultiInputSuccessForwardsFull) { + gran_ = absl::make_unique(graph_multi_input_, 4); + Status st = BuildMap(); + ASSERT_THAT(st, Eq(Status::OK())); + + auto root = + absl::make_unique(Subgraph::Identity({GetNode("const2_1")})); + + ExtendSubgraph(root.get()); + + // clang-format off + EXPECT_THAT(DumpRawSubgraphs(), UnorderedElementsAre( + "1: AddN(add2), Const(const2_1), Const(const2_2), Const(const2_3)" + )); + // clang-format on + EXPECT_THAT(DumpPartials(), UnorderedElementsAre()); + EXPECT_THAT(GetTodo(), SizeIs(0)); +} + +TEST_F(GraphAnalyzerTest, DropInvalidSubgraphsMulti) { + gran_ = absl::make_unique(graph_multi_input_, 3); + Status st = BuildMap(); + ASSERT_THAT(st, Eq(Status::OK())); + + // A good one, multi-input is all-in. + GetResult().insert(absl::make_unique(Subgraph::Identity({ + GetNode("const1_1"), + GetNode("const1_2"), + GetNode("add1"), + }))); + // A good one, multi-input is all-out + GetResult().insert(absl::make_unique(Subgraph::Identity({ + GetNode("add1"), + GetNode("add2"), + GetNode("sub"), + }))); + // A bad one, multi-input is partially in. + GetResult().insert(absl::make_unique(Subgraph::Identity({ + GetNode("const1_1"), + GetNode("add1"), + GetNode("sub"), + }))); + // A bad one, multi-input is partially in. + GetResult().insert(absl::make_unique(Subgraph::Identity({ + GetNode("add2"), + GetNode("const2_1"), + GetNode("const2_2"), + }))); + + DropInvalidSubgraphs(); + + // clang-format off + EXPECT_THAT(DumpRawSubgraphs(), UnorderedElementsAre( + "1: AddN(add1), AddN(add2), Sub(sub)", + "1: AddN(add1), Const(const1_1), Const(const1_2)" + )); + // clang-format on + EXPECT_THAT(DumpPartials(), UnorderedElementsAre()); + EXPECT_THAT(GetTodo(), SizeIs(0)); +} + +//=== + +// Successfully propagate backwards through a multi-input link, +// with the base (currently-extending) node already in the graph. +TEST_F(GraphAnalyzerTest, AllOrNoneInputSuccessBackwards) { + gran_ = absl::make_unique(graph_all_or_none_, 4); + Status st = BuildMap(); + ASSERT_THAT(st, Eq(Status::OK())); + + auto root = + absl::make_unique(Subgraph::Identity({GetNode("pass2")})); + + ExtendSubgraphAllOrNone(root.get(), GetNode("pass2")); + + // clang-format off + EXPECT_THAT(DumpRawSubgraphs(), UnorderedElementsAre( + "1: Const(const2_1), Const(const2_2), Const(const2_3), IdentityN(pass2)" + )); + EXPECT_THAT(DumpPartials(), UnorderedElementsAre()); + // clang-format on + EXPECT_THAT(GetTodo(), SizeIs(0)); +} + +// Successfully propagate backwards through a multi-input link, +// but no control links propagate. It also tests the situation +// where the target subgraph size is larger. +TEST_F(GraphAnalyzerTest, AllOrNoneInputSuccessBackwardsNoControl) { + gran_ = absl::make_unique(graph_all_or_none_, 5); + Status st = BuildMap(); + ASSERT_THAT(st, Eq(Status::OK())); + + auto root = + absl::make_unique(Subgraph::Identity({GetNode("pass1")})); + + ExtendSubgraphAllOrNone(root.get(), GetNode("pass1")); + + // clang-format off + EXPECT_THAT(DumpRawSubgraphs(), UnorderedElementsAre()); + EXPECT_THAT(DumpPartials(), UnorderedElementsAre( + "1: Const(const1_1), Const(const1_2), IdentityN(pass1)" + )); + // clang-format on + EXPECT_THAT(GetTodo(), SizeIs(1)); +} + +// The control links propagate separately as all-or-none, even on the nodes +// that are all-or-none for the normal inputs. +TEST_F(GraphAnalyzerTest, AllOrNoneInputSeparateControl) { + gran_ = absl::make_unique(graph_all_or_none_, 5); + Status st = BuildMap(); + ASSERT_THAT(st, Eq(Status::OK())); + + auto root = + absl::make_unique(Subgraph::Identity({GetNode("pass1")})); + + ExtendSubgraphPortAllOrNone(root.get(), GetNode("pass1"), + GenNode::Port(true, -1)); + + // clang-format off + EXPECT_THAT(DumpRawSubgraphs(), UnorderedElementsAre()); + EXPECT_THAT(DumpPartials(), UnorderedElementsAre( + "1: Const(const2_1), Const(const2_2), Const(const2_3), IdentityN(pass1)" + )); + // clang-format on + EXPECT_THAT(GetTodo(), SizeIs(1)); +} + +// Propagate backwards from all-or-none-input node, finding that the +// resulting subgraph would be too large. +TEST_F(GraphAnalyzerTest, AllOrNoneInputTooLargeBackwards) { + gran_ = absl::make_unique(graph_all_or_none_, 3); + Status st = BuildMap(); + ASSERT_THAT(st, Eq(Status::OK())); + + auto root = + absl::make_unique(Subgraph::Identity({GetNode("pass2")})); + + ExtendSubgraphAllOrNone(root.get(), GetNode("pass2")); + + EXPECT_THAT(DumpRawSubgraphs(), UnorderedElementsAre()); + EXPECT_THAT(DumpPartials(), UnorderedElementsAre()); + EXPECT_THAT(GetTodo(), SizeIs(0)); +} + +// Propagate backwards from all-or-none-input node, finding that nothing +// would be added to the parent subgraph. +TEST_F(GraphAnalyzerTest, AllOrNoneInputNothingAddedBackwards) { + gran_ = absl::make_unique(graph_all_or_none_, 4); + Status st = BuildMap(); + ASSERT_THAT(st, Eq(Status::OK())); + + auto root = absl::make_unique( + Subgraph::Identity({GetNode("pass2"), GetNode("const2_1"), + GetNode("const2_2"), GetNode("const2_3")})); + + ExtendSubgraphAllOrNone(root.get(), GetNode("pass2")); + + EXPECT_THAT(DumpRawSubgraphs(), UnorderedElementsAre()); + EXPECT_THAT(DumpPartials(), UnorderedElementsAre()); + EXPECT_THAT(GetTodo(), SizeIs(0)); +} + +// Successfully propagate forwards to all-or-none-input node, +// with the base (currently-extending) node not in the subgraph yet. +TEST_F(GraphAnalyzerTest, AllOrNoneInputSuccessForwardsBaseOut) { + gran_ = absl::make_unique(graph_all_or_none_, 4); + Status st = BuildMap(); + ASSERT_THAT(st, Eq(Status::OK())); + + auto root = + absl::make_unique(Subgraph::Identity({GetNode("const2_1")})); + + ExtendSubgraphAllOrNone(root.get(), GetNode("pass2")); + + // clang-format off + EXPECT_THAT(DumpRawSubgraphs(), UnorderedElementsAre( + "1: Const(const2_1), Const(const2_2), Const(const2_3), IdentityN(pass2)" + )); + EXPECT_THAT(DumpPartials(), UnorderedElementsAre()); + // clang-format on + EXPECT_THAT(GetTodo(), SizeIs(0)); +} + +// Successfully propagate backwards from all-or-none-input node. +TEST_F(GraphAnalyzerTest, AllOrNoneInputSuccessBackwardsFull) { + gran_ = absl::make_unique(graph_all_or_none_, 4); + Status st = BuildMap(); + ASSERT_THAT(st, Eq(Status::OK())); + + auto root = + absl::make_unique(Subgraph::Identity({GetNode("pass2")})); + + ExtendSubgraph(root.get()); + + // clang-format off + EXPECT_THAT(DumpRawSubgraphs(), UnorderedElementsAre( + "1: Const(const2_1), Const(const2_2), Const(const2_3), IdentityN(pass2)" + )); + EXPECT_THAT(DumpPartials(), UnorderedElementsAre( + "1: IdentityN(pass2), Sub(sub)" + )); + // clang-format on + EXPECT_THAT(GetTodo(), SizeIs(1)); +} + +// Successfully propagate forwards to all-or-none-input node. This includes +// both all-or-none-input for the normal inputs, and multi-input by the +// control path. +TEST_F(GraphAnalyzerTest, AllOrNoneInputSuccessForwardsFull) { + gran_ = absl::make_unique(graph_all_or_none_, 4); + Status st = BuildMap(); + ASSERT_THAT(st, Eq(Status::OK())); + + auto root = + absl::make_unique(Subgraph::Identity({GetNode("const2_1")})); + + ExtendSubgraph(root.get()); + + // clang-format off + EXPECT_THAT(DumpRawSubgraphs(), UnorderedElementsAre( + "1: Const(const2_1), Const(const2_2), Const(const2_3), IdentityN(pass2)", + "1: Const(const2_1), Const(const2_2), Const(const2_3), IdentityN(pass1)" + )); + EXPECT_THAT(DumpPartials(), UnorderedElementsAre()); + // clang-format on + EXPECT_THAT(GetTodo(), SizeIs(0)); +} + +TEST_F(GraphAnalyzerTest, DropInvalidSubgraphsAllOrNone) { + gran_ = absl::make_unique(graph_all_or_none_, 3); + Status st = BuildMap(); + ASSERT_THAT(st, Eq(Status::OK())); + + // A good one, all-or-none is all-in. + GetResult().insert(absl::make_unique(Subgraph::Identity({ + GetNode("const1_1"), + GetNode("const1_2"), + GetNode("pass1"), + }))); + // A good one, all-or-none is all-out + GetResult().insert(absl::make_unique(Subgraph::Identity({ + GetNode("pass1"), + GetNode("pass2"), + GetNode("sub"), + }))); + // A bad one, all-or-none is partially in. + GetResult().insert(absl::make_unique(Subgraph::Identity({ + GetNode("const1_1"), + GetNode("pass1"), + GetNode("sub"), + }))); + // A bad one, all-or-none is partially in. + GetResult().insert(absl::make_unique(Subgraph::Identity({ + GetNode("pass2"), + GetNode("const2_1"), + GetNode("const2_2"), + }))); + + DropInvalidSubgraphs(); + + // clang-format off + EXPECT_THAT(DumpRawSubgraphs(), UnorderedElementsAre( + "1: IdentityN(pass1), IdentityN(pass2), Sub(sub)", + "1: Const(const1_1), Const(const1_2), IdentityN(pass1)" + )); + // clang-format on + EXPECT_THAT(DumpPartials(), UnorderedElementsAre()); + EXPECT_THAT(GetTodo(), SizeIs(0)); +} + +} // end namespace test +} // end namespace graph_analyzer +} // end namespace grappler +} // end namespace tensorflow diff --git a/tensorflow/core/grappler/graph_analyzer/graph_analyzer_tool.cc b/tensorflow/core/grappler/graph_analyzer/graph_analyzer_tool.cc new file mode 100644 index 0000000000..924ca11e61 --- /dev/null +++ b/tensorflow/core/grappler/graph_analyzer/graph_analyzer_tool.cc @@ -0,0 +1,98 @@ +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include "tensorflow/core/framework/graph.pb.h" +#include "tensorflow/core/framework/node_def.pb.h" +#include "tensorflow/core/grappler/graph_analyzer/graph_analyzer.h" +#include "tensorflow/core/grappler/grappler_item.h" +#include "tensorflow/core/lib/core/status.h" +#include "tensorflow/core/platform/env.h" +#include "tensorflow/core/platform/init_main.h" +#include "tensorflow/core/protobuf/meta_graph.pb.h" + +namespace tensorflow { +namespace grappler { +namespace graph_analyzer { + +// Dies on failure. +static void LoadModel(const string& filename, + tensorflow::MetaGraphDef* metagraph) { + LOG(INFO) << "Loading model from " << filename; + Status st; + st = ReadBinaryProto(Env::Default(), filename, metagraph); + if (!st.ok()) { + LOG(WARNING) << "Failed to read a binary metagraph: " << st; + st = ReadTextProto(Env::Default(), filename, metagraph); + if (!st.ok()) { + LOG(FATAL) << "Failed to read a text metagraph: " << st; + } + } +} + +// Prune the graph to only keep the transitive fanin part with respect to a set +// of train ops (if provided). +void MaybePruneGraph(const tensorflow::MetaGraphDef& metagraph, + tensorflow::GraphDef* graph) { + std::vector fetch_nodes; + for (const auto& fetch : + metagraph.collection_def().at("train_op").node_list().value()) { + LOG(INFO) << "Fetch node: " << fetch; + fetch_nodes.push_back(fetch); + } + if (fetch_nodes.empty()) { + *graph = metagraph.graph_def(); + } else { + std::vector fanin_nodes = + tensorflow::grappler::ComputeTransitiveFanin(metagraph.graph_def(), + fetch_nodes); + for (const tensorflow::NodeDef* node : fanin_nodes) { + *(graph->add_node()) = *node; + } + LOG(INFO) << "Pruned " + << metagraph.graph_def().node_size() - graph->node_size() + << " nodes. Original graph size: " + << metagraph.graph_def().node_size() + << ". New graph size: " << graph->node_size() << "."; + } +} + +void GraphAnalyzerTool(const string& file_name, int n) { + if (n < 1) { + LOG(FATAL) << "Invalid subgraph size " << n << ", must be at least 1"; + } + + tensorflow::MetaGraphDef metagraph; + LoadModel(file_name, &metagraph); + tensorflow::GraphDef graph; + MaybePruneGraph(metagraph, &graph); + tensorflow::grappler::graph_analyzer::GraphAnalyzer analyzer(graph, n); + LOG(INFO) << "Running the analysis"; + tensorflow::Status st = analyzer.Run(); + if (!st.ok()) { + LOG(FATAL) << "Analysis failed: " << st; + } + + LOG(INFO) << "Printing the result"; + st = analyzer.OutputSubgraphs(); + if (!st.ok()) { + LOG(FATAL) << "Failed to print the result: " << st; + } + + LOG(INFO) << "Completed"; +} + +} // end namespace graph_analyzer +} // end namespace grappler +} // end namespace tensorflow diff --git a/tensorflow/core/grappler/graph_analyzer/graph_analyzer_tool.h b/tensorflow/core/grappler/graph_analyzer/graph_analyzer_tool.h new file mode 100644 index 0000000000..5a91fe7dc8 --- /dev/null +++ b/tensorflow/core/grappler/graph_analyzer/graph_analyzer_tool.h @@ -0,0 +1,31 @@ +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#ifndef TENSORFLOW_CORE_GRAPPLER_GRAPH_ANALYZER_GRAPH_ANALYZER_TOOL_H_ +#define TENSORFLOW_CORE_GRAPPLER_GRAPH_ANALYZER_GRAPH_ANALYZER_TOOL_H_ + +#include "tensorflow/core/lib/strings/str_util.h" + +namespace tensorflow { +namespace grappler { +namespace graph_analyzer { + +void GraphAnalyzerTool(const string& file_name, int n); + +} // end namespace graph_analyzer +} // end namespace grappler +} // end namespace tensorflow + +#endif // TENSORFLOW_CORE_GRAPPLER_GRAPH_ANALYZER_GRAPH_ANALYZER_TOOL_H_ diff --git a/tensorflow/core/grappler/graph_analyzer/hash_tools.h b/tensorflow/core/grappler/graph_analyzer/hash_tools.h new file mode 100644 index 0000000000..b0e79f9a68 --- /dev/null +++ b/tensorflow/core/grappler/graph_analyzer/hash_tools.h @@ -0,0 +1,47 @@ +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#ifndef TENSORFLOW_CORE_GRAPPLER_GRAPH_ANALYZER_HASH_TOOLS_H_ +#define TENSORFLOW_CORE_GRAPPLER_GRAPH_ANALYZER_HASH_TOOLS_H_ + +#include + +namespace tensorflow { +namespace grappler { +namespace graph_analyzer { + +// Unfortunately, std::hash provides no way to combine hashes, so everyone +// is copying boost::hash_combine. This is a version that follows Google's +// guidelines on the arguments, and contains only the combination, without +// hashing. +inline void CombineHash(size_t from, size_t* to) { + *to ^= from + 0x9e3779b9 + (*to << 6) + (*to >> 2); +} + +// Combine two hashes in such a way that the order of combination doesn't matter +// (so it's really both commutative and associative). The result is not a very +// high-quality hash but can be used in case if the order of sub-elements must +// not matter in the following comparison. An alternative would be to sort the +// hashes of the sub-elements and then combine them normally in the sorted +// order. +inline void CombineHashCommutative(size_t from, size_t* to) { + *to = *to + from + 0x9e3779b9; +} + +} // end namespace graph_analyzer +} // end namespace grappler +} // end namespace tensorflow + +#endif // TENSORFLOW_CORE_GRAPPLER_GRAPH_ANALYZER_HASH_TOOLS_H_ diff --git a/tensorflow/core/grappler/graph_analyzer/hash_tools_test.cc b/tensorflow/core/grappler/graph_analyzer/hash_tools_test.cc new file mode 100644 index 0000000000..b5e9ce6b8e --- /dev/null +++ b/tensorflow/core/grappler/graph_analyzer/hash_tools_test.cc @@ -0,0 +1,46 @@ +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include "tensorflow/core/grappler/graph_analyzer/hash_tools.h" + +#include +#include + +namespace tensorflow { +namespace grappler { +namespace graph_analyzer { +namespace test { +namespace { + +using ::testing::Eq; + +TEST(HashToolsTest, CombineHashCommutative) { + size_t a = 0; + size_t b = 999; + + size_t c = a; + CombineHashCommutative(b, &c); + + size_t d = b; + CombineHashCommutative(a, &d); + + EXPECT_THAT(c, Eq(d)); +} + +} // namespace +} // end namespace test +} // end namespace graph_analyzer +} // end namespace grappler +} // end namespace tensorflow diff --git a/tensorflow/core/grappler/graph_analyzer/map_tools.h b/tensorflow/core/grappler/graph_analyzer/map_tools.h new file mode 100644 index 0000000000..584062c5f2 --- /dev/null +++ b/tensorflow/core/grappler/graph_analyzer/map_tools.h @@ -0,0 +1,46 @@ +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#ifndef TENSORFLOW_CORE_GRAPPLER_GRAPH_ANALYZER_MAP_TOOLS_H_ +#define TENSORFLOW_CORE_GRAPPLER_GRAPH_ANALYZER_MAP_TOOLS_H_ + +#include + +namespace tensorflow { +namespace grappler { +namespace graph_analyzer { + +// Helpers for building maps of pointers. + +template +struct LessAtPtr : std::binary_function { + bool operator()(const Ptr& x, const Ptr& y) const { return *x < *y; } +}; + +template +struct EqAtPtr : std::binary_function { + bool operator()(const Ptr& x, const Ptr& y) const { return *x == *y; } +}; + +template +struct HashAtPtr : std::unary_function { + size_t operator()(const Ptr& x) const { return x->Hash(); } +}; + +} // end namespace graph_analyzer +} // end namespace grappler +} // end namespace tensorflow + +#endif // TENSORFLOW_CORE_GRAPPLER_GRAPH_ANALYZER_MAP_TOOLS_H_ diff --git a/tensorflow/core/grappler/graph_analyzer/sig_node.cc b/tensorflow/core/grappler/graph_analyzer/sig_node.cc new file mode 100644 index 0000000000..b5cca6a512 --- /dev/null +++ b/tensorflow/core/grappler/graph_analyzer/sig_node.cc @@ -0,0 +1,453 @@ +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include "tensorflow/core/grappler/graph_analyzer/sig_node.h" + +#include + +#include "absl/strings/str_format.h" + +namespace tensorflow { +namespace grappler { +namespace graph_analyzer { + +static constexpr bool debug = false; + +//=== SigNode + +SigNode::SigNode(const NodeDef* node) : node_(node) {} + +void SigNode::CopyLinks(const GenNode& from, const TranslationMap& map) { + hash_to_link_.clear(); + hashed_peers_.clear(); + + std::map link_map; + CopyLinksPass1(from, map, &link_map); + CopyLinksPass2(&link_map); +} + +void SigNode::CopyLinksPass1(const GenNode& from, const TranslationMap& map, + std::map* link_map) { + LinkTag::Hasher link_hasher; + + for (const auto& entry : from.links()) { + for (const auto& target : entry.second) { + auto nodeit = map.find(target.node); + if (nodeit == map.end()) { + // Node is not in the subgraph, ignore. + continue; + } + + LinkTag tag(entry.first, target.port); + size_t hval = link_hasher(tag); + + // This instantiates the entry if it was not present. + Link& map_entry = (*link_map)[tag]; + if (map_entry.peers.empty()) { + map_entry.tag = tag; + map_entry.unique_hash = hval; + } + map_entry.peers.push_back(nodeit->second); + } + } +} + +void SigNode::CopyLinksPass2(std::map* link_map) { + for (auto& entry : *link_map) { + Link* hl_entry_ptr = &hash_to_link_[entry.second.unique_hash]; + // In case of a conflict, rehash. This should almost never happen. + // Because the order of iteration is predictable, the rehashed values + // will also be predictable. + while (!hl_entry_ptr->peers.empty()) { + CombineHash(1, &entry.second.unique_hash); + hl_entry_ptr = &hash_to_link_[entry.second.unique_hash]; + } + + for (const auto& peer : entry.second.peers) { + hashed_peers_.emplace_back(HashedPeer(entry.second.unique_hash, peer)); + } + + hl_entry_ptr->tag = entry.second.tag; + hl_entry_ptr->unique_hash = entry.second.unique_hash; + hl_entry_ptr->peers.swap(entry.second.peers); + } +} + +void SigNode::ComputeTopoHash0() { + topo_hash_.clear(); + last_hashed_nodes_ = next_hashed_nodes_ = node_mask_; + + // TODO(babkin): include the attrbutes too, as an option. + size_t hval = std::hash()(opcode()); + + // Getting the topology of the links in to the hash early should get more + // conflicts resolved early. + for (const auto& entry : hashed_peers_) { + CombineHash(entry.link_hash, &hval); + } + + topo_hash_.push_back(hval); +} + +void SigNode::ComputeTopoHash(int distance) { + // The new starting point. + next_hashed_nodes_ = last_hashed_nodes_; + if (debug) { + LOG(INFO) << "DEBUG node " << name() << " mask=" << std::hex + << next_hashed_nodes_; + } + + if (hash_is_final_) { + return; + } + + CHECK(topo_hash_.size() == distance); + + int prev = distance - 1; + + // Start with own's local topology hash. This value is stable, so + // if the hashes of the surrounding nodes don't change on the following + // distances, the hash of this node won't change either. + size_t hval = topo_hash_[0]; + + if (!hashed_peers_.empty()) { + size_t last_link_hash = hashed_peers_[0].link_hash; + size_t comm_hash = 0; + + for (const auto& entry : hashed_peers_) { + if (entry.link_hash != last_link_hash) { + CombineHash(last_link_hash, &hval); + CombineHash(comm_hash, &hval); + comm_hash = 0; + last_link_hash = entry.link_hash; + } + + // The links in the same vector are commutative, so combine their + // hashes in a commutative way. + CombineHashCommutative(entry.peer->GetTopoHash(prev), &comm_hash); + next_hashed_nodes_ |= entry.peer->last_hashed_nodes_; + if (debug) { + LOG(INFO) << "DEBUG node " << name() << " += " << entry.peer->name() + << " mask=" << std::hex << next_hashed_nodes_; + } + } + + // The last commutative group. + CombineHash(last_link_hash, &hval); + CombineHash(comm_hash, &hval); + } + + topo_hash_.push_back(hval); +} + +size_t SigNode::GetTopoHash(int distance) const { + CHECK(!topo_hash_.empty()); + if (distance >= topo_hash_.size()) { + CHECK(hash_is_final_); + return topo_hash_.back(); + } else { + return topo_hash_[distance]; + } +} + +bool SigNode::operator==(const SigNode& other) const { + // TODO(babkin): add attributes too. + if (opcode() != other.opcode()) { + return false; + } + + // Normally the caller is expected to compare the nodes + // at the same rank in different graphs, but just in case... + if (unique_rank_ != other.unique_rank_) { + return false; + } + + if (hashed_peers_.size() != other.hashed_peers_.size()) { + return false; + } + + for (auto it1 = hashed_peers_.begin(), it2 = other.hashed_peers_.begin(); + it1 != hashed_peers_.end(); ++it1, ++it2) { + // TODO(babkin): might compare the actual values too + // but the hash is probably just as good. + if (it1->link_hash != it2->link_hash) { + return false; + } + if (it1->peer->unique_rank_ != it2->peer->unique_rank_) { + return false; + } + } + + return true; +} + +//=== Signature + +constexpr int Signature::kMaxGraphSize; + +string Signature::ToString() const { + string result; + for (size_t n = 0; n < nodes.size(); ++n) { + // TODO(babkin): add attributes too. + result += absl::StrFormat("%d:%s", n, nodes[n]->opcode()); + for (const auto& entry : nodes[n]->hashed_peers_) { + const auto& link = nodes[n]->hash_to_link_[entry.link_hash]; + + // The link entries are already sorted, by tags and then by the + // node ranks. + if (link.tag.local.IsInbound()) { + result += + absl::StrFormat("[%s:%s:%d]", string(link.tag.local), + string(link.tag.remote), entry.peer->unique_rank_); + } + } + result.push_back(','); + } + return result; +} + +Status Signature::Compute() { + if (map.size() > kMaxGraphSize) { + return Status( + error::INVALID_ARGUMENT, + absl::StrFormat( + "A graph of %d nodes is too big for signature computation, " + "the maximal supported node count is %d.", + map.size(), kMaxGraphSize)); + } + + // The value that will be assigned next as the unique node id. + // This also means that all the entries in nodes at indexes less than this + // have been finalized and don't need to be touched any more. + size_t next_node_id = 0; + + sig_short = 0; + sig_full.resize(0); // Keep the storage. + + // The main signature generation. + PrepareNodes(); + FindUniqueHashes(&next_node_id); + while (next_node_id < map.size()) { + ComputeOneRound(next_node_id); + FindUniqueHashes(&next_node_id); + } + + OrderLinks(); + + return Status::OK(); +} + +void Signature::PrepareNodes() { + nodes.resize(0); // Keep the storage. + + // Initialize the nodes. + int64_t mask = 1; + for (const auto& entry : map) { + SigNode* node = entry.second.get(); + node->last_hashed_nodes_ = node->node_mask_ = mask; + mask <<= 1; + node->unique_rank_ = ~0; + node->hash_is_final_ = false; + node->ComputeTopoHash0(); + if (node->GetHighTopoHash() <= map.size()) { + // Would conflict with one of the reserved values. + node->ReHighTopoHash(); + } + + // The initial order is random. + nodes.emplace_back(node); + } +} + +void Signature::FindUniqueHashes(size_t* next_node_id_p) { + // Start by sorting by the hash value. + std::sort(nodes.begin() + *next_node_id_p, nodes.end(), + SigNode::NodeOrderLess()); + + // At each call, if no nodes have unique hashes, one node that has a + // non-unique (shared) hash can be made unique by assigning a unique id. + // This node gets picked predictably by taking the last node. + // TODO(babkin): Technically, more than one node can be unshared, + // as long as their last_hashed_nodes_ overlap only by the nodes that + // already had the assigned ids before the current round. But it's not clear + // yet, how often would this beneficial, because it looks like for many + // subgraphs unsharing one node should be enough to untangle them. This + // would need more measurement before implementing. + bool found_unique = false; + for (size_t n = *next_node_id_p; n < nodes.size(); ++n) { + size_t cur_hash = nodes[n]->GetHighTopoHash(); + if (n + 1 < nodes.size() && nodes[n + 1]->GetHighTopoHash() == cur_hash) { + // A sequence of nodes sharing the same hash. Skip over it. + // TODO(babkin): check here for the arbitrary hash conflicts and resolve + // them. + for (++n; + n + 1 < nodes.size() && nodes[n + 1]->GetHighTopoHash() == cur_hash; + ++n) { + } + if (found_unique || n != nodes.size() - 1) { + // Either some unique nodes have already been found, or this is + // not the last chance, keep trying to find the unique nodes. + continue; + } + // Here we're at the last node and haven't found any unique ones. + // So fall through and make this last node unique. + } + + found_unique = true; + size_t id = (*next_node_id_p)++; + nodes[n]->unique_rank_ = id; + + size_t last_hash = nodes[n]->GetHighTopoHash(); + CombineHash(last_hash, &sig_short); + sig_full.push_back(last_hash); + + // Take the hash at 0 and mix the unique rank into it. After that it will + // stay fixed. + nodes[n]->topo_hash_.resize(1); + nodes[n]->topo_hash_[0] = id + 1; // Avoid the value of 0. + + nodes[n]->hash_is_final_ = true; + nodes[n]->last_hashed_nodes_ = nodes[n]->node_mask_; + if (n != id) { + std::swap(nodes[id], nodes[n]); + } + } +} + +void Signature::ComputeOneRound(size_t next_node_id) { + // Reset the state of the nodes. + int debug_i = 0; + for (auto it = nodes.begin() + next_node_id; it != nodes.end(); ++it) { + auto node = *it; + // The hash at distance 0 never changes, so preserve it. + node->topo_hash_.resize(1); + node->last_hashed_nodes_ = node->node_mask_; + node->hash_is_final_ = false; + if (debug) { + LOG(INFO) << "DEBUG distance=" << 0 << " node " << debug_i++ << " " + << node->name() << " mask=" << std::hex + << node->last_hashed_nodes_; + } + } + + bool stop = false; + // The distance can reach up to nodes.size()+1, to include not only all the + // nodes but also all the redundant paths. + for (int distance = 1; !stop; ++distance) { + for (auto it = nodes.begin() + next_node_id; it != nodes.end(); ++it) { + auto node = *it; + if (node->hash_is_final_) { + continue; + } + node->ComputeTopoHash(distance); + if (node->GetHighTopoHash() <= nodes.size()) { + // Would conflict with one of the reserved values. + node->ReHighTopoHash(); + } + } + + // Will be looking for the indications to not stop. + stop = true; + + debug_i = 0; + // The bitmasks get moved after all the hash computations are done. + for (auto it = nodes.begin() + next_node_id; it != nodes.end(); ++it) { + auto node = *it; + if (debug) { + LOG(INFO) << "DEBUG distance=" << distance << " node " << debug_i++ + << " " << node->name() << " oldmask=" << std::hex + << node->last_hashed_nodes_ << " mask=" << std::hex + << node->next_hashed_nodes_; + } + if (node->last_hashed_nodes_ == node->next_hashed_nodes_) { + // Stopped growing, this part of the graph must be fully + // surrounded by nodes that already have the unique ids. + node->hash_is_final_ = true; + } else { + node->last_hashed_nodes_ = node->next_hashed_nodes_; + stop = false; + } + } + } +} + +void Signature::OrderLinks() { + for (const auto& node : nodes) { + if (node->hashed_peers_.empty()) { + continue; + } + + size_t cur_link_hash = node->hashed_peers_[0].link_hash + 1; + int first_idx = -1; + + int idx; + for (idx = 0; idx < node->hashed_peers_.size(); ++idx) { + auto& entry = node->hashed_peers_[idx]; + if (entry.link_hash == cur_link_hash) { + continue; + } + if (idx - first_idx > 1) { + // Need to sort. + std::sort(node->hashed_peers_.begin() + first_idx, + node->hashed_peers_.begin() + idx, + SigNode::HashedPeer::LessByRank()); + } + + cur_link_hash = entry.link_hash; + first_idx = idx; + } + if (idx - first_idx > 1) { + // Sort the last bunch. + std::sort(node->hashed_peers_.begin() + first_idx, + node->hashed_peers_.begin() + idx, + SigNode::HashedPeer::LessByRank()); + } + } +} + +bool Signature::operator==(const Signature& other) const { + // Tries to find the differences as early as possible by + // comparing the hashes first. + + if (sig_short != other.sig_short) { + return false; + } + if (sig_full.size() != other.sig_full.size()) { + return false; + } + + for (auto it1 = sig_full.begin(), it2 = other.sig_full.begin(); + it1 != sig_full.end(); ++it1, ++it2) { + if (*it1 != *it2) { + return false; + } + } + + if (nodes.size() != other.nodes.size()) { + return false; + } + for (auto it1 = nodes.begin(), it2 = other.nodes.begin(); it1 != nodes.end(); + ++it1, ++it2) { + if (**it1 != **it2) { + return false; + } + } + + return true; +} + +} // end namespace graph_analyzer +} // end namespace grappler +} // end namespace tensorflow diff --git a/tensorflow/core/grappler/graph_analyzer/sig_node.h b/tensorflow/core/grappler/graph_analyzer/sig_node.h new file mode 100644 index 0000000000..45c0ed3162 --- /dev/null +++ b/tensorflow/core/grappler/graph_analyzer/sig_node.h @@ -0,0 +1,304 @@ +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#ifndef TENSORFLOW_CORE_GRAPPLER_GRAPH_ANALYZER_SIG_NODE_H_ +#define TENSORFLOW_CORE_GRAPPLER_GRAPH_ANALYZER_SIG_NODE_H_ + +#include +#include +#include + +#include "tensorflow/core/framework/graph.pb.h" +#include "tensorflow/core/framework/node_def.pb.h" +#include "tensorflow/core/grappler/graph_analyzer/gen_node.h" +#include "tensorflow/core/grappler/graph_analyzer/hash_tools.h" +#include "tensorflow/core/lib/core/status.h" +#include "tensorflow/core/protobuf/meta_graph.pb.h" + +namespace tensorflow { +namespace grappler { +namespace graph_analyzer { + +namespace test { +class SigBaseTest; +} // end namespace test + +class SigNode; + +// To find nodes by name. Having the map ordered makes the tests easier, +// and it isn't used in production code often enough to get any win from +// using an unordered map. +using SigNodeMap = std::map>; + +// One node in the graph, in the form convenient for generation of the signature +// of the graph, and comparison of two (sub)graphs for equivalence. It refers to +// the original NodeDef protobuf for most information and adds the extra +// enrichment. +// +// The graph building is 2-stage: first match a SigNode with each NodeDef and +// collect them into a map that finds them by name, then process the map, +// deep-parse the underlying NodeDefs and connect the SigNodes together. +class SigNode { + public: + friend struct Signature; + + // Will keep the pointer to the underlying NodeDef, so that + // underlying object must not be deleted while SigNode is alive. + explicit SigNode(const NodeDef* node); + + // Access wrappers. + const string& name() const { return node_->name(); } + const string& opcode() const { return node_->op(); } + const NodeDef* node_def() const { return node_; } + + // For extraction of subgraphs into a separate SigNodeMap, copies the links + // that point inside the subgraph from a full-graph SigNode to a subgraph + // SigNode. The translation map defines the subgraph and gives the mapping + // from the nodes in the full graph to the matching nodes in subgraph. + using TranslationMap = + std::unordered_map; + void CopyLinks(const GenNode& from, const TranslationMap& map); + + // A link is an edge of the graph that connects 2 nodes. Each of the connected + // nodes has its own perspective on the link, seeing its local port, remote + // port and the remote node. The direction of the link is encoded in the + // ports, one port is always incoming and another one outgoing. + // + // The link tag here contains both ports of the link viewed from the + // perspective of this node; consisting of both the local port (i.e. at this + // node) and remote port (i.e. on the other node), the local one going first. + struct LinkTag { + struct Hasher { + size_t operator()(const LinkTag& tag) const noexcept { + size_t hval = port_hasher(tag.local); + CombineHash(port_hasher(tag.remote), &hval); + return hval; + } + GenNode::Port::Hasher port_hasher; + }; + + LinkTag(GenNode::Port a_local, GenNode::Port a_remote) + : local(a_local), remote(a_remote) {} + + // The default constructor is used for the default values in maps. + // (false, 99) is an arbitrary value that makes the uninitialized + // links easy to tell when debugging (they should never happen). + LinkTag() : local(false, 99), remote(false, 99) {} + + // Port of the link on the local node. + GenNode::Port local; + // Port of the link on the remote node. + GenNode::Port remote; + + bool operator==(const LinkTag& other) const { + return local == other.local && remote == other.remote; + } + bool operator<(const LinkTag& other) const { + return local < other.local || + (local == other.local && remote < other.remote); + } + }; + + // Since the signature logic doesn't differentiate between the links + // with the same tag (other than by the "peer" nodes on their other ends), + // all the links with the same tag are grouped into a single structure. + struct Link { + LinkTag tag; + size_t unique_hash; // Hash of the tag after conflict resolution. + // The remote node(s) on the other side on the link(s). + using PeerVector = std::vector; + PeerVector peers; + }; + + // A way to look up the link description by its hash. + using LinkHashMap = std::map; + const LinkHashMap& hash_to_link() const { return hash_to_link_; } + + // The enumeration of all the peer nodes in a predictable order. + // Before the signature generation, only the link values determine the + // order, after the signature generation the entries at the same + // links get further sorted by their peer node ranks. + struct HashedPeer { + HashedPeer(size_t l, SigNode* p) : link_hash(l), peer(p) {} + + struct LessByRank { + bool operator()(const SigNode::HashedPeer& left, + const SigNode::HashedPeer& right) { + return left.peer->unique_rank_ < right.peer->unique_rank_; + } + }; + + size_t link_hash; + SigNode* peer; + }; + using HashedPeerVector = std::vector; + const HashedPeerVector& hashed_peers() const { return hashed_peers_; } + + // Compares two nodes in two different graphs for equivalence (two nodes in + // the same graph would never be equivalent). Expects that the signatures of + // the graphs have already been computed, so unique_rank_ is filled in and + // the hashed_peers_ properly ordered. + bool operator==(const SigNode& other) const; + + bool operator!=(const SigNode& other) const { return !(*this == other); } + + private: + friend class test::SigBaseTest; + + // The CopyLinks code is split into 2 parts for testability. + // The first pass builds a map ordered by LinkTag for predictability. + void CopyLinksPass1(const GenNode& from, const TranslationMap& map, + std::map* link_map); + // The second pass converts to the map by hash value, + // resolves any hash conflicts, and builds the hashed peer vector. + void CopyLinksPass2(std::map* link_map); + + // Computes the topological hash at distance 0. Resets the topo_hash_ vector + // and hashed_nodes_; + void ComputeTopoHash0(); + + // Compute the topological has at the given distance. The hashes for all the + // lower distances must be already computed for all the nodes in the graph. + // Also computes next_hashed_nodes_ from last_hashed_nodes_. + void ComputeTopoHash(int distance); + + // Get the hash value for a particular distance. It must be previously + // computed. + size_t GetTopoHash(int distance) const; + + // The the hash value for the highest computed distance. It must be previously + // computed. + size_t GetHighTopoHash() const { + CHECK(!topo_hash_.empty()); + return topo_hash_.back(); + } + + // Rehash the topmost hash, to avoid conflicts. + void ReHighTopoHash() { + CHECK(!topo_hash_.empty()); + CombineHash(1, &topo_hash_.back()); + } + + // Ordering by node order and highest available hash (it must be + // previously computed). + struct NodeOrderLess { + bool operator()(const SigNode* left, const SigNode* right) { + return left->topo_hash_.back() < right->topo_hash_.back(); + } + }; + + private: + const NodeDef* node_; + + // The bitmap mask with 1 bit set that represents this node in the set + // during the computation of the signature. + uint64_t node_mask_ = 0; + + // The code that populates this map makes sure that there are no hash + // conflicts, rehashing if necessary. + LinkHashMap hash_to_link_; + + // The enumeration of all the direct peers in the predictable order (which + // happens to be the order ot their link tags, but the order of the hashes + // would do too). It is used for the quick enumeration during the signature + // computation. After the signature building is completed, the entries that + // have the same link tag get further sorted in the order of the ranks of + // their nodes. + HashedPeerVector hashed_peers_; + + // The unique rank represents the order in which the node will be included + // into the signature. It gets assigned in order either when the topo_hash_ of + // this node becomes unique in the graph, or when the nodes are completely + // equivalent, one of them is picked at random to assign the next rank, and + // then the rest of the nodes attempt to disambiguate based on that + // information. + size_t unique_rank_ = ~0; + // When hash_is_final_ is set, the topo_has_ vector stops growing, and the + // last value from it is used for all the further hashes. + bool hash_is_final_ = false; + // The hashes that include the topology of the nodes up to the distance N. The + // hash for distance 0 is produced from the attributes of this node itself and + // its general connectivity properties but no information about the + // neighboring nodes. The hash for distance D+1 is build from hashes at level + // D of this node and of all its immediate neighbors. The neighbors that are + // connected by equivalent links are included in a commutative way. + std::vector topo_hash_; + // The set of nodes that got included into the computation of the + // last topo_hash_ entry. + uint64_t last_hashed_nodes_ = 0; + // The next set of nodes that gets used for the current topo_hash entry. + uint64_t next_hashed_nodes_ = 0; +}; + +// Signature of a graph. The computation is intertwined with the private methods +// of SigNode, so keeping both in the same file looks more convenient. +struct Signature { + friend class test::SigBaseTest; + + // Maximal size of the graphs for which the signature can be computed. + // Changing this constant won't magically add the support for a larger size, + // the rest of implementation would have to be extended. The value of 64 is + // driven by the size of a bitset in an uint64_t, and should be enough for our + // purposes, while having a high efficiency of implementation. + static constexpr int kMaxGraphSize = 64; + + // Using the map, computes the rest of the fields of a signature. + // Returns an error is the graph is too big. + Status Compute(); + + // Convert the computed signature to a string representation. + string ToString() const; + + SigNodeMap map; // The nodes in the graph, accessible by name. + size_t sig_short = 0; // Hash of the signature, for the quick equality check. + // The full signature: hashes of the nodes in a predictable order. + std::vector sig_full; + // The nodes in the same order as they go in the signature. + std::vector nodes; + + // For building the unordered maps. + size_t Hash() const { return sig_short; } + + // Returns true if the graphs are equivalent. The signature must be already + // computed. + bool operator==(const Signature& other) const; + + private: + // Populates the nodes vector from the map and initializes the state of the + // nodes for the signature computation. + void PrepareNodes(); + + // Finds the nodes with the hashes that are unique and assigns the unique ids + // to them. If there are nodes with non-unique hashes, exactly one node from + // the first such sequence (in the order of hash values) will be picked and + // assigned a unique id. Assumes that the nodes[0...(next_node_id-1)] have + // been already assigned the unique ids. Advances next_node_id by at least 1. + void FindUniqueHashes(size_t* next_node_id_p); + + // One round of the signature computation. Assumes that the + // nodes[0...(next_node_id-1)] have been already assigned the fixed + // positions, and thus computes the hashes only for the remaining nodes. + void ComputeOneRound(size_t next_node_id); + + // Additional ordering of the hashed_peers_ links in the nodes, so that they + // can be compared and printed in a predictable order. + void OrderLinks(); +}; + +} // end namespace graph_analyzer +} // end namespace grappler +} // end namespace tensorflow + +#endif // TENSORFLOW_CORE_GRAPPLER_GRAPH_ANALYZER_SIG_NODE_H_ diff --git a/tensorflow/core/grappler/graph_analyzer/sig_node_test.cc b/tensorflow/core/grappler/graph_analyzer/sig_node_test.cc new file mode 100644 index 0000000000..4c6a9ba9e0 --- /dev/null +++ b/tensorflow/core/grappler/graph_analyzer/sig_node_test.cc @@ -0,0 +1,1235 @@ +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include "tensorflow/core/grappler/graph_analyzer/sig_node.h" + +#include +#include +#include "absl/memory/memory.h" +#include "absl/strings/str_format.h" +#include "tensorflow/core/grappler/graph_analyzer/subgraph.h" +#include "tensorflow/core/grappler/graph_analyzer/test_tools.h" +#include "tensorflow/core/grappler/utils.h" + +namespace tensorflow { +namespace grappler { +namespace graph_analyzer { +namespace test { + +using ::testing::ElementsAre; +using ::testing::Eq; +using ::testing::Gt; +using ::testing::Ne; +using ::testing::SizeIs; + +//=== + +TEST(SigNodeLinkTag, Compare) { + SigNode::LinkTag a(GenNode::Port(false, 1), GenNode::Port(false, 2)); + SigNode::LinkTag b(GenNode::Port(false, 1), GenNode::Port(false, 2)); + SigNode::LinkTag c(GenNode::Port(false, 2), GenNode::Port(false, 1)); + SigNode::LinkTag d(GenNode::Port(false, 1), GenNode::Port(false, 3)); + SigNode::LinkTag e(GenNode::Port(false, 2), GenNode::Port(false, 2)); + + EXPECT_TRUE(a == b); + EXPECT_FALSE(a == c); + EXPECT_FALSE(a == e); + + EXPECT_FALSE(a < b); + EXPECT_FALSE(b < a); + + EXPECT_TRUE(a < c); + EXPECT_FALSE(c < a); + + EXPECT_TRUE(a < d); + EXPECT_FALSE(d < a); +} + +//=== + +class SigBaseTest : public ::testing::Test, protected TestGraphs { + protected: + void BuildSigMap(const GraphDef& graph) { + gen_map_.clear(); + sig_.map.clear(); + CHECK(GenNode::BuildGraphInMap(graph, &gen_map_).ok()); + Subgraph::Identity id; + for (const auto& entry : gen_map_) { + id.insert(entry.second.get()); + } + Subgraph sg(id); + sg.ExtractForSignature(&sig_.map); + } + + static void CopyLinksPass2( + std::map* link_map, SigNode* node) { + node->CopyLinksPass2(link_map); + } + + static void ComputeTopoHash0(SigNode* node) { node->ComputeTopoHash0(); } + + static void ComputeTopoHash(int distance, SigNode* node) { + node->ComputeTopoHash(distance); + } + + static size_t GetTopoHash(int distance, SigNode* node) { + return node->GetTopoHash(distance); + } + + static size_t GetHighTopoHash(SigNode* node) { + return node->GetHighTopoHash(); + } + + static void ReHighTopoHash(SigNode* node) { node->ReHighTopoHash(); } + + static SigNode::HashedPeerVector& RefHashedPeers(SigNode* node) { + return node->hashed_peers_; + } + static size_t& RefUniqueRank(SigNode* node) { return node->unique_rank_; } + static bool& RefHashIsFinal(SigNode* node) { return node->hash_is_final_; } + static std::vector& RefTopoHash(SigNode* node) { + return node->topo_hash_; + } + static uint64_t& RefNodeMask(SigNode* node) { return node->node_mask_; } + static uint64_t& RefLastHashedNodes(SigNode* node) { + return node->last_hashed_nodes_; + } + static uint64_t& RefNextHashedNodes(SigNode* node) { + return node->next_hashed_nodes_; + } + + static void PrepareNodes(Signature* signature) { signature->PrepareNodes(); } + + static void FindUniqueHashes(size_t* next_node_id_p, Signature* signature) { + signature->FindUniqueHashes(next_node_id_p); + } + + static void ComputeOneRound(size_t next_node_id, Signature* signature) { + signature->ComputeOneRound(next_node_id); + } + + static void OrderLinks(Signature* signature) { signature->OrderLinks(); } + + // These get initialized in BuildSigMap(). + GenNodeMap gen_map_; + Signature sig_; +}; + +//=== + +class SigNodeTest : public SigBaseTest {}; + +// Tests that the duplicate hashes get resolved by rehashing. +TEST_F(SigNodeTest, DuplicateHash) { + NodeDef node1 = MakeNodeConst("node1"); + NodeDef node2 = MakeNodeConst("node2"); + NodeDef node3 = MakeNodeShapeN("node3", "node1", "node2"); + + SigNode sn1(&node1); + SigNode sn2(&node2); + SigNode sn3(&node3); + + constexpr size_t kSameHash = 999; + + SigNode::Link link1; + link1.tag = SigNode::LinkTag(GenNode::Port(true, 0), GenNode::Port(false, 0)); + link1.unique_hash = kSameHash; + link1.peers.emplace_back(&sn1); + + SigNode::Link link2; + link2.tag = SigNode::LinkTag(GenNode::Port(true, 1), GenNode::Port(false, 0)); + link2.unique_hash = kSameHash; + link2.peers.emplace_back(&sn2); + + SigNode::Link link3; + link3.tag = SigNode::LinkTag(GenNode::Port(true, 2), GenNode::Port(false, 0)); + link3.unique_hash = kSameHash; + link3.peers.emplace_back(&sn3); + + std::map link_map; + link_map[link1.tag] = link1; + link_map[link2.tag] = link2; + link_map[link3.tag] = link3; + + CopyLinksPass2(&link_map, &sn3); + auto& hl = sn3.hash_to_link(); + EXPECT_THAT(hl, SizeIs(3)); + + // Check that the hashes are self_consistent, and put the entries into + // another map with a known order. + std::map rehashed; + auto hlit = hl.begin(); + ASSERT_THAT(hlit, Ne(hl.end())); + EXPECT_THAT(hlit->second.unique_hash, Eq(hlit->first)); + rehashed[hlit->second.tag] = hlit->second; + ++hlit; + ASSERT_THAT(hlit, Ne(hl.end())); + EXPECT_THAT(hlit->second.unique_hash, Eq(hlit->first)); + rehashed[hlit->second.tag] = hlit->second; + ++hlit; + ASSERT_THAT(hlit, Ne(hl.end())); + EXPECT_THAT(hlit->second.unique_hash, Eq(hlit->first)); + rehashed[hlit->second.tag] = hlit->second; + + // Just in case. + ASSERT_THAT(rehashed, SizeIs(3)); + + auto rhit = rehashed.begin(); + ASSERT_THAT(rhit, Ne(rehashed.end())); + EXPECT_TRUE(rhit->second.tag == link1.tag); + EXPECT_THAT(rhit->second.unique_hash, Eq(kSameHash)); + EXPECT_THAT(rhit->second.peers, ElementsAre(&sn1)); + + ++rhit; + ASSERT_THAT(rhit, Ne(rehashed.end())); + EXPECT_TRUE(rhit->second.tag == link2.tag); + // This hash must be rehashed. + EXPECT_THAT(rhit->second.unique_hash, Ne(kSameHash)); + size_t hash2 = rhit->second.unique_hash; + EXPECT_THAT(rhit->second.peers, ElementsAre(&sn2)); + + ++rhit; + ASSERT_THAT(rhit, Ne(rehashed.end())); + EXPECT_TRUE(rhit->second.tag == link3.tag); + // This hash must be rehashed. + EXPECT_THAT(rhit->second.unique_hash, Ne(kSameHash)); + EXPECT_THAT(rhit->second.unique_hash, Ne(hash2)); + size_t hash3 = rhit->second.unique_hash; + EXPECT_THAT(rhit->second.peers, ElementsAre(&sn3)); + + auto& peers = sn3.hashed_peers(); + EXPECT_THAT(peers, SizeIs(3)); + + auto peerit = peers.begin(); + ASSERT_THAT(peerit, Ne(peers.end())); + EXPECT_THAT(peerit->link_hash, Eq(kSameHash)); + EXPECT_THAT(peerit->peer, Eq(&sn1)); + + ++peerit; + ASSERT_THAT(peerit, Ne(peers.end())); + EXPECT_THAT(peerit->link_hash, Eq(hash2)); + EXPECT_THAT(peerit->peer, Eq(&sn2)); + + ++peerit; + ASSERT_THAT(peerit, Ne(peers.end())); + EXPECT_THAT(peerit->link_hash, Eq(hash3)); + EXPECT_THAT(peerit->peer, Eq(&sn3)); +} + +// The full CopyLinks() is tested in (SubgraphTest, ExtractForSignature). + +TEST_F(SigNodeTest, GetTopoHash) { + NodeDef node1 = MakeNodeConst("node1"); + SigNode sn1(&node1); + + // Fake some hash values. + RefTopoHash(&sn1).emplace_back(123); + RefTopoHash(&sn1).emplace_back(456); + + EXPECT_THAT(GetTopoHash(0, &sn1), Eq(123)); + EXPECT_THAT(GetTopoHash(1, &sn1), Eq(456)); + + RefHashIsFinal(&sn1) = true; + + EXPECT_THAT(GetTopoHash(0, &sn1), Eq(123)); + EXPECT_THAT(GetTopoHash(1, &sn1), Eq(456)); + EXPECT_THAT(GetTopoHash(2, &sn1), Eq(456)); + + EXPECT_THAT(GetHighTopoHash(&sn1), Eq(456)); +} + +TEST_F(SigNodeTest, ReTopoHash) { + NodeDef node1 = MakeNodeConst("node1"); + SigNode sn1(&node1); + + // Fake some hash values. + RefTopoHash(&sn1).emplace_back(123); + RefTopoHash(&sn1).emplace_back(456); + + EXPECT_THAT(GetTopoHash(0, &sn1), Eq(123)); + EXPECT_THAT(GetTopoHash(1, &sn1), Eq(456)); + + ReHighTopoHash(&sn1); + + size_t expected_hash = 456; + CombineHash(1, &expected_hash); + + EXPECT_THAT(GetTopoHash(0, &sn1), Eq(123)); + EXPECT_THAT(GetTopoHash(1, &sn1), Eq(expected_hash)); +} + +TEST_F(SigNodeTest, ComputeTopoHash0) { + NodeDef node1 = MakeNodeConst("node1"); + SigNode sn1(&node1); + + // Fake a topology. + RefUniqueRank(&sn1) = 10; + RefNodeMask(&sn1) = 0x02; + + RefTopoHash(&sn1).emplace_back(123); + RefTopoHash(&sn1).emplace_back(456); + + // Fake a state. + RefLastHashedNodes(&sn1) = 0xFF; + RefNextHashedNodes(&sn1) = 0xFF; + + RefHashedPeers(&sn1).emplace_back(SigNode::HashedPeer(1, nullptr)); + RefHashedPeers(&sn1).emplace_back(SigNode::HashedPeer(1, nullptr)); + RefHashedPeers(&sn1).emplace_back(SigNode::HashedPeer(2, nullptr)); + RefHashedPeers(&sn1).emplace_back(SigNode::HashedPeer(3, nullptr)); + RefHashedPeers(&sn1).emplace_back(SigNode::HashedPeer(3, nullptr)); + + // Run the test. + ComputeTopoHash0(&sn1); + + EXPECT_THAT(RefLastHashedNodes(&sn1), Eq(0x02)); + EXPECT_THAT(RefNextHashedNodes(&sn1), Eq(0x02)); + EXPECT_THAT(RefTopoHash(&sn1), SizeIs(1)); + + size_t exp_hval = std::hash()(sn1.opcode()); + CombineHash(1, &exp_hval); + CombineHash(1, &exp_hval); + CombineHash(2, &exp_hval); + CombineHash(3, &exp_hval); + CombineHash(3, &exp_hval); + + EXPECT_THAT(GetTopoHash(0, &sn1), Eq(exp_hval)); +} + +TEST_F(SigNodeTest, ComputeTopoHashNotFinal) { + NodeDef node1 = MakeNodeConst("node1"); + SigNode sn1(&node1); + NodeDef node2 = MakeNodeConst("node2"); + SigNode sn2(&node2); + NodeDef node3 = MakeNodeConst("node3"); + SigNode sn3(&node3); + + // Fake a topology. + RefUniqueRank(&sn1) = 0; + RefNodeMask(&sn1) = 0x01; + RefUniqueRank(&sn2) = 0; + RefNodeMask(&sn2) = 0x02; + RefUniqueRank(&sn3) = 0; + RefNodeMask(&sn3) = 0x04; + + RefHashedPeers(&sn1).emplace_back(SigNode::HashedPeer(10, &sn2)); + RefHashedPeers(&sn1).emplace_back(SigNode::HashedPeer(10, &sn3)); + RefHashedPeers(&sn1).emplace_back(SigNode::HashedPeer(20, &sn2)); + RefHashedPeers(&sn1).emplace_back(SigNode::HashedPeer(30, &sn3)); + RefHashedPeers(&sn1).emplace_back(SigNode::HashedPeer(30, &sn2)); + + // Fake a state. + RefTopoHash(&sn1).emplace_back(123); + RefTopoHash(&sn1).emplace_back(321); + + RefTopoHash(&sn2).emplace_back(456); + RefTopoHash(&sn2).emplace_back(654); + + RefTopoHash(&sn3).emplace_back(789); + RefTopoHash(&sn3).emplace_back(987); + + // These values are not realistic in the way that they don't include the bits + // from the mask of nodes themselves, but that's the point of this test: only + // the previous nodes' node sets are used in the computation, not their own + // masks directly. + RefLastHashedNodes(&sn1) = 0x8; + RefLastHashedNodes(&sn2) = 0x10; + RefLastHashedNodes(&sn3) = 0x20; + + // A scratch value to get overwritten. + RefNextHashedNodes(&sn1) = 0x100; + + ComputeTopoHash(2, &sn1); + + EXPECT_THAT(RefLastHashedNodes(&sn1), Eq(0x8)); // Unchanged. + EXPECT_THAT(RefNextHashedNodes(&sn1), Eq(0x38)); + + // This computes the hash form the explicit numbers above. + size_t exp_hash = 123; // The 0th hash is the starting point. + size_t comm_hash; + + comm_hash = 0; + CombineHashCommutative(654, &comm_hash); + CombineHashCommutative(987, &comm_hash); + + CombineHash(10, &exp_hash); + CombineHash(comm_hash, &exp_hash); + + comm_hash = 0; + CombineHashCommutative(654, &comm_hash); + + CombineHash(20, &exp_hash); + CombineHash(comm_hash, &exp_hash); + + comm_hash = 0; + CombineHashCommutative(654, &comm_hash); + CombineHashCommutative(987, &comm_hash); + + CombineHash(30, &exp_hash); + CombineHash(comm_hash, &exp_hash); + + EXPECT_THAT(GetTopoHash(2, &sn1), Eq(exp_hash)); + EXPECT_THAT(RefTopoHash(&sn1), SizeIs(3)); +} + +TEST_F(SigNodeTest, ComputeTopoHashFinal) { + NodeDef node1 = MakeNodeConst("node1"); + SigNode sn1(&node1); + NodeDef node2 = MakeNodeConst("node2"); + SigNode sn2(&node2); + NodeDef node3 = MakeNodeConst("node3"); + SigNode sn3(&node3); + + // Fake a topology - same as for ComputeTopoHashNotFinal. + RefUniqueRank(&sn1) = 0; + RefNodeMask(&sn1) = 0x01; + RefUniqueRank(&sn2) = 0; + RefNodeMask(&sn2) = 0x02; + RefUniqueRank(&sn3) = 0; + RefNodeMask(&sn3) = 0x04; + + RefHashedPeers(&sn1).emplace_back(SigNode::HashedPeer(10, &sn2)); + RefHashedPeers(&sn1).emplace_back(SigNode::HashedPeer(10, &sn3)); + RefHashedPeers(&sn1).emplace_back(SigNode::HashedPeer(20, &sn2)); + RefHashedPeers(&sn1).emplace_back(SigNode::HashedPeer(30, &sn3)); + RefHashedPeers(&sn1).emplace_back(SigNode::HashedPeer(30, &sn2)); + + // Fake a state - mostly same as for ComputeTopoHashNotFinal. + RefTopoHash(&sn1).emplace_back(123); + RefTopoHash(&sn1).emplace_back(321); + + RefTopoHash(&sn2).emplace_back(456); + RefTopoHash(&sn2).emplace_back(654); + + RefTopoHash(&sn3).emplace_back(789); + RefTopoHash(&sn3).emplace_back(987); + + // These values are not realistic in the way that they don't include the bits + // from the mask of nodes themselves, but that's the point of this test: only + // the previous nodes' node sets are used in the computation, not their own + // masks directly. + RefLastHashedNodes(&sn1) = 0x8; + RefLastHashedNodes(&sn2) = 0x10; + RefLastHashedNodes(&sn3) = 0x20; + + // A scratch value to get overwritten. + RefNextHashedNodes(&sn1) = 0x100; + + // This is the difference in configuration. + RefHashIsFinal(&sn1) = true; + + ComputeTopoHash(2, &sn1); + + EXPECT_THAT(RefLastHashedNodes(&sn1), Eq(0x8)); // Unchanged. + EXPECT_THAT(RefNextHashedNodes(&sn1), Eq(0x8)); + EXPECT_THAT(RefTopoHash(&sn1), SizeIs(2)); + EXPECT_THAT(GetTopoHash(2, &sn1), Eq(321)); +} + +TEST_F(SigNodeTest, EqualsOpcode) { + NodeDef node1 = MakeNodeConst("node1"); + SigNode sn1(&node1); + + NodeDef node2 = MakeNodeConst("node2"); + SigNode sn2(&node2); + + EXPECT_TRUE(sn1 == sn2); + EXPECT_FALSE(sn1 != sn2); + + node2.set_op("Mul"); + + EXPECT_TRUE(sn1 != sn2); + EXPECT_FALSE(sn1 == sn2); +} + +TEST_F(SigNodeTest, EqualsRank) { + NodeDef node1 = MakeNodeConst("node1"); + SigNode sn1(&node1); + + NodeDef node2 = MakeNodeConst("node2"); + SigNode sn2(&node2); + + EXPECT_TRUE(sn1 == sn2); + EXPECT_FALSE(sn1 != sn2); + + RefUniqueRank(&sn1) = 1; + RefUniqueRank(&sn2) = 2; + + EXPECT_TRUE(sn1 != sn2); + EXPECT_FALSE(sn1 == sn2); +} + +// Checks that if the nodes have a different number of links, +// they will be considered unequal. +TEST_F(SigNodeTest, EqualsLinkSize) { + GraphDef graph1; + (*graph1.add_node()) = MakeNodeConst("node1"); + (*graph1.add_node()) = MakeNodeMul("node2", "node1", "node1"); + + GenNodeMap gen_map1; + ASSERT_THAT(GenNode::BuildGraphInMap(graph1, &gen_map1), Eq(Status::OK())); + + Subgraph::Identity id1; + id1.insert(gen_map1["node1"].get()); + id1.insert(gen_map1["node2"].get()); + Subgraph sg1(id1); + + SigNodeMap sig_map1; + sg1.ExtractForSignature(&sig_map1); + + GraphDef graph2; + (*graph2.add_node()) = MakeNodeConst("node1"); + // The difference between graph1 and graph2: one more input. + auto node22 = graph2.add_node(); + *node22 = MakeNodeMul("node2", "node1", "node1"); + node22->add_input("node2"); + + GenNodeMap gen_map2; + ASSERT_THAT(GenNode::BuildGraphInMap(graph2, &gen_map2), Eq(Status::OK())); + + Subgraph::Identity id2; + id2.insert(gen_map2["node1"].get()); + id2.insert(gen_map2["node2"].get()); + Subgraph sg2(id2); + + SigNodeMap sig_map2; + sg2.ExtractForSignature(&sig_map2); + + EXPECT_TRUE(*sig_map1["node1"] == *sig_map2["node1"]); + EXPECT_FALSE(*sig_map1["node2"] == *sig_map2["node2"]); + EXPECT_FALSE(*sig_map2["node2"] == *sig_map1["node2"]); +} + +TEST_F(SigNodeTest, EqualsLinks) { + // Start with 2 copies of the same graph. + GraphDef graph1; + (*graph1.add_node()) = MakeNodeConst("node1"); + (*graph1.add_node()) = MakeNodeMul("node2", "node1", "node1"); + + GenNodeMap gen_map1; + ASSERT_THAT(GenNode::BuildGraphInMap(graph1, &gen_map1), Eq(Status::OK())); + + Subgraph::Identity id1; + id1.insert(gen_map1["node1"].get()); + id1.insert(gen_map1["node2"].get()); + Subgraph sg1(id1); + + SigNodeMap sig_map1; + sg1.ExtractForSignature(&sig_map1); + + GenNodeMap gen_map2; + ASSERT_THAT(GenNode::BuildGraphInMap(graph1, &gen_map2), Eq(Status::OK())); + + Subgraph::Identity id2; + id2.insert(gen_map2["node1"].get()); + id2.insert(gen_map2["node2"].get()); + Subgraph sg2(id2); + + SigNodeMap sig_map2; + sg2.ExtractForSignature(&sig_map2); + + EXPECT_TRUE(*sig_map1["node1"] == *sig_map2["node1"]); + EXPECT_TRUE(*sig_map1["node2"] == *sig_map2["node2"]); + + // Alter the link hash of one of the nodes. + SigNode* sn2 = sig_map2["node2"].get(); + ++RefHashedPeers(sn2)[0].link_hash; + + EXPECT_FALSE(*sig_map1["node2"] == *sig_map2["node2"]); + + // Restore back. + --RefHashedPeers(sn2)[0].link_hash; + EXPECT_TRUE(*sig_map1["node2"] == *sig_map2["node2"]); + + // Alter the unique rank of a referenced node. + ++RefUniqueRank(sig_map2["node1"].get()); + + EXPECT_FALSE(*sig_map1["node2"] == *sig_map2["node2"]); +} + +//=== + +class SignatureTest : public SigBaseTest { + protected: + // Initializeds the state used to generate the permutations of a given size. + static void InitPermutation(size_t size, + std::vector* plain_permutation, + std::vector* countdown) { + plain_permutation->clear(); + countdown->clear(); + for (size_t i = 0; i < size; ++i) { + plain_permutation->emplace_back(i); + countdown->emplace_back(size - 1 - i); + } + } + + // Builds a permutation guided by the count-down value. + static void BuildPermutation(const std::vector& plain_permutation, + const std::vector& countdown, + std::vector* result) { + *result = plain_permutation; + for (int i = 0; i < result->size(); ++i) { + std::swap((*result)[i], (*result)[i + countdown[i]]); + } + } + + // Returns false when the count-down is finished. + static bool CountDown(std::vector* countdown) { + // The last position always contains 0, so skip it. + int pos; + for (pos = countdown->size() - 2; pos >= 0; --pos) { + if ((*countdown)[pos] > 0) { + --(*countdown)[pos]; + break; + } + (*countdown)[pos] = (countdown->size() - 1 - pos); + } + + return pos >= 0; + } + + // Permutes the nodes every which way and checks that all the signatures + // produced are the same. This is reasonable for the graphs up to the + // size 5, maybe 6 at the stretch. After that the number of permutation grows + // huge and the test becomes very slow. + void TestGraphEveryWay(const GraphDef& graph) { + size_t graph_size = graph.node_size(); + + gen_map_.clear(); + sig_.map.clear(); + Status result = GenNode::BuildGraphInMap(graph, &gen_map_); + ASSERT_THAT(result, Eq(Status::OK())); + Subgraph::Identity id; + for (const auto& entry : gen_map_) { + id.insert(entry.second.get()); + } + Subgraph sg(id); + sg.ExtractForSignature(&sig_.map); + + std::vector plain_permutation; + std::vector countdown; + InitPermutation(graph_size, &plain_permutation, &countdown); + + std::set signatures; + std::vector permutation; + do { + BuildPermutation(plain_permutation, countdown, &permutation); + + constexpr bool kDebugPermutation = false; + if (kDebugPermutation) { + string p; + for (int i = 0; i < permutation.size(); ++i) { + p.push_back('0' + permutation[i]); + } + LOG(INFO) << "Permutation: " << p; + } + + std::vector> hold(graph_size); + int idx; + + // Permute the nodes. + sig_.nodes.clear(); + idx = 0; + if (kDebugPermutation) { + LOG(INFO) << " nodes before permutation:"; + } + for (auto& entry : sig_.map) { + if (kDebugPermutation) { + LOG(INFO) << " " << entry.second.get(); + } + hold[idx++] = std::move(entry.second); + } + idx = 0; + if (kDebugPermutation) { + LOG(INFO) << " nodes after permutation:"; + } + for (auto& entry : sig_.map) { + entry.second = std::move(hold[permutation[idx++]]); + if (kDebugPermutation) { + LOG(INFO) << " " << entry.second.get(); + } + // This is used to order the links per permutation. + sig_.nodes.emplace_back(entry.second.get()); + RefUniqueRank(entry.second.get()) = idx; + } + // Order the links with the same tags per permutation. + OrderLinks(&sig_); + + // The test as such. + ASSERT_THAT(sig_.Compute(), Eq(Status::OK())); + + signatures.insert(sig_.ToString()); + + EXPECT_THAT(sig_.sig_full, SizeIs(graph_size)); + size_t hval = 0; + for (size_t ih : sig_.sig_full) { + // The space 1..graph_size is reserved. + EXPECT_THAT(ih, Gt(graph_size)); + CombineHash(ih, &hval); + } + EXPECT_THAT(sig_.sig_short, Eq(hval)); + + // Un-permute the nodes for the next iteration. + idx = 0; + for (auto& entry : sig_.map) { + hold[permutation[idx++]] = std::move(entry.second); + } + idx = 0; + if (kDebugPermutation) { + LOG(INFO) << " nodes after un-permutation:"; + } + for (auto& entry : sig_.map) { + entry.second = std::move(hold[idx++]); + if (kDebugPermutation) { + LOG(INFO) << " " << entry.second.get(); + } + } + } while (CountDown(&countdown)); + + for (const auto& s : signatures) { + LOG(INFO) << "Signature: " << s; + } + + // All the permutations should produce the same signature. + EXPECT_THAT(signatures, SizeIs(1)); + } +}; + +TEST_F(SignatureTest, PrepareNodes) { + NodeDef node1 = MakeNodeConst("node1"); + sig_.map["node1"] = absl::make_unique(&node1); + NodeDef node2 = MakeNodeConst("node2"); + sig_.map["node2"] = absl::make_unique(&node2); + NodeDef node3 = MakeNodeConst("node3"); + sig_.map["node3"] = absl::make_unique(&node3); + + PrepareNodes(&sig_); + + ASSERT_THAT(sig_.nodes, SizeIs(3)); + + int idx = 0; + for (const auto& entry : sig_.map) { + EXPECT_THAT(RefNodeMask(entry.second.get()), Eq(1 << idx)) + << " at index " << idx; + EXPECT_THAT(RefUniqueRank(entry.second.get()), Eq(static_cast(~0))) + << " at index " << idx; + EXPECT_THAT(RefHashIsFinal(entry.second.get()), false) + << " at index " << idx; + EXPECT_THAT(RefTopoHash(entry.second.get()), SizeIs(1)) + << " at index " << idx; + ++idx; + } +} + +TEST_F(SignatureTest, FindUniqueHashesAllDifferent) { + NodeDef node1 = MakeNodeConst("node1"); + SigNode sn1(&node1); + NodeDef node2 = MakeNodeConst("node2"); + SigNode sn2(&node2); + NodeDef node3 = MakeNodeConst("node3"); + SigNode sn3(&node3); + NodeDef node4 = MakeNodeConst("node4"); + SigNode sn4(&node4); + + // The last values in the arrays values go in the backwards order. + RefTopoHash(&sn1).emplace_back(100); + RefTopoHash(&sn1).emplace_back(900); + + RefTopoHash(&sn2).emplace_back(200); + RefTopoHash(&sn2).emplace_back(800); + + RefTopoHash(&sn3).emplace_back(300); + RefTopoHash(&sn3).emplace_back(700); + + RefTopoHash(&sn4).emplace_back(400); + RefTopoHash(&sn4).emplace_back(600); + + sig_.nodes.emplace_back(&sn1); + sig_.nodes.emplace_back(&sn2); + sig_.nodes.emplace_back(&sn3); + sig_.nodes.emplace_back(&sn4); + + size_t next = 1; // Skips over sn1. + + FindUniqueHashes(&next, &sig_); + EXPECT_THAT(next, Eq(4)); + + EXPECT_THAT(sig_.nodes[0], Eq(&sn1)); + // The nodes after first one get sorted by the high hash. + EXPECT_THAT(sig_.nodes[1], Eq(&sn4)); + EXPECT_THAT(sig_.nodes[2], Eq(&sn3)); + EXPECT_THAT(sig_.nodes[3], Eq(&sn2)); + + EXPECT_THAT(RefHashIsFinal(&sn1), Eq(false)); + // Nodes that get finalized are marked as such. + EXPECT_THAT(RefHashIsFinal(&sn2), Eq(true)); + EXPECT_THAT(RefHashIsFinal(&sn3), Eq(true)); + EXPECT_THAT(RefHashIsFinal(&sn4), Eq(true)); + + EXPECT_THAT(RefTopoHash(&sn1), SizeIs(2)); + ASSERT_THAT(RefTopoHash(&sn2), SizeIs(1)); + ASSERT_THAT(RefTopoHash(&sn3), SizeIs(1)); + ASSERT_THAT(RefTopoHash(&sn4), SizeIs(1)); + + EXPECT_THAT(RefTopoHash(&sn2)[0], Eq(4)); + EXPECT_THAT(RefTopoHash(&sn3)[0], Eq(3)); + EXPECT_THAT(RefTopoHash(&sn4)[0], Eq(2)); + + EXPECT_THAT(sig_.sig_full, ElementsAre(600, 700, 800)); + + size_t exp_short_hash = 0; + CombineHash(600, &exp_short_hash); + CombineHash(700, &exp_short_hash); + CombineHash(800, &exp_short_hash); + EXPECT_THAT(sig_.sig_short, Eq(exp_short_hash)); +} + +TEST_F(SignatureTest, FindUniqueHashesDuplicatesExceptOne) { + NodeDef node1 = MakeNodeConst("node1"); + SigNode sn1(&node1); + NodeDef node2 = MakeNodeConst("node2"); + SigNode sn2(&node2); + NodeDef node3 = MakeNodeConst("node3"); + SigNode sn3(&node3); + NodeDef node4 = MakeNodeConst("node4"); + SigNode sn4(&node4); + NodeDef node5 = MakeNodeConst("node5"); + SigNode sn5(&node5); + + RefTopoHash(&sn1).emplace_back(100); + RefTopoHash(&sn1).emplace_back(600); + + RefTopoHash(&sn2).emplace_back(200); + RefTopoHash(&sn2).emplace_back(600); + + RefTopoHash(&sn3).emplace_back(300); + RefTopoHash(&sn3).emplace_back(700); + + RefTopoHash(&sn4).emplace_back(400); + RefTopoHash(&sn4).emplace_back(800); + + RefTopoHash(&sn5).emplace_back(500); + RefTopoHash(&sn5).emplace_back(800); + + sig_.nodes.emplace_back(&sn1); + sig_.nodes.emplace_back(&sn2); + sig_.nodes.emplace_back(&sn3); + sig_.nodes.emplace_back(&sn4); + sig_.nodes.emplace_back(&sn5); + + size_t next = 0; + + FindUniqueHashes(&next, &sig_); + EXPECT_THAT(next, Eq(1)); + + // The unique node goes first. + EXPECT_THAT(sig_.nodes[0], Eq(&sn3)); + + // The rest of the nodes are assumed to be sorted in a stable order. + EXPECT_THAT(sig_.nodes[1], Eq(&sn2)); + // Node 1 gets swapped with node 3. + EXPECT_THAT(sig_.nodes[2], Eq(&sn1)); + EXPECT_THAT(sig_.nodes[3], Eq(&sn4)); + EXPECT_THAT(sig_.nodes[4], Eq(&sn5)); + + EXPECT_THAT(RefHashIsFinal(&sn1), Eq(false)); + EXPECT_THAT(RefHashIsFinal(&sn2), Eq(false)); + EXPECT_THAT(RefHashIsFinal(&sn3), Eq(true)); + EXPECT_THAT(RefHashIsFinal(&sn4), Eq(false)); + EXPECT_THAT(RefHashIsFinal(&sn5), Eq(false)); + + EXPECT_THAT(RefTopoHash(&sn1), SizeIs(2)); + EXPECT_THAT(RefTopoHash(&sn2), SizeIs(2)); + EXPECT_THAT(RefTopoHash(&sn3), SizeIs(1)); + EXPECT_THAT(RefTopoHash(&sn4), SizeIs(2)); + EXPECT_THAT(RefTopoHash(&sn5), SizeIs(2)); + + EXPECT_THAT(RefTopoHash(&sn3)[0], Eq(1)); +} + +TEST_F(SignatureTest, FindUniqueHashesDuplicates) { + NodeDef node1 = MakeNodeConst("node1"); + SigNode sn1(&node1); + NodeDef node2 = MakeNodeConst("node2"); + SigNode sn2(&node2); + NodeDef node3 = MakeNodeConst("node3"); + SigNode sn3(&node3); + NodeDef node4 = MakeNodeConst("node4"); + SigNode sn4(&node4); + NodeDef node5 = MakeNodeConst("node5"); + SigNode sn5(&node5); + + RefTopoHash(&sn1).emplace_back(100); + RefTopoHash(&sn1).emplace_back(600); + + RefTopoHash(&sn2).emplace_back(200); + RefTopoHash(&sn2).emplace_back(600); + + RefTopoHash(&sn3).emplace_back(300); + RefTopoHash(&sn3).emplace_back(700); + + RefTopoHash(&sn4).emplace_back(400); + RefTopoHash(&sn4).emplace_back(700); + + RefTopoHash(&sn5).emplace_back(500); + RefTopoHash(&sn5).emplace_back(700); + + sig_.nodes.emplace_back(&sn1); + sig_.nodes.emplace_back(&sn2); + sig_.nodes.emplace_back(&sn3); + sig_.nodes.emplace_back(&sn4); + sig_.nodes.emplace_back(&sn5); + + size_t next = 0; + + FindUniqueHashes(&next, &sig_); + EXPECT_THAT(next, Eq(1)); + + // The last copy of the last duplicate wins. + EXPECT_THAT(sig_.nodes[0], Eq(&sn5)); + + // The rest of the nodes are assumed to be sorted in a stable order. + // Node 1 gets swapped. + EXPECT_THAT(sig_.nodes[1], Eq(&sn2)); + EXPECT_THAT(sig_.nodes[2], Eq(&sn3)); + EXPECT_THAT(sig_.nodes[3], Eq(&sn4)); + EXPECT_THAT(sig_.nodes[4], Eq(&sn1)); + + EXPECT_THAT(RefHashIsFinal(&sn1), Eq(false)); + EXPECT_THAT(RefHashIsFinal(&sn2), Eq(false)); + EXPECT_THAT(RefHashIsFinal(&sn3), Eq(false)); + EXPECT_THAT(RefHashIsFinal(&sn4), Eq(false)); + EXPECT_THAT(RefHashIsFinal(&sn5), Eq(true)); + + EXPECT_THAT(RefTopoHash(&sn1), SizeIs(2)); + EXPECT_THAT(RefTopoHash(&sn2), SizeIs(2)); + EXPECT_THAT(RefTopoHash(&sn3), SizeIs(2)); + EXPECT_THAT(RefTopoHash(&sn4), SizeIs(2)); + EXPECT_THAT(RefTopoHash(&sn5), SizeIs(1)); + + EXPECT_THAT(RefTopoHash(&sn5)[0], Eq(1)); +} + +// On a circular topology. +TEST_F(SignatureTest, ComputeOneRoundCircular) { + BuildSigMap(graph_circular_onedir_); + PrepareNodes(&sig_); + + ASSERT_THAT(sig_.nodes, SizeIs(5)); + + // This skips FindUniqueHashes() which would pick one node, so that + // all the nodes are equivalent for ComputeOneRound(). + + ComputeOneRound(0, &sig_); + + // All the nodes are the same, so the computed hashes will also be the same. + size_t hval = GetHighTopoHash(sig_.nodes[0]); + for (int i = 0; i < 5; ++i) { + EXPECT_THAT(GetHighTopoHash(sig_.nodes[i]), Eq(hval)) << " at index " << i; + EXPECT_THAT(RefHashIsFinal(sig_.nodes[i]), Eq(true)) << " at index " << i; + EXPECT_THAT(RefLastHashedNodes(sig_.nodes[i]), Eq(0x1F)) + << " at index " << i; + EXPECT_THAT(RefNextHashedNodes(sig_.nodes[i]), Eq(0x1F)) + << " at index " << i; + // The sets of hashed nodes go like this: + // Step 0: self. + // Step 1: self, previous (-1) and next (+1) node. + // Step 2: self, (-1), (-2), (+1), (+2): all 5 nodes in the graph + // Step 3: still all 5 nodes in the graph + EXPECT_THAT(RefTopoHash(sig_.nodes[i]), SizeIs(4)) << " at index " << i; + } +} + +// On a linear topology. +TEST_F(SignatureTest, ComputeOneRoundLinear) { + BuildSigMap(graph_linear_); + PrepareNodes(&sig_); + + ASSERT_THAT(sig_.nodes, SizeIs(5)); + + // This skips FindUniqueHashes() which would pick one node, so that + // all the nodes are equivalent for ComputeOneRound(). + + ComputeOneRound(0, &sig_); + + std::vector hash_size; + for (int i = 0; i < 5; ++i) { + EXPECT_THAT(RefHashIsFinal(sig_.nodes[i]), Eq(true)) << " at index " << i; + EXPECT_THAT(RefLastHashedNodes(sig_.nodes[i]), Eq(0x1F)) + << " at index " << i; + EXPECT_THAT(RefNextHashedNodes(sig_.nodes[i]), Eq(0x1F)) + << " at index " << i; + hash_size.emplace_back(RefTopoHash(sig_.nodes[i]).size()); + } + + // The sets of hashed nodes for the central node go like this: + // Step 0: self. + // Step 1: self, previous (-1) and next (+1) node. + // Step 2: self, (-1), (-2), (+1), (+2): all 5 nodes in the graph + // Step 3: still all 5 nodes in the graph + // + // The nodes one step closer to the ends require one more step. The end nodes + // require one more step yet. + std::sort(hash_size.begin(), hash_size.end()); + EXPECT_THAT(hash_size, ElementsAre(4, 5, 5, 6, 6)); +} + +// On a linear topology where the cental node has been already marked as unique +// (yeah, not a very realistic case but tests the situations when the +// disconnected subgraphs get created). +TEST_F(SignatureTest, ComputeOneRoundSplitLinear) { + BuildSigMap(graph_linear_); + PrepareNodes(&sig_); + + ASSERT_THAT(sig_.nodes, SizeIs(5)); + + // This test relies on the order of SigNodeMap imposed on sig_.nodes. + + // The middle node gets separated by moving it to the front. + std::swap(sig_.nodes[0], sig_.nodes[2]); + ASSERT_THAT(RefNodeMask(sig_.nodes[0]), Eq(0x04)); + ASSERT_THAT(RefLastHashedNodes(sig_.nodes[0]), Eq(0x04)); + ASSERT_THAT(RefNextHashedNodes(sig_.nodes[0]), Eq(0x04)); + RefHashIsFinal(sig_.nodes[0]) = true; + + ComputeOneRound(1, &sig_); + + // These should stay unchanged. + EXPECT_THAT(RefLastHashedNodes(sig_.nodes[0]), Eq(0x04)); + EXPECT_THAT(RefNextHashedNodes(sig_.nodes[0]), Eq(0x04)); + + std::vector hash_size; + for (int i = 1; i < 5; ++i) { + EXPECT_THAT(RefHashIsFinal(sig_.nodes[i]), Eq(true)) << " at index " << i; + hash_size.emplace_back(RefTopoHash(sig_.nodes[i]).size()); + } + + std::sort(hash_size.begin(), hash_size.end()); + // The end nodes take 4 steps, closer to the center 3 steps. + EXPECT_THAT(hash_size, ElementsAre(3, 3, 4, 4)); + + EXPECT_THAT(RefLastHashedNodes(sig_.nodes[1]), Eq(0x07)); + EXPECT_THAT(RefNextHashedNodes(sig_.nodes[1]), Eq(0x07)); + EXPECT_THAT(RefLastHashedNodes(sig_.nodes[2]), Eq(0x07)); + EXPECT_THAT(RefNextHashedNodes(sig_.nodes[2]), Eq(0x07)); + + EXPECT_THAT(RefLastHashedNodes(sig_.nodes[3]), Eq(0x1C)); + EXPECT_THAT(RefNextHashedNodes(sig_.nodes[3]), Eq(0x1C)); + EXPECT_THAT(RefLastHashedNodes(sig_.nodes[4]), Eq(0x1C)); + EXPECT_THAT(RefNextHashedNodes(sig_.nodes[4]), Eq(0x1C)); +} + +TEST_F(SignatureTest, OrderLinks) { + gen_map_.clear(); + sig_.map.clear(); + Status result = GenNode::BuildGraphInMap(graph_for_link_order_, &gen_map_); + ASSERT_THAT(result, Eq(Status::OK())); + Subgraph::Identity id; + for (const auto& entry : gen_map_) { + id.insert(entry.second.get()); + } + Subgraph sg(id); + sg.ExtractForSignature(&sig_.map); + + // Populate the fake signature and assign the ranks in the backwards order. + for (auto it = sig_.map.rbegin(); it != sig_.map.rend(); ++it) { + auto& entry = *it; + RefUniqueRank(entry.second.get()) = sig_.nodes.size(); + sig_.nodes.emplace_back(entry.second.get()); + } + + // How it was ordered in the original graph. + string before = sig_.ToString(); + // clang-format off + EXPECT_THAT(before, Eq( + "0:Mul[i0:o0:5][i0:o0:4][i0:o1:4][i0:o2:3][i0:o2:2][i0:o3:2]," + "1:Mul[i0:o0:5][i0:o0:4][i0:o0:3][i0:o0:2]," + "2:Const," + "3:Const," + "4:Const," + "5:Const," + )); + // clang-format on + + OrderLinks(&sig_); + + string after = sig_.ToString(); + // clang-format off + EXPECT_THAT(after, Eq( + "0:Mul[i0:o0:4][i0:o0:5][i0:o1:4][i0:o2:2][i0:o2:3][i0:o3:2]," + "1:Mul[i0:o0:2][i0:o0:3][i0:o0:4][i0:o0:5]," + "2:Const," + "3:Const," + "4:Const," + "5:Const," + )); + // clang-format on +} + +TEST_F(SignatureTest, GraphTooBig) { + GraphDef graph; + for (int i = 0; i <= Signature::kMaxGraphSize; ++i) { + (*graph.add_node()) = MakeNodeConst(absl::StrFormat("node%d", i)); + } + + ASSERT_THAT(GenNode::BuildGraphInMap(graph, &gen_map_), Eq(Status::OK())); + + Subgraph::Identity id; + for (const auto& entry : gen_map_) { + id.insert(entry.second.get()); + } + Subgraph sg(id); + sg.ExtractForSignature(&sig_.map); + + ASSERT_THAT(sig_.Compute(), + Eq(Status(error::INVALID_ARGUMENT, + "A graph of 65 nodes is too big for signature " + "computation, the maximal supported node count is " + "64."))); +} + +TEST_F(SignatureTest, ToString) { + BuildSigMap(graph_circular_onedir_); + PrepareNodes(&sig_); + + ASSERT_THAT(sig_.nodes, SizeIs(5)); + + // Fake the works by assigning unique ranks as they go in the initial order. + for (int i = 0; i < 5; ++i) { + RefUniqueRank(sig_.nodes[i]) = i; + RefHashIsFinal(sig_.nodes[i]) = true; + } + + string result = sig_.ToString(); + + // clang-format off + ASSERT_THAT(result, Eq( + "0:Mul[i0:o0:4][i0:o0:4]," + "1:Mul[i0:o0:0][i0:o0:0]," + "2:Mul[i0:o0:1][i0:o0:1]," + "3:Mul[i0:o0:2][i0:o0:2]," + "4:Mul[i0:o0:3][i0:o0:3]," + )); + // clang-format on +} + +// This is a test of the permutation logic itself. +TEST_F(SignatureTest, Permutation) { + std::vector plain_permutation; + std::vector countdown; + InitPermutation(5, &plain_permutation, &countdown); + + std::set results; + + std::vector permutation; + do { + BuildPermutation(plain_permutation, countdown, &permutation); + EXPECT_THAT(permutation, SizeIs(5)); + + string p; + for (int i = 0; i < permutation.size(); ++i) { + p.push_back('0' + permutation[i]); + } + LOG(INFO) << "Permutation: " << p; + results.insert(p); + } while (CountDown(&countdown)); + + EXPECT_THAT(results, SizeIs(5 * 4 * 3 * 2 * 1)); +} + +TEST_F(SignatureTest, ComputeCircularOneDir) { + TestGraphEveryWay(graph_circular_onedir_); +} + +TEST_F(SignatureTest, ComputeCircularBiDir) { + TestGraphEveryWay(graph_circular_bidir_); +} + +TEST_F(SignatureTest, ComputeLinear) { TestGraphEveryWay(graph_linear_); } + +TEST_F(SignatureTest, ComputeMultiInput) { + TestGraphEveryWay(graph_multi_input_); +} + +TEST_F(SignatureTest, ComputeAllOrNone) { + TestGraphEveryWay(graph_all_or_none_); +} + +TEST_F(SignatureTest, ComputeCross) { TestGraphEveryWay(graph_small_cross_); } + +TEST_F(SignatureTest, Equals) { + // Start with 2 copies of the same graph. + GenNodeMap gen_map1; + ASSERT_THAT(GenNode::BuildGraphInMap(graph_circular_bidir_, &gen_map1), + Eq(Status::OK())); + + Subgraph::Identity id1; + id1.insert(gen_map1["node1"].get()); + id1.insert(gen_map1["node2"].get()); + Subgraph sg1(id1); + + Signature sig1; + sg1.ExtractForSignature(&sig1.map); + ASSERT_THAT(sig1.Compute(), Eq(Status::OK())); + + GenNodeMap gen_map2; + ASSERT_THAT(GenNode::BuildGraphInMap(graph_circular_bidir_, &gen_map2), + Eq(Status::OK())); + + Subgraph::Identity id2; + id2.insert(gen_map2["node1"].get()); + id2.insert(gen_map2["node2"].get()); + Subgraph sg2(id2); + + Signature sig2; + sg2.ExtractForSignature(&sig2.map); + ASSERT_THAT(sig2.Compute(), Eq(Status::OK())); + + EXPECT_TRUE(sig1 == sig2); + + // Change the short hash. + ++sig2.sig_short; + EXPECT_FALSE(sig1 == sig2); + + // Restore back. + --sig2.sig_short; + EXPECT_TRUE(sig1 == sig2); + + // Change the full hash. + ++sig2.sig_full[0]; + EXPECT_FALSE(sig1 == sig2); + + // Restore back. + --sig2.sig_full[0]; + EXPECT_TRUE(sig1 == sig2); + + // Make the nodes different. + std::swap(sig2.nodes[0], sig2.nodes[1]); + EXPECT_FALSE(sig1 == sig2); + + // Restore back. + std::swap(sig2.nodes[0], sig2.nodes[1]); + EXPECT_TRUE(sig1 == sig2); + + // Different number of nodes. + sig2.nodes.emplace_back(sig2.nodes[0]); + EXPECT_FALSE(sig1 == sig2); + EXPECT_FALSE(sig2 == sig1); +} + +} // end namespace test +} // end namespace graph_analyzer +} // end namespace grappler +} // end namespace tensorflow diff --git a/tensorflow/core/grappler/graph_analyzer/subgraph.cc b/tensorflow/core/grappler/graph_analyzer/subgraph.cc new file mode 100644 index 0000000000..28a91e0f84 --- /dev/null +++ b/tensorflow/core/grappler/graph_analyzer/subgraph.cc @@ -0,0 +1,235 @@ +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include "tensorflow/core/grappler/graph_analyzer/subgraph.h" + +#include + +#include "absl/memory/memory.h" +#include "absl/strings/str_format.h" +#include "absl/strings/str_join.h" +#include "tensorflow/core/grappler/graph_analyzer/hash_tools.h" + +namespace tensorflow { +namespace grappler { +namespace graph_analyzer { + +//=== Subgraph::Identity + +Subgraph::Identity::Identity(InitializerList init) { + for (auto element : init) { + insert(element); + } +} + +bool Subgraph::Identity::operator<(const Identity& other) const { + // Shorter sets go first. + if (this->size() < other.size()) { + return true; + } + if (this->size() > other.size()) { + return false; + } + for (auto lit = this->begin(), rit = other.begin(); lit != this->end(); + ++lit, ++rit) { + if (*lit < *rit) { + return true; + } + if (*lit > *rit) { + return false; + } + } + return false; // Equal. +} + +bool Subgraph::Identity::operator==(const Identity& other) const { + if (this->size() != other.size()) { + return false; + } + for (auto lit = this->begin(), rit = other.begin(); lit != this->end(); + ++lit, ++rit) { + if (*lit != *rit) { + return false; + } + } + return true; // Equal. +} + +size_t Subgraph::Identity::Hash() const { + std::hash hasher; + size_t result = 0; + for (auto ptr : *this) { + CombineHash(hasher(ptr), &result); + } + return result; +} + +string Subgraph::Dump() { + // TODO(babkin): this is simplified for now. + std::vector nodes; + for (const auto& n : id_) { + if (specific_) { + nodes.emplace_back(absl::StrFormat("%s(%s)", n->opcode(), n->name())); + } else { + nodes.emplace_back(n->opcode()); + } + } + std::sort(nodes.begin(), nodes.end()); + + return absl::StrFormat("%d: ", collation_count_) + absl::StrJoin(nodes, ", "); +} + +void Subgraph::ExtractForSignature(SigNodeMap* result) { + // Mapping of nodes from the original graph to the new one. + SigNode::TranslationMap full_to_new; + + for (auto node : id_) { + auto newnode_ref = absl::make_unique(node->node_def()); + auto newnode = newnode_ref.get(); + (*result)[node->name()] = std::move(newnode_ref); + full_to_new[node] = newnode; + } + + for (const auto& mapping : full_to_new) { + mapping.second->CopyLinks(*mapping.first, full_to_new); + } +} + +//=== Subgraph + +Subgraph::Subgraph(const Identity& parent_id, GenNode* add_node) + : id_(parent_id) { + id_.insert(add_node); + hash_ = id_.Hash(); +} + +//=== SubgraphIterator + +SubgraphIterator::SubgraphIterator(const Subgraph::Identity* id) + : id_(id), id_it_(id_->begin()) { + if (!id_->empty()) { + link_map_it_ = (*id_it_)->links().begin(); + // In case if the node has no links. + while (link_map_it_ == (*id_it_)->links().end()) { + if (++id_it_ == id_->end()) { + return; + } + link_map_it_ = (*id_it_)->links().begin(); + } + link_idx_ = 0; + // The LinkTargetVector should never be empty but just in case safeguard + // against that too. + PropagateNext(); + } +} + +bool SubgraphIterator::Next() { + if (AtEnd()) { + return false; + } + ++link_idx_; + return PropagateNext(); +} + +bool SubgraphIterator::NextIfSamePort() { + if (AtEnd()) { + return false; + } + if (link_idx_ + 1 < link_map_it_->second.size()) { + ++link_idx_; + return true; + } else { + return false; + } +} + +void SubgraphIterator::SkipPort() { + if (AtEnd()) { + return; + } + link_idx_ = link_map_it_->second.size() - 1; +} + +void SubgraphIterator::SkipNode() { + if (AtEnd()) { + return; + } + for (auto next = link_map_it_; next != (*id_it_)->links().end(); ++next) { + link_map_it_ = next; + } + link_idx_ = link_map_it_->second.size() - 1; +} + +bool SubgraphIterator::PropagateNext() { + // Loops are used to skip over the empty entries. + while (link_idx_ >= link_map_it_->second.size()) { + ++link_map_it_; + while (link_map_it_ == (*id_it_)->links().end()) { + if (++id_it_ == id_->end()) { + return false; + } + link_map_it_ = (*id_it_)->links().begin(); + } + link_idx_ = 0; + } + return true; +} + +bool SubgraphIterator::operator==(const SubgraphIterator& other) const { + if (id_ != other.id_) { + return false; + } + if (id_it_ != other.id_it_) { + return false; + } + // When AtEnd(), the rest of the fields are not valid. + if (AtEnd()) { + return true; + } + if (link_map_it_ != other.link_map_it_) { + return false; + } + if (link_idx_ != other.link_idx_) { + return false; + } + return true; +} + +//=== SubgraphPtrSet + +Subgraph* SubgraphPtrSet::ExtendParent(const Subgraph::Identity& parent_id, + GenNode* node) { + if (parent_id.find(node) != parent_id.end()) { + // This was another link to the node that is already in the parent. + return nullptr; + } + + // Constructing an object just to check that an equivalent one is already + // present is kind of ugly but storing the references rather than the objects + // in the set avoids the need to make the object copyable. + auto sg = absl::make_unique(parent_id, node); + if (find(sg) != end()) { + // This subgraph was already found by extending from a different path. + return nullptr; + } + + Subgraph* ptr = sg.get(); + insert(std::move(sg)); + return ptr; +} + +} // end namespace graph_analyzer +} // end namespace grappler +} // end namespace tensorflow diff --git a/tensorflow/core/grappler/graph_analyzer/subgraph.h b/tensorflow/core/grappler/graph_analyzer/subgraph.h new file mode 100644 index 0000000000..4de31d5dfa --- /dev/null +++ b/tensorflow/core/grappler/graph_analyzer/subgraph.h @@ -0,0 +1,189 @@ +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#ifndef TENSORFLOW_CORE_GRAPPLER_GRAPH_ANALYZER_SUBGRAPH_H_ +#define TENSORFLOW_CORE_GRAPPLER_GRAPH_ANALYZER_SUBGRAPH_H_ + +#include +#include + +#include "tensorflow/core/grappler/graph_analyzer/gen_node.h" +#include "tensorflow/core/grappler/graph_analyzer/map_tools.h" +#include "tensorflow/core/grappler/graph_analyzer/sig_node.h" +#include "tensorflow/core/lib/gtl/flatset.h" + +namespace tensorflow { +namespace grappler { +namespace graph_analyzer { + +// The description of a single subgraph for processing. +class Subgraph { + public: + // Identity of a single subgraph as a set of nodes. + class Identity : public gtl::FlatSet { + public: + using InitializerList = std::initializer_list; + + Identity() = default; + Identity(InitializerList init); + bool operator<(const Identity& other) const; + bool operator==(const Identity& other) const; + + // Compute the hash. + size_t Hash() const; + }; + + explicit Subgraph(Identity id) : id_(std::move(id)), hash_(id_.Hash()) {} + + // Construct by extending the parent identity with an extra node. + Subgraph(const Identity& parent_id, GenNode* add_node); + + Subgraph() = delete; + Subgraph(const Subgraph& other) = delete; + void operator=(const Subgraph& other) = delete; + + // Order for building sets of subgraphs. + bool operator<(const Subgraph& other) const { return this->id_ < other.id_; } + // Support for hashed sets. + bool operator==(const Subgraph& other) const { + return this->id_ == other.id_; + } + size_t Hash() const { return hash_; } + + // Dump the subgraph information to a string. + string Dump(); + + // Extract this subgraph into a separate graph representation for signature + // building, that includes only the links between the nodes in the subgraph + // and drops all the external links. The result map should be clear before the + // call. + void ExtractForSignature(SigNodeMap* result); + + const Identity& id() const { return id_; } + bool specific() const { return specific_; } + void SetSpecific(bool value) { specific_ = value; } + int32_t collation_count() const { return collation_count_; } + void AddCollation(int32_t n = 1) { collation_count_ += n; } + void ResetCollation() { collation_count_ = 1; } + void MergeCollation(const Subgraph& other) { + collation_count_ += other.collation_count_; + } + + private: + // Identity also serves as the list of nodes. It never changes throughout the + // life of subgraph. + Identity id_; + size_t hash_; // Cached from the identity. + // Whether the dump should include the specific names of the nodes. The + // non-specific (i.e. generic) subgraphs represent a collation of multiple + // subgraphs. + bool specific_ = true; + // How many collated subgraphs are represented by this subgraph. + int32_t collation_count_ = 1; +}; + +// Iteration of all links in a subgraph. This is more like Java iterators than +// the normal C++ iterators. It's simpler this way and there seems to be no +// major reason to make it a proper C++ iterator. +class SubgraphIterator { + public: + // Obviously an iterator is valid only until the original object + // gets destroyed. + explicit SubgraphIterator(const Subgraph::Identity* id); + explicit SubgraphIterator(const Subgraph* sg) : SubgraphIterator(&sg->id()) {} + + // Check whether the built-in iterator is at the end. + bool AtEnd() const { return id_it_ == id_->end(); } + + // Get the neighbor at the current iterator. + // MUST NOT be called when AtEnd(); + const GenNode::LinkTarget& GetNeighbor() const { + return link_map_it_->second[link_idx_]; + } + + // Get the node at the current iterator. + // MUST NOT be called when AtEnd(); + const GenNode* GetNode() const { return *id_it_; } + + // Get the port leading to the neighbor at the current iterator. + // MUST NOT be called when AtEnd(); + GenNode::Port GetPort() const { return link_map_it_->first; } + + // Increases the iterator. + // Returns true if NOT AtEnd() after increasing the iterator. + // Safe to call if already AtEnd(). + bool Next(); + + // If there are more links at the same port, increases the iterator and + // returns true. Otherwise leaves the iterator unchanged and returns false. + bool NextIfSamePort(); + + // Increases the iterator directly to the last position on the current port + // (or if already there then doesn't increase). Equivalent to calling + // NextIfSamePort() while it returns true, but faster. + // Safe to call if already AtEnd(). + void SkipPort(); + + // Increases the iterator directly to the last position on the current node. + // Safe to call if already AtEnd(). + void SkipNode(); + + // Returns true if the iterators are exactly the same. + bool operator==(const SubgraphIterator& other) const; + bool operator!=(const SubgraphIterator& other) const { + return !(*this == other); + } + + private: + // After link_idx_ has been increased, make sure that it points to the + // next valid element (or end) by increasing the higher levels of iteration if + // needed. + // Returns true if NOT AtEnd() after increasing the iterator. + // NOT safe to call if already AtEnd(). + bool PropagateNext(); + + // Identity of the subgraph being iterated over. + const Subgraph::Identity* id_; + + // The current position, allowing to iterate through the links (see the + // reasoning for it in the public section). + // + // (1) Iterator of the nodes in the subgraph. + Subgraph::Identity::const_iterator id_it_; + // (2) Iterator in the link map of the node. + GenNode::LinkMap::const_iterator link_map_it_; + // (3) Index in the vector of the links. + int32_t link_idx_; +}; + +// A convenient way to store subgraphs: in a set of unique_ptrs. This way the +// addresses of subgraph objects will stay stable, and the objects themselves +// won't be copied. +class SubgraphPtrSet + : public std::unordered_set, + HashAtPtr>, + EqAtPtr>> { + public: + // Attempts to extend the set by adding a new subgraph that gets created by + // adding one node to the parent subgraph. If such a subgraph already exists, + // returns nullptr, otherwise returns the pointer to the new subgraph. + Subgraph* ExtendParent(const Subgraph::Identity& parent_id, GenNode* node); +}; + +} // end namespace graph_analyzer +} // end namespace grappler +} // end namespace tensorflow + +#endif // TENSORFLOW_CORE_GRAPPLER_GRAPH_ANALYZER_SUBGRAPH_H_ diff --git a/tensorflow/core/grappler/graph_analyzer/subgraph_test.cc b/tensorflow/core/grappler/graph_analyzer/subgraph_test.cc new file mode 100644 index 0000000000..0f90dc8f0d --- /dev/null +++ b/tensorflow/core/grappler/graph_analyzer/subgraph_test.cc @@ -0,0 +1,348 @@ +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include "tensorflow/core/grappler/graph_analyzer/subgraph.h" + +#include +#include +#include + +#include +#include +#include "absl/memory/memory.h" +#include "absl/strings/str_format.h" +#include "tensorflow/core/grappler/graph_analyzer/test_tools.h" + +namespace tensorflow { +namespace grappler { +namespace graph_analyzer { +namespace test { +namespace { + +using ::testing::ElementsAre; +using ::testing::Eq; +using ::testing::Ne; + +TEST(SubgraphTest, Comparison) { + GraphDef graph; + // A topology with a loop. + (*graph.add_node()) = MakeNodeConst("node1"); + (*graph.add_node()) = MakeNodeConst("node2"); + GenNodeMap map; + ASSERT_THAT(GenNode::BuildGraphInMap(graph, &map), Eq(Status::OK())); + auto gn1 = map["node1"].get(); + auto gn2 = map["node2"].get(); + ASSERT_THAT(gn1, Ne(nullptr)); + ASSERT_THAT(gn2, Ne(nullptr)); + + Subgraph::Identity id1; + Subgraph::Identity id2; + + id1.insert(gn1); + id2.insert(gn2); + + Subgraph sg1(id1); + Subgraph sg2(id2); + + EXPECT_TRUE(id1 == sg1.id()); + EXPECT_TRUE(id2 == sg2.id()); + + EXPECT_THAT(sg1 < sg2, Eq(id1 < id2)); +} + +TEST(SubgraphTest, EmptyIteration) { + NodeDef node1 = MakeNodeConst("node1"); + auto gn1 = absl::make_unique(&node1); + Subgraph::Identity id1; + id1.insert(gn1.get()); + Subgraph sg1(id1); + SubgraphIterator sit(&sg1); + + EXPECT_TRUE(sit.AtEnd()); + EXPECT_FALSE(sit.Next()); + EXPECT_TRUE(sit.AtEnd()); + + SubgraphIterator sit2(&sg1); + EXPECT_TRUE(sit == sit2); +} + +TEST(SubgraphTest, Iteration) { + GraphDef graph; + // A topology with a loop. + (*graph.add_node()) = MakeNodeConst("node1"); + (*graph.add_node()) = MakeNodeSub("node2", "node3:1", "node3:0"); + auto node3 = graph.add_node(); + *node3 = MakeNodeBroadcastGradientArgs("node3", "node1", "node2"); + node3->add_input("^node3"); // The control link goes back to self. + + GenNodeMap map; + ASSERT_THAT(GenNode::BuildGraphInMap(graph, &map), Eq(Status::OK())); + ASSERT_THAT(map.find("node3"), Ne(map.end())); + + Subgraph::Identity id; + id.insert(map["node3"].get()); + Subgraph sg(id); + + // node3 has 2 incoming data links, 2 outgoing data , 1 control incoming, 1 + // control outgoing = total of 6 + { + SubgraphIterator sit(&sg); + EXPECT_FALSE(sit.AtEnd()); // 1 + EXPECT_TRUE(sit.Next()); + EXPECT_FALSE(sit.AtEnd()); // 2 + EXPECT_TRUE(sit.Next()); + EXPECT_FALSE(sit.AtEnd()); // 3 + EXPECT_TRUE(sit.Next()); + EXPECT_FALSE(sit.AtEnd()); // 4 + EXPECT_TRUE(sit.Next()); + EXPECT_FALSE(sit.AtEnd()); // 5 + EXPECT_TRUE(sit.Next()); + EXPECT_FALSE(sit.AtEnd()); // 6 + EXPECT_FALSE(sit.Next()); + EXPECT_TRUE(sit.AtEnd()); + } + + // Now get the values out. And more equality testing along the way. + { + SubgraphIterator sit(&sg); + SubgraphIterator sit2(&sg); + std::vector links; + for (; !sit.AtEnd(); sit.Next()) { + EXPECT_TRUE(sit == sit2); + sit2.Next(); + EXPECT_FALSE(sit == sit2); + + links.push_back(absl::StrFormat("[%s,%s,%s]", string(sit.GetPort()), + sit.GetNeighbor().node->name(), + string(sit.GetNeighbor().port))); + } + EXPECT_TRUE(sit == sit2); + + std::sort(links.begin(), links.end()); + // clang-format off + EXPECT_THAT(links, ElementsAre( + "[i0,node1,o0]", + "[i1,node2,o0]", + "[iC,node3,oC]", + "[o0,node2,i1]", + "[o1,node2,i0]", + "[oC,node3,iC]" + )); + // clang-format on + } +} + +TEST(SubgraphTest, IterationSamePort) { + GraphDef graph; + (*graph.add_node()) = MakeNodeConst("node1"); + (*graph.add_node()) = MakeNodeSub("node2", "node3", "node3"); + (*graph.add_node()) = MakeNodeAddN("node3", "node1", "node2"); + + GenNodeMap map; + ASSERT_THAT(GenNode::BuildGraphInMap(graph, &map), Eq(Status::OK())); + ASSERT_THAT(map.find("node3"), Ne(map.end())); + + Subgraph::Identity id; + id.insert(map["node3"].get()); + Subgraph sg(id); + + int total_links = 0; + for (SubgraphIterator sit(&sg); !sit.AtEnd(); sit.Next()) { + ++total_links; + } + + // Initialize the port as control, which doesn't occur in this graph. + GenNode::Port last_port(false, -1); + int steps_total_same_port = 0; + int steps_with_same_port = 0; + for (SubgraphIterator sit(&sg); !sit.AtEnd(); sit.Next()) { + GenNode::Port new_port = sit.GetPort(); + EXPECT_THAT(last_port.Encoded(), Ne(new_port.Encoded())) + << "At step " << steps_total_same_port; + last_port = new_port; + + ++steps_total_same_port; + + SubgraphIterator sit2(sit); + sit2.SkipPort(); + + while (sit.NextIfSamePort()) { + new_port = sit.GetPort(); + EXPECT_THAT(last_port.Encoded(), Eq(new_port.Encoded())) + << "At step " << steps_total_same_port; + ++steps_total_same_port; + ++steps_with_same_port; + } + + EXPECT_TRUE(sit == sit2); + } + + EXPECT_THAT(steps_total_same_port, Eq(total_links)); + // There is one 2-way input and one 2-way output. + EXPECT_THAT(steps_with_same_port, Eq(2)); +} + +TEST(SubgraphTest, IterationSameNode) { + GraphDef graph; + (*graph.add_node()) = MakeNodeConst("node1"); + (*graph.add_node()) = MakeNodeSub("node2", "node3", "node3"); + (*graph.add_node()) = MakeNodeAddN("node3", "node1", "node2"); + + GenNodeMap map; + ASSERT_THAT(GenNode::BuildGraphInMap(graph, &map), Eq(Status::OK())); + ASSERT_THAT(map.find("node3"), Ne(map.end())); + + Subgraph::Identity id; + id.insert(map["node3"].get()); + Subgraph sg(id); + + const GenNode* last_node = nullptr; + SubgraphIterator sit(&sg); + while (!sit.AtEnd()) { + const GenNode* new_node = sit.GetNode(); + + EXPECT_THAT(new_node, Ne(last_node)) << "At node " << new_node->name(); + + SubgraphIterator sit2(sit); + sit2.SkipNode(); + + ASSERT_FALSE(sit2.AtEnd()); + EXPECT_THAT(sit2.GetNode(), Eq(new_node)) + << "At expected node " << new_node->name() << ", got " + << sit2.GetNode()->name(); + + while (sit != sit2 && !sit.AtEnd()) { + sit.Next(); + } + + ASSERT_FALSE(sit.AtEnd()); + EXPECT_THAT(sit.GetNode(), Eq(new_node)) + << "At expected node " << new_node->name() << ", got " + << sit2.GetNode()->name(); + + sit.Next(); + + last_node = new_node; + } + + // Check that it doesn't fail if already at end. + sit.SkipNode(); + EXPECT_TRUE(sit.AtEnd()); +} + +TEST(SubgraphTest, ExtendSet) { + GraphDef graph; + // A topology with a loop. + (*graph.add_node()) = MakeNodeConst("node1"); + (*graph.add_node()) = MakeNodeSub("node2", "node3:1", "node3:0"); + auto node3 = graph.add_node(); + *node3 = MakeNodeBroadcastGradientArgs("node3", "node1", "node2"); + node3->add_input("^node3"); // The control link goes back to self. + + GenNodeMap map; + ASSERT_THAT(GenNode::BuildGraphInMap(graph, &map), Eq(Status::OK())); + ASSERT_THAT(map.find("node2"), Ne(map.end())); + ASSERT_THAT(map.find("node3"), Ne(map.end())); + + Subgraph::Identity id_empty; + + Subgraph::Identity id3; + id3.insert(map["node3"].get()); + + Subgraph::Identity id23 = id3; + id23.insert(map["node2"].get()); + + Subgraph* sg; + SubgraphPtrSet set; + + // Extend an empty identity. + sg = set.ExtendParent(id_empty, map["node3"].get()); + EXPECT_THAT(set.size(), Eq(1)); + ASSERT_THAT(sg, Ne(nullptr)); + EXPECT_TRUE(sg->id() == id3); + + // Extend with a node that is already in the parent. + sg = set.ExtendParent(id3, map["node3"].get()); + EXPECT_THAT(set.size(), Eq(1)); + EXPECT_THAT(sg, Eq(nullptr)); + + // Extend to a 2-node subgraph. + sg = set.ExtendParent(id3, map["node2"].get()); + EXPECT_THAT(set.size(), Eq(2)); + ASSERT_THAT(sg, Ne(nullptr)); + EXPECT_TRUE(sg->id() == id23); + + // The second insert of the same node gets ignored. + sg = set.ExtendParent(id3, map["node2"].get()); + EXPECT_THAT(set.size(), Eq(2)); + EXPECT_THAT(sg, Eq(nullptr)); +} + +TEST(SubgraphTest, ExtractForSignature) { + GraphDef graph; + (*graph.add_node()) = MakeNodeConst("node1"); + (*graph.add_node()) = MakeNodeSub("node2", "node3:1", "node3:0"); + auto node3 = graph.add_node(); + *node3 = MakeNodeBroadcastGradientArgs("node3", "node1", "node2"); + node3->add_input("^node1"); + node3->add_input("^node2"); + node3->add_input("^node3"); // The control link goes back to self. + + GenNodeMap map; + ASSERT_THAT(GenNode::BuildGraphInMap(graph, &map), Eq(Status::OK())); + ASSERT_THAT(map.find("node1"), Ne(map.end())); + ASSERT_THAT(map.find("node2"), Ne(map.end())); + ASSERT_THAT(map.find("node3"), Ne(map.end())); + + Subgraph::Identity id; + id.insert(map["node1"].get()); + id.insert(map["node3"].get()); + + Subgraph sg(id); + + SigNodeMap map2; + sg.ExtractForSignature(&map2); + ASSERT_THAT(map2.find("node1"), Ne(map2.end())); + ASSERT_THAT(map2.find("node2"), Eq(map2.end())); + ASSERT_THAT(map2.find("node3"), Ne(map2.end())); + + // clang-format off + EXPECT_THAT(DumpLinkHashMap(map2["node1"]->hash_to_link()), ElementsAre( + "oC:iC: node3", + "o0:i0: node3" + )); + EXPECT_THAT(DumpHashedPeerVector(map2["node1"]->hashed_peers()), ElementsAre( + "node3", + "node3" + )); + EXPECT_THAT(DumpLinkHashMap(map2["node3"]->hash_to_link()), ElementsAre( + "oC:iC: node3", + "iC:oC: node1, node3", + "i0:o0: node1" + )); + EXPECT_THAT(DumpHashedPeerVector(map2["node3"]->hashed_peers()), ElementsAre( + "node3", + "node1", + "node3", + "node1" + )); + // clang-format on +} + +} // end namespace +} // end namespace test +} // end namespace graph_analyzer +} // end namespace grappler +} // end namespace tensorflow diff --git a/tensorflow/core/grappler/graph_analyzer/test_tools.cc b/tensorflow/core/grappler/graph_analyzer/test_tools.cc new file mode 100644 index 0000000000..fc9495bc7d --- /dev/null +++ b/tensorflow/core/grappler/graph_analyzer/test_tools.cc @@ -0,0 +1,296 @@ +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include "tensorflow/core/grappler/graph_analyzer/test_tools.h" + +#include "absl/strings/str_format.h" +#include "absl/strings/str_join.h" + +namespace tensorflow { +namespace grappler { +namespace graph_analyzer { +namespace test { + +//=== Helper methods to construct the nodes. + +NodeDef MakeNodeConst(const string& name) { + NodeDef n; + n.set_name(name); + n.set_op("Const"); + return n; +} + +NodeDef MakeNode2Arg(const string& name, const string& opcode, + const string& arg1, const string& arg2) { + NodeDef n; + n.set_name(name); + n.set_op(opcode); + n.add_input(arg1); + n.add_input(arg2); + return n; +} + +NodeDef MakeNode4Arg(const string& name, const string& opcode, + const string& arg1, const string& arg2, const string& arg3, + const string& arg4) { + NodeDef n; + n.set_name(name); + n.set_op(opcode); + n.add_input(arg1); + n.add_input(arg2); + n.add_input(arg3); + n.add_input(arg4); + return n; +} + +// Not really a 2-argument but convenient to construct. +NodeDef MakeNodeShapeN(const string& name, const string& arg1, + const string& arg2) { + // This opcode is multi-input but not commutative. + return MakeNode2Arg(name, "ShapeN", arg1, arg2); +} + +// Not really a 2-argument but convenient to construct. +NodeDef MakeNodeIdentityN(const string& name, const string& arg1, + const string& arg2) { + // The argument is of a list type. + return MakeNode2Arg(name, "IdentityN", arg1, arg2); +} + +NodeDef MakeNodeQuantizedConcat(const string& name, const string& arg1, + const string& arg2, const string& arg3, + const string& arg4) { + // This opcode has multiple multi-inputs. + return MakeNode4Arg(name, "QuantizedConcat", arg1, arg2, arg3, arg4); +} + +//=== Helper methods for analysing the structures. + +std::vector DumpLinkMap(const GenNode::LinkMap& link_map) { + // This will order the entries first. + std::map ordered; + for (const auto& link : link_map) { + string key = string(link.first); + + // Order the other sides too. They may be repeating, so store them + // in a multiset. + std::multiset others; + for (const auto& other : link.second) { + others.emplace( + absl::StrFormat("%s[%s]", other.node->name(), string(other.port))); + } + ordered[key] = absl::StrJoin(others, ", "); + } + // Now dump the result in a predictable order. + std::vector result; + result.reserve(ordered.size()); + for (const auto& link : ordered) { + result.emplace_back(link.first + ": " + link.second); + } + return result; +} + +std::vector DumpLinkHashMap(const SigNode::LinkHashMap& link_hash_map) { + // The entries in this map are ordered by hash value which might change + // at any point. Re-order them by the link tag. + std::map tags; + for (const auto& entry : link_hash_map) { + tags[entry.second.tag] = entry.first; + } + + std::vector result; + for (const auto& id : tags) { + // For predictability, the nodes need to be sorted. + std::vector nodes; + for (const auto& peer : link_hash_map.at(id.second).peers) { + nodes.emplace_back(peer->name()); + } + std::sort(nodes.begin(), nodes.end()); + result.emplace_back(string(id.first.local) + ":" + string(id.first.remote) + + ": " + absl::StrJoin(nodes, ", ")); + } + return result; +} + +std::vector DumpHashedPeerVector( + const SigNode::HashedPeerVector& hashed_peers) { + std::vector result; + + // Each subset of nodes with the same hash has to be sorted by name. + // Other than that, the vector is already ordered by full tags. + size_t last_hash = 0; + // Index, since iterators may get invalidated on append. + size_t subset_start = 0; + + for (const auto& entry : hashed_peers) { + if (entry.link_hash != last_hash) { + std::sort(result.begin() + subset_start, result.end()); + subset_start = result.size(); + } + result.emplace_back(entry.peer->name()); + } + std::sort(result.begin() + subset_start, result.end()); + + return result; +} + +TestGraphs::TestGraphs() { + { + GraphDef& graph = graph_3n_self_control_; + // The topology includes a loop and a link to self. + (*graph.add_node()) = MakeNodeConst("node1"); + (*graph.add_node()) = MakeNodeSub("node2", "node3:1", "node3:0"); + auto node3 = graph.add_node(); + *node3 = MakeNodeBroadcastGradientArgs("node3", "node1", "node2"); + node3->add_input("^node3"); // The control link goes back to self. + } + { + GraphDef& graph = graph_multi_input_; + // The topology includes a loop and a link to self. + (*graph.add_node()) = MakeNodeConst("const1_1"); + (*graph.add_node()) = MakeNodeConst("const1_2"); + (*graph.add_node()) = MakeNodeAddN("add1", "const1_1", "const1_2"); + + (*graph.add_node()) = MakeNodeConst("const2_1"); + (*graph.add_node()) = MakeNodeConst("const2_2"); + (*graph.add_node()) = MakeNodeConst("const2_3"); + + auto add2 = graph.add_node(); + *add2 = MakeNodeAddN("add2", "const2_1", "const2_2"); + // The 3rd node is connected twice, to 4 links total. + add2->add_input("const2_3"); + add2->add_input("const2_3"); + + (*graph.add_node()) = MakeNodeSub("sub", "add1", "add2"); + } + { + GraphDef& graph = graph_all_or_none_; + // The topology includes a loop and a link to self. + (*graph.add_node()) = MakeNodeConst("const1_1"); + (*graph.add_node()) = MakeNodeConst("const1_2"); + auto pass1 = graph.add_node(); + *pass1 = MakeNodeIdentityN("pass1", "const1_1", "const1_2"); + + (*graph.add_node()) = MakeNodeConst("const2_1"); + (*graph.add_node()) = MakeNodeConst("const2_2"); + (*graph.add_node()) = MakeNodeConst("const2_3"); + + auto pass2 = graph.add_node(); + *pass2 = MakeNodeIdentityN("pass2", "const2_1", "const2_2"); + // The 3rd node is connected twice, to 4 links total. + pass2->add_input("const2_3"); + pass2->add_input("const2_3"); + + // Add the control links, they get handled separately than the normal + // links. + pass1->add_input("^const2_1"); + pass1->add_input("^const2_2"); + pass1->add_input("^const2_3"); + + (*graph.add_node()) = MakeNodeSub("sub", "pass1", "pass2"); + } + { + GraphDef& graph = graph_circular_onedir_; + (*graph.add_node()) = MakeNodeMul("node1", "node5", "node5"); + (*graph.add_node()) = MakeNodeMul("node2", "node1", "node1"); + (*graph.add_node()) = MakeNodeMul("node3", "node2", "node2"); + (*graph.add_node()) = MakeNodeMul("node4", "node3", "node3"); + (*graph.add_node()) = MakeNodeMul("node5", "node4", "node4"); + } + { + GraphDef& graph = graph_circular_bidir_; + // The left and right links are intentionally mixed up. + (*graph.add_node()) = MakeNodeMul("node1", "node5", "node2"); + (*graph.add_node()) = MakeNodeMul("node2", "node3", "node1"); + (*graph.add_node()) = MakeNodeMul("node3", "node2", "node4"); + (*graph.add_node()) = MakeNodeMul("node4", "node5", "node3"); + (*graph.add_node()) = MakeNodeMul("node5", "node4", "node1"); + } + { + GraphDef& graph = graph_linear_; + (*graph.add_node()) = MakeNodeConst("node1"); + (*graph.add_node()) = MakeNodeMul("node2", "node1", "node1"); + (*graph.add_node()) = MakeNodeMul("node3", "node2", "node2"); + (*graph.add_node()) = MakeNodeMul("node4", "node3", "node3"); + (*graph.add_node()) = MakeNodeMul("node5", "node4", "node4"); + } + { + GraphDef& graph = graph_cross_; + (*graph.add_node()) = MakeNodeConst("node1"); + (*graph.add_node()) = MakeNodeMul("node2", "node1", "node1"); + (*graph.add_node()) = MakeNodeConst("node3"); + (*graph.add_node()) = MakeNodeMul("node4", "node3", "node3"); + (*graph.add_node()) = MakeNodeConst("node5"); + (*graph.add_node()) = MakeNodeMul("node6", "node5", "node5"); + (*graph.add_node()) = MakeNodeConst("node7"); + (*graph.add_node()) = MakeNodeMul("node8", "node7", "node7"); + + auto center = graph.add_node(); + *center = MakeNodeMul("node9", "node2", "node4"); + center->add_input("node6"); + center->add_input("node8"); + } + { + GraphDef& graph = graph_small_cross_; + (*graph.add_node()) = MakeNodeConst("node1"); + (*graph.add_node()) = MakeNodeConst("node2"); + (*graph.add_node()) = MakeNodeConst("node3"); + (*graph.add_node()) = MakeNodeConst("node4"); + + auto center = graph.add_node(); + *center = MakeNodeMul("node5", "node1", "node2"); + center->add_input("node3"); + center->add_input("node4"); + } + { + GraphDef& graph = graph_for_link_order_; + (*graph.add_node()) = MakeNodeConst("node1"); + (*graph.add_node()) = MakeNodeConst("node2"); + (*graph.add_node()) = MakeNodeConst("node3"); + (*graph.add_node()) = MakeNodeConst("node4"); + + // One group of equivalent links. + auto center = graph.add_node(); + *center = MakeNodeMul("node5", "node1", "node2"); + center->add_input("node3"); + center->add_input("node4"); + + // Multiple groups, separated by unique links. + auto center2 = graph.add_node(); + *center2 = MakeNodeMul("node6", "node1", "node2"); + center2->add_input("node2:1"); + center2->add_input("node3:2"); + center2->add_input("node4:2"); + center2->add_input("node4:3"); + } + { + GraphDef& graph = graph_sun_; + (*graph.add_node()) = MakeNodeConst("node1"); + (*graph.add_node()) = MakeNodeConst("node2"); + (*graph.add_node()) = MakeNodeConst("node3"); + (*graph.add_node()) = MakeNodeConst("node4"); + (*graph.add_node()) = MakeNodeConst("node5"); + (*graph.add_node()) = MakeNodeSub("node6", "node1", "node10"); + (*graph.add_node()) = MakeNodeSub("node7", "node2", "node6"); + (*graph.add_node()) = MakeNodeSub("node8", "node3", "node7"); + (*graph.add_node()) = MakeNodeSub("node9", "node4", "node8"); + (*graph.add_node()) = MakeNodeSub("node10", "node5", "node9"); + } +} + +} // end namespace test +} // end namespace graph_analyzer +} // end namespace grappler +} // end namespace tensorflow diff --git a/tensorflow/core/grappler/graph_analyzer/test_tools.h b/tensorflow/core/grappler/graph_analyzer/test_tools.h new file mode 100644 index 0000000000..98e269d57e --- /dev/null +++ b/tensorflow/core/grappler/graph_analyzer/test_tools.h @@ -0,0 +1,120 @@ +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#ifndef TENSORFLOW_CORE_GRAPPLER_GRAPH_ANALYZER_TEST_TOOLS_H_ +#define TENSORFLOW_CORE_GRAPPLER_GRAPH_ANALYZER_TEST_TOOLS_H_ + +#include +#include + +#include "tensorflow/core/framework/op.h" +#include "tensorflow/core/grappler/graph_analyzer/gen_node.h" +#include "tensorflow/core/grappler/graph_analyzer/sig_node.h" +#include "tensorflow/core/grappler/op_types.h" + +namespace tensorflow { +namespace grappler { +namespace graph_analyzer { +namespace test { + +//=== Helper methods to construct the nodes. + +NodeDef MakeNodeConst(const string& name); + +NodeDef MakeNode2Arg(const string& name, const string& opcode, + const string& arg1, const string& arg2); + +NodeDef MakeNode4Arg(const string& name, const string& opcode, + const string& arg1, const string& arg2, const string& arg3, + const string& arg4); + +inline NodeDef MakeNodeMul(const string& name, const string& arg1, + const string& arg2) { + return MakeNode2Arg(name, "Mul", arg1, arg2); +} + +// Not really a 2-argument but convenient to construct. +inline NodeDef MakeNodeAddN(const string& name, const string& arg1, + const string& arg2) { + return MakeNode2Arg(name, "AddN", arg1, arg2); +} + +inline NodeDef MakeNodeSub(const string& name, const string& arg1, + const string& arg2) { + return MakeNode2Arg(name, "Sub", arg1, arg2); +} + +// Has 2 honest outputs. +inline NodeDef MakeNodeBroadcastGradientArgs(const string& name, + const string& arg1, + const string& arg2) { + return MakeNode2Arg(name, "BroadcastGradientArgs", arg1, arg2); +} + +NodeDef MakeNodeShapeN(const string& name, const string& arg1, + const string& arg2); + +NodeDef MakeNodeIdentityN(const string& name, const string& arg1, + const string& arg2); + +NodeDef MakeNodeQuantizedConcat(const string& name, const string& arg1, + const string& arg2, const string& arg3, + const string& arg4); + +//=== A container of pre-constructed graphs. + +class TestGraphs { + public: + TestGraphs(); + + // Graph with 3 nodes and a control link to self (which is not valid in + // reality but adds excitement to the tests). + GraphDef graph_3n_self_control_; + // Graph that has the multi-input links. + GraphDef graph_multi_input_; + // Graph that has the all-or-none nodes. + GraphDef graph_all_or_none_; + // All the nodes are connected in a circle that goes in one direction. + GraphDef graph_circular_onedir_; + // All the nodes are connected in a circle that goes in both directions. + GraphDef graph_circular_bidir_; + // The nodes are connected in a line. + GraphDef graph_linear_; + // The nodes are connected in a cross shape. + GraphDef graph_cross_; + GraphDef graph_small_cross_; + // For testing the ordering of links at the end of signature generation, + // a variation of a cross. + GraphDef graph_for_link_order_; + // Sun-shaped, a ring with "rays". + GraphDef graph_sun_; +}; + +//=== Helper methods for analysing the structures. + +std::vector DumpLinkMap(const GenNode::LinkMap& link_map); + +// Also checks for the consistency of hash values. +std::vector DumpLinkHashMap(const SigNode::LinkHashMap& link_hash_map); + +std::vector DumpHashedPeerVector( + const SigNode::HashedPeerVector& hashed_peers); + +} // end namespace test +} // end namespace graph_analyzer +} // end namespace grappler +} // end namespace tensorflow + +#endif // TENSORFLOW_CORE_GRAPPLER_GRAPH_ANALYZER_TEST_TOOLS_H_ diff --git a/tensorflow/python/BUILD b/tensorflow/python/BUILD index f62a927925..5af6437c56 100644 --- a/tensorflow/python/BUILD +++ b/tensorflow/python/BUILD @@ -3777,6 +3777,7 @@ tf_py_wrap_cc( "framework/python_op_gen.i", "grappler/cluster.i", "grappler/cost_analyzer.i", + "grappler/graph_analyzer.i", "grappler/item.i", "grappler/model_analyzer.i", "grappler/tf_optimizer.i", @@ -3835,6 +3836,7 @@ tf_py_wrap_cc( "//tensorflow/core/grappler/clusters:single_machine", "//tensorflow/core/grappler/clusters:virtual_cluster", "//tensorflow/core/grappler/costs:graph_memory", + "//tensorflow/core/grappler/graph_analyzer:graph_analyzer_tool", "//tensorflow/core/grappler/optimizers:meta_optimizer", "//tensorflow/core:lib", "//tensorflow/core:reader_base", @@ -5536,6 +5538,18 @@ py_test( ], ) +py_binary( + name = "graph_analyzer", + srcs = [ + "grappler/graph_analyzer.py", + ], + srcs_version = "PY2AND3", + deps = [ + ":framework_for_generated_wrappers", + ":pywrap_tensorflow_internal", + ], +) + pyx_library( name = "framework_fast_tensor_util", srcs = ["framework/fast_tensor_util.pyx"], diff --git a/tensorflow/python/grappler/graph_analyzer.i b/tensorflow/python/grappler/graph_analyzer.i new file mode 100644 index 0000000000..cc7b5358eb --- /dev/null +++ b/tensorflow/python/grappler/graph_analyzer.i @@ -0,0 +1,26 @@ +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +%{ +#include "tensorflow/core/grappler/graph_analyzer/graph_analyzer_tool.h" +%} + +%{ +void GraphAnalyzer(const string& file_path, int n) { + tensorflow::grappler::graph_analyzer::GraphAnalyzerTool(file_path, n); +} +%} + +void GraphAnalyzer(const string& file_path, int n); diff --git a/tensorflow/python/grappler/graph_analyzer.py b/tensorflow/python/grappler/graph_analyzer.py new file mode 100644 index 0000000000..ec5544e38e --- /dev/null +++ b/tensorflow/python/grappler/graph_analyzer.py @@ -0,0 +1,46 @@ +# Copyright 2018 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================= +"""A tool that finds all subgraphs of a given size in a TF graph. + +The subgraph patterns are sorted by occurrence, and only the transitive fanin +part of the graph with regard to the fetch nodes is considered. +""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import argparse +import sys + +from tensorflow.python import pywrap_tensorflow as tf_wrap +from tensorflow.python.platform import app + + +def main(_): + tf_wrap.GraphAnalyzer(FLAGS.input, FLAGS.n) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument( + "--input", + type=str, + default=None, + help="Input file path for a TensorFlow MetaGraphDef.") + parser.add_argument( + "--n", type=int, default=None, help="The size of the subgraphs.") + FLAGS, unparsed = parser.parse_known_args() + app.run(main=main, argv=[sys.argv[0]] + unparsed) diff --git a/tensorflow/python/tensorflow.i b/tensorflow/python/tensorflow.i index 26e8acd897..39174fa589 100644 --- a/tensorflow/python/tensorflow.i +++ b/tensorflow/python/tensorflow.i @@ -54,4 +54,5 @@ limitations under the License. %include "tensorflow/python/grappler/item.i" %include "tensorflow/python/grappler/tf_optimizer.i" %include "tensorflow/python/grappler/cost_analyzer.i" +%include "tensorflow/python/grappler/graph_analyzer.i" %include "tensorflow/python/grappler/model_analyzer.i" -- GitLab From df6c8721f8be706291a8151d725de4435942f7e2 Mon Sep 17 00:00:00 2001 From: Justin Lebar Date: Mon, 27 Aug 2018 15:20:48 -0700 Subject: [PATCH 182/598] [XLA] Use EXPECT rather than ASSERT where appropriate in literal_test.cc PiperOrigin-RevId: 210441200 --- tensorflow/compiler/xla/literal_test.cc | 74 ++++++++++++------------- 1 file changed, 37 insertions(+), 37 deletions(-) diff --git a/tensorflow/compiler/xla/literal_test.cc b/tensorflow/compiler/xla/literal_test.cc index aef87e46d8..e08a9d6e41 100644 --- a/tensorflow/compiler/xla/literal_test.cc +++ b/tensorflow/compiler/xla/literal_test.cc @@ -99,42 +99,42 @@ class LiteralUtilTest : public ::testing::Test { TEST_F(LiteralUtilTest, LiteralScalarToString) { auto true_lit = LiteralUtil::CreateR0(true); - ASSERT_EQ("true", true_lit->ToString()); + EXPECT_EQ("true", true_lit->ToString()); auto false_lit = LiteralUtil::CreateR0(false); - ASSERT_EQ("false", false_lit->ToString()); + EXPECT_EQ("false", false_lit->ToString()); auto u32_lit = LiteralUtil::CreateR0(42); - ASSERT_EQ("42", u32_lit->ToString()); + EXPECT_EQ("42", u32_lit->ToString()); auto s32_lit = LiteralUtil::CreateR0(-999); - ASSERT_EQ("-999", s32_lit->ToString()); + EXPECT_EQ("-999", s32_lit->ToString()); auto f32_lit = LiteralUtil::CreateR0(3.14f); - ASSERT_EQ("3.14", f32_lit->ToString()); + EXPECT_EQ("3.14", f32_lit->ToString()); auto f16_lit = LiteralUtil::CreateR0(static_cast(0.5f)); - ASSERT_EQ("0.5", f16_lit->ToString()); + EXPECT_EQ("0.5", f16_lit->ToString()); auto c64_lit = LiteralUtil::CreateR0({3.14f, 2.78f}); - ASSERT_EQ("(3.14, 2.78)", c64_lit->ToString()); + EXPECT_EQ("(3.14, 2.78)", c64_lit->ToString()); auto bf16_lit = LiteralUtil::CreateR0(static_cast(0.5f)); - ASSERT_EQ("0.5", bf16_lit->ToString()); + EXPECT_EQ("0.5", bf16_lit->ToString()); // 3.14 will be truncated to 3.125 in bfloat16 format. auto bf16_lit_truncated = LiteralUtil::CreateR0(static_cast(3.14f)); - ASSERT_EQ("3.125", bf16_lit_truncated->ToString()); + EXPECT_EQ("3.125", bf16_lit_truncated->ToString()); auto bf16_lit_truncated2 = LiteralUtil::CreateR0(static_cast(9.001f)); - ASSERT_EQ("9", bf16_lit_truncated2->ToString()); + EXPECT_EQ("9", bf16_lit_truncated2->ToString()); } TEST_F(LiteralUtilTest, LiteralVectorToString) { auto pred_vec = LiteralUtil::CreateR1({true, false, true}); - ASSERT_EQ("{101}", pred_vec->ToString()); + EXPECT_EQ("{101}", pred_vec->ToString()); } TEST_F(LiteralUtilTest, R2ToString) { @@ -144,7 +144,7 @@ TEST_F(LiteralUtilTest, R2ToString) { { 3, 4 }, { 5, 6 } })"; - ASSERT_EQ(expected, literal->ToString()); + EXPECT_EQ(expected, literal->ToString()); } TEST_F(LiteralUtilTest, R3ToString) { @@ -158,7 +158,7 @@ TEST_F(LiteralUtilTest, R3ToString) { { { 5 }, { 6 } } })"; - ASSERT_EQ(expected, literal->ToString()); + EXPECT_EQ(expected, literal->ToString()); } TEST_F(LiteralUtilTest, TupleToString) { @@ -172,7 +172,7 @@ f32[2,2] { { 3, 4 } } ))"; - ASSERT_EQ(expected, tuple->ToString()); + EXPECT_EQ(expected, tuple->ToString()); } TEST_F(LiteralUtilTest, CreateR3FromArray3d) { @@ -198,7 +198,7 @@ TEST_F(LiteralUtilTest, CreateR3FromArray3d) { { 9, 10 }, { 11, 12 } } })"; - ASSERT_EQ(expected, result); + EXPECT_EQ(expected, result); } TEST_F(LiteralUtilTest, CreateSparse) { @@ -251,7 +251,7 @@ TEST_F(LiteralUtilTest, LiteralR4F32ProjectedStringifies) { } } })"; - ASSERT_EQ(expected, result); + EXPECT_EQ(expected, result); } TEST_F(LiteralUtilTest, LiteralR4F32Stringifies) { @@ -284,7 +284,7 @@ TEST_F(LiteralUtilTest, LiteralR4F32Stringifies) { } } })"; - ASSERT_EQ(expected, result); + EXPECT_EQ(expected, result); } TEST_F(LiteralUtilTest, EachCellR2F32) { @@ -1039,7 +1039,7 @@ TEST_F(LiteralUtilTest, CopyFromDifferentShapes) { auto vector = LiteralUtil::CreateR1({5.0, 7.0}); Status status = matrix->CopyFrom(*vector); ASSERT_FALSE(status.ok()); - ASSERT_THAT(status.error_message(), + EXPECT_THAT(status.error_message(), HasSubstr("Destination subshape incompatible")); } @@ -1394,10 +1394,10 @@ TEST_F(LiteralUtilTest, CopyFromProto_f16) { Literal::CreateFromProto(p)); auto r = literal->data(); ASSERT_EQ(4, r.size()); - ASSERT_EQ(h1, r[0]); - ASSERT_EQ(h2, r[1]); - ASSERT_EQ(h2, r[2]); - ASSERT_EQ(h1, r[3]); + EXPECT_EQ(h1, r[0]); + EXPECT_EQ(h2, r[1]); + EXPECT_EQ(h2, r[2]); + EXPECT_EQ(h1, r[3]); } TEST_F(LiteralUtilTest, LiteralSliceTest) { @@ -1580,7 +1580,7 @@ TEST_F(LiteralUtilTest, MoveIntoTuple) { TEST_F(LiteralUtilTest, MoveIntoEmptyTuple) { Literal literal = Literal::MoveIntoTuple({}); ASSERT_TRUE(ShapeUtil::IsTuple(literal.shape())); - ASSERT_EQ(ShapeUtil::TupleElementCount(literal.shape()), 0); + EXPECT_EQ(ShapeUtil::TupleElementCount(literal.shape()), 0); } TEST_F(LiteralUtilTest, LiteralMoveAssignment) { @@ -1693,7 +1693,7 @@ TEST_F(LiteralUtilTest, InvalidProtoNoValues) { *proto.mutable_shape() = ShapeUtil::MakeShape(F32, {3}); Status status = Literal::CreateFromProto(proto).status(); ASSERT_FALSE(status.ok()); - ASSERT_THAT(status.error_message(), + EXPECT_THAT(status.error_message(), HasSubstr("Expected 3 elements in LiteralProto")); } @@ -1705,7 +1705,7 @@ TEST_F(LiteralUtilTest, InvalidProtoNoShape) { proto.add_preds(false); Status status = Literal::CreateFromProto(proto).status(); ASSERT_FALSE(status.ok()); - ASSERT_THAT(status.error_message(), HasSubstr("LiteralProto has no shape")); + EXPECT_THAT(status.error_message(), HasSubstr("LiteralProto has no shape")); } TEST_F(LiteralUtilTest, InvalidProtoWrongContainer) { @@ -1717,7 +1717,7 @@ TEST_F(LiteralUtilTest, InvalidProtoWrongContainer) { proto.add_preds(false); Status status = Literal::CreateFromProto(proto).status(); ASSERT_FALSE(status.ok()); - ASSERT_THAT(status.error_message(), + EXPECT_THAT(status.error_message(), HasSubstr("Expected 3 elements in LiteralProto")); } @@ -1730,7 +1730,7 @@ TEST_F(LiteralUtilTest, InvalidProtoTooFewValues) { proto.add_f32s(3.0); Status status = Literal::CreateFromProto(proto).status(); ASSERT_FALSE(status.ok()); - ASSERT_THAT(status.error_message(), + EXPECT_THAT(status.error_message(), HasSubstr("Expected 84 elements in LiteralProto")); } @@ -1743,7 +1743,7 @@ TEST_F(LiteralUtilTest, InvalidProtoTooManyValues) { proto.add_s32s(100); Status status = Literal::CreateFromProto(proto).status(); ASSERT_FALSE(status.ok()); - ASSERT_THAT(status.error_message(), + EXPECT_THAT(status.error_message(), HasSubstr("Expected 2 elements in LiteralProto")); } @@ -1758,7 +1758,7 @@ TEST_F(LiteralUtilTest, InvalidProtoMissingLayout) { proto.add_preds(false); Status status = Literal::CreateFromProto(proto).status(); ASSERT_FALSE(status.ok()); - ASSERT_THAT(status.error_message(), HasSubstr("LiteralProto has no layout")); + EXPECT_THAT(status.error_message(), HasSubstr("LiteralProto has no layout")); } TEST_F(LiteralUtilTest, InvalidProtoTooFewTupleElements) { @@ -1774,7 +1774,7 @@ TEST_F(LiteralUtilTest, InvalidProtoTooFewTupleElements) { Status status = Literal::CreateFromProto(proto).status(); ASSERT_FALSE(status.ok()); - ASSERT_THAT(status.error_message(), HasSubstr("Expected 2 tuple elements")); + EXPECT_THAT(status.error_message(), HasSubstr("Expected 2 tuple elements")); } TEST_F(LiteralUtilTest, InvalidProtoTooManyTupleElements) { @@ -1797,7 +1797,7 @@ TEST_F(LiteralUtilTest, InvalidProtoTooManyTupleElements) { Status status = Literal::CreateFromProto(proto).status(); ASSERT_FALSE(status.ok()); - ASSERT_THAT(status.error_message(), HasSubstr("Expected 2 tuple elements")); + EXPECT_THAT(status.error_message(), HasSubstr("Expected 2 tuple elements")); } TEST_F(LiteralUtilTest, SortSparseElements) { @@ -1807,7 +1807,7 @@ TEST_F(LiteralUtilTest, SortSparseElements) { literal->AppendSparseElement({3, 4, 5}, 3.0); literal->AppendSparseElement({1, 2, 3}, 1.0); literal->SortSparseElements(); - ASSERT_EQ(literal->ToString(false), + EXPECT_EQ(literal->ToString(false), "f32[10,10,10]{[1, 2, 3]: 1, [2, 3, 4]: 2, [3, 4, 5]: 3}"); } @@ -1815,22 +1815,22 @@ TEST_F(LiteralUtilTest, GetSparseElementAsString) { std::vector dimensions = {10, 10, 10}; SparseIndexArray indices(10, {{1, 2, 3}, {2, 3, 4}, {3, 4, 5}}); - ASSERT_EQ( + EXPECT_EQ( LiteralUtil::CreateSparse(dimensions, indices, {true, false, true}) ->GetSparseElementAsString(1), "false"); - ASSERT_EQ(LiteralUtil::CreateSparse(dimensions, indices, {1, 2, 3}) + EXPECT_EQ(LiteralUtil::CreateSparse(dimensions, indices, {1, 2, 3}) ->GetSparseElementAsString(1), absl::StrCat(int64{2})); - ASSERT_EQ( + EXPECT_EQ( LiteralUtil::CreateSparse(dimensions, indices, {1.0, 2.0, 3.0}) ->GetSparseElementAsString(1), absl::StrCat(double{2.0})); - ASSERT_EQ(LiteralUtil::CreateSparse(dimensions, indices, + EXPECT_EQ(LiteralUtil::CreateSparse(dimensions, indices, {half{1.0}, half{2.0}, half{3.0}}) ->GetSparseElementAsString(1), absl::StrCat(static_cast(half{2.0}))); - ASSERT_EQ(LiteralUtil::CreateSparse( + EXPECT_EQ(LiteralUtil::CreateSparse( dimensions, indices, std::vector{{1.0, 2.0}, {3.0, 4.0}, {5.0, 6.0}}) ->GetSparseElementAsString(1), -- GitLab From fc492c08d64f05b1d68beedf11a3b55bf8066a8b Mon Sep 17 00:00:00 2001 From: Igor Ganichev Date: Mon, 27 Aug 2018 15:29:56 -0700 Subject: [PATCH 183/598] Support returning resource handles from function in XLA There are a couple of reasons to do this: - resource handle are regular tensors part of a public API that can potentially be returned from a function. - When tfe.defun is executed under GradientTape, it generates a function returning resource handles in certain cases. This CL adds support for returning resource handles from an XLA compiled function. These resource handles must have been passed as arguments to the function. In other words, we don't yet support returning resources created inside the function. tfe.defun never makes functions that create resources. PiperOrigin-RevId: 210442856 --- .../compiler/jit/create_xla_launch_op.cc | 7 +- tensorflow/compiler/jit/xla_launch_util.cc | 47 ++++---- tensorflow/compiler/tests/eager_test.py | 98 +++++++++++++++++ tensorflow/compiler/tf2xla/graph_compiler.cc | 1 + .../compiler/tf2xla/kernels/retval_op.cc | 13 ++- tensorflow/compiler/tf2xla/xla_compiler.cc | 9 +- tensorflow/compiler/tf2xla/xla_compiler.h | 6 + .../compiler/tf2xla/xla_compiler_test.cc | 103 ++++++++++++++++-- tensorflow/compiler/tf2xla/xla_context.cc | 13 +++ tensorflow/compiler/tf2xla/xla_context.h | 3 + 10 files changed, 263 insertions(+), 37 deletions(-) diff --git a/tensorflow/compiler/jit/create_xla_launch_op.cc b/tensorflow/compiler/jit/create_xla_launch_op.cc index a7f8a5613c..56b034a30b 100644 --- a/tensorflow/compiler/jit/create_xla_launch_op.cc +++ b/tensorflow/compiler/jit/create_xla_launch_op.cc @@ -209,8 +209,13 @@ Status CreateXlaLaunchOp(FunctionLibraryRuntime* flr, const NodeDef& node_def, // device memory. // XlaLaunch kernel keeps all outputs (including constants, which it copies), - // in device memory + // in device memory except for resources. MemoryTypeVector output_memory_types(fbody->ret_types.size(), DEVICE_MEMORY); + for (int i = 0; i < fbody->ret_types.size(); ++i) { + if (fbody->ret_types[i] == DT_RESOURCE) { + output_memory_types[i] = HOST_MEMORY; + } + } // Create the kernel. NameAttrList function; diff --git a/tensorflow/compiler/jit/xla_launch_util.cc b/tensorflow/compiler/jit/xla_launch_util.cc index 2ffce9298d..affeab4a8c 100644 --- a/tensorflow/compiler/jit/xla_launch_util.cc +++ b/tensorflow/compiler/jit/xla_launch_util.cc @@ -271,31 +271,36 @@ Status XlaComputationLaunchContext::PopulateOutputs( } } else { const TensorShape& shape = kernel->outputs[i].shape; - VLOG(2) << "Retval " << i << " shape " << shape.DebugString(); - - se::DeviceMemoryBase buffer = output.buffer({output_num}); - if (allocate_xla_tensors_) { - Tensor* output_tensor; - TF_RETURN_IF_ERROR(ctx->allocate_output(i, shape, &output_tensor)); - XlaTensor* xla_tensor = XlaTensor::FromTensor(output_tensor); - if (xla_tensor) { - xla_tensor->set_shaped_buffer(ScopedShapedBuffer( - ExtractSubShapedBuffer(&output, output_num, xla_allocator_))); - if (use_multiple_streams_) { - xla_tensor->SetDefinedOn(stream, definition_event); + const DataType& type = kernel->outputs[i].type; + VLOG(2) << "Retval " << i << " shape " << shape.DebugString() << " type " + << DataTypeString(type); + if (type == DT_RESOURCE) { + ctx->set_output(i, ctx->input(kernel->outputs[i].input_index)); + } else { + se::DeviceMemoryBase buffer = output.buffer({output_num}); + if (allocate_xla_tensors_) { + Tensor* output_tensor; + TF_RETURN_IF_ERROR(ctx->allocate_output(i, shape, &output_tensor)); + XlaTensor* xla_tensor = XlaTensor::FromTensor(output_tensor); + if (xla_tensor) { + xla_tensor->set_shaped_buffer(ScopedShapedBuffer( + ExtractSubShapedBuffer(&output, output_num, xla_allocator_))); + if (use_multiple_streams_) { + xla_tensor->SetDefinedOn(stream, definition_event); + } + } else { + // xla_tensor wasn't valid, which must mean this is a zero-element + // tensor. + CHECK_EQ(output_tensor->TotalBytes(), 0); } } else { - // xla_tensor wasn't valid, which must mean this is a zero-element - // tensor. - CHECK_EQ(output_tensor->TotalBytes(), 0); + Tensor output_tensor = XlaTensorBuffer::MakeTensor( + ctx->expected_output_dtype(i), shape, buffer, allocator); + output.set_buffer(xla::OwningDeviceMemory(), {output_num}); + ctx->set_output(i, output_tensor); } - } else { - Tensor output_tensor = XlaTensorBuffer::MakeTensor( - ctx->expected_output_dtype(i), shape, buffer, allocator); - output.set_buffer(xla::OwningDeviceMemory(), {output_num}); - ctx->set_output(i, output_tensor); + ++output_num; } - ++output_num; } if (VLOG_IS_ON(3)) { diff --git a/tensorflow/compiler/tests/eager_test.py b/tensorflow/compiler/tests/eager_test.py index e32f3d4b7f..63cee550fd 100644 --- a/tensorflow/compiler/tests/eager_test.py +++ b/tensorflow/compiler/tests/eager_test.py @@ -351,6 +351,38 @@ class EagerFunctionTest(xla_test.XLATestCase): var = f(v) self.assertEqual(2.0, var.numpy()) + def testReturnResourceHandle(self): + with self.test_scope(): + v = resource_variable_ops.ResourceVariable([[1.0, 2.0], [3.0, 4.0]]) + + def f(v): + return v.handle + + f = function.defun(f) + handle = f(v) + self.assertAllEqual(v.numpy(), + resource_variable_ops.read_variable_op( + handle, dtypes.float32).numpy()) + + def testReturnMultipleResourceHandles(self): + with self.test_scope(): + v1 = resource_variable_ops.ResourceVariable(1.25) + v2 = resource_variable_ops.ResourceVariable(2.0) + + def f(v): + return v.handle, 3.0 * v, v2.handle, v + v2 + + f = function.defun(f) + v1_handle, v1_times_3, v2_handle, variable_sum = f(v1) + self.assertAllEqual(v1.numpy(), + resource_variable_ops.read_variable_op( + v1_handle, dtypes.float32).numpy()) + self.assertEqual(3.75, v1_times_3.numpy()) + self.assertAllEqual(v2.numpy(), + resource_variable_ops.read_variable_op( + v2_handle, dtypes.float32).numpy()) + self.assertEqual(3.25, variable_sum.numpy()) + def testAllArgumentKinds(self): """Test a complex function that takes different argument kinds. @@ -457,6 +489,72 @@ class EagerFunctionTest(xla_test.XLATestCase): y = two_x_plus_1(x) self.assertAllEqual([5, 7, 9], y.numpy()) + def testNestedDefunWithVariable(self): + with self.test_scope(): + v0 = resource_variable_ops.ResourceVariable(5.0) + + @function.defun + def g(x): + x = v0 * x + return x + + @function.defun + def f(x): + x = g(v0 * x) + return x + + x = constant_op.constant(3.0) + y = f(x) + + self.assertEqual(75, y.numpy()) + + def testNestedDefunInGradientTape(self): + with self.test_scope(): + v0 = resource_variable_ops.ResourceVariable(5.0) + + @function.defun + def g(x): + x = v0 * x + return x + + @function.defun + def f(x): + x = g(v0 * x) + return x + + x = constant_op.constant(3.0) + with backprop.GradientTape() as tape: + y = f(x) + dy = tape.gradient(y, v0) + + self.assertEqual(75, y.numpy()) + self.assertEqual(30, dy.numpy()) + + def testNestedDefunInGradientTapeDifferentVars(self): + with self.test_scope(): + v0 = resource_variable_ops.ResourceVariable(5.0) + v1 = resource_variable_ops.ResourceVariable(3.0) + + @function.defun + def g(x): + x = v1 * x + return x + + @function.defun + def f(x): + x = g(v0 * x) + return x + + x = constant_op.constant(3.0) + with backprop.GradientTape(persistent=True) as tape: + y = f(x) + dy_v0 = tape.gradient(y, v0) + dy_v1 = tape.gradient(y, v1) + + self.assertEqual(45, y.numpy()) + self.assertEqual(9, dy_v0.numpy()) + self.assertEqual(15, dy_v1.numpy()) + class ExcessivePaddingTest(xla_test.XLATestCase): """Test that eager execution works with TPU flattened tensors. diff --git a/tensorflow/compiler/tf2xla/graph_compiler.cc b/tensorflow/compiler/tf2xla/graph_compiler.cc index ba37ed3337..1ed1fb3b02 100644 --- a/tensorflow/compiler/tf2xla/graph_compiler.cc +++ b/tensorflow/compiler/tf2xla/graph_compiler.cc @@ -146,6 +146,7 @@ Status GraphCompiler::Compile() { } OpKernelContext op_context(¶ms, n->num_outputs()); + VLOG(3) << "Translating " << params.op_kernel->name(); if (IsFunctional(n)) { TF_RETURN_IF_ERROR(CompileFunctionalNode(n, &op_context)); } else { diff --git a/tensorflow/compiler/tf2xla/kernels/retval_op.cc b/tensorflow/compiler/tf2xla/kernels/retval_op.cc index 64900e4709..e172c64932 100644 --- a/tensorflow/compiler/tf2xla/kernels/retval_op.cc +++ b/tensorflow/compiler/tf2xla/kernels/retval_op.cc @@ -48,6 +48,15 @@ class RetvalOp : public XlaOpKernel { } else { xla::XlaOp input = ctx->Input(0); const TensorShape input_shape = ctx->InputShape(0); + DataType input_type = ctx->input_type(0); + XlaContext& tc = XlaContext::Get(ctx); + + if (input_type == DT_RESOURCE) { + XlaResource* resource; + OP_REQUIRES_OK(ctx, ctx->GetResourceInput(0, &resource)); + ctx->SetStatus(tc.AddResourceRetval(index_, resource)); + return; + } auto is_constant = ctx->builder()->IsConstant(input); if (!is_constant.ok()) { @@ -55,7 +64,6 @@ class RetvalOp : public XlaOpKernel { return; } - XlaContext& tc = XlaContext::Get(ctx); if (tc.resolve_compile_time_constants() && (input_shape.num_elements() == 0 || is_constant.ValueOrDie())) { xla::Literal literal; @@ -104,7 +112,8 @@ class RetvalOp : public XlaOpKernel { TF_DISALLOW_COPY_AND_ASSIGN(RetvalOp); }; -REGISTER_XLA_OP(Name("_Retval").CompilationOnly(), RetvalOp); +REGISTER_XLA_OP(Name("_Retval").AllowResourceTypes().CompilationOnly(), + RetvalOp); } // anonymous namespace } // namespace tensorflow diff --git a/tensorflow/compiler/tf2xla/xla_compiler.cc b/tensorflow/compiler/tf2xla/xla_compiler.cc index eabfc6b6e2..aa2a521d98 100644 --- a/tensorflow/compiler/tf2xla/xla_compiler.cc +++ b/tensorflow/compiler/tf2xla/xla_compiler.cc @@ -361,6 +361,9 @@ Status BuildComputation( if (retval.has_constant_value()) { output.is_constant = true; output.constant_value = retval.constant_value(); + } else if (retval.resource() != nullptr) { + output.is_constant = false; + output.input_index = retval.resource()->arg_num(); } else { output.is_constant = false; elems.push_back(retval.handle()); @@ -495,7 +498,8 @@ Status XlaCompiler::BuildArguments( arg_expression.set_constant_value(arg.constant_value); break; case XlaCompiler::Argument::kInvalid: - return errors::Internal("Unreachable case in BuildArguments()"); + return errors::Internal( + "Unreachable case in BuildArguments() while filling constant args"); } } @@ -615,7 +619,8 @@ Status XlaCompiler::BuildArguments( break; case XlaCompiler::Argument::kConstant: case XlaCompiler::Argument::kInvalid: - return errors::Internal("Unreachable case in BuildArguments()"); + return errors::Internal( + "Unreachable case in BuildArguments() while filling handles"); } } diff --git a/tensorflow/compiler/tf2xla/xla_compiler.h b/tensorflow/compiler/tf2xla/xla_compiler.h index da1ae02f32..9e2c64fd42 100644 --- a/tensorflow/compiler/tf2xla/xla_compiler.h +++ b/tensorflow/compiler/tf2xla/xla_compiler.h @@ -183,6 +183,8 @@ class XlaCompiler { struct OutputDescription { // Type and shape of the output. The shape is the unflattened shape. + // When `type` is DT_RESOURCE, `shape` is the shape of the resource + // variable's value. DataType type; TensorShape shape; @@ -190,6 +192,10 @@ class XlaCompiler { // 'Tensor' is in host memory. bool is_constant = false; Tensor constant_value; + + // When this output is a resource, i.e. `type == DT_RESOURCE`, this is + // the index of the input that contains the resource. + int input_index; }; // Describes a variable write side effect of the computation. diff --git a/tensorflow/compiler/tf2xla/xla_compiler_test.cc b/tensorflow/compiler/tf2xla/xla_compiler_test.cc index 740f6dc25c..be3c93ae47 100644 --- a/tensorflow/compiler/tf2xla/xla_compiler_test.cc +++ b/tensorflow/compiler/tf2xla/xla_compiler_test.cc @@ -861,6 +861,33 @@ TEST_F(XlaCompilerTest, LocalFunctionWithWrongArgumentsFail) { << status.error_message(); } +void RunAndCheckVariablesComputation( + xla::Client* client, const XlaCompiler::CompilationResult& result) { + std::unique_ptr param0_literal = + xla::LiteralUtil::CreateR1({7, 42}); + std::unique_ptr param1_literal = + xla::LiteralUtil::CreateR1({-3, 101}); + std::unique_ptr param0_data = + client->TransferToServer(*param0_literal).ConsumeValueOrDie(); + std::unique_ptr param1_data = + client->TransferToServer(*param1_literal).ConsumeValueOrDie(); + + std::unique_ptr actual = + client + ->Execute(*result.computation, {param0_data.get(), param1_data.get()}) + .ConsumeValueOrDie(); + std::unique_ptr actual_literal = + client->Transfer(*actual).ConsumeValueOrDie(); + + std::unique_ptr expected0 = + xla::LiteralUtil::CreateR1({5, 144}); + std::unique_ptr expected1 = + xla::LiteralUtil::CreateR1({4, 143}); + std::unique_ptr expected_literal = + xla::LiteralUtil::MakeTuple({expected0.get(), expected1.get()}); + EXPECT_TRUE(xla::LiteralTestUtil::Equal(*expected_literal, *actual_literal)); +} + // Tests a simple graph that reads and writes a variable. TEST_F(XlaCompilerTest, Variables) { Scope scope = Scope::NewRootScope().ExitOnError(); @@ -892,36 +919,90 @@ TEST_F(XlaCompilerTest, Variables) { // Compiles the graph. XlaCompiler compiler(DefaultOptions()); + XlaCompiler::CompilationResult result; + TF_ASSERT_OK(compiler.CompileGraph(XlaCompiler::CompileOptions(), "add", + std::move(graph), args, &result)); + RunAndCheckVariablesComputation(client_, result); +} + +// Tests a simple graph that reads and writes a variable. +TEST_F(XlaCompilerTest, ReturnResourceHandleOnly) { + Scope scope = Scope::NewRootScope().ExitOnError(); + auto var = ops::_Arg(scope.WithOpName("V"), DT_RESOURCE, 0); + auto d = ops::_Retval(scope.WithOpName("D"), var, 0); + std::unique_ptr graph(new Graph(OpRegistry::Global())); + TF_ASSERT_OK(scope.ToGraph(graph.get())); + + // Builds a description of the arguments. + std::vector args(1); + args[0].kind = XlaCompiler::Argument::kResource; + args[0].resource_kind = XlaResource::kVariable; + args[0].initialized = true; + args[0].type = DT_INT32; + args[0].shape = TensorShape({2}); + + // Compiles the graph. + XlaCompiler compiler(DefaultOptions()); + XlaCompiler::CompilationResult result; TF_ASSERT_OK(compiler.CompileGraph(XlaCompiler::CompileOptions(), "add", std::move(graph), args, &result)); // Tests that the generated computation works. - std::unique_ptr param0_literal = - xla::LiteralUtil::CreateR1({7, 42}); std::unique_ptr param1_literal = xla::LiteralUtil::CreateR1({-3, 101}); - std::unique_ptr param0_data = - client_->TransferToServer(*param0_literal).ConsumeValueOrDie(); std::unique_ptr param1_data = client_->TransferToServer(*param1_literal).ConsumeValueOrDie(); std::unique_ptr actual = - client_ - ->Execute(*result.computation, {param0_data.get(), param1_data.get()}) + client_->Execute(*result.computation, {param1_data.get()}) .ConsumeValueOrDie(); std::unique_ptr actual_literal = client_->Transfer(*actual).ConsumeValueOrDie(); - std::unique_ptr expected0 = - xla::LiteralUtil::CreateR1({5, 144}); - std::unique_ptr expected1 = - xla::LiteralUtil::CreateR1({4, 143}); std::unique_ptr expected_literal = - xla::LiteralUtil::MakeTuple({expected0.get(), expected1.get()}); + xla::LiteralUtil::MakeTuple({}); EXPECT_TRUE(xla::LiteralTestUtil::Equal(*expected_literal, *actual_literal)); } +TEST_F(XlaCompilerTest, ReturnResourceHandle) { + Scope scope = Scope::NewRootScope().ExitOnError(); + auto a = ops::_Arg(scope.WithOpName("A"), DT_INT32, 0); + auto var = ops::_Arg(scope.WithOpName("V"), DT_RESOURCE, 1); + // Adds an identity op around the resource to make sure identity ops propagate + // resources correctly. + auto identity = ops::Identity(scope.WithOpName("VIdentity"), var); + auto write = ops::AssignAddVariableOp(scope, identity, a); + auto read = ops::ReadVariableOp( + scope.WithControlDependencies(std::vector{write}), var, + DT_INT32); + auto read_plus_one = ops::Add(scope, read, ops::Const(scope, 1)); + auto r = ops::_Retval(scope.WithOpName("R"), var, 0); + auto d = ops::_Retval(scope.WithOpName("D"), read_plus_one, 1); + + std::unique_ptr graph(new Graph(OpRegistry::Global())); + TF_ASSERT_OK(scope.ToGraph(graph.get())); + + // Builds a description of the arguments. + std::vector args(2); + args[0].kind = XlaCompiler::Argument::kParameter; + args[0].type = DT_INT32; + args[0].shape = TensorShape({2}); + args[1].kind = XlaCompiler::Argument::kResource; + args[1].resource_kind = XlaResource::kVariable; + args[1].initialized = true; + args[1].type = DT_INT32; + args[1].shape = TensorShape({2}); + + // Compiles the graph. + XlaCompiler compiler(DefaultOptions()); + + XlaCompiler::CompilationResult result; + TF_ASSERT_OK(compiler.CompileGraph(XlaCompiler::CompileOptions(), "add", + std::move(graph), args, &result)); + RunAndCheckVariablesComputation(client_, result); +} + xla::StatusOr> BuildTestGraph() { Scope scope = Scope::NewRootScope().ExitOnError(); auto a = ops::_Arg(scope.WithOpName("A"), DT_INT32, 0); diff --git a/tensorflow/compiler/tf2xla/xla_context.cc b/tensorflow/compiler/tf2xla/xla_context.cc index b24e3aabbe..e36039ada5 100644 --- a/tensorflow/compiler/tf2xla/xla_context.cc +++ b/tensorflow/compiler/tf2xla/xla_context.cc @@ -107,6 +107,19 @@ Status XlaContext::AddConstRetval(int retval_index, DataType dtype, return Status::OK(); } +Status XlaContext::AddResourceRetval(int retval_index, XlaResource* resource) { + VLOG(1) << "Adding retval index " << retval_index << " with resource " + << resource->name() << ":" << resource->shape().DebugString() + << " to XLA computation"; + if (retvals_.size() <= retval_index) { + retvals_.resize(retval_index + 1); + } + XlaExpression e; + e.set_resource(resource); + retvals_[retval_index] = Retval{DT_RESOURCE, resource->shape(), e}; + return Status::OK(); +} + xla::XlaBuilder* XlaContext::builder() { return builder_; } Status XlaContext::CreateResource( diff --git a/tensorflow/compiler/tf2xla/xla_context.h b/tensorflow/compiler/tf2xla/xla_context.h index 3db37afdba..4da891634e 100644 --- a/tensorflow/compiler/tf2xla/xla_context.h +++ b/tensorflow/compiler/tf2xla/xla_context.h @@ -86,6 +86,9 @@ class XlaContext : public ResourceBase { Status AddConstRetval(int retval_index, DataType dtype, const xla::LiteralSlice& literal); + // As for Retval, but for return values that are resource handles. + Status AddResourceRetval(int retval_index, XlaResource* resource); + // Creates a resource with resource `kind` and initial value `handle`. `name` // is a descriptive name for use in error messages. See the `XlaResource` // constructor for a description of the remaining arguments. -- GitLab From 4e36393006f8462a3ef516a6ca3010542213f352 Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Mon, 27 Aug 2018 15:38:17 -0700 Subject: [PATCH 184/598] Updating the URL to the paper on leaky_relu. PiperOrigin-RevId: 210444225 --- tensorflow/python/ops/nn_ops.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tensorflow/python/ops/nn_ops.py b/tensorflow/python/ops/nn_ops.py index edc6e04b48..474e0bb295 100644 --- a/tensorflow/python/ops/nn_ops.py +++ b/tensorflow/python/ops/nn_ops.py @@ -1586,7 +1586,7 @@ def leaky_relu(features, alpha=0.2, name=None): "Rectifier Nonlinearities Improve Neural Network Acoustic Models" AL Maas, AY Hannun, AY Ng - Proc. ICML, 2013 - http://web.stanford.edu/~awni/papers/relu_hybrid_icml2013_final.pdf + https://ai.stanford.edu/~amaas/papers/relu_hybrid_icml2013_final.pdf Args: features: A `Tensor` representing preactivation values. Must be one of -- GitLab From 9500c1d80de70dabd1b538287a667c6fda0c394d Mon Sep 17 00:00:00 2001 From: Mark Daoust Date: Mon, 27 Aug 2018 16:38:40 -0700 Subject: [PATCH 185/598] Fix link. PiperOrigin-RevId: 210454123 --- tensorflow/docs_src/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tensorflow/docs_src/README.md b/tensorflow/docs_src/README.md index bcd896c5ba..5b824f1150 100644 --- a/tensorflow/docs_src/README.md +++ b/tensorflow/docs_src/README.md @@ -1,3 +1,3 @@ # This directory has moved -The new location is: https://github.com/tensorflow/docs/site/en +The new location is: https://github.com/tensorflow/docs/ -- GitLab From f577ae972f457cd7ba8dc8be14a80d8d6e27b8cb Mon Sep 17 00:00:00 2001 From: Allen Lavoie Date: Mon, 27 Aug 2018 16:47:28 -0700 Subject: [PATCH 186/598] Checkpointable: Fix the ignore-missing logic for name-based checkpoint restores Restore previously checked if a key existed, but didn't quite ignore that value properly if it was missing. PiperOrigin-RevId: 210455409 --- tensorflow/python/training/checkpointable/util.py | 13 ++++++++++--- .../python/training/checkpointable/util_test.py | 6 ++++++ 2 files changed, 16 insertions(+), 3 deletions(-) diff --git a/tensorflow/python/training/checkpointable/util.py b/tensorflow/python/training/checkpointable/util.py index d1b50d1362..45d217e8b1 100644 --- a/tensorflow/python/training/checkpointable/util.py +++ b/tensorflow/python/training/checkpointable/util.py @@ -199,6 +199,7 @@ class _NameBasedRestoreCoordinator(object): for saveable in self.globally_named_object_attributes( checkpointable): restored_tensors = [] + tensor_missing = False for spec in saveable.specs: if spec.name in self.dtype_map: with ops.device("cpu:0"): @@ -209,9 +210,15 @@ class _NameBasedRestoreCoordinator(object): dtypes=[self.dtype_map[spec.name]], name="%s_checkpoint_read" % (spec.name,)) restored_tensors.append(array_ops.identity(restored)) - - saveable.restore(restored_tensors=restored_tensors, - restored_shapes=None) + else: + tensor_missing = True + + if not tensor_missing: + # Ignores values missing from the checkpoint, as with object-based + # restore. Status assertions can be used to check exact matches, + # although it's unlikely to ever happen for name-based checkpoints. + saveable.restore(restored_tensors=restored_tensors, + restored_shapes=None) # TODO(allenl): If this ends up in a public API, consider adding LINT.IfChange diff --git a/tensorflow/python/training/checkpointable/util_test.py b/tensorflow/python/training/checkpointable/util_test.py index 697b44c3ff..bef4bf2a16 100644 --- a/tensorflow/python/training/checkpointable/util_test.py +++ b/tensorflow/python/training/checkpointable/util_test.py @@ -1482,6 +1482,12 @@ class CheckpointCompatibilityTests(test.TestCase): status = object_saver.restore(save_path) status.initialize_or_restore() self._check_sentinels(root) + # Check that there is no error when keys are missing from the name-based + # checkpoint. + root.not_in_name_checkpoint = resource_variable_ops.ResourceVariable([1.]) + status = object_saver.restore(save_path) + with self.assertRaises(AssertionError): + status.assert_existing_objects_matched() def testSaveGraphLoadEager(self): checkpoint_directory = self.get_temp_dir() -- GitLab From f481d2bed293d8791069711cd08084be3b079222 Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Mon, 27 Aug 2018 16:55:15 -0700 Subject: [PATCH 187/598] Make embedding lookup to always return a colocation constraint free result. Currently, whenever embedding variable has < 2 partitions the returned result may have a constraint on embedding var. This in turn may force the ops that use the embedding lookup result onto ps tasks. PiperOrigin-RevId: 210456452 --- tensorflow/python/kernel_tests/embedding_ops_test.py | 2 +- tensorflow/python/ops/embedding_ops.py | 5 ++++- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/tensorflow/python/kernel_tests/embedding_ops_test.py b/tensorflow/python/kernel_tests/embedding_ops_test.py index 0e83726760..dcd435e1ff 100644 --- a/tensorflow/python/kernel_tests/embedding_ops_test.py +++ b/tensorflow/python/kernel_tests/embedding_ops_test.py @@ -480,7 +480,7 @@ class EmbeddingLookupTest(test.TestCase): id_vals, shape=ids_shape, dtype=dtypes.int32) x, params, _ = _EmbeddingParams(num_shards, vocab_size, shape=[2]) y = embedding_ops.embedding_lookup(x, ids) - y_shape = [num_ids] + list(params[_PName(0) + ":0"].shape[1:]) + y_shape = ids_shape + tuple(params[_PName(0) + ":0"].shape[1:]) x_name = [_PName(i) for i in range(num_shards)] x_init_value = [params[x_n + ":0"] for x_n in x_name] x_shape = [i.shape for i in x_init_value] diff --git a/tensorflow/python/ops/embedding_ops.py b/tensorflow/python/ops/embedding_ops.py index f97fca47ea..6263041b8d 100644 --- a/tensorflow/python/ops/embedding_ops.py +++ b/tensorflow/python/ops/embedding_ops.py @@ -134,7 +134,10 @@ def _embedding_lookup_and_transform(params, ids, max_norm) if transform_fn: result = transform_fn(result) - return result + # Make sure the final result does not have colocation contraints on the + # params. Similar to the case np > 1 where parallel_dynamic_stitch is + # outside the scioe of all with ops.colocate_with(params[p]). + return array_ops.identity(result) else: # Flatten the ids. There are two cases where we need to do this. # - There is more than one params tensor. -- GitLab From 34870d54c0e1b505212da6ace722c6d9f9e8891b Mon Sep 17 00:00:00 2001 From: Alan Chiao Date: Mon, 27 Aug 2018 17:01:33 -0700 Subject: [PATCH 188/598] Update RNN to support state API. PiperOrigin-RevId: 210457365 --- .../delegates/nnapi/nnapi_delegate_test.cc | 2 +- tensorflow/contrib/lite/kernels/basic_rnn.cc | 29 +++++++++---------- .../contrib/lite/kernels/basic_rnn_test.cc | 21 ++++---------- 3 files changed, 20 insertions(+), 32 deletions(-) diff --git a/tensorflow/contrib/lite/delegates/nnapi/nnapi_delegate_test.cc b/tensorflow/contrib/lite/delegates/nnapi/nnapi_delegate_test.cc index 73c27fb3a0..048ec15984 100644 --- a/tensorflow/contrib/lite/delegates/nnapi/nnapi_delegate_test.cc +++ b/tensorflow/contrib/lite/delegates/nnapi/nnapi_delegate_test.cc @@ -1829,7 +1829,7 @@ class RNNOpModel : public SingleOpModelWithNNAPI { int input_size_; }; -TEST(NNAPIDelegate, RnnBlackBoxTest) { +TEST(NNAPIDelegate, DISABLED_RnnBlackBoxTest) { RNNOpModel rnn(2, 16, 8); rnn.SetWeights(rnn_weights); rnn.SetBias(rnn_bias); diff --git a/tensorflow/contrib/lite/kernels/basic_rnn.cc b/tensorflow/contrib/lite/kernels/basic_rnn.cc index c09b15b3d2..c5a5c0182f 100644 --- a/tensorflow/contrib/lite/kernels/basic_rnn.cc +++ b/tensorflow/contrib/lite/kernels/basic_rnn.cc @@ -31,8 +31,10 @@ constexpr int kInputTensor = 0; constexpr int kWeightsTensor = 1; constexpr int kRecurrentWeightsTensor = 2; constexpr int kBiasTensor = 3; -constexpr int kHiddenStateTensor = 0; -constexpr int kOutputTensor = 1; +constexpr int kHiddenStateTensor = 4; + +// Output tensor. +constexpr int kOutputTensor = 0; void* Init(TfLiteContext* context, const char* buffer, size_t length) { auto* scratch_tensor_index = new int; @@ -46,14 +48,16 @@ void Free(TfLiteContext* context, void* buffer) { TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) { // Check we have all the inputs and outputs we need. - TF_LITE_ENSURE_EQ(context, node->inputs->size, 4); - TF_LITE_ENSURE_EQ(context, node->outputs->size, 2); + TF_LITE_ENSURE_EQ(context, node->inputs->size, 5); + TF_LITE_ENSURE_EQ(context, node->outputs->size, 1); const TfLiteTensor* input = GetInput(context, node, kInputTensor); const TfLiteTensor* input_weights = GetInput(context, node, kWeightsTensor); const TfLiteTensor* recurrent_weights = GetInput(context, node, kRecurrentWeightsTensor); const TfLiteTensor* bias = GetInput(context, node, kBiasTensor); + const TfLiteTensor* hidden_state = + GetInput(context, node, kHiddenStateTensor); // Check all the parameters of tensor match within themselves and match the // input configuration. @@ -65,20 +69,12 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) { TF_LITE_ASSERT_EQ(recurrent_weights->dims->data[1], bias->dims->data[0]); TF_LITE_ENSURE_EQ(context, input->type, kTfLiteFloat32); TF_LITE_ENSURE_EQ(context, input_weights->type, recurrent_weights->type); + TF_LITE_ENSURE_EQ(context, NumDimensions(hidden_state), 2); + TF_LITE_ENSURE_EQ(context, hidden_state->dims->data[0], batch_size); + TF_LITE_ENSURE_EQ(context, hidden_state->dims->data[1], num_units); - TfLiteTensor* hidden_state = GetOutput(context, node, kHiddenStateTensor); TfLiteTensor* output = GetOutput(context, node, kOutputTensor); - // Resize state. - TfLiteIntArray* hidden_state_size_array = TfLiteIntArrayCreate(2); - hidden_state_size_array->data[0] = batch_size; - hidden_state_size_array->data[1] = num_units; - TF_LITE_ENSURE_OK(context, context->ResizeTensor(context, hidden_state, - hidden_state_size_array)); - - // Mark hidden state as a persistent tensor. - hidden_state->allocation_type = kTfLiteArenaRwPersistent; - // Resize output. TfLiteIntArray* output_size_array = TfLiteIntArrayCreate(2); output_size_array->data[0] = batch_size; @@ -205,7 +201,8 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) { const TfLiteTensor* recurrent_weights = GetInput(context, node, kRecurrentWeightsTensor); const TfLiteTensor* bias = GetInput(context, node, kBiasTensor); - TfLiteTensor* hidden_state = GetOutput(context, node, kHiddenStateTensor); + TfLiteTensor* hidden_state = + &context->tensors[node->inputs->data[kHiddenStateTensor]]; TfLiteTensor* output = GetOutput(context, node, kOutputTensor); // We already checked that weight types are consistent, so branch on one. diff --git a/tensorflow/contrib/lite/kernels/basic_rnn_test.cc b/tensorflow/contrib/lite/kernels/basic_rnn_test.cc index 96465fcaf0..d179735404 100644 --- a/tensorflow/contrib/lite/kernels/basic_rnn_test.cc +++ b/tensorflow/contrib/lite/kernels/basic_rnn_test.cc @@ -181,15 +181,16 @@ class RNNOpModel : public SingleOpModel { weights_ = AddInput(weights); recurrent_weights_ = AddInput(recurrent_weights); bias_ = AddInput(TensorType_FLOAT32); - hidden_state_ = AddOutput(TensorType_FLOAT32); + hidden_state_ = AddInput(TensorType_FLOAT32, true); output_ = AddOutput(TensorType_FLOAT32); SetBuiltinOp( BuiltinOperator_RNN, BuiltinOptions_RNNOptions, CreateRNNOptions(builder_, ActivationFunctionType_RELU).Union()); - BuildInterpreter({{batches_, input_size_}, - {units_, input_size_}, - {units_, units_}, - {units_}}); + BuildInterpreter({{batches_, input_size_}, // input tensor + {units_, input_size_}, // weights tensor + {units_, units_}, // recurrent weights tensor + {units_}, // bias tensor + {batches_, units_}}); // hidden state tensor } void SetBias(std::initializer_list f) { PopulateTensor(bias_, f); } @@ -210,14 +211,6 @@ class RNNOpModel : public SingleOpModel { PopulateTensor(input_, offset, begin, end); } - void ResetHiddenState() { - const int zero_buffer_size = units_ * batches_; - std::unique_ptr zero_buffer(new float[zero_buffer_size]); - memset(zero_buffer.get(), 0, zero_buffer_size * sizeof(float)); - PopulateTensor(hidden_state_, 0, zero_buffer.get(), - zero_buffer.get() + zero_buffer_size); - } - std::vector GetOutput() { return ExtractVector(output_); } int input_size() { return input_size_; } @@ -258,7 +251,6 @@ TEST(RnnOpTest, BlackBoxTest) { rnn.SetBias(rnn_bias); rnn.SetRecurrentWeights(rnn_recurrent_weights); - rnn.ResetHiddenState(); const int input_sequence_size = sizeof(rnn_input) / sizeof(float) / (rnn.input_size() * rnn.num_batches()); @@ -286,7 +278,6 @@ TEST(HybridRnnOpTest, BlackBoxTest) { rnn.SetBias(rnn_bias); rnn.SetRecurrentWeights(rnn_recurrent_weights); - rnn.ResetHiddenState(); const int input_sequence_size = sizeof(rnn_input) / sizeof(float) / (rnn.input_size() * rnn.num_batches()); -- GitLab From a37798a39807e5c9be3d26b3fb8ac628d1d9385f Mon Sep 17 00:00:00 2001 From: Akshay Modi Date: Mon, 27 Aug 2018 17:13:25 -0700 Subject: [PATCH 189/598] Correctly expose connect_to_remote_host PiperOrigin-RevId: 210459088 --- tensorflow/contrib/eager/python/tfe.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tensorflow/contrib/eager/python/tfe.py b/tensorflow/contrib/eager/python/tfe.py index fe7f1b72fc..f5b8d95e4f 100644 --- a/tensorflow/contrib/eager/python/tfe.py +++ b/tensorflow/contrib/eager/python/tfe.py @@ -74,7 +74,7 @@ To use, at program startup, call `tf.enable_eager_execution()`. @@TensorSpec -@@connect_to_cloud_tpu +@@connect_to_remote_host @@DEVICE_PLACEMENT_EXPLICIT @@DEVICE_PLACEMENT_WARN -- GitLab From 572e8efcb5b51c77aeabd75b3258d0be9d067452 Mon Sep 17 00:00:00 2001 From: Akshay Agrawal Date: Mon, 27 Aug 2018 17:15:22 -0700 Subject: [PATCH 190/598] Add the colocation stack to defun's cache key. This makes defun respect call-site `colcoate_with` contexts. PiperOrigin-RevId: 210459321 --- tensorflow/python/eager/function.py | 11 +++++++++-- tensorflow/python/eager/function_test.py | 23 +++++++++++++++++++++++ 2 files changed, 32 insertions(+), 2 deletions(-) diff --git a/tensorflow/python/eager/function.py b/tensorflow/python/eager/function.py index 9b50f54eb8..6c87dccaf1 100644 --- a/tensorflow/python/eager/function.py +++ b/tensorflow/python/eager/function.py @@ -238,6 +238,7 @@ class FuncGraph(CapturingGraph): self.seed = graph.seed self._xla_compile = getattr(graph, "_xla_compile", False) self._device_function_stack = graph._device_function_stack.copy() # pylint: disable=protected-access + self._colocation_stack = graph._colocation_stack.copy() # pylint: disable=protected-access # TODO(b/112165328, b/112906995): summaries depend on inheriting collections # from the default graph even in eager mode. It'd be nice to not have a @@ -1018,13 +1019,19 @@ class PolymorphicFunction(object): # The graph, or whether we're executing eagerly, should be a part of the # cache key so we don't improperly capture tensors such as variables. - execution_context = ctx.executing_eagerly() or graph + executing_eagerly = ctx.executing_eagerly() + execution_context = executing_eagerly or graph # Putting the device in the cache key ensures that call-site device # annotations are respected. device_functions = _get_device_functions(ctx, graph) - return cache_key + (execution_context, device_functions) + # `ops.colocate_with` directives translate into `ops.device` directives when + # eager execution is enabled. + colocation_stack = (None if executing_eagerly else + tuple(graph._colocation_stack.peek_objs())) # pylint: disable=protected-access + + return cache_key + (execution_context, device_functions, colocation_stack) def _canonicalize_function_inputs(self, *args, **kwds): """Canonicalizes `args` and `kwds`. diff --git a/tensorflow/python/eager/function_test.py b/tensorflow/python/eager/function_test.py index 8381d2f55c..3c79099d87 100644 --- a/tensorflow/python/eager/function_test.py +++ b/tensorflow/python/eager/function_test.py @@ -1101,6 +1101,29 @@ class FunctionTest(test.TestCase): with ops.device('cpu:1'): default_graph_function() + @test_util.run_in_graph_and_eager_modes + def testColocateWithRespected(self): + # TODO(b/113291792): Use multiple CPUs instead of a GPU. + if not context.context().num_gpus(): + self.skipTest('No GPUs found.') + + with ops.device('cpu:0'): + x = constant_op.constant(1.0) + + with ops.device('gpu:0'): + y = constant_op.constant(1.0) + + @function.defun + def foo(): + return iterator_ops.Iterator.from_structure( + (dtypes.float32,)).string_handle() + + with ops.colocate_with(x): + self.assertIn(compat.as_bytes('CPU:0'), self.evaluate(foo())) + + with ops.colocate_with(y): + self.assertIn(compat.as_bytes('GPU:0'), self.evaluate(foo())) + def testVariablesAreTracked(self): v = resource_variable_ops.ResourceVariable(1.0) -- GitLab From 1e1cf3ea4fa161727ba5fed5fad05ed1b243fcef Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Mon, 27 Aug 2018 17:22:57 -0700 Subject: [PATCH 191/598] Switch to tf_kernel_library to make sure always_linked is set to 1. PiperOrigin-RevId: 210460236 --- tensorflow/core/kernels/data/BUILD | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tensorflow/core/kernels/data/BUILD b/tensorflow/core/kernels/data/BUILD index 7716043055..8d867455e7 100644 --- a/tensorflow/core/kernels/data/BUILD +++ b/tensorflow/core/kernels/data/BUILD @@ -232,7 +232,7 @@ cc_library( ], ) -cc_library( +tf_kernel_library( name = "parse_example_dataset_op", srcs = ["parse_example_dataset_op.cc"], deps = [ -- GitLab From c7173ca08a06145439362280517bd1e741ee8c7b Mon Sep 17 00:00:00 2001 From: Tim Shen Date: Mon, 27 Aug 2018 17:29:40 -0700 Subject: [PATCH 192/598] Remove tensorflow::gtl::{Mutable,}ArraySlice's old API that don't show up in absl::Span, so that transitioning to the latter is easier. PiperOrigin-RevId: 210461037 --- tensorflow/compiler/tf2xla/kernels/diag_op.cc | 2 +- tensorflow/compiler/tf2xla/kernels/select_op.cc | 2 +- tensorflow/compiler/tf2xla/lib/scatter.cc | 2 +- .../compiler/xla/service/elemental_ir_emitter.cc | 2 +- tensorflow/compiler/xla/shape_util.h | 4 ++-- .../xla/tools/dumped_computation_to_graphviz.cc | 2 +- .../tools/dumped_computation_to_operation_list.cc | 2 +- .../xla/tools/dumped_computation_to_text.cc | 2 +- .../xla/tools/dumped_computation_to_tf_graphdef.cc | 2 +- .../compiler/xla/tools/replay_computation.cc | 2 +- tensorflow/compiler/xla/tools/show_signature.cc | 2 +- tensorflow/core/lib/gtl/array_slice.h | 7 ------- tensorflow/core/lib/gtl/array_slice_test.cc | 14 ++++++-------- 13 files changed, 18 insertions(+), 27 deletions(-) diff --git a/tensorflow/compiler/tf2xla/kernels/diag_op.cc b/tensorflow/compiler/tf2xla/kernels/diag_op.cc index ed44ad218b..70c3eaf66b 100644 --- a/tensorflow/compiler/tf2xla/kernels/diag_op.cc +++ b/tensorflow/compiler/tf2xla/kernels/diag_op.cc @@ -178,7 +178,7 @@ class MatrixDiagOp : public XlaOpKernel { int last_dim = dims.size() - 1; int64 last_dim_size = input_shape.dim_size(last_dim); tensorflow::gtl::ArraySlice other_dims(dims); - other_dims.pop_back(); + other_dims.remove_suffix(1); xla::XlaOp input = ctx->Input(0); xla::XlaOp diag = CreateDiagonal(input, last_dim_size, other_dims, diff --git a/tensorflow/compiler/tf2xla/kernels/select_op.cc b/tensorflow/compiler/tf2xla/kernels/select_op.cc index 6ce50efb4a..d9578eca5b 100644 --- a/tensorflow/compiler/tf2xla/kernels/select_op.cc +++ b/tensorflow/compiler/tf2xla/kernels/select_op.cc @@ -67,7 +67,7 @@ class SelectOp : public XlaOpKernel { // to get the dimensions in the right order. const auto dim_sizes = then_shape.dim_sizes(); gtl::ArraySlice bdims = dim_sizes; - bdims.pop_front(); + bdims.remove_prefix(1); cond_handle = xla::Broadcast(cond_handle, bdims); std::vector dim_order(then_shape.dims()); diff --git a/tensorflow/compiler/tf2xla/lib/scatter.cc b/tensorflow/compiler/tf2xla/lib/scatter.cc index ba22eff73a..bafe5099f2 100644 --- a/tensorflow/compiler/tf2xla/lib/scatter.cc +++ b/tensorflow/compiler/tf2xla/lib/scatter.cc @@ -58,7 +58,7 @@ xla::StatusOr XlaScatter( ") must be <= the rank of the buffer (shape: ", xla::ShapeUtil::HumanString(buffer_shape), ")"); } - indices_dims.pop_back(); + indices_dims.remove_suffix(1); } int64 num_indices = 1; diff --git a/tensorflow/compiler/xla/service/elemental_ir_emitter.cc b/tensorflow/compiler/xla/service/elemental_ir_emitter.cc index 2e5930fb70..52faaab25c 100644 --- a/tensorflow/compiler/xla/service/elemental_ir_emitter.cc +++ b/tensorflow/compiler/xla/service/elemental_ir_emitter.cc @@ -899,7 +899,7 @@ StatusOr ElementalIrEmitter::EmitErfInv(PrimitiveType prim_type, auto multiply_add = [&](tensorflow::gtl::ArraySlice coefficients, llvm::Value* w) { llvm::Value* p = getFloat(coefficients.front()); - coefficients.pop_front(); + coefficients.remove_prefix(1); for (float coefficient : coefficients) { p = b_->CreateFAdd(b_->CreateFMul(p, w), getFloat(coefficient)); } diff --git a/tensorflow/compiler/xla/shape_util.h b/tensorflow/compiler/xla/shape_util.h index 84f36e48a0..83e58545bf 100644 --- a/tensorflow/compiler/xla/shape_util.h +++ b/tensorflow/compiler/xla/shape_util.h @@ -131,12 +131,12 @@ class ShapeIndexView { } ShapeIndexView ConsumeFront() const { ShapeIndexView result = *this; - result.indices_.pop_front(); + result.indices_.remove_prefix(1); return result; } ShapeIndexView ConsumeBack() const { ShapeIndexView result = *this; - result.indices_.pop_back(); + result.indices_.remove_suffix(1); return result; } ShapeIndex ToShapeIndex() const { return ShapeIndex(begin(), end()); } diff --git a/tensorflow/compiler/xla/tools/dumped_computation_to_graphviz.cc b/tensorflow/compiler/xla/tools/dumped_computation_to_graphviz.cc index f20dcef382..d15b71b792 100644 --- a/tensorflow/compiler/xla/tools/dumped_computation_to_graphviz.cc +++ b/tensorflow/compiler/xla/tools/dumped_computation_to_graphviz.cc @@ -78,7 +78,7 @@ int main(int argc, char** argv) { tensorflow::port::InitMain(argv[0], &argc, &argv); tensorflow::gtl::ArraySlice args(argv, argc); - args.pop_front(); // Pop off the binary name, argv[0] + args.remove_prefix(1); // Pop off the binary name, argv[0] xla::tools::RealMain(args); return 0; } diff --git a/tensorflow/compiler/xla/tools/dumped_computation_to_operation_list.cc b/tensorflow/compiler/xla/tools/dumped_computation_to_operation_list.cc index 72e5abd274..c446b27a04 100644 --- a/tensorflow/compiler/xla/tools/dumped_computation_to_operation_list.cc +++ b/tensorflow/compiler/xla/tools/dumped_computation_to_operation_list.cc @@ -105,7 +105,7 @@ int main(int argc, char** argv) { tensorflow::port::InitMain(argv[0], &argc, &argv); tensorflow::gtl::ArraySlice args(argv, argc); - args.pop_front(); // Pop off the binary name, argv[0] + args.remove_prefix(1); // Pop off the binary name, argv[0] xla::tools::RealMain(args); return 0; } diff --git a/tensorflow/compiler/xla/tools/dumped_computation_to_text.cc b/tensorflow/compiler/xla/tools/dumped_computation_to_text.cc index f03e1b1f96..d86a4474b3 100644 --- a/tensorflow/compiler/xla/tools/dumped_computation_to_text.cc +++ b/tensorflow/compiler/xla/tools/dumped_computation_to_text.cc @@ -103,7 +103,7 @@ int main(int argc, char** argv) { QCHECK(argc > 1) << "\nERROR: must specify at least one module\n" << usage; tensorflow::gtl::ArraySlice args(argv, argc); - args.pop_front(); // Pop off the binary name, argv[0] + args.remove_prefix(1); // Pop off the binary name, argv[0] xla::tools::RealMain(args, compile); return 0; } diff --git a/tensorflow/compiler/xla/tools/dumped_computation_to_tf_graphdef.cc b/tensorflow/compiler/xla/tools/dumped_computation_to_tf_graphdef.cc index dc5c106d02..bd8b89542f 100644 --- a/tensorflow/compiler/xla/tools/dumped_computation_to_tf_graphdef.cc +++ b/tensorflow/compiler/xla/tools/dumped_computation_to_tf_graphdef.cc @@ -79,7 +79,7 @@ int main(int argc, char** argv) { tensorflow::port::InitMain(argv[0], &argc, &argv); tensorflow::gtl::ArraySlice args(argv, argc); - args.pop_front(); // Pop off the binary name, argv[0] + args.remove_prefix(1); // Pop off the binary name, argv[0] xla::tools::RealMain(args); return 0; } diff --git a/tensorflow/compiler/xla/tools/replay_computation.cc b/tensorflow/compiler/xla/tools/replay_computation.cc index e776e6a4eb..e826d6fa93 100644 --- a/tensorflow/compiler/xla/tools/replay_computation.cc +++ b/tensorflow/compiler/xla/tools/replay_computation.cc @@ -345,6 +345,6 @@ int main(int argc, char** argv) { } tensorflow::gtl::ArraySlice args(argv, argc); - args.pop_front(); // Pop off the binary name, argv[0] + args.remove_prefix(1); // Pop off the binary name, argv[0] return xla::tools::RealMain(args, opts); } diff --git a/tensorflow/compiler/xla/tools/show_signature.cc b/tensorflow/compiler/xla/tools/show_signature.cc index 4e53fafcc9..10e7202acf 100644 --- a/tensorflow/compiler/xla/tools/show_signature.cc +++ b/tensorflow/compiler/xla/tools/show_signature.cc @@ -67,7 +67,7 @@ int main(int argc, char** argv) { tensorflow::port::InitMain(argv[0], &argc, &argv); tensorflow::gtl::ArraySlice args(argv, argc); - args.pop_front(); // Pop off the binary name, argv[0] + args.remove_prefix(1); // Pop off the binary name, argv[0] xla::tools::RealMain(args); return 0; } diff --git a/tensorflow/core/lib/gtl/array_slice.h b/tensorflow/core/lib/gtl/array_slice.h index 4ecc96ee79..b773a65569 100644 --- a/tensorflow/core/lib/gtl/array_slice.h +++ b/tensorflow/core/lib/gtl/array_slice.h @@ -187,8 +187,6 @@ class ArraySlice { void remove_prefix(size_type n) { impl_.remove_prefix(n); } void remove_suffix(size_type n) { impl_.remove_suffix(n); } - void pop_back() { remove_suffix(1); } - void pop_front() { remove_prefix(1); } // These relational operators have the same semantics as the // std::vector relational operators: they do deep (element-wise) @@ -286,8 +284,6 @@ class MutableArraySlice { void remove_prefix(size_type n) { impl_.remove_prefix(n); } void remove_suffix(size_type n) { impl_.remove_suffix(n); } - void pop_back() { remove_suffix(1); } - void pop_front() { remove_prefix(1); } bool operator==(ArraySlice other) const { return ArraySlice(*this) == other; @@ -296,9 +292,6 @@ class MutableArraySlice { return ArraySlice(*this) != other; } - // DEPRECATED(jacobsa): Please use data() instead. - pointer mutable_data() const { return impl_.data(); } - private: Impl impl_; }; diff --git a/tensorflow/core/lib/gtl/array_slice_test.cc b/tensorflow/core/lib/gtl/array_slice_test.cc index 4d3da85b88..c798a488cb 100644 --- a/tensorflow/core/lib/gtl/array_slice_test.cc +++ b/tensorflow/core/lib/gtl/array_slice_test.cc @@ -73,13 +73,13 @@ static void TestHelper(const IntSlice& vorig, const IntVec& vec) { if (len > 0) { EXPECT_EQ(0, v.front()); EXPECT_EQ(len - 1, v.back()); - v.pop_back(); + v.remove_suffix(1); EXPECT_EQ(len - 1, v.size()); for (size_t i = 0; i < v.size(); ++i) { EXPECT_EQ(i, v[i]); } if (len > 1) { - v.pop_front(); + v.remove_prefix(1); EXPECT_EQ(len - 2, v.size()); for (size_t i = 0; i < v.size(); ++i) { EXPECT_EQ(i + 1, v[i]); @@ -128,7 +128,7 @@ static void MutableTestHelper(const MutableIntSlice& vorig, int* ptr, MutableIntSlice other; // To test the assignment return value. MutableIntSlice v = other = vorig; - EXPECT_EQ(ptr, v.mutable_data()); + EXPECT_EQ(ptr, v.data()); int counter = 0; for (MutableIntSlice::iterator it = v.begin(); it != v.end(); ++it) { @@ -142,17 +142,17 @@ static void MutableTestHelper(const MutableIntSlice& vorig, int* ptr, v[0] = 1; v.front() = 2; v.back() = 5; - *v.mutable_data() = 4; + *v.data() = 4; std::fill(v.begin(), v.end(), 5); std::fill(v.rbegin(), v.rend(), 6); // Test size-changing methods. - v.pop_back(); + v.remove_suffix(1); EXPECT_EQ(len - 1, v.size()); for (size_t i = 0; i < v.size(); ++i) { EXPECT_EQ(ptr + i, &v[i]); } if (len > 1) { - v.pop_front(); + v.remove_prefix(1); EXPECT_EQ(len - 2, v.size()); for (size_t i = 0; i < v.size(); ++i) { EXPECT_EQ(ptr + i + 1, &v[i]); @@ -605,7 +605,6 @@ TEST(MutableIntSlice, IteratorsAndReferences) { MutableIntSlice s = a; accept_pointer(s.data()); - accept_pointer(s.mutable_data()); accept_iterator(s.begin()); accept_iterator(s.end()); accept_reverse_iterator(s.rbegin()); @@ -627,7 +626,6 @@ TEST(MutableIntSlice, IteratorsAndReferences_Const) { const MutableIntSlice s = a; accept_pointer(s.data()); - accept_pointer(s.mutable_data()); accept_iterator(s.begin()); accept_iterator(s.end()); accept_reverse_iterator(s.rbegin()); -- GitLab From 4a596512be206b28120acd3253d022042fa2ce6d Mon Sep 17 00:00:00 2001 From: Suyog Gupta Date: Mon, 27 Aug 2018 17:30:27 -0700 Subject: [PATCH 193/598] Use nbins as given in hparams when pruning on TPUs PiperOrigin-RevId: 210461150 --- tensorflow/contrib/model_pruning/README.md | 2 +- tensorflow/contrib/model_pruning/python/pruning_utils.py | 9 ++++----- 2 files changed, 5 insertions(+), 6 deletions(-) diff --git a/tensorflow/contrib/model_pruning/README.md b/tensorflow/contrib/model_pruning/README.md index a5267fd904..15d95896d9 100644 --- a/tensorflow/contrib/model_pruning/README.md +++ b/tensorflow/contrib/model_pruning/README.md @@ -53,7 +53,7 @@ The pruning library allows for specification of the following hyper parameters: | weight_sparsity_map | list of strings | [""] | list of weight variable name (or layer name):target sparsity pairs. Eg. [conv1:0.9,conv2/kernel:0.8]. For layers/weights not in this list, sparsity as specified by the target_sparsity hyperparameter is used. | | threshold_decay | float | 0.9 | The decay factor to use for exponential decay of the thresholds | | pruning_frequency | integer | 10 | How often should the masks be updated? (in # of global_steps) | -| nbins | integer | 256 | Number of bins to use for histogram computation | +| nbins | integer | 256 | Number of bins to use for histogram computation. Note: When running on TPUs, a large (>1024) value for `nbins` may adversely affect the training time. | | block_height|integer | 1 | Number of rows in a block for block sparse matrices| | block_width |integer | 1 | Number of cols in a block for block sparse matrices| | block_pooling_function| string | AVG | The function to use to pool weight values in a block: average (AVG) or max (MAX)| diff --git a/tensorflow/contrib/model_pruning/python/pruning_utils.py b/tensorflow/contrib/model_pruning/python/pruning_utils.py index b50a372e9d..91b0bb7f60 100644 --- a/tensorflow/contrib/model_pruning/python/pruning_utils.py +++ b/tensorflow/contrib/model_pruning/python/pruning_utils.py @@ -235,19 +235,18 @@ def compute_cdf_from_histogram(values, value_range, **kwargs): def compute_cdf(values, value_range, **kwargs): """Returns the normalized cumulative distribution of the given values tensor. - Uses tf.while_loop to directly compute the cdf of the values. Number of bins - for histogram is fixed at _NBINS=255 + Uses tf.while_loop to directly compute the cdf of the values. Args: values: Numeric `Tensor`. value_range: Shape [2] `Tensor` of same `dtype` as `values` - **kwargs: keyword arguments: name + **kwargs: keyword arguments: nbins, name Returns: A 1-D `Tensor` holding normalized cdf of values. """ - nbins = _NBINS + nbins = kwargs.get('nbins', _NBINS) name = kwargs.get('name', None) with ops.name_scope(name, 'cdf', [values, value_range, nbins]): values = ops.convert_to_tensor(values, name='values') @@ -281,7 +280,7 @@ def compute_cdf(values, value_range, **kwargs): cdf = math_ops.add( cdf, array_ops.one_hot( - loop_count, depth=_NBINS, on_value=temp, off_value=0.0)) + loop_count, depth=nbins, on_value=temp, off_value=0.0)) return [loop_count + 1, cdf] _, cdf = control_flow_ops.while_loop( -- GitLab From 11122413de40f0813ef180ed70e70ce605afef34 Mon Sep 17 00:00:00 2001 From: Sanjoy Das Date: Mon, 27 Aug 2018 18:29:17 -0700 Subject: [PATCH 194/598] Print cross-cluster edges; NFC PiperOrigin-RevId: 210467779 --- .../compiler/jit/mark_for_compilation_pass.cc | 76 +++++++++++++++++++ 1 file changed, 76 insertions(+) diff --git a/tensorflow/compiler/jit/mark_for_compilation_pass.cc b/tensorflow/compiler/jit/mark_for_compilation_pass.cc index 518c39ec15..4e4abade32 100644 --- a/tensorflow/compiler/jit/mark_for_compilation_pass.cc +++ b/tensorflow/compiler/jit/mark_for_compilation_pass.cc @@ -668,6 +668,82 @@ static void VLogClusteringSummary(const Graph& g) { VLOG(3) << " " << pair.first << ": " << pair.second << " instances"; } } + + struct EdgeInfo { + StringPiece node_name; + absl::optional cluster_name; + + StringPiece GetClusterName() const { + return cluster_name ? *cluster_name : "[none]"; + } + + std::pair> AsPair() const { + return {node_name, cluster_name}; + } + + bool operator<(const EdgeInfo& other) const { + return AsPair() < other.AsPair(); + } + }; + + using EdgeInfoMap = std::map>; + + EdgeInfoMap incoming_edge_infos; + EdgeInfoMap outgoing_edge_infos; + + std::set cluster_names_to_print; + + for (const Edge* e : g.edges()) { + const Node* from = e->src(); + absl::optional from_cluster_name = GetXlaClusterForNode(*from); + + const Node* to = e->dst(); + absl::optional to_cluster_name = GetXlaClusterForNode(*to); + + if (to_cluster_name == from_cluster_name) { + continue; + } + + if (to_cluster_name) { + incoming_edge_infos[*to_cluster_name] + [EdgeInfo{from->name(), from_cluster_name}]++; + cluster_names_to_print.insert(*to_cluster_name); + } + + if (from_cluster_name) { + outgoing_edge_infos[*from_cluster_name][{to->name(), to_cluster_name}]++; + cluster_names_to_print.insert(*from_cluster_name); + } + } + + VLOG(2) << "*** Inter-Cluster edges:"; + if (cluster_names_to_print.empty()) { + VLOG(2) << " [none]"; + } + + auto print_edge_info_set_for_cluster = [&](StringPiece cluster_name, + const EdgeInfoMap& edge_info_map, + StringPiece desc) { + auto it = edge_info_map.find(cluster_name); + if (it != edge_info_map.end()) { + VLOG(2) << " " << it->second.size() << " " << desc << " edges"; + for (const auto& edge_info_count_pair : it->second) { + VLOG(2) << " " << edge_info_count_pair.first.GetClusterName() << " " + << edge_info_count_pair.first.node_name << " # " + << edge_info_count_pair.second; + } + } else { + VLOG(2) << " No " << desc << " edges."; + } + }; + + for (StringPiece cluster_name : cluster_names_to_print) { + VLOG(2) << " ** Cluster " << cluster_name; + print_edge_info_set_for_cluster(cluster_name, incoming_edge_infos, + "incoming"); + print_edge_info_set_for_cluster(cluster_name, outgoing_edge_infos, + "outgoing"); + } } // Is 'node' an operator that consumes only the shape of its input, not the -- GitLab From 9422d3d57c62399e425f63a769dcfa6ebd163bdc Mon Sep 17 00:00:00 2001 From: Sanjoy Das Date: Mon, 27 Aug 2018 18:41:44 -0700 Subject: [PATCH 195/598] [TF:XLA] Bump open source absl revision to f0f15c2778b0e4959244dd25e63f445a455870f5 PiperOrigin-RevId: 210469962 --- tensorflow/workspace.bzl | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tensorflow/workspace.bzl b/tensorflow/workspace.bzl index c1b38d920e..941b27cb59 100755 --- a/tensorflow/workspace.bzl +++ b/tensorflow/workspace.bzl @@ -106,11 +106,11 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""): tf_http_archive( name = "com_google_absl", urls = [ - "https://mirror.bazel.build/github.com/abseil/abseil-cpp/archive/fefc83638fb69395d259ed245699310610429064.tar.gz", - "https://github.com/abseil/abseil-cpp/archive/fefc83638fb69395d259ed245699310610429064.tar.gz", + "https://mirror.bazel.build/github.com/abseil/abseil-cpp/archive/f0f15c2778b0e4959244dd25e63f445a455870f5.tar.gz", + "https://github.com/abseil/abseil-cpp/archive/f0f15c2778b0e4959244dd25e63f445a455870f5.tar.gz", ], - sha256 = "e5f94a6fcc42cb3f312987a1f8c1a62a915bab4df993cf6cde95f64f2d264259", - strip_prefix = "abseil-cpp-fefc83638fb69395d259ed245699310610429064", + sha256 = "4ee36dacb75846eaa209ce8060bb269a42b7b3903612ca6d9e86a692659fe8c1", + strip_prefix = "abseil-cpp-f0f15c2778b0e4959244dd25e63f445a455870f5", build_file = clean_dep("//third_party:com_google_absl.BUILD"), ) -- GitLab From a053d7ba69ce9c42f6c854f20aa565407de8c3f7 Mon Sep 17 00:00:00 2001 From: "karl@kubx.ca" Date: Mon, 27 Aug 2018 21:51:23 -0400 Subject: [PATCH 196/598] Add mention about default_value lifetime constraint --- tensorflow/java/src/gen/cc/op_specs.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/tensorflow/java/src/gen/cc/op_specs.h b/tensorflow/java/src/gen/cc/op_specs.h index 7ad19af562..4adcfca96a 100644 --- a/tensorflow/java/src/gen/cc/op_specs.h +++ b/tensorflow/java/src/gen/cc/op_specs.h @@ -94,7 +94,10 @@ class AttributeSpec { // jni_type: the type of this attribute in JNI layer (see OperationBuilder) // description: a description of this attribute, in javadoc // iterable: true if this attribute is a list - // default_value: default value for this attribute or nullptr if none + // default_value: default value for this attribute or nullptr if none. Any + // value referenced by this pointer must outlive the lifetime + // of the AttributeSpec. This is guaranteed if the value is + // issued by an OpDef of the global OpRegistry. AttributeSpec(const string& op_def_name, const Variable& var, const Type& type, const Type& jni_type, const string& description, bool iterable, -- GitLab From fa607e7e9224b4d88ead0a81fc65c7884d25950a Mon Sep 17 00:00:00 2001 From: Sanjoy Das Date: Mon, 27 Aug 2018 18:50:25 -0700 Subject: [PATCH 197/598] Use a mixin to reduce llvm::IRBuilder<> related boilerplate. PiperOrigin-RevId: 210472260 --- tensorflow/compiler/xla/service/BUILD | 1 + tensorflow/compiler/xla/service/cpu/BUILD | 1 + .../xla/service/cpu/elemental_ir_emitter.cc | 25 +- .../xla/service/cpu/elemental_ir_emitter.h | 6 +- .../compiler/xla/service/cpu/ir_emitter.cc | 376 +++++----- .../compiler/xla/service/cpu/ir_emitter.h | 7 +- .../xla/service/elemental_ir_emitter.cc | 688 ++++++++---------- .../xla/service/elemental_ir_emitter.h | 126 ++-- tensorflow/compiler/xla/service/gpu/BUILD | 1 + .../xla/service/gpu/elemental_ir_emitter.cc | 120 ++- .../xla/service/gpu/elemental_ir_emitter.h | 40 +- .../compiler/xla/service/gpu/ir_emitter.cc | 103 ++- .../compiler/xla/service/gpu/ir_emitter.h | 6 +- .../xla/service/gpu/ir_emitter_unnested.cc | 264 ++++--- tensorflow/compiler/xla/service/llvm_ir/BUILD | 9 + .../xla/service/llvm_ir/ir_builder_mixin.h | 400 ++++++++++ 16 files changed, 1242 insertions(+), 931 deletions(-) create mode 100644 tensorflow/compiler/xla/service/llvm_ir/ir_builder_mixin.h diff --git a/tensorflow/compiler/xla/service/BUILD b/tensorflow/compiler/xla/service/BUILD index f164a614f1..025f0b0195 100644 --- a/tensorflow/compiler/xla/service/BUILD +++ b/tensorflow/compiler/xla/service/BUILD @@ -2689,6 +2689,7 @@ cc_library( "//tensorflow/compiler/xla:util", "//tensorflow/compiler/xla:xla_data_proto", "//tensorflow/compiler/xla/service/llvm_ir:ir_array", + "//tensorflow/compiler/xla/service/llvm_ir:ir_builder_mixin", "//tensorflow/compiler/xla/service/llvm_ir:llvm_loop", "//tensorflow/compiler/xla/service/llvm_ir:llvm_util", "//tensorflow/compiler/xla/service/llvm_ir:loop_emitter", diff --git a/tensorflow/compiler/xla/service/cpu/BUILD b/tensorflow/compiler/xla/service/cpu/BUILD index f0adfc5d45..4cd192873f 100644 --- a/tensorflow/compiler/xla/service/cpu/BUILD +++ b/tensorflow/compiler/xla/service/cpu/BUILD @@ -278,6 +278,7 @@ cc_library( "//tensorflow/compiler/xla/service/llvm_ir:dynamic_update_slice_util", "//tensorflow/compiler/xla/service/llvm_ir:fused_ir_emitter", "//tensorflow/compiler/xla/service/llvm_ir:ir_array", + "//tensorflow/compiler/xla/service/llvm_ir:ir_builder_mixin", "//tensorflow/compiler/xla/service/llvm_ir:llvm_loop", "//tensorflow/compiler/xla/service/llvm_ir:llvm_util", "//tensorflow/compiler/xla/service/llvm_ir:loop_emitter", diff --git a/tensorflow/compiler/xla/service/cpu/elemental_ir_emitter.cc b/tensorflow/compiler/xla/service/cpu/elemental_ir_emitter.cc index db54454707..c8312d80bd 100644 --- a/tensorflow/compiler/xla/service/cpu/elemental_ir_emitter.cc +++ b/tensorflow/compiler/xla/service/cpu/elemental_ir_emitter.cc @@ -30,15 +30,16 @@ limitations under the License. namespace xla { namespace cpu { -StatusOr CpuElementalIrEmitter::EmitAtan2( - PrimitiveType prim_type, llvm::Value* lhs, llvm::Value* rhs) const { +StatusOr CpuElementalIrEmitter::EmitAtan2(PrimitiveType prim_type, + llvm::Value* lhs, + llvm::Value* rhs) { string function_name; bool cast_result_to_fp16 = false; switch (prim_type) { case F16: cast_result_to_fp16 = true; - lhs = b_->CreateFPCast(lhs, b_->getFloatTy()); - rhs = b_->CreateFPCast(rhs, b_->getFloatTy()); + lhs = FPCast(lhs, b_->getFloatTy()); + rhs = FPCast(rhs, b_->getFloatTy()); TF_FALLTHROUGH_INTENDED; case F32: function_name = "atan2f"; @@ -58,21 +59,21 @@ StatusOr CpuElementalIrEmitter::EmitAtan2( function->setDoesNotThrow(); function->setDoesNotAccessMemory(); // Create an instruction to call the function. - llvm::Value* result = b_->CreateCall(function, {lhs, rhs}); + llvm::Value* result = Call(function, {lhs, rhs}); if (cast_result_to_fp16) { - result = b_->CreateFPCast(result, b_->getHalfTy()); + result = FPCast(result, b_->getHalfTy()); } return result; } -StatusOr CpuElementalIrEmitter::EmitTanh( - PrimitiveType prim_type, llvm::Value* value) const { +StatusOr CpuElementalIrEmitter::EmitTanh(PrimitiveType prim_type, + llvm::Value* value) { bool cast_result_to_fp16 = false; string function_name; switch (prim_type) { case F16: cast_result_to_fp16 = true; - value = b_->CreateFPCast(value, b_->getFloatTy()); + value = FPCast(value, b_->getFloatTy()); TF_FALLTHROUGH_INTENDED; case F32: function_name = "tanhf"; @@ -91,16 +92,16 @@ StatusOr CpuElementalIrEmitter::EmitTanh( function->setDoesNotThrow(); function->setDoesNotAccessMemory(); // Create an instruction to call the function. - llvm::Value* result = b_->CreateCall(function, value); + llvm::Value* result = Call(function, value); if (cast_result_to_fp16) { - result = b_->CreateFPCast(result, b_->getHalfTy()); + result = FPCast(result, b_->getHalfTy()); } return result; } llvm_ir::ElementGenerator CpuElementalIrEmitter::MakeElementGenerator( const HloInstruction* hlo, - const HloToElementGeneratorMap& operand_to_generator) const { + const HloToElementGeneratorMap& operand_to_generator) { if (hlo->opcode() == HloOpcode::kMap) { return [this, hlo, &operand_to_generator]( const llvm_ir::IrArray::Index& index) -> StatusOr { diff --git a/tensorflow/compiler/xla/service/cpu/elemental_ir_emitter.h b/tensorflow/compiler/xla/service/cpu/elemental_ir_emitter.h index 76833e765d..e3fba9306b 100644 --- a/tensorflow/compiler/xla/service/cpu/elemental_ir_emitter.h +++ b/tensorflow/compiler/xla/service/cpu/elemental_ir_emitter.h @@ -36,13 +36,13 @@ class CpuElementalIrEmitter : public ElementalIrEmitter { llvm_ir::ElementGenerator MakeElementGenerator( const HloInstruction* hlo, - const HloToElementGeneratorMap& operand_to_generator) const override; + const HloToElementGeneratorMap& operand_to_generator) override; protected: StatusOr EmitAtan2(PrimitiveType prim_type, llvm::Value* lhs, - llvm::Value* rhs) const override; + llvm::Value* rhs) override; StatusOr EmitTanh(PrimitiveType prim_type, - llvm::Value* value) const override; + llvm::Value* value) override; IrEmitter* ir_emitter_; }; diff --git a/tensorflow/compiler/xla/service/cpu/ir_emitter.cc b/tensorflow/compiler/xla/service/cpu/ir_emitter.cc index 321c2e9896..903e73f606 100644 --- a/tensorflow/compiler/xla/service/cpu/ir_emitter.cc +++ b/tensorflow/compiler/xla/service/cpu/ir_emitter.cc @@ -170,9 +170,9 @@ IrEmitter::~IrEmitter() {} Status IrEmitter::HandleBitcast(HloInstruction* bitcast) { VLOG(2) << "HandleBitcast: " << bitcast->ToString(); emitted_value_[bitcast] = - b_.CreateBitCast(GetEmittedValueFor(bitcast->operand(0)), - IrShapeType(bitcast->shape())->getPointerTo(), - AsStringRef(IrName(bitcast))); + BitCast(GetEmittedValueFor(bitcast->operand(0)), + IrShapeType(bitcast->shape())->getPointerTo(), + AsStringRef(IrName(bitcast))); return Status::OK(); } @@ -439,22 +439,22 @@ Status IrEmitter::EmitXfeedTransfer(XfeedKind kind, const Shape& shape, // of size exactly 'length_32', and the runtime is responsible for // check-failing the process if there is a mismatch, versus passing us back a // buffer that we might overrun. - llvm::Value* acquired_pointer = b_.CreateCall( - acquire_func, - {b_.getInt32(length_32), shape_ptr, b_.getInt32(shape_length)}); + llvm::Value* acquired_pointer = + Call(acquire_func, + {b_.getInt32(length_32), shape_ptr, b_.getInt32(shape_length)}); if (kind == XfeedKind::kInfeed) { // Copy to the program buffer address from the acquired buffer. - b_.CreateMemCpy(program_buffer_address, /*DstAlign=*/1, acquired_pointer, - /*SrcAlign=*/1, length_32); + MemCpy(program_buffer_address, /*DstAlign=*/1, acquired_pointer, + /*SrcAlign=*/1, length_32); } else { // Outfeed -- copy from the in-program address to the acquired buffer. - b_.CreateMemCpy(acquired_pointer, /*DstAlign=*/1, program_buffer_address, - /*SrcAlign=*/1, length_32); + MemCpy(acquired_pointer, /*DstAlign=*/1, program_buffer_address, + /*SrcAlign=*/1, length_32); } - b_.CreateCall(release_func, {b_.getInt32(length_32), acquired_pointer, - shape_ptr, b_.getInt32(shape_length)}); + Call(release_func, {b_.getInt32(length_32), acquired_pointer, shape_ptr, + b_.getInt32(shape_length)}); return Status::OK(); } @@ -518,8 +518,8 @@ StatusOr IrEmitter::EmitTargetElementLoopBodyForReduceWindow( llvm_ir::PrimitiveTypeToIrType(operand_element_type, module_), "reduce_window_accumulator_address", &b_, MinimumAlignmentForPrimitiveType(operand_element_type)); - b_.CreateStore(b_.CreateLoad(GetEmittedValueFor(reduce_window->operand(1))), - accumulator_address); + Store(Load(GetEmittedValueFor(reduce_window->operand(1))), + accumulator_address); llvm_ir::ForLoopNest loops(IrName(reduce_window, "inner"), &b_); std::vector window_size; @@ -536,22 +536,21 @@ StatusOr IrEmitter::EmitTargetElementLoopBodyForReduceWindow( llvm::Value* in_bounds_condition = nullptr; for (size_t i = 0; i < index.size(); ++i) { llvm::Value* strided_index = - b_.CreateNSWMul(index[i], b_.getInt64(window.dimensions(i).stride())); - input_index[i] = - b_.CreateNSWSub(b_.CreateNSWAdd(strided_index, window_index[i]), - b_.getInt64(window.dimensions(i).padding_low())); + NSWMul(index[i], b_.getInt64(window.dimensions(i).stride())); + input_index[i] = NSWSub(NSWAdd(strided_index, window_index[i]), + b_.getInt64(window.dimensions(i).padding_low())); // We need to check if 0 <= input_index[i] < bound, as otherwise we are in // the padding so that we can skip the computation. That is equivalent to // input_index[i] < bound as an *unsigned* comparison, since a negative // value will wrap to a large positive value. - llvm::Value* index_condition = b_.CreateICmpULT( - input_index[i], - b_.getInt64(ShapeUtil::GetDimension(operand->shape(), i))); + llvm::Value* index_condition = + ICmpULT(input_index[i], + b_.getInt64(ShapeUtil::GetDimension(operand->shape(), i))); if (in_bounds_condition == nullptr) { in_bounds_condition = index_condition; } else { - in_bounds_condition = b_.CreateAnd(in_bounds_condition, index_condition); + in_bounds_condition = And(in_bounds_condition, index_condition); } } CHECK(in_bounds_condition != nullptr); @@ -564,12 +563,12 @@ StatusOr IrEmitter::EmitTargetElementLoopBodyForReduceWindow( llvm_ir::IrArray input_array(GetIrArrayFor(operand)); llvm::Value* input_value = input_array.EmitReadArrayElement(input_index, &b_); llvm::Value* result = EmitThreadLocalCall( - *reduce_window->to_apply(), - {b_.CreateLoad(accumulator_address), input_value}, "reducer_function"); - b_.CreateStore(result, accumulator_address); + *reduce_window->to_apply(), {Load(accumulator_address), input_value}, + "reducer_function"); + Store(result, accumulator_address); SetToFirstInsertPoint(loops.GetOuterLoopExitBasicBlock(), &b_); - return b_.CreateLoad(accumulator_address); + return Load(accumulator_address); } Status IrEmitter::HandleReduceWindow(HloInstruction* reduce_window) { @@ -646,7 +645,7 @@ Status IrEmitter::HandleSelectAndScatter(HloInstruction* select_and_scatter) { select_and_scatter, /*desc=*/IrName(select_and_scatter, "init"), [this, init_value](const llvm_ir::IrArray::Index& target_index) { llvm::Value* init_value_addr = GetEmittedValueFor(init_value); - return b_.CreateLoad(init_value_addr); + return Load(init_value_addr); })); // Create a loop to iterate over the source array to scatter to the output. @@ -666,7 +665,7 @@ Status IrEmitter::HandleSelectAndScatter(HloInstruction* select_and_scatter) { b_.getInt64Ty(), b_.getInt32(rank), "selected_index_address", &b_); llvm::Value* initialized_flag_address = llvm_ir::EmitAllocaAtFunctionEntry( b_.getInt1Ty(), "initialized_flag_address", &b_); - b_.CreateStore(b_.getInt1(false), initialized_flag_address); + Store(b_.getInt1(false), initialized_flag_address); // Create the inner loop to iterate over the window. llvm_ir::ForLoopNest window_loops(IrName(select_and_scatter, "window"), &b_); @@ -684,15 +683,14 @@ Status IrEmitter::HandleSelectAndScatter(HloInstruction* select_and_scatter) { llvm_ir::IrArray::Index operand_index(b_.getInt64Ty(), source_index.size()); llvm::Value* in_bounds_condition = b_.getTrue(); for (int64 i = 0; i < rank; ++i) { - llvm::Value* strided_index = b_.CreateNSWMul( - source_index[i], b_.getInt64(window.dimensions(i).stride())); - operand_index[i] = - b_.CreateNSWSub(b_.CreateNSWAdd(strided_index, window_index[i]), - b_.getInt64(window.dimensions(i).padding_low())); - llvm::Value* index_condition = b_.CreateICmpULT( - operand_index[i], - b_.getInt64(ShapeUtil::GetDimension(operand->shape(), i))); - in_bounds_condition = b_.CreateAnd(in_bounds_condition, index_condition); + llvm::Value* strided_index = + NSWMul(source_index[i], b_.getInt64(window.dimensions(i).stride())); + operand_index[i] = NSWSub(NSWAdd(strided_index, window_index[i]), + b_.getInt64(window.dimensions(i).padding_low())); + llvm::Value* index_condition = + ICmpULT(operand_index[i], + b_.getInt64(ShapeUtil::GetDimension(operand->shape(), i))); + in_bounds_condition = And(in_bounds_condition, index_condition); } CHECK(in_bounds_condition != nullptr); @@ -702,7 +700,7 @@ Status IrEmitter::HandleSelectAndScatter(HloInstruction* select_and_scatter) { llvm_ir::EmitIfThenElse(in_bounds_condition, "in-bounds", &b_); SetToFirstInsertPoint(if_in_bounds.true_block, &b_); llvm_ir::LlvmIfData if_initialized = llvm_ir::EmitIfThenElse( - b_.CreateLoad(initialized_flag_address), "initialized", &b_); + Load(initialized_flag_address), "initialized", &b_); // If the initialized_flag is false, initialize the selected value and index // with the currently visiting operand. @@ -711,38 +709,37 @@ Status IrEmitter::HandleSelectAndScatter(HloInstruction* select_and_scatter) { [&](const llvm_ir::IrArray::Index& operand_index) { for (int64 i = 0; i < rank; ++i) { llvm::Value* selected_index_address_slot = - b_.CreateInBoundsGEP(selected_index_address, {b_.getInt32(i)}); - b_.CreateStore(operand_index[i], selected_index_address_slot); + InBoundsGEP(selected_index_address, {b_.getInt32(i)}); + Store(operand_index[i], selected_index_address_slot); } }; llvm_ir::IrArray operand_array(GetIrArrayFor(operand)); llvm::Value* operand_data = operand_array.EmitReadArrayElement(operand_index, &b_); - b_.CreateStore(operand_data, selected_value_address); + Store(operand_data, selected_value_address); save_operand_index(operand_index); - b_.CreateStore(b_.getInt1(true), initialized_flag_address); + Store(b_.getInt1(true), initialized_flag_address); // If the initialized_flag is true, call the `select` function to potentially // update the selected value and index with the currently visiting operand. SetToFirstInsertPoint(if_initialized.true_block, &b_); llvm::Value* operand_address = operand_array.EmitArrayElementAddress(operand_index, &b_); - llvm::Value* operand_element = b_.CreateLoad(operand_address); + llvm::Value* operand_element = Load(operand_address); llvm::Value* result = EmitThreadLocalCall( *select_and_scatter->select(), - {b_.CreateLoad(selected_value_address), operand_element}, - "select_function"); + {Load(selected_value_address), operand_element}, "select_function"); // If the 'select' function returns false, update the selected value and the // index to the currently visiting operand. - llvm::Value* cond = b_.CreateICmpNE( + llvm::Value* cond = ICmpNE( result, llvm::ConstantInt::get(llvm_ir::PrimitiveTypeToIrType(PRED, module_), 0), "boolean_predicate"); llvm_ir::LlvmIfData if_select_lhs = llvm_ir::EmitIfThenElse(cond, "if-select-lhs", &b_); SetToFirstInsertPoint(if_select_lhs.false_block, &b_); - b_.CreateStore(b_.CreateLoad(operand_address), selected_value_address); + Store(Load(operand_address), selected_value_address); save_operand_index(operand_index); // After iterating over the window elements, scatter the source element to @@ -753,8 +750,8 @@ Status IrEmitter::HandleSelectAndScatter(HloInstruction* select_and_scatter) { llvm_ir::IrArray::Index selected_index(source_index.GetType()); for (int64 i = 0; i < rank; ++i) { llvm::Value* selected_index_address_slot = - b_.CreateInBoundsGEP(selected_index_address, {b_.getInt32(i)}); - selected_index.push_back(b_.CreateLoad(selected_index_address_slot)); + InBoundsGEP(selected_index_address, {b_.getInt32(i)}); + selected_index.push_back(Load(selected_index_address_slot)); } llvm_ir::IrArray source_array(GetIrArrayFor(source)); llvm::Value* source_value = @@ -836,7 +833,7 @@ StatusOr IrEmitter::EmitTargetElementLoopBodyForConvolution( lhs_llvm_type, "convolution_sum_address", &b_, MinimumAlignmentForPrimitiveType(lhs_element_type)); llvm::Value* constant_zero = llvm::Constant::getNullValue(lhs_llvm_type); - b_.CreateStore(constant_zero, sum_address); + Store(constant_zero, sum_address); llvm_ir::ForLoopNest loops(IrName(convolution, "inner"), &b_); std::vector kernel_spatial(num_spatial_dims); @@ -863,11 +860,11 @@ StatusOr IrEmitter::EmitTargetElementLoopBodyForConvolution( llvm::Value* kernel_index, const WindowDimension& window_dim) { llvm::Value* strided_index = - b_.CreateNSWMul(output_index, b_.getInt64(window_dim.stride())); - llvm::Value* dilated_kernel_index = b_.CreateNSWMul( - kernel_index, b_.getInt64(window_dim.window_dilation())); - return b_.CreateNSWSub(b_.CreateNSWAdd(strided_index, dilated_kernel_index), - b_.getInt64(window_dim.padding_low())); + NSWMul(output_index, b_.getInt64(window_dim.stride())); + llvm::Value* dilated_kernel_index = + NSWMul(kernel_index, b_.getInt64(window_dim.window_dilation())); + return NSWSub(NSWAdd(strided_index, dilated_kernel_index), + b_.getInt64(window_dim.padding_low())); }; std::vector input_spatial(num_spatial_dims); for (int i = 0; i < num_spatial_dims; ++i) { @@ -884,9 +881,8 @@ StatusOr IrEmitter::EmitTargetElementLoopBodyForConvolution( // Also need to check that the input coordinates are not in one of the // holes created by base dilation. const auto not_in_hole = [&](llvm::Value* input_index, int64 base_dilation) { - llvm::Value* remainder = - b_.CreateSRem(input_index, b_.getInt64(base_dilation)); - return b_.CreateICmpEQ(remainder, b_.getInt64(0)); + llvm::Value* remainder = SRem(input_index, b_.getInt64(base_dilation)); + return ICmpEQ(remainder, b_.getInt64(0)); }; llvm::Value* in_bounds_condition = b_.getInt1(true); @@ -894,17 +890,17 @@ StatusOr IrEmitter::EmitTargetElementLoopBodyForConvolution( llvm::ConstantInt* input_bound = b_.getInt64(window_util::DilatedBound( lhs->shape().dimensions(dnums.input_spatial_dimensions(i)), window.dimensions(i).base_dilation())); - llvm::Value* dim_in_bound = b_.CreateICmpULT(input_spatial[i], input_bound); + llvm::Value* dim_in_bound = ICmpULT(input_spatial[i], input_bound); llvm::Value* dim_not_in_hole = not_in_hole(input_spatial[i], window.dimensions(i).base_dilation()); - llvm::Value* dim_ok = b_.CreateAnd(dim_in_bound, dim_not_in_hole); - in_bounds_condition = b_.CreateAnd(in_bounds_condition, dim_ok); + llvm::Value* dim_ok = And(dim_in_bound, dim_not_in_hole); + in_bounds_condition = And(in_bounds_condition, dim_ok); } // Now we need to map the dilated base coordinates back to the actual // data indices on the lhs. const auto undilate = [&](llvm::Value* input_index, int64 base_dilation) { - return b_.CreateSDiv(input_index, b_.getInt64(base_dilation)); + return SDiv(input_index, b_.getInt64(base_dilation)); }; for (int i = 0; i < num_spatial_dims; ++i) { input_spatial[i] = @@ -929,8 +925,8 @@ StatusOr IrEmitter::EmitTargetElementLoopBodyForConvolution( for (int i = 0; i < num_spatial_dims; ++i) { kernel_index[dnums.kernel_spatial_dimensions(i)] = window.dimensions(i).window_reversal() - ? b_.CreateNSWSub(b_.getInt64(window.dimensions(i).size() - 1), - kernel_spatial[i]) + ? NSWSub(b_.getInt64(window.dimensions(i).size() - 1), + kernel_spatial[i]) : kernel_spatial[i]; } @@ -939,13 +935,13 @@ StatusOr IrEmitter::EmitTargetElementLoopBodyForConvolution( llvm_ir::IrArray input_array(GetIrArrayFor(lhs)); llvm::Value* product = - b_.CreateFMul(input_array.EmitReadArrayElement(input_index, &b_), - kernel_array.EmitReadArrayElement(kernel_index, &b_)); - llvm::Value* sum = b_.CreateFAdd(b_.CreateLoad(sum_address), product); - b_.CreateStore(sum, sum_address); + FMul(input_array.EmitReadArrayElement(input_index, &b_), + kernel_array.EmitReadArrayElement(kernel_index, &b_)); + llvm::Value* sum = FAdd(Load(sum_address), product); + Store(sum, sum_address); SetToFirstInsertPoint(loops.GetOuterLoopExitBasicBlock(), &b_); - return b_.CreateLoad(sum_address); + return Load(sum_address); } Status IrEmitter::HandleConvolution(HloInstruction* convolution) { @@ -1071,34 +1067,32 @@ Status IrEmitter::HandleConvolution(HloInstruction* convolution) { conv_func->setCallingConv(llvm::CallingConv::C); conv_func->setDoesNotThrow(); conv_func->setOnlyAccessesArgMemory(); - b_.CreateCall( - conv_func, - { - GetExecutableRunOptionsArgument(), - b_.CreateBitCast(GetEmittedValueFor(convolution), ir_ptr_type), - b_.CreateBitCast(lhs_address, ir_ptr_type), - b_.CreateBitCast(rhs_address, ir_ptr_type), - b_.getInt64(input_batch), - b_.getInt64(input_rows), - b_.getInt64(input_cols), - b_.getInt64(input_channels), - b_.getInt64(kernel_rows), - b_.getInt64(kernel_cols), - b_.getInt64(kernel_channels), - b_.getInt64(kernel_filters), - b_.getInt64(output_rows), - b_.getInt64(output_cols), - b_.getInt64(row_stride), - b_.getInt64(col_stride), - b_.getInt64(padding_top), - b_.getInt64(padding_bottom), - b_.getInt64(padding_left), - b_.getInt64(padding_right), - b_.getInt64(lhs_row_dilation), - b_.getInt64(lhs_col_dilation), - b_.getInt64(rhs_row_dilation), - b_.getInt64(rhs_col_dilation), - }); + Call(conv_func, { + GetExecutableRunOptionsArgument(), + BitCast(GetEmittedValueFor(convolution), ir_ptr_type), + BitCast(lhs_address, ir_ptr_type), + BitCast(rhs_address, ir_ptr_type), + b_.getInt64(input_batch), + b_.getInt64(input_rows), + b_.getInt64(input_cols), + b_.getInt64(input_channels), + b_.getInt64(kernel_rows), + b_.getInt64(kernel_cols), + b_.getInt64(kernel_channels), + b_.getInt64(kernel_filters), + b_.getInt64(output_rows), + b_.getInt64(output_cols), + b_.getInt64(row_stride), + b_.getInt64(col_stride), + b_.getInt64(padding_top), + b_.getInt64(padding_bottom), + b_.getInt64(padding_left), + b_.getInt64(padding_right), + b_.getInt64(lhs_row_dilation), + b_.getInt64(lhs_col_dilation), + b_.getInt64(rhs_row_dilation), + b_.getInt64(rhs_col_dilation), + }); return Status::OK(); } @@ -1158,15 +1152,14 @@ Status IrEmitter::HandleFft(HloInstruction* fft) { fft_func->setDoesNotThrow(); fft_func->setOnlyAccessesInaccessibleMemOrArgMem(); const int fft_rank = fft_length.size(); - b_.CreateCall( - fft_func, - {GetExecutableRunOptionsArgument(), - b_.CreateBitCast(GetEmittedValueFor(fft), int8_ptr_type), - b_.CreateBitCast(operand_address, int8_ptr_type), - b_.getInt32(fft->fft_type()), b_.getInt32(fft_rank), - b_.getInt64(input_batch), b_.getInt64(fft_rank > 0 ? fft_length[0] : 0), - b_.getInt64(fft_rank > 1 ? fft_length[1] : 0), - b_.getInt64(fft_rank > 2 ? fft_length[2] : 0)}); + Call(fft_func, + {GetExecutableRunOptionsArgument(), + BitCast(GetEmittedValueFor(fft), int8_ptr_type), + BitCast(operand_address, int8_ptr_type), b_.getInt32(fft->fft_type()), + b_.getInt32(fft_rank), b_.getInt64(input_batch), + b_.getInt64(fft_rank > 0 ? fft_length[0] : 0), + b_.getInt64(fft_rank > 1 ? fft_length[1] : 0), + b_.getInt64(fft_rank > 2 ? fft_length[2] : 0)}); return Status::OK(); } @@ -1205,8 +1198,8 @@ Status IrEmitter::HandleCrossReplicaSum(HloInstruction* crs) { operand_ptrs.push_back(EmitTempBufferPointer(out_slice, operand_shape)); // TODO(b/63762267): Be more aggressive about specifying alignment. - b_.CreateMemCpy(operand_ptrs.back(), /*DstAlign=*/1, in_ptr, - /*SrcAlign=*/1, ShapeUtil::ByteSizeOf(operand_shape)); + MemCpy(operand_ptrs.back(), /*DstAlign=*/1, in_ptr, + /*SrcAlign=*/1, ShapeUtil::ByteSizeOf(operand_shape)); } llvm_ir::EmitTuple(GetIrArrayFor(crs), operand_ptrs, &b_, module_); return Status::OK(); @@ -1465,19 +1458,19 @@ IrEmitter::EmitInnerLoopForVectorizedReduction( accumulator_shard_type, "accumulator", &b_, 0)); } - llvm::Value* init_value_ssa = b_.CreateLoad(GetEmittedValueFor(init_value)); + llvm::Value* init_value_ssa = Load(GetEmittedValueFor(init_value)); for (llvm::Value* accumulator_shard : accumulator) { llvm::Value* initial_value; auto shard_type = accumulator_shard->getType()->getPointerElementType(); if (auto vector_type = llvm::dyn_cast(shard_type)) { initial_value = - b_.CreateVectorSplat(vector_type->getNumElements(), init_value_ssa); + VectorSplat(vector_type->getNumElements(), init_value_ssa); } else { initial_value = init_value_ssa; } - b_.CreateAlignedStore(initial_value, accumulator_shard, element_alignment); + AlignedStore(initial_value, accumulator_shard, element_alignment); } llvm_ir::ForLoopNest reduction_loop_nest(IrName(arg, "vectorized_inner"), @@ -1499,24 +1492,24 @@ IrEmitter::EmitInnerLoopForVectorizedReduction( } CHECK(output_index.end() == it); - llvm::Value* input_address = b_.CreateBitCast( + llvm::Value* input_address = BitCast( arg_array.EmitArrayElementAddress(input_index, &b_), b_.getInt8PtrTy()); for (int i = 0; i < accumulator.size(); i++) { auto input_address_typed = - b_.CreateBitCast(input_address, accumulator[i]->getType()); + BitCast(input_address, accumulator[i]->getType()); auto current_accumulator_value = - b_.CreateAlignedLoad(accumulator[i], element_alignment); - auto addend = b_.CreateAlignedLoad(input_address_typed, element_alignment); + AlignedLoad(accumulator[i], element_alignment); + auto addend = AlignedLoad(input_address_typed, element_alignment); arg_array.AnnotateLoadStoreInstructionWithMetadata(addend); auto reduced_result = reduction_generator(&b_, current_accumulator_value, addend); - b_.CreateAlignedStore(reduced_result, accumulator[i], element_alignment); + AlignedStore(reduced_result, accumulator[i], element_alignment); if (i != (accumulator.size() - 1)) { - input_address = b_.CreateConstInBoundsGEP1_32(reduced_result->getType(), - input_address_typed, 1); + input_address = ConstInBoundsGEP1_32(reduced_result->getType(), + input_address_typed, 1); } } @@ -1525,8 +1518,7 @@ IrEmitter::EmitInnerLoopForVectorizedReduction( ShardedVector result_ssa; result_ssa.reserve(accumulator.size()); for (auto accumulator_shard : accumulator) { - result_ssa.push_back( - b_.CreateAlignedLoad(accumulator_shard, element_alignment)); + result_ssa.push_back(AlignedLoad(accumulator_shard, element_alignment)); } return result_ssa; } @@ -1535,18 +1527,18 @@ void IrEmitter::EmitShardedVectorStore( llvm::Value* store_address, const std::vector& value_to_store, const int alignment, const llvm_ir::IrArray& containing_array) { for (int i = 0; i < value_to_store.size(); i++) { - auto store_address_typed = b_.CreateBitCast( - store_address, - llvm::PointerType::getUnqual(value_to_store[i]->getType())); + auto store_address_typed = + BitCast(store_address, + llvm::PointerType::getUnqual(value_to_store[i]->getType())); - auto store_instruction = b_.CreateAlignedStore( - value_to_store[i], store_address_typed, alignment); + auto store_instruction = + AlignedStore(value_to_store[i], store_address_typed, alignment); containing_array.AnnotateLoadStoreInstructionWithMetadata( store_instruction); if (i != (value_to_store.size() - 1)) { - store_address = b_.CreateConstInBoundsGEP1_32( - value_to_store[i]->getType(), store_address_typed, 1); + store_address = ConstInBoundsGEP1_32(value_to_store[i]->getType(), + store_address_typed, 1); } } } @@ -1711,8 +1703,8 @@ StatusOr IrEmitter::EmitTargetElementLoopBodyForReduce( llvm_ir::PrimitiveTypeToIrType(accumulator_type, module_), "accumulator", &b_, MinimumAlignmentForPrimitiveType(accumulator_type)); llvm::Value* init_value_addr = GetEmittedValueFor(init_value); - llvm::Value* load_init_value = b_.CreateLoad(init_value_addr); - b_.CreateStore(load_init_value, accumulator_addr); + llvm::Value* load_init_value = Load(init_value_addr); + Store(load_init_value, accumulator_addr); // The enclosing loops go over all the target elements. Now we have to compute // the actual target element. For this, we build a new loop nest to iterate @@ -1745,12 +1737,12 @@ StatusOr IrEmitter::EmitTargetElementLoopBodyForReduce( // Apply the reduction function to the loaded value. llvm::Value* input_element = arg_array.EmitReadArrayElement(input_index, &b_); llvm::Value* result = EmitThreadLocalCall( - *reduce->to_apply(), {b_.CreateLoad(accumulator_addr), input_element}, + *reduce->to_apply(), {Load(accumulator_addr), input_element}, "reduce_function"); - b_.CreateStore(result, accumulator_addr); + Store(result, accumulator_addr); SetToFirstInsertPoint(loops.GetOuterLoopExitBasicBlock(), &b_); - return b_.CreateLoad(accumulator_addr); + return Load(accumulator_addr); } Status IrEmitter::HandleReduce(HloInstruction* reduce) { @@ -1988,7 +1980,7 @@ Status IrEmitter::HandlePad(HloInstruction* pad) { [this, pad](const llvm_ir::IrArray::Index& target_index) { const HloInstruction* padding_value = pad->operand(1); llvm::Value* padding_value_addr = GetEmittedValueFor(padding_value); - return b_.CreateLoad(padding_value_addr); + return Load(padding_value_addr); })); // Create a loop to iterate over the operand elements and update the output @@ -2010,10 +2002,10 @@ Status IrEmitter::HandlePad(HloInstruction* pad) { const PaddingConfig& padding_config = pad->padding_config(); llvm_ir::IrArray::Index output_index(operand_index.GetType()); for (size_t i = 0; i < operand_index.size(); ++i) { - llvm::Value* offset = b_.CreateMul( - operand_index[i], - b_.getInt64(padding_config.dimensions(i).interior_padding() + 1)); - llvm::Value* index = b_.CreateAdd( + llvm::Value* offset = + Mul(operand_index[i], + b_.getInt64(padding_config.dimensions(i).interior_padding() + 1)); + llvm::Value* index = Add( offset, b_.getInt64(padding_config.dimensions(i).edge_padding_low())); output_index.push_back(index); } @@ -2124,10 +2116,10 @@ Status IrEmitter::HandleCustomCall(HloInstruction* custom_call) { for (size_t i = 0; i < operands.size(); ++i) { const HloInstruction* operand = operands[i]; llvm::Value* operand_as_i8ptr = - b_.CreatePointerCast(GetEmittedValueFor(operand), i8_ptr_type); + PointerCast(GetEmittedValueFor(operand), i8_ptr_type); llvm::Value* slot_in_operands_alloca = - b_.CreateInBoundsGEP(operands_alloca, {b_.getInt64(i)}); - b_.CreateStore(operand_as_i8ptr, slot_in_operands_alloca); + InBoundsGEP(operands_alloca, {b_.getInt64(i)}); + Store(operand_as_i8ptr, slot_in_operands_alloca); } auto* custom_call_ir_function = llvm::cast(module_->getOrInsertFunction( @@ -2139,9 +2131,9 @@ Status IrEmitter::HandleCustomCall(HloInstruction* custom_call) { TF_RETURN_IF_ERROR(EmitTargetAddressForOp(custom_call)); auto* output_address_arg = - b_.CreatePointerCast(GetEmittedValueFor(custom_call), i8_ptr_type); + PointerCast(GetEmittedValueFor(custom_call), i8_ptr_type); - b_.CreateCall(custom_call_ir_function, {output_address_arg, operands_alloca}); + Call(custom_call_ir_function, {output_address_arg, operands_alloca}); return Status::OK(); } @@ -2200,15 +2192,14 @@ Status IrEmitter::HandleWhile(HloInstruction* xla_while) { llvm::BasicBlock* header_bb = llvm::BasicBlock::Create( module_->getContext(), AsStringRef(IrName(xla_while, "header")), compute_function_->function()); - b_.CreateBr(header_bb); + Br(header_bb); b_.SetInsertPoint(header_bb); // Calls the condition function to determine whether to proceed with the // body. It must return a bool, so use the scalar call form. EmitGlobalCall(*xla_while->while_condition(), IrName(xla_while, "cond")); - llvm::Value* while_predicate = b_.CreateICmpNE( - b_.CreateLoad( - GetBufferForGlobalCallReturnValue(*xla_while->while_condition())), + llvm::Value* while_predicate = ICmpNE( + Load(GetBufferForGlobalCallReturnValue(*xla_while->while_condition())), llvm::ConstantInt::get(llvm_ir::PrimitiveTypeToIrType(PRED, module_), 0)); // Branches to the body or to the while exit depending on the condition. @@ -2217,7 +2208,7 @@ Status IrEmitter::HandleWhile(HloInstruction* xla_while) { compute_function_->function()); llvm::BasicBlock* exit_bb = llvm::BasicBlock::Create( module_->getContext(), AsStringRef(IrName(xla_while, "exit"))); - b_.CreateCondBr(while_predicate, body_bb, exit_bb); + CondBr(while_predicate, body_bb, exit_bb); // Calls the body function from the body block. b_.SetInsertPoint(body_bb); @@ -2226,7 +2217,7 @@ Status IrEmitter::HandleWhile(HloInstruction* xla_while) { EmitGlobalCall(*xla_while->while_body(), IrName(xla_while, "body")); // Finishes with a branch back to the header. - b_.CreateBr(header_bb); + Br(header_bb); // Adds the exit block to the function and sets the insert point there. compute_function_->function()->getBasicBlockList().push_back(exit_bb); @@ -2273,7 +2264,6 @@ StatusOr IrEmitter::EmitFastConcatenate( output_min2maj.end()); llvm::Type* i8_ptr_type = b_.getInt8PtrTy(); - llvm::Type* i8_type = b_.getInt8Ty(); TF_RETURN_IF_ERROR(EmitTargetAddressForOp(concatenate)); llvm_ir::IrArray target_array = GetIrArrayFor(concatenate); @@ -2296,9 +2286,9 @@ StatusOr IrEmitter::EmitFastConcatenate( // Contiguous subregions from each operand to the concatenate contribute to a // contiguous subregion in the target buffer starting at target_region_begin. llvm::Value* target_region_begin = - b_.CreateBitCast(target_array.EmitArrayElementAddress( - outer_dims_index, &b_, "target_region"), - i8_ptr_type); + BitCast(target_array.EmitArrayElementAddress(outer_dims_index, &b_, + "target_region"), + i8_ptr_type); int64 byte_offset_into_target_region = 0; int64 inner_dims_product = @@ -2312,13 +2302,12 @@ StatusOr IrEmitter::EmitFastConcatenate( for (HloInstruction* operand : operands) { const Shape& input_shape = operand->shape(); llvm_ir::IrArray source_array = GetIrArrayFor(operand); - llvm::Value* copy_source_address = b_.CreateBitCast( + llvm::Value* copy_source_address = BitCast( source_array.EmitArrayElementAddress(outer_dims_index, &b_, "src_addr"), i8_ptr_type); llvm::Value* copy_target_address = - b_.CreateGEP(i8_type, target_region_begin, - b_.getInt64(byte_offset_into_target_region)); + GEP(target_region_begin, b_.getInt64(byte_offset_into_target_region)); EmitTransferElements( copy_target_address, copy_source_address, @@ -2350,15 +2339,15 @@ void IrEmitter::EmitTransferElements(llvm::Value* target, llvm::Value* source, llvm_ir::PrimitiveTypeToIrType(primitive_type, module_)); if (element_count == 1) { - auto* load_instruction = b_.CreateAlignedLoad( - b_.CreateBitCast(source, primitive_ptr_type), element_alignment); + auto* load_instruction = + AlignedLoad(BitCast(source, primitive_ptr_type), element_alignment); source_array.AnnotateLoadStoreInstructionWithMetadata(load_instruction); - auto* store_instruction = b_.CreateAlignedStore( - load_instruction, b_.CreateBitCast(target, primitive_ptr_type), - element_alignment); + auto* store_instruction = + AlignedStore(load_instruction, BitCast(target, primitive_ptr_type), + element_alignment); target_array.AnnotateLoadStoreInstructionWithMetadata(store_instruction); } else { - auto* memcpy_instruction = b_.CreateMemCpy( + auto* memcpy_instruction = MemCpy( target, /*DstAlign=*/element_alignment, source, /*SrcAlign=*/element_alignment, element_count * primitive_type_size); @@ -2420,9 +2409,9 @@ Status IrEmitter::HandleConditional(HloInstruction* conditional) { // cond_result = true_computation(true_operand) // else // cond_result = false_computation(false_operand) - llvm::LoadInst* pred_value = b_.CreateLoad( - GetIrArrayFor(pred).GetBasePointer(), "load_predicate_value"); - llvm::Value* pred_cond = b_.CreateICmpNE( + llvm::LoadInst* pred_value = + Load(GetIrArrayFor(pred).GetBasePointer(), "load_predicate_value"); + llvm::Value* pred_cond = ICmpNE( pred_value, llvm::ConstantInt::get(llvm_ir::PrimitiveTypeToIrType(PRED, module_), 0), "boolean_predicate"); @@ -2509,8 +2498,8 @@ llvm::Value* IrEmitter::GetProfileCounterCommon( int64 prof_counter_idx = it->second; string counter_name = IrName("prof_counter", hlo.name()); - return b_.CreateGEP(GetProfileCountersArgument(), - b_.getInt64(prof_counter_idx), AsStringRef(counter_name)); + return GEP(GetProfileCountersArgument(), b_.getInt64(prof_counter_idx), + AsStringRef(counter_name)); } void IrEmitter::ProfilingState::UpdateProfileCounter(llvm::IRBuilder<>* b, @@ -2664,8 +2653,7 @@ llvm::Value* IrEmitter::EmitThreadLocalTempBufferPointer( llvm::Value* params = compute_function_->parameters_arg(); llvm::Value* param_address_offset = llvm_ir::EmitBufferIndexingGEP(params, param_number, &b_); - llvm::LoadInst* param_address_untyped = - b_.CreateLoad(param_address_offset); + llvm::LoadInst* param_address_untyped = Load(param_address_offset); if (!ShapeUtil::IsOpaque(target_shape)) { AttachAlignmentMetadataForLoad(param_address_untyped, target_shape); @@ -2693,8 +2681,7 @@ llvm::Value* IrEmitter::EmitThreadLocalTempBufferPointer( } return buf_it->second; }(); - return b_.CreateBitCast(tempbuf_address, - IrShapeType(target_shape)->getPointerTo()); + return BitCast(tempbuf_address, IrShapeType(target_shape)->getPointerTo()); } llvm::Value* IrEmitter::EmitGlobalTempBufferPointer( @@ -2702,7 +2689,7 @@ llvm::Value* IrEmitter::EmitGlobalTempBufferPointer( const BufferAllocation& allocation = *slice.allocation(); llvm::Value* tempbuf_address_ptr = llvm_ir::EmitBufferIndexingGEP( GetTempBuffersArgument(), slice.index(), &b_); - llvm::LoadInst* tempbuf_address_base = b_.CreateLoad(tempbuf_address_ptr); + llvm::LoadInst* tempbuf_address_base = Load(tempbuf_address_ptr); if (hlo_module_config_.debug_options() .xla_llvm_enable_invariant_load_metadata()) { tempbuf_address_base->setMetadata( @@ -2716,10 +2703,10 @@ llvm::Value* IrEmitter::EmitGlobalTempBufferPointer( if (slice.offset() > 0) { // Adjust the address to account for the slice offset. tempbuf_address_untyped = - b_.CreateInBoundsGEP(tempbuf_address_base, b_.getInt64(slice.offset())); + InBoundsGEP(tempbuf_address_base, b_.getInt64(slice.offset())); } - return b_.CreateBitCast(tempbuf_address_untyped, - IrShapeType(target_shape)->getPointerTo()); + return BitCast(tempbuf_address_untyped, + IrShapeType(target_shape)->getPointerTo()); } llvm::Value* IrEmitter::EmitTempBufferPointer( @@ -2805,8 +2792,8 @@ Status IrEmitter::EmitMemcpy(const HloInstruction& source, llvm::Value* destination_value = GetEmittedValueFor(&destination); int64 source_size = ByteSizeOf(source.shape()); // TODO(b/63762267): Be more aggressive about specifying alignment. - b_.CreateMemCpy(destination_value, /*DstAlign=*/1, source_value, - /*SrcAlign=*/1, source_size); + MemCpy(destination_value, /*DstAlign=*/1, source_value, + /*SrcAlign=*/1, source_size); return Status::OK(); } @@ -2860,7 +2847,7 @@ llvm::Value* IrEmitter::EmitThreadLocalCall( CHECK(!parameter->getType()->isPointerTy()); llvm::Value* parameter_addr = llvm_ir::EmitAllocaAtFunctionEntry( parameter->getType(), "arg_addr", &b_); - b_.CreateStore(parameter, parameter_addr); + Store(parameter, parameter_addr); parameter_addrs.push_back(parameter_addr); } @@ -2869,29 +2856,28 @@ llvm::Value* IrEmitter::EmitThreadLocalCall( absl::StrCat(name, "_retval_addr"), &b_, MinimumAlignmentForPrimitiveType(return_type)); - b_.CreateCall( - FindOrDie(emitted_functions_, &callee), - GetArrayFunctionCallArguments( - parameter_addrs, &b_, name, - /*return_value_buffer=*/return_value_buffer, - /*exec_run_options_arg=*/GetExecutableRunOptionsArgument(), - /*temp_buffers_arg=*/ - llvm::Constant::getNullValue(b_.getInt8PtrTy()->getPointerTo()), - /*profile_counters_arg=*/GetProfileCountersArgument())); + Call(FindOrDie(emitted_functions_, &callee), + GetArrayFunctionCallArguments( + parameter_addrs, &b_, name, + /*return_value_buffer=*/return_value_buffer, + /*exec_run_options_arg=*/GetExecutableRunOptionsArgument(), + /*temp_buffers_arg=*/ + llvm::Constant::getNullValue(b_.getInt8PtrTy()->getPointerTo()), + /*profile_counters_arg=*/GetProfileCountersArgument())); - return b_.CreateLoad(return_value_buffer); + return Load(return_value_buffer); } void IrEmitter::EmitGlobalCall(const HloComputation& callee, absl::string_view name) { - b_.CreateCall(FindOrDie(emitted_functions_, &callee), - GetArrayFunctionCallArguments( - /*parameter_addresses=*/{}, &b_, name, - /*return_value_buffer=*/ - llvm::Constant::getNullValue(b_.getInt8PtrTy()), - /*exec_run_options_arg=*/GetExecutableRunOptionsArgument(), - /*temp_buffers_arg=*/GetTempBuffersArgument(), - /*profile_counters_arg=*/GetProfileCountersArgument())); + Call(FindOrDie(emitted_functions_, &callee), + GetArrayFunctionCallArguments( + /*parameter_addresses=*/{}, &b_, name, + /*return_value_buffer=*/ + llvm::Constant::getNullValue(b_.getInt8PtrTy()), + /*exec_run_options_arg=*/GetExecutableRunOptionsArgument(), + /*temp_buffers_arg=*/GetTempBuffersArgument(), + /*profile_counters_arg=*/GetProfileCountersArgument())); } llvm::Value* IrEmitter::GetBufferForGlobalCallReturnValue( diff --git a/tensorflow/compiler/xla/service/cpu/ir_emitter.h b/tensorflow/compiler/xla/service/cpu/ir_emitter.h index 99c080b3db..ec68710d3f 100644 --- a/tensorflow/compiler/xla/service/cpu/ir_emitter.h +++ b/tensorflow/compiler/xla/service/cpu/ir_emitter.h @@ -40,6 +40,7 @@ limitations under the License. #include "tensorflow/compiler/xla/service/hlo_module_config.h" #include "tensorflow/compiler/xla/service/llvm_ir/alias_analysis.h" #include "tensorflow/compiler/xla/service/llvm_ir/ir_array.h" +#include "tensorflow/compiler/xla/service/llvm_ir/ir_builder_mixin.h" #include "tensorflow/compiler/xla/service/llvm_ir/loop_emitter.h" #include "tensorflow/compiler/xla/service/name_uniquer.h" #include "tensorflow/compiler/xla/statusor.h" @@ -55,7 +56,8 @@ namespace cpu { // This class is the top-level API for the XLA HLO --> LLVM IR compiler. It // implements the DfsHloVisitor interface and emits HLO computations as LLVM IR // functions. -class IrEmitter : public DfsHloVisitorWithDefault { +class IrEmitter : public DfsHloVisitorWithDefault, + public IrBuilderMixin { public: // Create a new LLVM IR emitter. // @@ -100,6 +102,9 @@ class IrEmitter : public DfsHloVisitorWithDefault { llvm::IRBuilder<>* b() { return &b_; } + // builder() is for IrBuilderMixin. + llvm::IRBuilder<>* builder() { return &b_; } + // Emit an LLVM global variable for every constant buffer allocation. Status EmitConstantGlobals(); diff --git a/tensorflow/compiler/xla/service/elemental_ir_emitter.cc b/tensorflow/compiler/xla/service/elemental_ir_emitter.cc index 52faaab25c..61f6055fc9 100644 --- a/tensorflow/compiler/xla/service/elemental_ir_emitter.cc +++ b/tensorflow/compiler/xla/service/elemental_ir_emitter.cc @@ -204,7 +204,7 @@ llvm::Value* EmitIntegralToFloating(llvm::Value* integer_value, } // namespace StatusOr ElementalIrEmitter::EmitUnaryOp( - const HloInstruction* op, llvm::Value* operand_value) const { + const HloInstruction* op, llvm::Value* operand_value) { if (op->opcode() == HloOpcode::kCopy) { return operand_value; } else if (ShapeUtil::ElementIsIntegral(op->operand(0)->shape()) || @@ -218,7 +218,7 @@ StatusOr ElementalIrEmitter::EmitUnaryOp( } StatusOr ElementalIrEmitter::EmitIntegerUnaryOp( - const HloInstruction* op, llvm::Value* operand_value) const { + const HloInstruction* op, llvm::Value* operand_value) { switch (op->opcode()) { case HloOpcode::kConvert: { PrimitiveType from_type = op->operand(0)->shape().element_type(); @@ -230,14 +230,14 @@ StatusOr ElementalIrEmitter::EmitIntegerUnaryOp( } if (to_type == PRED) { return b_->CreateZExt( - b_->CreateICmpNE(operand_value, llvm::ConstantInt::get( - operand_value->getType(), 0)), + ICmpNE(operand_value, + llvm::ConstantInt::get(operand_value->getType(), 0)), llvm_ir::PrimitiveTypeToIrType(PRED, module_)); } if (primitive_util::IsIntegralType(to_type)) { - return b_->CreateIntCast( - operand_value, llvm_ir::PrimitiveTypeToIrType(to_type, module_), - primitive_util::IsSignedIntegralType(from_type)); + return IntCast(operand_value, + llvm_ir::PrimitiveTypeToIrType(to_type, module_), + primitive_util::IsSignedIntegralType(from_type)); } if (primitive_util::IsFloatingPointType(to_type)) { if (to_type == BF16) { @@ -253,14 +253,12 @@ StatusOr ElementalIrEmitter::EmitIntegerUnaryOp( primitive_util::ComplexComponentType(to_type), module_); if (primitive_util::IsSignedIntegralType(from_type)) { return EmitComposeComplex( - op, b_->CreateSIToFP(operand_value, to_ir_component_type), - nullptr); + op, SIToFP(operand_value, to_ir_component_type), nullptr); } if (primitive_util::IsUnsignedIntegralType(from_type) || from_type == PRED) { return EmitComposeComplex( - op, b_->CreateUIToFP(operand_value, to_ir_component_type), - nullptr); + op, UIToFP(operand_value, to_ir_component_type), nullptr); } } return Unimplemented("conversion from primitive type %s to %s", @@ -276,8 +274,8 @@ StatusOr ElementalIrEmitter::EmitIntegerUnaryOp( } if (primitive_util::BitWidth(from_type) == primitive_util::BitWidth(to_type)) { - return b_->CreateBitCast( - operand_value, llvm_ir::PrimitiveTypeToIrType(to_type, module_)); + return BitCast(operand_value, + llvm_ir::PrimitiveTypeToIrType(to_type, module_)); } return InvalidArgument( "bitcast conversion from primitive type %s to %s with unequal " @@ -292,8 +290,8 @@ StatusOr ElementalIrEmitter::EmitIntegerUnaryOp( if (is_signed) { auto type = llvm_ir::PrimitiveTypeToIrType(op->shape().element_type(), module_); - auto cmp = b_->CreateICmpSGE(operand_value, GetZero(type)); - return Select(cmp, operand_value, b_->CreateNeg(operand_value)); + auto cmp = ICmpSGE(operand_value, GetZero(type)); + return Select(cmp, operand_value, Neg(operand_value)); } else { return operand_value; } @@ -309,23 +307,22 @@ StatusOr ElementalIrEmitter::EmitIntegerUnaryOp( << op->shape().element_type(); auto type = llvm_ir::PrimitiveTypeToIrType(op->shape().element_type(), module_); - auto cmp = b_->CreateICmpEQ(operand_value, GetZero(type)); - auto ashr = b_->CreateAShr(operand_value, type->getIntegerBitWidth() - 1); - return Select(cmp, GetZero(type), b_->CreateOr(ashr, 1)); + auto cmp = ICmpEQ(operand_value, GetZero(type)); + auto ashr = AShr(operand_value, type->getIntegerBitWidth() - 1); + return Select(cmp, GetZero(type), Or(ashr, 1)); } case HloOpcode::kNegate: - return b_->CreateNeg(operand_value); + return Neg(operand_value); case HloOpcode::kNot: { auto type = op->shape().element_type(); if (type == PRED) { // It is not sufficient to just call CreateNot() here because a PRED // is represented as an i8 and the truth value is stored only in the // bottom bit. - return b_->CreateZExt( - b_->CreateNot(b_->CreateTrunc(operand_value, b_->getInt1Ty())), - llvm_ir::PrimitiveTypeToIrType(PRED, module_)); + return b_->CreateZExt(Not(Trunc(operand_value, b_->getInt1Ty())), + llvm_ir::PrimitiveTypeToIrType(PRED, module_)); } else if (primitive_util::IsIntegralType(type)) { - return b_->CreateNot(operand_value); + return Not(operand_value); } return Unimplemented("unary op Not is not defined for type '%d'", type); } @@ -336,7 +333,7 @@ StatusOr ElementalIrEmitter::EmitIntegerUnaryOp( } StatusOr ElementalIrEmitter::EmitFloatUnaryOp( - const HloInstruction* op, llvm::Value* operand_value) const { + const HloInstruction* op, llvm::Value* operand_value) { switch (op->opcode()) { case HloOpcode::kConvert: { PrimitiveType from_type = op->operand(0)->shape().element_type(); @@ -353,8 +350,8 @@ StatusOr ElementalIrEmitter::EmitFloatUnaryOp( } return EmitComposeComplex( op, - b_->CreateFPCast(operand_value, llvm_ir::PrimitiveTypeToIrType( - to_component_type, module_)), + FPCast(operand_value, + llvm_ir::PrimitiveTypeToIrType(to_component_type, module_)), nullptr); } if (from_type == BF16) { @@ -370,22 +367,21 @@ StatusOr ElementalIrEmitter::EmitFloatUnaryOp( } if (to_type == PRED) { return b_->CreateZExt( - b_->CreateFCmpUNE( - operand_value, - llvm::ConstantFP::get(operand_value->getType(), 0.0)), + FCmpUNE(operand_value, + llvm::ConstantFP::get(operand_value->getType(), 0.0)), llvm_ir::PrimitiveTypeToIrType(PRED, module_)); } if (primitive_util::IsFloatingPointType(to_type)) { - return b_->CreateFPCast( - operand_value, llvm_ir::PrimitiveTypeToIrType(to_type, module_)); + return FPCast(operand_value, + llvm_ir::PrimitiveTypeToIrType(to_type, module_)); } if (primitive_util::IsSignedIntegralType(to_type)) { - return b_->CreateFPToSI( - operand_value, llvm_ir::PrimitiveTypeToIrType(to_type, module_)); + return FPToSI(operand_value, + llvm_ir::PrimitiveTypeToIrType(to_type, module_)); } if (primitive_util::IsUnsignedIntegralType(to_type)) { - return b_->CreateFPToUI( - operand_value, llvm_ir::PrimitiveTypeToIrType(to_type, module_)); + return FPToUI(operand_value, + llvm_ir::PrimitiveTypeToIrType(to_type, module_)); } return Unimplemented("unhandled conversion operation: %s => %s", PrimitiveType_Name(from_type), @@ -400,8 +396,8 @@ StatusOr ElementalIrEmitter::EmitFloatUnaryOp( } if (primitive_util::BitWidth(from_type) == primitive_util::BitWidth(to_type)) { - return b_->CreateBitCast( - operand_value, llvm_ir::PrimitiveTypeToIrType(to_type, module_)); + return BitCast(operand_value, + llvm_ir::PrimitiveTypeToIrType(to_type, module_)); } return InvalidArgument( "bitcast conversion from primitive type %s to %s with unequal " @@ -444,8 +440,8 @@ StatusOr ElementalIrEmitter::EmitFloatUnaryOp( // TODO(b/32151903): Ensure consistent sign behavior for -0.0. auto type = operand_value->getType(); auto zero = llvm::ConstantFP::get(type, 0.0); - auto oeq = b_->CreateFCmpOEQ(operand_value, zero); - auto olt = b_->CreateFCmpOLT(operand_value, zero); + auto oeq = FCmpOEQ(operand_value, zero); + auto olt = FCmpOLT(operand_value, zero); return Select(oeq, zero, Select(olt, llvm::ConstantFP::get(type, -1.0), llvm::ConstantFP::get(type, 1.0))); @@ -457,12 +453,12 @@ StatusOr ElementalIrEmitter::EmitFloatUnaryOp( auto abs_value = llvm_ir::EmitCallToIntrinsic( llvm::Intrinsic::fabs, {operand_value}, {type}, b_); auto infinity = llvm::ConstantFP::getInfinity(type); - auto not_infinite = b_->CreateFCmpONE(abs_value, infinity); + auto not_infinite = FCmpONE(abs_value, infinity); return b_->CreateZExt(not_infinite, llvm_ir::PrimitiveTypeToIrType(PRED, module_)); } case HloOpcode::kNegate: - return b_->CreateFNeg(operand_value); + return FNeg(operand_value); case HloOpcode::kReal: return operand_value; case HloOpcode::kImag: @@ -474,7 +470,7 @@ StatusOr ElementalIrEmitter::EmitFloatUnaryOp( } StatusOr ElementalIrEmitter::EmitComplexUnaryOp( - const HloInstruction* op, llvm::Value* operand_value) const { + const HloInstruction* op, llvm::Value* operand_value) { PrimitiveType input_type = op->operand(0)->shape().element_type(); PrimitiveType component_type = primitive_util::IsComplexType(input_type) @@ -486,12 +482,11 @@ StatusOr ElementalIrEmitter::EmitComplexUnaryOp( auto a = EmitExtractReal(operand_value); auto b = EmitExtractImag(operand_value); llvm::Type* llvm_ty = a->getType(); - auto sum_sq = b_->CreateFAdd(b_->CreateFMul(a, a), b_->CreateFMul(b, b)); + auto sum_sq = FAdd(FMul(a, a), FMul(b, b)); TF_ASSIGN_OR_RETURN(auto log_sum_sq, EmitLog(component_type, sum_sq)); TF_ASSIGN_OR_RETURN(auto angle, EmitAtan2(component_type, b, a)); auto one_half = llvm::ConstantFP::get(llvm_ty, 0.5); - return EmitComposeComplex(op, b_->CreateFMul(one_half, log_sum_sq), - angle); + return EmitComposeComplex(op, FMul(one_half, log_sum_sq), angle); } case HloOpcode::kLog1p: { // log1p(a+bi) = .5*log((a+1)^2+b^2) + i*atan2(b, a + 1) @@ -499,14 +494,12 @@ StatusOr ElementalIrEmitter::EmitComplexUnaryOp( auto b = EmitExtractImag(operand_value); llvm::Type* llvm_ty = a->getType(); auto one = llvm::ConstantFP::get(llvm_ty, 1.0); - auto a_plus_one = b_->CreateFAdd(a, one); - auto sum_sq = b_->CreateFAdd(b_->CreateFMul(a_plus_one, a_plus_one), - b_->CreateFMul(b, b)); + auto a_plus_one = FAdd(a, one); + auto sum_sq = FAdd(FMul(a_plus_one, a_plus_one), FMul(b, b)); TF_ASSIGN_OR_RETURN(auto log_sum_sq, EmitLog(component_type, sum_sq)); TF_ASSIGN_OR_RETURN(auto angle, EmitAtan2(component_type, b, a_plus_one)); auto one_half = llvm::ConstantFP::get(llvm_ty, 0.5); - return EmitComposeComplex(op, b_->CreateFMul(one_half, log_sum_sq), - angle); + return EmitComposeComplex(op, FMul(one_half, log_sum_sq), angle); } case HloOpcode::kConvert: { PrimitiveType from_type = op->operand(0)->shape().element_type(); @@ -520,11 +513,9 @@ StatusOr ElementalIrEmitter::EmitComplexUnaryOp( primitive_util::ComplexComponentType(to_type); auto to_ir_component_type = llvm_ir::PrimitiveTypeToIrType(to_component_type, module_); - return EmitComposeComplex(op, - b_->CreateFPCast(EmitExtractReal(operand_value), - to_ir_component_type), - b_->CreateFPCast(EmitExtractImag(operand_value), - to_ir_component_type)); + return EmitComposeComplex( + op, FPCast(EmitExtractReal(operand_value), to_ir_component_type), + FPCast(EmitExtractImag(operand_value), to_ir_component_type)); } case HloOpcode::kExp: { // e^(a+bi) = e^a*(cos(b)+sin(b)i) @@ -534,8 +525,7 @@ StatusOr ElementalIrEmitter::EmitComplexUnaryOp( auto cos_b, EmitCos(component_type, EmitExtractImag(operand_value))); TF_ASSIGN_OR_RETURN( auto sin_b, EmitSin(component_type, EmitExtractImag(operand_value))); - return EmitComposeComplex(op, b_->CreateFMul(exp_a, cos_b), - b_->CreateFMul(exp_a, sin_b)); + return EmitComposeComplex(op, FMul(exp_a, cos_b), FMul(exp_a, sin_b)); } case HloOpcode::kExpm1: { // e^(a+bi)-1 = (e^a*cos(b)-1)+e^a*sin(b)i @@ -546,8 +536,8 @@ StatusOr ElementalIrEmitter::EmitComplexUnaryOp( TF_ASSIGN_OR_RETURN( auto sin_b, EmitSin(component_type, EmitExtractImag(operand_value))); auto one = llvm::ConstantFP::get(exp_a->getType(), 1.0); - auto real_result = b_->CreateFSub(b_->CreateFMul(exp_a, cos_b), one); - auto imag_result = b_->CreateFMul(exp_a, sin_b); + auto real_result = FSub(FMul(exp_a, cos_b), one); + auto imag_result = FMul(exp_a, sin_b); return EmitComposeComplex(op, real_result, imag_result); } case HloOpcode::kCos: { @@ -562,14 +552,13 @@ StatusOr ElementalIrEmitter::EmitComplexUnaryOp( auto b = EmitExtractImag(operand_value); auto type = a->getType(); TF_ASSIGN_OR_RETURN(auto exp_b, EmitExp(component_type, b)); - auto half_exp_b = b_->CreateFMul(llvm::ConstantFP::get(type, 0.5), exp_b); - auto half_exp_neg_b = - b_->CreateFDiv(llvm::ConstantFP::get(type, 0.5), exp_b); + auto half_exp_b = FMul(llvm::ConstantFP::get(type, 0.5), exp_b); + auto half_exp_neg_b = FDiv(llvm::ConstantFP::get(type, 0.5), exp_b); TF_ASSIGN_OR_RETURN(auto cos_a, EmitCos(component_type, a)); TF_ASSIGN_OR_RETURN(auto sin_a, EmitSin(component_type, a)); - return EmitComposeComplex( - op, b_->CreateFMul(cos_a, b_->CreateFAdd(half_exp_neg_b, half_exp_b)), - b_->CreateFMul(sin_a, b_->CreateFSub(half_exp_neg_b, half_exp_b))); + return EmitComposeComplex(op, + FMul(cos_a, FAdd(half_exp_neg_b, half_exp_b)), + FMul(sin_a, FSub(half_exp_neg_b, half_exp_b))); } case HloOpcode::kSin: { // sin(z) = .5i(e^(-iz) - e^(iz)) @@ -585,14 +574,13 @@ StatusOr ElementalIrEmitter::EmitComplexUnaryOp( auto b = EmitExtractImag(operand_value); auto type = a->getType(); TF_ASSIGN_OR_RETURN(auto exp_b, EmitExp(component_type, b)); - auto half_exp_b = b_->CreateFMul(llvm::ConstantFP::get(type, 0.5), exp_b); - auto half_exp_neg_b = - b_->CreateFDiv(llvm::ConstantFP::get(type, 0.5), exp_b); + auto half_exp_b = FMul(llvm::ConstantFP::get(type, 0.5), exp_b); + auto half_exp_neg_b = FDiv(llvm::ConstantFP::get(type, 0.5), exp_b); TF_ASSIGN_OR_RETURN(auto cos_a, EmitCos(component_type, a)); TF_ASSIGN_OR_RETURN(auto sin_a, EmitSin(component_type, a)); - return EmitComposeComplex( - op, b_->CreateFMul(sin_a, b_->CreateFAdd(half_exp_b, half_exp_neg_b)), - b_->CreateFMul(cos_a, b_->CreateFSub(half_exp_b, half_exp_neg_b))); + return EmitComposeComplex(op, + FMul(sin_a, FAdd(half_exp_b, half_exp_neg_b)), + FMul(cos_a, FSub(half_exp_b, half_exp_neg_b))); } case HloOpcode::kTanh: { /* @@ -620,61 +608,51 @@ StatusOr ElementalIrEmitter::EmitComplexUnaryOp( TF_ASSIGN_OR_RETURN(auto exp_a, EmitExp(component_type, a)); TF_ASSIGN_OR_RETURN(auto cos_b, EmitCos(component_type, b)); TF_ASSIGN_OR_RETURN(auto sin_b, EmitSin(component_type, b)); - auto exp_neg_a = - b_->CreateFDiv(llvm::ConstantFP::get(exp_a->getType(), 1), exp_a); - auto exp_2a_minus_exp_neg_2a = b_->CreateFSub( - b_->CreateFMul(exp_a, exp_a), b_->CreateFMul(exp_neg_a, exp_neg_a)); - auto cos_b_sq = b_->CreateFMul(cos_b, cos_b); - auto sin_b_sq = b_->CreateFMul(sin_b, sin_b); - auto real_num = - b_->CreateFAdd(b_->CreateFMul(cos_b_sq, exp_2a_minus_exp_neg_2a), - b_->CreateFMul(sin_b_sq, exp_2a_minus_exp_neg_2a)); - auto cos_b_sin_b = b_->CreateFMul(cos_b, sin_b); - auto exp_a_plus_exp_neg_a = b_->CreateFAdd(exp_a, exp_neg_a); + auto exp_neg_a = FDiv(llvm::ConstantFP::get(exp_a->getType(), 1), exp_a); + auto exp_2a_minus_exp_neg_2a = + FSub(FMul(exp_a, exp_a), FMul(exp_neg_a, exp_neg_a)); + auto cos_b_sq = FMul(cos_b, cos_b); + auto sin_b_sq = FMul(sin_b, sin_b); + auto real_num = FAdd(FMul(cos_b_sq, exp_2a_minus_exp_neg_2a), + FMul(sin_b_sq, exp_2a_minus_exp_neg_2a)); + auto cos_b_sin_b = FMul(cos_b, sin_b); + auto exp_a_plus_exp_neg_a = FAdd(exp_a, exp_neg_a); auto exp_a_plus_exp_neg_a_sq = - b_->CreateFMul(exp_a_plus_exp_neg_a, exp_a_plus_exp_neg_a); - auto exp_a_minus_exp_neg_a = b_->CreateFSub(exp_a, exp_neg_a); + FMul(exp_a_plus_exp_neg_a, exp_a_plus_exp_neg_a); + auto exp_a_minus_exp_neg_a = FSub(exp_a, exp_neg_a); auto exp_a_minus_exp_neg_a_sq = - b_->CreateFMul(exp_a_minus_exp_neg_a, exp_a_minus_exp_neg_a); - auto imag_num = b_->CreateFMul( - cos_b_sin_b, - b_->CreateFSub(exp_a_plus_exp_neg_a_sq, exp_a_minus_exp_neg_a_sq)); - auto denom = - b_->CreateFAdd(b_->CreateFMul(cos_b_sq, exp_a_plus_exp_neg_a_sq), - b_->CreateFMul(sin_b_sq, exp_a_minus_exp_neg_a_sq)); - return EmitComposeComplex(op, b_->CreateFDiv(real_num, denom), - b_->CreateFDiv(imag_num, denom)); + FMul(exp_a_minus_exp_neg_a, exp_a_minus_exp_neg_a); + auto imag_num = FMul( + cos_b_sin_b, FSub(exp_a_plus_exp_neg_a_sq, exp_a_minus_exp_neg_a_sq)); + auto denom = FAdd(FMul(cos_b_sq, exp_a_plus_exp_neg_a_sq), + FMul(sin_b_sq, exp_a_minus_exp_neg_a_sq)); + return EmitComposeComplex(op, FDiv(real_num, denom), + FDiv(imag_num, denom)); } case HloOpcode::kAbs: { - auto sum_sq = - b_->CreateFAdd(b_->CreateFMul(EmitExtractReal(operand_value), - EmitExtractReal(operand_value)), - b_->CreateFMul(EmitExtractImag(operand_value), - EmitExtractImag(operand_value))); + auto sum_sq = FAdd( + FMul(EmitExtractReal(operand_value), EmitExtractReal(operand_value)), + FMul(EmitExtractImag(operand_value), EmitExtractImag(operand_value))); return llvm_ir::EmitCallToIntrinsic(llvm::Intrinsic::sqrt, {sum_sq}, {sum_sq->getType()}, b_); } case HloOpcode::kSign: { // Sign(c) = c / |c| - auto sum_sq = - b_->CreateFAdd(b_->CreateFMul(EmitExtractReal(operand_value), - EmitExtractReal(operand_value)), - b_->CreateFMul(EmitExtractImag(operand_value), - EmitExtractImag(operand_value))); + auto sum_sq = FAdd( + FMul(EmitExtractReal(operand_value), EmitExtractReal(operand_value)), + FMul(EmitExtractImag(operand_value), EmitExtractImag(operand_value))); auto cplx_abs = llvm_ir::EmitCallToIntrinsic( llvm::Intrinsic::sqrt, {sum_sq}, {sum_sq->getType()}, b_); auto type = cplx_abs->getType(); auto zero = llvm::ConstantFP::get(type, 0.0); - auto oeq = b_->CreateFCmpOEQ(cplx_abs, zero); + auto oeq = FCmpOEQ(cplx_abs, zero); return Select( oeq, EmitComposeComplex(op, zero, zero), - EmitComposeComplex( - op, b_->CreateFDiv(EmitExtractReal(operand_value), cplx_abs), - b_->CreateFDiv(EmitExtractImag(operand_value), cplx_abs))); + EmitComposeComplex(op, FDiv(EmitExtractReal(operand_value), cplx_abs), + FDiv(EmitExtractImag(operand_value), cplx_abs))); } case HloOpcode::kNegate: - return EmitComposeComplex(op, - b_->CreateFNeg(EmitExtractReal(operand_value)), - b_->CreateFNeg(EmitExtractImag(operand_value))); + return EmitComposeComplex(op, FNeg(EmitExtractReal(operand_value)), + FNeg(EmitExtractImag(operand_value))); case HloOpcode::kReal: return EmitExtractReal(operand_value); case HloOpcode::kImag: @@ -686,8 +664,7 @@ StatusOr ElementalIrEmitter::EmitComplexUnaryOp( } StatusOr ElementalIrEmitter::EmitBinaryOp( - const HloInstruction* op, llvm::Value* lhs_value, - llvm::Value* rhs_value) const { + const HloInstruction* op, llvm::Value* lhs_value, llvm::Value* rhs_value) { PrimitiveType operand_type = op->operand(0)->shape().element_type(); if (ShapeUtil::ElementIsIntegral(op->operand(0)->shape()) || operand_type == PRED) { @@ -702,21 +679,20 @@ StatusOr ElementalIrEmitter::EmitBinaryOp( } StatusOr ElementalIrEmitter::EmitFloatBinaryOp( - const HloInstruction* op, llvm::Value* lhs_value, - llvm::Value* rhs_value) const { + const HloInstruction* op, llvm::Value* lhs_value, llvm::Value* rhs_value) { switch (op->opcode()) { case HloOpcode::kComplex: return EmitComposeComplex(op, lhs_value, rhs_value); case HloOpcode::kAdd: - return b_->CreateFAdd(lhs_value, rhs_value); + return FAdd(lhs_value, rhs_value); case HloOpcode::kSubtract: - return b_->CreateFSub(lhs_value, rhs_value); + return FSub(lhs_value, rhs_value); case HloOpcode::kMultiply: - return b_->CreateFMul(lhs_value, rhs_value); + return FMul(lhs_value, rhs_value); case HloOpcode::kDivide: - return b_->CreateFDiv(lhs_value, rhs_value); + return FDiv(lhs_value, rhs_value); case HloOpcode::kRemainder: - return b_->CreateFRem(lhs_value, rhs_value); + return FRem(lhs_value, rhs_value); // LLVM comparisons can be "unordered" (U) or "ordered" (O) -- ordered // comparisons always return false when one of the operands is NaN, whereas // unordered comparisons return true. @@ -758,61 +734,47 @@ StatusOr ElementalIrEmitter::EmitFloatBinaryOp( } StatusOr ElementalIrEmitter::EmitComplexBinaryOp( - const HloInstruction* op, llvm::Value* lhs_value, - llvm::Value* rhs_value) const { + const HloInstruction* op, llvm::Value* lhs_value, llvm::Value* rhs_value) { switch (op->opcode()) { case HloOpcode::kAdd: - return EmitComposeComplex(op, - b_->CreateFAdd(EmitExtractReal(lhs_value), - EmitExtractReal(rhs_value)), - b_->CreateFAdd(EmitExtractImag(lhs_value), - EmitExtractImag(rhs_value))); + return EmitComposeComplex( + op, FAdd(EmitExtractReal(lhs_value), EmitExtractReal(rhs_value)), + FAdd(EmitExtractImag(lhs_value), EmitExtractImag(rhs_value))); case HloOpcode::kSubtract: - return EmitComposeComplex(op, - b_->CreateFSub(EmitExtractReal(lhs_value), - EmitExtractReal(rhs_value)), - b_->CreateFSub(EmitExtractImag(lhs_value), - EmitExtractImag(rhs_value))); + return EmitComposeComplex( + op, FSub(EmitExtractReal(lhs_value), EmitExtractReal(rhs_value)), + FSub(EmitExtractImag(lhs_value), EmitExtractImag(rhs_value))); case HloOpcode::kMultiply: return EmitComposeComplex( op, - b_->CreateFSub(b_->CreateFMul(EmitExtractReal(lhs_value), - EmitExtractReal(rhs_value)), - b_->CreateFMul(EmitExtractImag(lhs_value), - EmitExtractImag(rhs_value))), - b_->CreateFAdd(b_->CreateFMul(EmitExtractReal(lhs_value), - EmitExtractImag(rhs_value)), - b_->CreateFMul(EmitExtractImag(lhs_value), - EmitExtractReal(rhs_value)))); + FSub(FMul(EmitExtractReal(lhs_value), EmitExtractReal(rhs_value)), + FMul(EmitExtractImag(lhs_value), EmitExtractImag(rhs_value))), + FAdd(FMul(EmitExtractReal(lhs_value), EmitExtractImag(rhs_value)), + FMul(EmitExtractImag(lhs_value), EmitExtractReal(rhs_value)))); case HloOpcode::kDivide: { // (a+bi) / (c+di) = ((a+bi)(c-di)) / ((c+di)(c-di)) // = ((ac + bd) + (bc - ad)i) / (c^2 + d^2) auto rhs_sum_sq = - b_->CreateFAdd(b_->CreateFMul(EmitExtractReal(rhs_value), - EmitExtractReal(rhs_value)), - b_->CreateFMul(EmitExtractImag(rhs_value), - EmitExtractImag(rhs_value))); + FAdd(FMul(EmitExtractReal(rhs_value), EmitExtractReal(rhs_value)), + FMul(EmitExtractImag(rhs_value), EmitExtractImag(rhs_value))); auto type = rhs_sum_sq->getType(); auto zero = llvm::ConstantFP::get(type, 0.0); - auto oeq = b_->CreateFCmpOEQ(rhs_sum_sq, zero); - auto real_inf_or_nan = b_->CreateFDiv(EmitExtractReal(lhs_value), zero); - auto imag_inf_or_nan = b_->CreateFDiv(EmitExtractImag(lhs_value), zero); + auto oeq = FCmpOEQ(rhs_sum_sq, zero); + auto real_inf_or_nan = FDiv(EmitExtractReal(lhs_value), zero); + auto imag_inf_or_nan = FDiv(EmitExtractImag(lhs_value), zero); return Select( oeq, EmitComposeComplex(op, real_inf_or_nan, imag_inf_or_nan), - EmitComposeComplex( - op, - b_->CreateFDiv( - b_->CreateFAdd(b_->CreateFMul(EmitExtractReal(lhs_value), - EmitExtractReal(rhs_value)), - b_->CreateFMul(EmitExtractImag(lhs_value), - EmitExtractImag(rhs_value))), - rhs_sum_sq), - b_->CreateFDiv( - b_->CreateFSub(b_->CreateFMul(EmitExtractImag(lhs_value), - EmitExtractReal(rhs_value)), - b_->CreateFMul(EmitExtractReal(lhs_value), - EmitExtractImag(rhs_value))), - rhs_sum_sq))); + EmitComposeComplex(op, + FDiv(FAdd(FMul(EmitExtractReal(lhs_value), + EmitExtractReal(rhs_value)), + FMul(EmitExtractImag(lhs_value), + EmitExtractImag(rhs_value))), + rhs_sum_sq), + FDiv(FSub(FMul(EmitExtractImag(lhs_value), + EmitExtractReal(rhs_value)), + FMul(EmitExtractReal(lhs_value), + EmitExtractImag(rhs_value))), + rhs_sum_sq))); } // LLVM comparisons can be "unordered" (U) or "ordered" (O) -- ordered // comparisons always return false when one of the operands is NaN, whereas @@ -822,21 +784,19 @@ StatusOr ElementalIrEmitter::EmitComplexBinaryOp( // unordered comparison. This makes x != y equivalent to !(x == y), and // matches C++'s semantics. case HloOpcode::kEq: - return b_->CreateAnd( - llvm_ir::EmitComparison(llvm::CmpInst::FCMP_OEQ, - EmitExtractReal(lhs_value), - EmitExtractReal(rhs_value), b_), - llvm_ir::EmitComparison(llvm::CmpInst::FCMP_OEQ, - EmitExtractImag(lhs_value), - EmitExtractImag(rhs_value), b_)); + return And(llvm_ir::EmitComparison(llvm::CmpInst::FCMP_OEQ, + EmitExtractReal(lhs_value), + EmitExtractReal(rhs_value), b_), + llvm_ir::EmitComparison(llvm::CmpInst::FCMP_OEQ, + EmitExtractImag(lhs_value), + EmitExtractImag(rhs_value), b_)); case HloOpcode::kNe: - return b_->CreateOr( - llvm_ir::EmitComparison(llvm::CmpInst::FCMP_UNE, - EmitExtractReal(lhs_value), - EmitExtractReal(rhs_value), b_), - llvm_ir::EmitComparison(llvm::CmpInst::FCMP_UNE, - EmitExtractImag(lhs_value), - EmitExtractImag(rhs_value), b_)); + return Or(llvm_ir::EmitComparison(llvm::CmpInst::FCMP_UNE, + EmitExtractReal(lhs_value), + EmitExtractReal(rhs_value), b_), + llvm_ir::EmitComparison(llvm::CmpInst::FCMP_UNE, + EmitExtractImag(lhs_value), + EmitExtractImag(rhs_value), b_)); case HloOpcode::kPower: { // (a+bi)^(c+di) = @@ -848,26 +808,24 @@ StatusOr ElementalIrEmitter::EmitComplexBinaryOp( auto b = EmitExtractImag(lhs_value); auto c = EmitExtractReal(rhs_value); auto d = EmitExtractImag(rhs_value); - auto aa_p_bb = b_->CreateFAdd(b_->CreateFMul(a, a), b_->CreateFMul(b, b)); + auto aa_p_bb = FAdd(FMul(a, a), FMul(b, b)); auto one_half = llvm::ConstantFP::get(a->getType(), 0.5); - auto half_c = b_->CreateFMul(one_half, c); + auto half_c = FMul(one_half, c); TF_ASSIGN_OR_RETURN(auto aa_p_bb_to_half_c, EmitPow(component_type, aa_p_bb, half_c)); - auto neg_d = b_->CreateFNeg(d); + auto neg_d = FNeg(d); TF_ASSIGN_OR_RETURN(auto arg_lhs, EmitAtan2(component_type, b, a)); - auto neg_d_arg_lhs = b_->CreateFMul(neg_d, arg_lhs); + auto neg_d_arg_lhs = FMul(neg_d, arg_lhs); TF_ASSIGN_OR_RETURN(auto e_to_neg_d_arg_lhs, EmitExp(component_type, neg_d_arg_lhs)); - auto coeff = b_->CreateFMul(aa_p_bb_to_half_c, e_to_neg_d_arg_lhs); + auto coeff = FMul(aa_p_bb_to_half_c, e_to_neg_d_arg_lhs); TF_ASSIGN_OR_RETURN(auto ln_aa_p_bb, EmitLog(component_type, aa_p_bb)); - auto half_d = b_->CreateFMul(one_half, d); - auto q = b_->CreateFAdd(b_->CreateFMul(c, arg_lhs), - b_->CreateFMul(half_d, ln_aa_p_bb)); + auto half_d = FMul(one_half, d); + auto q = FAdd(FMul(c, arg_lhs), FMul(half_d, ln_aa_p_bb)); TF_ASSIGN_OR_RETURN(auto cos_q, EmitCos(component_type, q)); TF_ASSIGN_OR_RETURN(auto sin_q, EmitSin(component_type, q)); - return EmitComposeComplex(op, b_->CreateFMul(coeff, cos_q), - b_->CreateFMul(coeff, sin_q)); + return EmitComposeComplex(op, FMul(coeff, cos_q), FMul(coeff, sin_q)); } default: return Unimplemented("binary complex op '%s'", @@ -876,17 +834,17 @@ StatusOr ElementalIrEmitter::EmitComplexBinaryOp( } llvm::Value* ElementalIrEmitter::EmitFloatMax(llvm::Value* lhs_value, - llvm::Value* rhs_value) const { + llvm::Value* rhs_value) { return llvm_ir::EmitFloatMax(lhs_value, rhs_value, b_); } llvm::Value* ElementalIrEmitter::EmitFloatMin(llvm::Value* lhs_value, - llvm::Value* rhs_value) const { + llvm::Value* rhs_value) { return llvm_ir::EmitFloatMin(lhs_value, rhs_value, b_); } StatusOr ElementalIrEmitter::EmitErfInv(PrimitiveType prim_type, - llvm::Value* x) const { + llvm::Value* x) { if (prim_type != F32) { // TODO(b/34339814): Implement inverse erf for F64. return Unimplemented( @@ -901,7 +859,7 @@ StatusOr ElementalIrEmitter::EmitErfInv(PrimitiveType prim_type, llvm::Value* p = getFloat(coefficients.front()); coefficients.remove_prefix(1); for (float coefficient : coefficients) { - p = b_->CreateFAdd(b_->CreateFMul(p, w), getFloat(coefficient)); + p = FAdd(FMul(p, w), getFloat(coefficient)); } return p; }; @@ -921,25 +879,24 @@ StatusOr ElementalIrEmitter::EmitErfInv(PrimitiveType prim_type, llvm::Function* logf_fn = llvm::Intrinsic::getDeclaration( module_, llvm::Intrinsic::log, {b_->getFloatTy()}); - llvm::Value* w = b_->CreateFNeg(b_->CreateCall( - logf_fn, {b_->CreateFMul(b_->CreateFSub(getFloat(1.0f), x), - b_->CreateFAdd(getFloat(1.0f), x))})); + llvm::Value* w = FNeg( + Call(logf_fn, {FMul(FSub(getFloat(1.0f), x), FAdd(getFloat(1.0f), x))})); llvm::Value* p_addr = llvm_ir::EmitAllocaAtFunctionEntry(b_->getFloatTy(), "p.addr", b_); llvm_ir::LlvmIfData if_data = llvm_ir::EmitIfThenElse( - b_->CreateFCmpOLT(w, getFloat(5.0f)), "w_less_than_five", b_); + FCmpOLT(w, getFloat(5.0f)), "w_less_than_five", b_); // Handle true BB. SetToFirstInsertPoint(if_data.true_block, b_); { - llvm::Value* lw = b_->CreateFSub(w, getFloat(2.5f)); + llvm::Value* lw = FSub(w, getFloat(2.5f)); tensorflow::gtl::ArraySlice lq{ 2.81022636e-08f, 3.43273939e-07f, -3.5233877e-06f, -4.39150654e-06f, 0.00021858087f, -0.00125372503f, -0.00417768164f, 0.246640727f, 1.50140941f}; llvm::Value* p = multiply_add(lq, lw); - b_->CreateStore(p, p_addr); + Store(p, p_addr); } // Handle false BB. @@ -948,76 +905,73 @@ StatusOr ElementalIrEmitter::EmitErfInv(PrimitiveType prim_type, llvm::Function* sqrtf_fn = llvm::Intrinsic::getDeclaration( module_, llvm::Intrinsic::sqrt, {b_->getFloatTy()}); - llvm::Value* gw = - b_->CreateFSub(b_->CreateCall(sqrtf_fn, {w}), getFloat(3.0f)); + llvm::Value* gw = FSub(Call(sqrtf_fn, w), getFloat(3.0f)); tensorflow::gtl::ArraySlice gq{ -0.000200214257f, 0.000100950558f, 0.00134934322f, -0.00367342844f, 0.00573950773f, -0.0076224613f, 0.00943887047f, 1.00167406f, 2.83297682f}; llvm::Value* p = multiply_add(gq, gw); - b_->CreateStore(p, p_addr); + Store(p, p_addr); } SetToFirstInsertPoint(if_data.after_block, b_); - llvm::Value* p = b_->CreateLoad(p_addr); - return b_->CreateFMul(p, x); + llvm::Value* p = Load(p_addr); + return FMul(p, x); } -StatusOr ElementalIrEmitter::EmitErfcInv( - PrimitiveType prim_type, llvm::Value* value) const { +StatusOr ElementalIrEmitter::EmitErfcInv(PrimitiveType prim_type, + llvm::Value* value) { // Compute erfcinv(value) by calculating erfinv(1.0 - value). auto type = llvm_ir::PrimitiveTypeToIrType(prim_type, module_); auto one = llvm::ConstantFP::get(type, 1.0); - return EmitErfInv(prim_type, b_->CreateFSub(one, value)); + return EmitErfInv(prim_type, FSub(one, value)); } StatusOr ElementalIrEmitter::EmitLog(PrimitiveType prim_type, - llvm::Value* value) const { + llvm::Value* value) { return llvm_ir::EmitCallToIntrinsic(llvm::Intrinsic::log, {value}, {value->getType()}, b_); } StatusOr ElementalIrEmitter::EmitLog1p(PrimitiveType prim_type, - llvm::Value* value) const { + llvm::Value* value) { auto x = value; auto type = llvm_ir::PrimitiveTypeToIrType(prim_type, module_); auto one = llvm::ConstantFP::get(type, 1.0); auto negative_half = llvm::ConstantFP::get(type, -0.5); // When x is large, the naive evaluation of ln(x + 1) is more // accurate than the Taylor series. - TF_ASSIGN_OR_RETURN(auto for_large_x, - EmitLog(prim_type, b_->CreateFAdd(x, one))); + TF_ASSIGN_OR_RETURN(auto for_large_x, EmitLog(prim_type, FAdd(x, one))); // The Taylor series for ln(x+1) is x - x^2/2 - x^3/3 + …. - auto for_small_x = - b_->CreateFMul(b_->CreateFAdd(b_->CreateFMul(negative_half, x), one), x); + auto for_small_x = FMul(FAdd(FMul(negative_half, x), one), x); const auto kAntilogarithmIsSmallThreshold = 1e-4; auto abs_x = llvm_ir::EmitCallToIntrinsic(llvm::Intrinsic::fabs, {value}, {type}, b_); - auto x_is_small = b_->CreateFCmpOLT( + auto x_is_small = FCmpOLT( abs_x, llvm::ConstantFP::get(type, kAntilogarithmIsSmallThreshold)); return Select(x_is_small, for_small_x, for_large_x); } StatusOr ElementalIrEmitter::EmitSin(PrimitiveType prim_type, - llvm::Value* value) const { + llvm::Value* value) { return llvm_ir::EmitCallToIntrinsic(llvm::Intrinsic::sin, {value}, {value->getType()}, b_); } StatusOr ElementalIrEmitter::EmitCos(PrimitiveType prim_type, - llvm::Value* value) const { + llvm::Value* value) { return llvm_ir::EmitCallToIntrinsic(llvm::Intrinsic::cos, {value}, {value->getType()}, b_); } StatusOr ElementalIrEmitter::EmitExp(PrimitiveType prim_type, - llvm::Value* value) const { + llvm::Value* value) { return llvm_ir::EmitCallToIntrinsic(llvm::Intrinsic::exp, {value}, {value->getType()}, b_); } StatusOr ElementalIrEmitter::EmitExpm1(PrimitiveType prim_type, - llvm::Value* value) const { + llvm::Value* value) { auto x = value; auto type = llvm_ir::PrimitiveTypeToIrType(prim_type, module_); auto one = llvm::ConstantFP::get(type, 1.0); @@ -1025,40 +979,40 @@ StatusOr ElementalIrEmitter::EmitExpm1(PrimitiveType prim_type, // When the exponent is large, the naive evaluation of e^(x) - 1 is more // accurate than the Taylor series. TF_ASSIGN_OR_RETURN(auto exp_x, EmitExp(prim_type, value)); - auto for_large_x = b_->CreateFSub(exp_x, one); + auto for_large_x = FSub(exp_x, one); // The Taylor series for exp(x) is 1 + x + x^2/2 + x^3/6 + …. // We want exp(x)-1 which is x + x^2/2 + x^3/6 + …. - auto x_squared = b_->CreateFAdd(x, x); - auto x_squared_over_two = b_->CreateFMul(x_squared, half); - auto for_small_x = b_->CreateFAdd(x, x_squared_over_two); + auto x_squared = FAdd(x, x); + auto x_squared_over_two = FMul(x_squared, half); + auto for_small_x = FAdd(x, x_squared_over_two); const auto kExponentIsSmallThreshold = 1e-5; auto abs_x = llvm_ir::EmitCallToIntrinsic(llvm::Intrinsic::fabs, {value}, {type}, b_); - auto x_is_small = b_->CreateFCmpOLT( - abs_x, llvm::ConstantFP::get(type, kExponentIsSmallThreshold)); + auto x_is_small = + FCmpOLT(abs_x, llvm::ConstantFP::get(type, kExponentIsSmallThreshold)); return Select(x_is_small, for_small_x, for_large_x); } StatusOr ElementalIrEmitter::EmitPow(PrimitiveType prim_type, llvm::Value* lhs, - llvm::Value* rhs) const { + llvm::Value* rhs) { return llvm_ir::EmitCallToIntrinsic(llvm::Intrinsic::pow, {lhs, rhs}, {lhs->getType()}, b_); } StatusOr ElementalIrEmitter::EmitAtan2(PrimitiveType prim_type, llvm::Value* lhs, - llvm::Value* rhs) const { + llvm::Value* rhs) { return Unimplemented("atan2"); } StatusOr ElementalIrEmitter::EmitTanh(PrimitiveType prim_type, - llvm::Value* value) const { + llvm::Value* value) { return Unimplemented("tanh"); } StatusOr ElementalIrEmitter::EmitReducePrecision( - const HloInstruction* hlo, llvm::Value* x) const { + const HloInstruction* hlo, llvm::Value* x) { if (hlo->operand(0)->shape().element_type() != F32) { return Unimplemented("reduce-precision only implemented for F32"); } @@ -1089,44 +1043,39 @@ static llvm::Value* SaturateShiftIfNecessary(llvm::IRBuilder<>* b, return b->CreateSelect(shift_amt_in_range, shift_result, saturated_value); } -llvm::Value* ElementalIrEmitter::GetOne(llvm::Type* type) const { +llvm::Value* ElementalIrEmitter::GetOne(llvm::Type* type) { return llvm::ConstantInt::get(llvm::cast(type), 1); } -llvm::Value* ElementalIrEmitter::GetZero(llvm::Type* type) const { +llvm::Value* ElementalIrEmitter::GetZero(llvm::Type* type) { return llvm::ConstantInt::get(llvm::cast(type), 0); } -llvm::Value* ElementalIrEmitter::GetIntSMin(llvm::Type* type) const { +llvm::Value* ElementalIrEmitter::GetIntSMin(llvm::Type* type) { auto* integer_type = llvm::cast(type); return llvm::ConstantInt::get(integer_type, llvm::APInt::getSignedMinValue( integer_type->getBitWidth())); } -llvm::Value* ElementalIrEmitter::GetMinusOne(llvm::Type* type) const { +llvm::Value* ElementalIrEmitter::GetMinusOne(llvm::Type* type) { auto* integer_type = llvm::cast(type); return llvm::ConstantInt::get( integer_type, llvm::APInt::getAllOnesValue(integer_type->getBitWidth())); } -llvm::Value* ElementalIrEmitter::IsZero(llvm::Value* v) const { - return b_->CreateICmpEQ(v, llvm::ConstantInt::get(v->getType(), 0)); -} - -llvm::Value* ElementalIrEmitter::IsIntMinDivisionOverflow( - llvm::Value* lhs, llvm::Value* rhs) const { - return b_->CreateAnd(b_->CreateICmpEQ(lhs, GetIntSMin(lhs->getType())), - b_->CreateICmpEQ(rhs, GetMinusOne(rhs->getType()))); +llvm::Value* ElementalIrEmitter::IsZero(llvm::Value* v) { + return ICmpEQ(v, llvm::ConstantInt::get(v->getType(), 0)); } -llvm::Value* ElementalIrEmitter::Select(llvm::Value* cond, llvm::Value* if_true, - llvm::Value* if_false) const { - return b_->CreateSelect(cond, if_true, if_false); +llvm::Value* ElementalIrEmitter::IsIntMinDivisionOverflow(llvm::Value* lhs, + llvm::Value* rhs) { + return And(ICmpEQ(lhs, GetIntSMin(lhs->getType())), + ICmpEQ(rhs, GetMinusOne(rhs->getType()))); } llvm::Value* ElementalIrEmitter::EmitIntegerDivide(llvm::Value* lhs, llvm::Value* rhs, - bool is_signed) const { + bool is_signed) { // Integer division overflow behavior: // // X / 0 == -1 @@ -1135,16 +1084,15 @@ llvm::Value* ElementalIrEmitter::EmitIntegerDivide(llvm::Value* lhs, if (!is_signed) { llvm::Value* udiv_is_unsafe = IsZero(rhs); llvm::Value* safe_rhs = Select(udiv_is_unsafe, GetOne(lhs->getType()), rhs); - llvm::Value* safe_div = b_->CreateUDiv(lhs, safe_rhs); + llvm::Value* safe_div = UDiv(lhs, safe_rhs); return Select(udiv_is_unsafe, GetMinusOne(lhs->getType()), safe_div); } llvm::Value* has_zero_divisor = IsZero(rhs); llvm::Value* has_int_min_overflow = IsIntMinDivisionOverflow(lhs, rhs); - llvm::Value* sdiv_is_unsafe = - b_->CreateOr(has_int_min_overflow, has_zero_divisor); + llvm::Value* sdiv_is_unsafe = Or(has_int_min_overflow, has_zero_divisor); llvm::Value* safe_rhs = Select(sdiv_is_unsafe, GetOne(lhs->getType()), rhs); - llvm::Value* safe_div = b_->CreateSDiv(lhs, safe_rhs); + llvm::Value* safe_div = SDiv(lhs, safe_rhs); return Select( has_zero_divisor, GetMinusOne(lhs->getType()), @@ -1153,7 +1101,7 @@ llvm::Value* ElementalIrEmitter::EmitIntegerDivide(llvm::Value* lhs, llvm::Value* ElementalIrEmitter::EmitIntegerRemainder(llvm::Value* lhs, llvm::Value* rhs, - bool is_signed) const { + bool is_signed) { // Integer remainder overflow behavior: // // X % 0 == X @@ -1162,16 +1110,15 @@ llvm::Value* ElementalIrEmitter::EmitIntegerRemainder(llvm::Value* lhs, if (!is_signed) { llvm::Value* urem_is_unsafe = IsZero(rhs); llvm::Value* safe_rhs = Select(urem_is_unsafe, GetOne(lhs->getType()), rhs); - llvm::Value* safe_rem = b_->CreateURem(lhs, safe_rhs); + llvm::Value* safe_rem = URem(lhs, safe_rhs); return Select(urem_is_unsafe, lhs, safe_rem); } llvm::Value* has_zero_divisor = IsZero(rhs); llvm::Value* has_int_min_overflow = IsIntMinDivisionOverflow(lhs, rhs); - llvm::Value* srem_is_unsafe = - b_->CreateOr(has_int_min_overflow, has_zero_divisor); + llvm::Value* srem_is_unsafe = Or(has_int_min_overflow, has_zero_divisor); llvm::Value* safe_rhs = Select(srem_is_unsafe, GetOne(lhs->getType()), rhs); - llvm::Value* safe_rem = b_->CreateSRem(lhs, safe_rhs); + llvm::Value* safe_rem = SRem(lhs, safe_rhs); return Select( has_zero_divisor, lhs, @@ -1180,15 +1127,15 @@ llvm::Value* ElementalIrEmitter::EmitIntegerRemainder(llvm::Value* lhs, StatusOr ElementalIrEmitter::EmitIntegerBinaryOp( const HloInstruction* op, llvm::Value* lhs_value, llvm::Value* rhs_value, - bool is_signed) const { + bool is_signed) { switch (op->opcode()) { // TODO(jingyue): add the "nsw" attribute for signed types. case HloOpcode::kAdd: - return b_->CreateAdd(lhs_value, rhs_value); + return Add(lhs_value, rhs_value); case HloOpcode::kSubtract: - return b_->CreateSub(lhs_value, rhs_value); + return Sub(lhs_value, rhs_value); case HloOpcode::kMultiply: - return b_->CreateMul(lhs_value, rhs_value); + return Mul(lhs_value, rhs_value); case HloOpcode::kDivide: return EmitIntegerDivide(lhs_value, rhs_value, is_signed); case HloOpcode::kRemainder: @@ -1220,11 +1167,11 @@ StatusOr ElementalIrEmitter::EmitIntegerBinaryOp( case HloOpcode::kMaximum: return EmitIntegralMax(lhs_value, rhs_value, is_signed); case HloOpcode::kAnd: - return b_->CreateAnd(lhs_value, rhs_value); + return And(lhs_value, rhs_value); case HloOpcode::kOr: - return b_->CreateOr(lhs_value, rhs_value); + return Or(lhs_value, rhs_value); case HloOpcode::kXor: - return b_->CreateXor(lhs_value, rhs_value); + return Xor(lhs_value, rhs_value); // Shifting out bits >= the number of bits in the type being shifted // produces a poison value in LLVM which is basically "deferred undefined @@ -1233,15 +1180,15 @@ StatusOr ElementalIrEmitter::EmitIntegerBinaryOp( // UB. case HloOpcode::kShiftRightArithmetic: return SaturateShiftIfNecessary(b_, lhs_value, rhs_value, - b_->CreateAShr(lhs_value, rhs_value), + AShr(lhs_value, rhs_value), /*saturate_to_sign_bit=*/true); case HloOpcode::kShiftLeft: return SaturateShiftIfNecessary(b_, lhs_value, rhs_value, - b_->CreateShl(lhs_value, rhs_value), + Shl(lhs_value, rhs_value), /*saturate_to_sign_bit=*/false); case HloOpcode::kShiftRightLogical: return SaturateShiftIfNecessary(b_, lhs_value, rhs_value, - b_->CreateLShr(lhs_value, rhs_value), + LShr(lhs_value, rhs_value), /*saturate_to_sign_bit=*/false); default: return Unimplemented("binary integer op '%s'", @@ -1251,7 +1198,7 @@ StatusOr ElementalIrEmitter::EmitIntegerBinaryOp( llvm::Value* ElementalIrEmitter::EmitIntegralMax(llvm::Value* lhs_value, llvm::Value* rhs_value, - bool is_signed) const { + bool is_signed) { return Select(b_->CreateICmp(is_signed ? llvm::ICmpInst::ICMP_SGE : llvm::ICmpInst::ICMP_UGE, lhs_value, rhs_value), @@ -1260,7 +1207,7 @@ llvm::Value* ElementalIrEmitter::EmitIntegralMax(llvm::Value* lhs_value, llvm::Value* ElementalIrEmitter::EmitIntegralMin(llvm::Value* lhs_value, llvm::Value* rhs_value, - bool is_signed) const { + bool is_signed) { return Select(b_->CreateICmp(is_signed ? llvm::ICmpInst::ICMP_SLE : llvm::ICmpInst::ICMP_ULE, lhs_value, rhs_value), @@ -1269,7 +1216,7 @@ llvm::Value* ElementalIrEmitter::EmitIntegralMin(llvm::Value* lhs_value, llvm_ir::IrArray::Index ElementalIrEmitter::ElementwiseSourceIndex( const llvm_ir::IrArray::Index& target_index, const HloInstruction& hlo, - int64 operand_no) const { + int64 operand_no) { CHECK(hlo.IsElementwise()) << "HLO " << hlo.ToString() << " is not elementwise."; @@ -1310,7 +1257,7 @@ llvm_ir::IrArray::Index ElementalIrEmitter::ElementwiseSourceIndex( StatusOr ElementalIrEmitter::ConvertValueForDistribution( const HloInstruction* hlo, const ElementalIrEmitter::HloToElementGeneratorMap& operand_to_generator, - const llvm_ir::IrArray::Index& index, llvm::Value* raw_value) const { + const llvm_ir::IrArray::Index& index, llvm::Value* raw_value) { TF_ASSIGN_OR_RETURN(llvm::Value * a_or_mean, operand_to_generator.at(hlo->operand(0))(index)); TF_ASSIGN_OR_RETURN(llvm::Value * b_or_sigma, @@ -1328,17 +1275,17 @@ StatusOr ElementalIrEmitter::ConvertValueForDistribution( // Perform the division using the float type with the same number of bits // as the raw value to avoid overflow. if (raw_value_size_in_bits == 32) { - elem_value = b_->CreateUIToFP(elem_value, b_->getFloatTy()); - elem_value = b_->CreateFDiv( - elem_value, llvm::ConstantFP::get(b_->getFloatTy(), std::exp2(32))); + elem_value = UIToFP(elem_value, b_->getFloatTy()); + elem_value = FDiv(elem_value, + llvm::ConstantFP::get(b_->getFloatTy(), std::exp2(32))); } else { - elem_value = b_->CreateUIToFP(elem_value, b_->getDoubleTy()); - elem_value = b_->CreateFDiv( + elem_value = UIToFP(elem_value, b_->getDoubleTy()); + elem_value = FDiv( elem_value, llvm::ConstantFP::get(b_->getDoubleTy(), std::exp2(64))); } if (elem_ir_ty != elem_value->getType()) { - elem_value = b_->CreateFPTrunc(elem_value, elem_ir_ty); + elem_value = FPTrunc(elem_value, elem_ir_ty); } } @@ -1346,9 +1293,7 @@ StatusOr ElementalIrEmitter::ConvertValueForDistribution( switch (hlo->random_distribution()) { case RNG_UNIFORM: { if (elem_ir_ty->isFloatingPointTy()) { - return b_->CreateFAdd( - b_->CreateFMul(b_->CreateFSub(b_or_sigma, a_or_mean), elem_value), - a_or_mean); + return FAdd(FMul(FSub(b_or_sigma, a_or_mean), elem_value), a_or_mean); } else { // To generate a uniform random value in [a, b) from a raw random sample // in range [0, 2^N), we let range = b - a and return @@ -1361,17 +1306,16 @@ StatusOr ElementalIrEmitter::ConvertValueForDistribution( // the same cost as if the whole warp were to re-sample. So an // efficient re-sampling implementation on GPU would need to do // nontrivial work to share entropy between threads in the warp. - auto range = b_->CreateSub(b_or_sigma, a_or_mean); - return b_->CreateAdd(a_or_mean, b_->CreateURem(elem_value, range)); + auto range = Sub(b_or_sigma, a_or_mean); + return Add(a_or_mean, URem(elem_value, range)); } } case RNG_NORMAL: { TF_ASSIGN_OR_RETURN( llvm::Value * r, - EmitErfcInv(elem_prim_ty, - b_->CreateFMul(llvm::ConstantFP::get(elem_ir_ty, 2.0), - elem_value))); - return b_->CreateFAdd(b_->CreateFMul(r, b_or_sigma), a_or_mean); + EmitErfcInv(elem_prim_ty, FMul(llvm::ConstantFP::get(elem_ir_ty, 2.0), + elem_value))); + return FAdd(FMul(r, b_or_sigma), a_or_mean); } default: return InvalidArgument( @@ -1491,8 +1435,7 @@ std::array CalculateSampleValues( // Precondition: the RNG instruction is not fused. llvm_ir::ElementGenerator ElementalIrEmitter::MakePhiloxRngElementGenerator( const HloInstruction* hlo, - const ElementalIrEmitter::HloToElementGeneratorMap& operand_to_generator) - const { + const ElementalIrEmitter::HloToElementGeneratorMap& operand_to_generator) { VLOG(3) << "Using philox RNG algorithm"; CHECK(!hlo->IsFused()); // A random number generated by the per module random number generator. @@ -1515,7 +1458,7 @@ llvm_ir::ElementGenerator ElementalIrEmitter::MakePhiloxRngElementGenerator( // Load the global state variable for the Philox RNG algorithm. llvm::GlobalVariable* rng_state_ptr = llvm_ir::GetOrCreateVariableForPhiloxRngState(module_, b_); - llvm::Value* rng_state = b_->CreateLoad(rng_state_ptr, "rng_state_value"); + llvm::Value* rng_state = Load(rng_state_ptr, "rng_state_value"); // Build and return the elemental IR generator to generate a random value for // the element corresponding to the current thread. @@ -1541,8 +1484,8 @@ llvm_ir::ElementGenerator ElementalIrEmitter::MakePhiloxRngElementGenerator( // element within the sample. llvm::Value* elems_per_sample_value = llvm::ConstantInt::get(index_ty, elems_per_sample); - llvm::Value* sample_idx = b_->CreateUDiv(elem_idx, elems_per_sample_value); - llvm::Value* elem_offset = b_->CreateURem(elem_idx, elems_per_sample_value); + llvm::Value* sample_idx = UDiv(elem_idx, elems_per_sample_value); + llvm::Value* elem_offset = URem(elem_idx, elems_per_sample_value); std::array counter_values = CalculateSampleValues( sample_idx, hlo_random_value, global_random_number, rng_state, b_); @@ -1550,18 +1493,17 @@ llvm_ir::ElementGenerator ElementalIrEmitter::MakePhiloxRngElementGenerator( // Store the four counter_values into the sample_address alloca so we can // load the elem_offset'th one below. for (int idx = 0; idx < 4; ++idx) { - b_->CreateStore(counter_values[idx], - b_->CreateInBoundsGEP(sample_address, b_->getInt32(idx))); + Store(counter_values[idx], + InBoundsGEP(sample_address, b_->getInt32(idx))); } llvm::Type* int64_ty = b_->getInt64Ty(); CHECK(elems_per_sample == 2 || elems_per_sample == 4); llvm::Type* raw_value_ty = elems_per_sample == 2 ? int64_ty : int32_ty; // Retrieve the raw value for the current element from the current sample. - llvm::Value* raw_elem_value = b_->CreateLoad( - b_->CreateInBoundsGEP( - b_->CreatePointerCast(sample_address, raw_value_ty->getPointerTo()), - elem_offset), + llvm::Value* raw_elem_value = Load( + InBoundsGEP(PointerCast(sample_address, raw_value_ty->getPointerTo()), + elem_offset), "raw_elem_value"); return ConvertValueForDistribution(hlo, operand_to_generator, index, @@ -1572,7 +1514,7 @@ llvm_ir::ElementGenerator ElementalIrEmitter::MakePhiloxRngElementGenerator( StatusOr ElementalIrEmitter::EmitElementalSelect( const HloInstruction* hlo, const ElementalIrEmitter::HloToElementGeneratorMap& operand_to_generator, - const llvm_ir::IrArray::Index& index) const { + const llvm_ir::IrArray::Index& index) { TF_ASSIGN_OR_RETURN(llvm::Value * pred_value, operand_to_generator.at(hlo->operand(0))( ElementwiseSourceIndex(index, *hlo, 0))); @@ -1582,14 +1524,14 @@ StatusOr ElementalIrEmitter::EmitElementalSelect( TF_ASSIGN_OR_RETURN(llvm::Value * on_false_value, operand_to_generator.at(hlo->operand(2))( ElementwiseSourceIndex(index, *hlo, 2))); - return Select(b_->CreateTrunc(pred_value, b_->getInt1Ty()), on_true_value, + return Select(Trunc(pred_value, b_->getInt1Ty()), on_true_value, on_false_value); } StatusOr ElementalIrEmitter::EmitElementalClamp( const HloInstruction* hlo, const ElementalIrEmitter::HloToElementGeneratorMap& operand_to_generator, - const llvm_ir::IrArray::Index& index) const { + const llvm_ir::IrArray::Index& index) { TF_ASSIGN_OR_RETURN(llvm::Value * min_value, operand_to_generator.at(hlo->operand(0))( ElementwiseSourceIndex(index, *hlo, 0))); @@ -1615,7 +1557,7 @@ StatusOr ElementalIrEmitter::EmitElementalClamp( StatusOr ElementalIrEmitter::EmitElementalConcatenate( const HloInstruction* hlo, const ElementalIrEmitter::HloToElementGeneratorMap& operand_to_generator, - const llvm_ir::IrArray::Index& target_index) const { + const llvm_ir::IrArray::Index& target_index) { const int64 concat_dim = hlo->dimensions(0); auto source_index = target_index; @@ -1637,9 +1579,9 @@ StatusOr ElementalIrEmitter::EmitElementalConcatenate( } llvm_ir::SetToFirstInsertPoint(exit_block, b_); - llvm::PHINode* output = b_->CreatePHI( - llvm_ir::PrimitiveTypeToIrType(hlo->shape().element_type(), module_), - hlo->operands().size()); + llvm::PHINode* output = + PHI(llvm_ir::PrimitiveTypeToIrType(hlo->shape().element_type(), module_), + hlo->operands().size()); auto prior_insert_point = b_->GetInsertPoint(); b_->SetInsertPoint(init_block); @@ -1654,9 +1596,8 @@ StatusOr ElementalIrEmitter::EmitElementalConcatenate( auto concat_dim_size = llvm::ConstantInt::get(source_index[concat_dim]->getType(), operand->shape().dimensions(concat_dim)); - b_->CreateCondBr( - b_->CreateICmpULT(source_index[concat_dim], concat_dim_size), - true_block, false_block); + CondBr(ICmpULT(source_index[concat_dim], concat_dim_size), true_block, + false_block); // Create the terminator of the true block before calling operand // generators, because they require non-degenerate basic blocks. @@ -1669,11 +1610,10 @@ StatusOr ElementalIrEmitter::EmitElementalConcatenate( // Subtract the size of the concat dimension of the current operand // from the source index. b_->SetInsertPoint(false_block); - source_index[concat_dim] = - b_->CreateSub(source_index[concat_dim], concat_dim_size); + source_index[concat_dim] = Sub(source_index[concat_dim], concat_dim_size); } - b_->CreateUnreachable(); + Unreachable(); b_->SetInsertPoint(exit_block, prior_insert_point); return output; } @@ -1681,7 +1621,7 @@ StatusOr ElementalIrEmitter::EmitElementalConcatenate( StatusOr ElementalIrEmitter::EmitElementalDynamicSlice( const HloInstruction* hlo, const ElementalIrEmitter::HloToElementGeneratorMap& operand_to_generator, - const llvm_ir::IrArray::Index& index) const { + const llvm_ir::IrArray::Index& index) { // Emit IR to read dynamic start indices from hlo->operand(1). const HloInstruction* input_hlo = hlo->operand(0); const int64 rank = ShapeUtil::Rank(input_hlo->shape()); @@ -1698,7 +1638,7 @@ StatusOr ElementalIrEmitter::EmitElementalDynamicSlice( // Clamp the start index so that the sliced portion fits in the operand: // start_index = clamp(start_index, 0, operand_dim_size - output_dim_size) - start_index_value = b_->CreateSExtOrTrunc(start_index_value, index_type); + start_index_value = SExtOrTrunc(start_index_value, index_type); int64 largest_valid_start_index = input_hlo->shape().dimensions(i) - hlo->shape().dimensions(i); CHECK_GE(largest_valid_start_index, 0); @@ -1718,7 +1658,7 @@ StatusOr ElementalIrEmitter::EmitElementalDynamicSlice( for (int64 i = 0; i < rank; ++i) { // Emit IR which computes: // input_index = start_index + offset_index - input_index[i] = b_->CreateAdd(slice_start_index[i], index[i]); + input_index[i] = Add(slice_start_index[i], index[i]); } return operand_to_generator.at(input_hlo)(input_index); } @@ -1726,7 +1666,7 @@ StatusOr ElementalIrEmitter::EmitElementalDynamicSlice( StatusOr ElementalIrEmitter::EmitElementalGather( const HloInstruction* hlo, const ElementalIrEmitter::HloToElementGeneratorMap& operand_to_generator, - const llvm_ir::IrArray::Index& index) const { + const llvm_ir::IrArray::Index& index) { const Shape& operand_shape = hlo->operand(0)->shape(); const Shape& indices_shape = hlo->operand(1)->shape(); const Shape& output_shape = hlo->shape(); @@ -1775,7 +1715,7 @@ StatusOr ElementalIrEmitter::EmitElementalGather( auto add_to_operand_index = [&](llvm::Value* index_component, int64 dim) { llvm::Value* gather_dim_component_extended = - b_->CreateSExtOrTrunc(index_component, index_type); + SExtOrTrunc(index_component, index_type); int64 operand_dim = dim_numbers.start_index_map(dim); int64 output_dim = operand_to_output_dim[operand_dim]; // If 'output_dim' is -1, it means 'operand_dim' is an elided window dim. @@ -1799,8 +1739,8 @@ StatusOr ElementalIrEmitter::EmitElementalGather( gather_dim_component_extended, is_signed), is_signed); - operand_index[operand_dim] = b_->CreateAdd( - operand_index[operand_dim], gather_dim_component_extended_inbound); + operand_index[operand_dim] = + Add(operand_index[operand_dim], gather_dim_component_extended_inbound); }; if (indices_shape.dimensions_size() == dim_numbers.index_vector_dim()) { @@ -1824,7 +1764,7 @@ StatusOr ElementalIrEmitter::EmitElementalGather( StatusOr ElementalIrEmitter::EmitElementalDynamicUpdateSlice( const HloInstruction* hlo, const ElementalIrEmitter::HloToElementGeneratorMap& operand_to_generator, - const llvm_ir::IrArray::Index& index) const { + const llvm_ir::IrArray::Index& index) { const HloInstruction* input_hlo = hlo->operand(0); const HloInstruction* update_hlo = hlo->operand(1); const HloInstruction* start_hlo = hlo->operand(2); @@ -1847,7 +1787,7 @@ StatusOr ElementalIrEmitter::EmitElementalDynamicUpdateSlice( // Clamp the start index so that the update region fits in the operand. // start_index = clamp(start_index, 0, input_dim_size - update_dim_size) - start_index_value = b_->CreateSExtOrTrunc(start_index_value, index_type); + start_index_value = SExtOrTrunc(start_index_value, index_type); llvm::Value* update_dim_size = index_typed_const(update_hlo->shape().dimensions(i)); int64 largest_valid_start_index = @@ -1863,14 +1803,14 @@ StatusOr ElementalIrEmitter::EmitElementalDynamicUpdateSlice( start_index_value->setName( AsStringRef(IrName(hlo, StrCat("start_idx", i)))); slice_start_index[i] = start_index_value; - slice_limit_index[i] = b_->CreateAdd(slice_start_index[i], update_dim_size); - - slice_intersection = b_->CreateAnd( - slice_intersection, b_->CreateICmpSGE(index[i], slice_start_index[i]), - "slice_intersection"); - slice_intersection = b_->CreateAnd( - slice_intersection, b_->CreateICmpSLT(index[i], slice_limit_index[i]), - "slice_intersection"); + slice_limit_index[i] = Add(slice_start_index[i], update_dim_size); + + slice_intersection = + And(slice_intersection, ICmpSGE(index[i], slice_start_index[i]), + "slice_intersection"); + slice_intersection = + And(slice_intersection, ICmpSLT(index[i], slice_limit_index[i]), + "slice_intersection"); } // Emit: @@ -1887,26 +1827,26 @@ StatusOr ElementalIrEmitter::EmitElementalDynamicUpdateSlice( // Compute update index for intersection case. llvm_ir::IrArray::Index update_index(index.GetType(), rank); for (int64 i = 0; i < rank; ++i) { - update_index[i] = b_->CreateSub(index[i], slice_start_index[i]); + update_index[i] = Sub(index[i], slice_start_index[i]); } TF_ASSIGN_OR_RETURN(llvm::Value * true_value, operand_to_generator.at(update_hlo)(update_index)); - b_->CreateStore(true_value, ret_value_addr); + Store(true_value, ret_value_addr); // Handle false BB (return data from 'input') SetToFirstInsertPoint(if_data.false_block, b_); TF_ASSIGN_OR_RETURN(llvm::Value * false_value, operand_to_generator.at(input_hlo)(index)); - b_->CreateStore(false_value, ret_value_addr); + Store(false_value, ret_value_addr); SetToFirstInsertPoint(if_data.after_block, b_); - return b_->CreateLoad(ret_value_addr); + return Load(ret_value_addr); } StatusOr ElementalIrEmitter::EmitElementalPad( const HloInstruction* hlo, const ElementalIrEmitter::HloToElementGeneratorMap& operand_to_generator, - const llvm_ir::IrArray::Index& padded_index) const { + const llvm_ir::IrArray::Index& padded_index) { auto index = padded_index; llvm::Value* in_bounds = b_->getTrue(); for (size_t i = 0; i < index.size(); ++i) { @@ -1914,26 +1854,22 @@ StatusOr ElementalIrEmitter::EmitElementalPad( return llvm::ConstantInt::get(index[i]->getType(), n); }; const auto& pad_dim = hlo->padding_config().dimensions(i); - index[i] = - b_->CreateSub(index[i], index_typed_const(pad_dim.edge_padding_low())); - in_bounds = b_->CreateAnd(in_bounds, - b_->CreateICmpSGE(index[i], index_typed_const(0)), - "in_bounds"); - in_bounds = b_->CreateAnd( + index[i] = Sub(index[i], index_typed_const(pad_dim.edge_padding_low())); + in_bounds = + And(in_bounds, ICmpSGE(index[i], index_typed_const(0)), "in_bounds"); + in_bounds = And( in_bounds, - b_->CreateICmpEQ( + ICmpEQ( index_typed_const(0), - b_->CreateURem(index[i], - index_typed_const(pad_dim.interior_padding() + 1))), - "in_bounds"); - index[i] = b_->CreateSDiv( - index[i], index_typed_const(pad_dim.interior_padding() + 1)); - in_bounds = b_->CreateAnd( - in_bounds, - b_->CreateICmpSLT( - index[i], - index_typed_const(hlo->operand(0)->shape().dimensions(i))), + URem(index[i], index_typed_const(pad_dim.interior_padding() + 1))), "in_bounds"); + index[i] = + SDiv(index[i], index_typed_const(pad_dim.interior_padding() + 1)); + in_bounds = + And(in_bounds, + ICmpSLT(index[i], + index_typed_const(hlo->operand(0)->shape().dimensions(i))), + "in_bounds"); } // if (in_bounds) { @@ -1949,26 +1885,26 @@ StatusOr ElementalIrEmitter::EmitElementalPad( SetToFirstInsertPoint(if_data.true_block, b_); TF_ASSIGN_OR_RETURN(llvm::Value * operand_value, operand_to_generator.at(hlo->operand(0))(index)); - b_->CreateStore(operand_value, ret_value_addr); + Store(operand_value, ret_value_addr); SetToFirstInsertPoint(if_data.false_block, b_); TF_ASSIGN_OR_RETURN(llvm::Value * padding_value, operand_to_generator.at(hlo->operand(1))( IrArray::Index(index.GetType()))); - b_->CreateStore(padding_value, ret_value_addr); + Store(padding_value, ret_value_addr); SetToFirstInsertPoint(if_data.after_block, b_); // Don't create phi(operand_value, padding_value) here, because invoking // operand_to_generator may create new basic blocks, making the parent // of operand_value or padding_value no longer a predecessor of // if_data.after_block. - return b_->CreateLoad(ret_value_addr); + return Load(ret_value_addr); } StatusOr ElementalIrEmitter::EmitElementalDot( const HloInstruction* hlo, const ElementalIrEmitter::HloToElementGeneratorMap& operand_to_generator, - const llvm_ir::IrArray::Index& dot_result_index) const { + const llvm_ir::IrArray::Index& dot_result_index) { auto lhs_generator = operand_to_generator.at(hlo->operand(0)); auto rhs_generator = operand_to_generator.at(hlo->operand(1)); @@ -1996,8 +1932,7 @@ StatusOr ElementalIrEmitter::EmitElementalDot( llvm_ir::PrimitiveTypeToIrType(primitive_type, module_); llvm::Value* accumulator_alloca = llvm_ir::EmitAllocaAtFunctionEntry(primitive_type_llvm, "dot_acc", b_); - b_->CreateStore(llvm::Constant::getNullValue(primitive_type_llvm), - accumulator_alloca); + Store(llvm::Constant::getNullValue(primitive_type_llvm), accumulator_alloca); SetToFirstInsertPoint(inner_loop->GetBodyBasicBlock(), b_); @@ -2019,42 +1954,37 @@ StatusOr ElementalIrEmitter::EmitElementalDot( } rhs_index.InsertAt(rhs_contracting_dim, inner_loop->GetIndVarValue()); - llvm::Value* current_accumulator = b_->CreateLoad(accumulator_alloca); + llvm::Value* current_accumulator = Load(accumulator_alloca); TF_ASSIGN_OR_RETURN(llvm::Value * lhs_value, lhs_generator(lhs_index)); TF_ASSIGN_OR_RETURN(llvm::Value * rhs_value, rhs_generator(rhs_index)); llvm::Value* next_accumulator; if (primitive_util::IsComplexType(primitive_type)) { - llvm::Value* product_real = b_->CreateFSub( - b_->CreateFMul(EmitExtractReal(lhs_value), EmitExtractReal(rhs_value)), - b_->CreateFMul(EmitExtractImag(lhs_value), EmitExtractImag(rhs_value))); - llvm::Value* product_imag = b_->CreateFAdd( - b_->CreateFMul(EmitExtractReal(lhs_value), EmitExtractImag(rhs_value)), - b_->CreateFMul(EmitExtractImag(lhs_value), EmitExtractReal(rhs_value))); - next_accumulator = b_->CreateInsertValue( + llvm::Value* product_real = + FSub(FMul(EmitExtractReal(lhs_value), EmitExtractReal(rhs_value)), + FMul(EmitExtractImag(lhs_value), EmitExtractImag(rhs_value))); + llvm::Value* product_imag = + FAdd(FMul(EmitExtractReal(lhs_value), EmitExtractImag(rhs_value)), + FMul(EmitExtractImag(lhs_value), EmitExtractReal(rhs_value))); + next_accumulator = InsertValue( current_accumulator, - b_->CreateFAdd(EmitExtractReal(current_accumulator), product_real), - {0}); - next_accumulator = b_->CreateInsertValue( + FAdd(EmitExtractReal(current_accumulator), product_real), {0}); + next_accumulator = InsertValue( next_accumulator, - b_->CreateFAdd(EmitExtractImag(current_accumulator), product_imag), - {1}); + FAdd(EmitExtractImag(current_accumulator), product_imag), {1}); } else if (primitive_util::IsFloatingPointType(primitive_type)) { - next_accumulator = b_->CreateFAdd(current_accumulator, - b_->CreateFMul(lhs_value, rhs_value)); + next_accumulator = FAdd(current_accumulator, FMul(lhs_value, rhs_value)); } else { - next_accumulator = - b_->CreateAdd(current_accumulator, b_->CreateMul(lhs_value, rhs_value)); + next_accumulator = Add(current_accumulator, Mul(lhs_value, rhs_value)); } - b_->CreateStore(next_accumulator, accumulator_alloca); + Store(next_accumulator, accumulator_alloca); SetToFirstInsertPoint(inner_loop->GetExitBasicBlock(), b_); - return b_->CreateLoad(accumulator_alloca); + return Load(accumulator_alloca); } llvm_ir::ElementGenerator ElementalIrEmitter::MakeElementGenerator( const HloInstruction* hlo, - const ElementalIrEmitter::HloToElementGeneratorMap& operand_to_generator) - const { + const ElementalIrEmitter::HloToElementGeneratorMap& operand_to_generator) { switch (hlo->opcode()) { case HloOpcode::kAbs: case HloOpcode::kRoundNearestAfz: @@ -2148,10 +2078,10 @@ llvm_ir::ElementGenerator ElementalIrEmitter::MakeElementGenerator( const HloInstruction* operand = hlo->operand(0); auto source_index = target_index; for (int64 dim : hlo->dimensions()) { - source_index[dim] = b_->CreateSub( - llvm::ConstantInt::get(target_index[dim]->getType(), - hlo->shape().dimensions(dim) - 1), - target_index[dim]); + source_index[dim] = + Sub(llvm::ConstantInt::get(target_index[dim]->getType(), + hlo->shape().dimensions(dim) - 1), + target_index[dim]); } return operand_to_generator.at(operand)(source_index); }; @@ -2235,23 +2165,23 @@ llvm_ir::ElementGenerator ElementalIrEmitter::MakeElementGenerator( } } -llvm::Value* ElementalIrEmitter::EmitExtractReal(llvm::Value* value) const { - return b_->CreateExtractValue(value, {0}); +llvm::Value* ElementalIrEmitter::EmitExtractReal(llvm::Value* value) { + return ExtractValue(value, {0}); } -llvm::Value* ElementalIrEmitter::EmitExtractImag(llvm::Value* value) const { - return b_->CreateExtractValue(value, {1}); +llvm::Value* ElementalIrEmitter::EmitExtractImag(llvm::Value* value) { + return ExtractValue(value, {1}); } llvm::Value* ElementalIrEmitter::EmitComposeComplex(const HloInstruction* op, llvm::Value* real, - llvm::Value* imag) const { + llvm::Value* imag) { auto cplx_type = llvm_ir::PrimitiveTypeToIrType(op->shape().element_type(), module_); - auto complex = b_->CreateInsertValue( - llvm::ConstantAggregateZero::get(cplx_type), real, {0}); + auto complex = + InsertValue(llvm::ConstantAggregateZero::get(cplx_type), real, {0}); if (imag != nullptr) { - complex = b_->CreateInsertValue(complex, imag, {1}); + complex = InsertValue(complex, imag, {1}); } return complex; } diff --git a/tensorflow/compiler/xla/service/elemental_ir_emitter.h b/tensorflow/compiler/xla/service/elemental_ir_emitter.h index c037b98929..d3e2acaabd 100644 --- a/tensorflow/compiler/xla/service/elemental_ir_emitter.h +++ b/tensorflow/compiler/xla/service/elemental_ir_emitter.h @@ -23,12 +23,13 @@ limitations under the License. #include "llvm/IR/Value.h" #include "tensorflow/compiler/xla/service/hlo_instruction.h" #include "tensorflow/compiler/xla/service/hlo_module_config.h" +#include "tensorflow/compiler/xla/service/llvm_ir/ir_builder_mixin.h" #include "tensorflow/compiler/xla/service/llvm_ir/loop_emitter.h" #include "tensorflow/compiler/xla/statusor.h" namespace xla { -class ElementalIrEmitter { +class ElementalIrEmitter : public IrBuilderMixin { public: using HloToElementGeneratorMap = std::unordered_map; @@ -40,115 +41,114 @@ class ElementalIrEmitter { virtual ~ElementalIrEmitter() = default; virtual StatusOr EmitUnaryOp(const HloInstruction* op, - llvm::Value* operand_value) const; + llvm::Value* operand_value); virtual StatusOr EmitBinaryOp(const HloInstruction* op, llvm::Value* lhs_value, - llvm::Value* rhs_value) const; + llvm::Value* rhs_value); // Returns a function to generate an element of the output of `hlo`, given a // map of functions to generate elements of its operands. virtual llvm_ir::ElementGenerator MakeElementGenerator( const HloInstruction* hlo, - const HloToElementGeneratorMap& operand_to_generator) const; + const HloToElementGeneratorMap& operand_to_generator); - llvm::IRBuilder<>* b() const { return b_; } - llvm::Module* module() const { return module_; } + llvm::IRBuilder<>* b() { return b_; } + + // builder() is for IrBuilderMixin. + llvm::IRBuilder<>* builder() { return b_; } + + llvm::Module* module() { return module_; } protected: - virtual StatusOr EmitIntegerUnaryOp( - const HloInstruction* op, llvm::Value* operand_value) const; + virtual StatusOr EmitIntegerUnaryOp(const HloInstruction* op, + llvm::Value* operand_value); - virtual StatusOr EmitFloatUnaryOp( - const HloInstruction* op, llvm::Value* operand_value) const; + virtual StatusOr EmitFloatUnaryOp(const HloInstruction* op, + llvm::Value* operand_value); - virtual StatusOr EmitComplexUnaryOp( - const HloInstruction* op, llvm::Value* operand_value) const; + virtual StatusOr EmitComplexUnaryOp(const HloInstruction* op, + llvm::Value* operand_value); - llvm::Value* IsZero(llvm::Value* v) const; - llvm::Value* IsIntMinDivisionOverflow(llvm::Value* lhs, - llvm::Value* rhs) const; - llvm::Value* GetZero(llvm::Type* type) const; - llvm::Value* GetOne(llvm::Type* type) const; - llvm::Value* GetIntSMin(llvm::Type* type) const; - llvm::Value* GetMinusOne(llvm::Type* type) const; - llvm::Value* Select(llvm::Value* cond, llvm::Value* if_true, - llvm::Value* if_false) const; + llvm::Value* IsZero(llvm::Value* v); + llvm::Value* IsIntMinDivisionOverflow(llvm::Value* lhs, llvm::Value* rhs); + llvm::Value* GetZero(llvm::Type* type); + llvm::Value* GetOne(llvm::Type* type); + llvm::Value* GetIntSMin(llvm::Type* type); + llvm::Value* GetMinusOne(llvm::Type* type); llvm::Value* EmitIntegerDivide(llvm::Value* lhs, llvm::Value* rhs, - bool is_signed) const; + bool is_signed); llvm::Value* EmitIntegerRemainder(llvm::Value* lhs, llvm::Value* rhs, - bool is_signed) const; + bool is_signed); virtual StatusOr EmitIntegerBinaryOp(const HloInstruction* op, llvm::Value* lhs_value, llvm::Value* rhs_value, - bool is_signed) const; + bool is_signed); - virtual StatusOr EmitFloatBinaryOp( - const HloInstruction* op, llvm::Value* lhs_value, - llvm::Value* rhs_value) const; + virtual StatusOr EmitFloatBinaryOp(const HloInstruction* op, + llvm::Value* lhs_value, + llvm::Value* rhs_value); - virtual StatusOr EmitComplexBinaryOp( - const HloInstruction* op, llvm::Value* lhs_value, - llvm::Value* rhs_value) const; + virtual StatusOr EmitComplexBinaryOp(const HloInstruction* op, + llvm::Value* lhs_value, + llvm::Value* rhs_value); virtual llvm::Value* EmitFloatMax(llvm::Value* lhs_value, - llvm::Value* rhs_value) const; + llvm::Value* rhs_value); virtual llvm::Value* EmitFloatMin(llvm::Value* lhs_value, - llvm::Value* rhs_value) const; + llvm::Value* rhs_value); llvm::Value* EmitIntegralMax(llvm::Value* lhs_value, llvm::Value* rhs_value, - bool is_signed) const; + bool is_signed); llvm::Value* EmitIntegralMin(llvm::Value* lhs_value, llvm::Value* rhs_value, - bool is_signed) const; + bool is_signed); virtual StatusOr EmitErfInv(PrimitiveType prim_type, - llvm::Value* value) const; + llvm::Value* value); virtual StatusOr EmitErfcInv(PrimitiveType prim_type, - llvm::Value* value) const; + llvm::Value* value); virtual StatusOr EmitAtan2(PrimitiveType prim_type, - llvm::Value* lhs, - llvm::Value* rhs) const; + llvm::Value* lhs, llvm::Value* rhs); virtual StatusOr EmitLog(PrimitiveType prim_type, - llvm::Value* value) const; + llvm::Value* value); virtual StatusOr EmitLog1p(PrimitiveType prim_type, - llvm::Value* value) const; + llvm::Value* value); virtual StatusOr EmitSin(PrimitiveType prim_type, - llvm::Value* value) const; + llvm::Value* value); virtual StatusOr EmitCos(PrimitiveType prim_type, - llvm::Value* value) const; + llvm::Value* value); virtual StatusOr EmitExp(PrimitiveType prim_type, - llvm::Value* value) const; + llvm::Value* value); virtual StatusOr EmitExpm1(PrimitiveType prim_type, - llvm::Value* value) const; + llvm::Value* value); virtual StatusOr EmitPow(PrimitiveType prim_type, - llvm::Value* lhs, - llvm::Value* rhs) const; + llvm::Value* lhs, llvm::Value* rhs); virtual StatusOr EmitTanh(PrimitiveType prim_type, - llvm::Value* value) const; + llvm::Value* value); virtual StatusOr EmitReducePrecision(const HloInstruction* hlo, - llvm::Value* x) const; + llvm::Value* x); - virtual llvm::Value* EmitExtractReal(llvm::Value* value) const; - virtual llvm::Value* EmitExtractImag(llvm::Value* value) const; + virtual llvm::Value* EmitExtractReal(llvm::Value* value); + virtual llvm::Value* EmitExtractImag(llvm::Value* value); // Composes a complex struct. imag may be nullptr for simple cast operations. llvm::Value* EmitComposeComplex(const HloInstruction* op, llvm::Value* real, - llvm::Value* imag) const; + llvm::Value* imag); // A helper method for MakeElementGenerator. Given an elementwise op `hlo` and // the target array index, computes the source array index of its @@ -157,50 +157,50 @@ class ElementalIrEmitter { // Precondition: `hlo` is an elementwise op. llvm_ir::IrArray::Index ElementwiseSourceIndex( const llvm_ir::IrArray::Index& target_index, const HloInstruction& hlo, - int64 operand_no) const; + int64 operand_no); // Identifier of the thread unique among all threads on the device - virtual llvm::Value* EmitThreadId() const { return b_->getIntN(128, 0); } + virtual llvm::Value* EmitThreadId() { return b_->getIntN(128, 0); } StatusOr EmitElementalSelect( const HloInstruction* hlo, const HloToElementGeneratorMap& operand_to_generator, - const llvm_ir::IrArray::Index& index) const; + const llvm_ir::IrArray::Index& index); StatusOr EmitElementalClamp( const HloInstruction* hlo, const HloToElementGeneratorMap& operand_to_generator, - const llvm_ir::IrArray::Index& index) const; + const llvm_ir::IrArray::Index& index); StatusOr EmitElementalConcatenate( const HloInstruction* hlo, const HloToElementGeneratorMap& operand_to_generator, - const llvm_ir::IrArray::Index& target_index) const; + const llvm_ir::IrArray::Index& target_index); StatusOr EmitElementalDynamicSlice( const HloInstruction* hlo, const HloToElementGeneratorMap& operand_to_generator, - const llvm_ir::IrArray::Index& index) const; + const llvm_ir::IrArray::Index& index); StatusOr EmitElementalGather( const HloInstruction* hlo, const HloToElementGeneratorMap& operand_to_generator, - const llvm_ir::IrArray::Index& index) const; + const llvm_ir::IrArray::Index& index); StatusOr EmitElementalDynamicUpdateSlice( const HloInstruction* hlo, const HloToElementGeneratorMap& operand_to_generator, - const llvm_ir::IrArray::Index& index) const; + const llvm_ir::IrArray::Index& index); StatusOr EmitElementalPad( const HloInstruction* hlo, const HloToElementGeneratorMap& operand_to_generator, - const llvm_ir::IrArray::Index& padded_index) const; + const llvm_ir::IrArray::Index& padded_index); StatusOr EmitElementalDot( const HloInstruction* hlo, const HloToElementGeneratorMap& operand_to_generator, - const llvm_ir::IrArray::Index& dot_result_index) const; + const llvm_ir::IrArray::Index& dot_result_index); llvm::IRBuilder<>* const b_; @@ -215,13 +215,13 @@ class ElementalIrEmitter { // random number generation algorithm. llvm_ir::ElementGenerator MakePhiloxRngElementGenerator( const HloInstruction* hlo, - const HloToElementGeneratorMap& operand_to_generator) const; + const HloToElementGeneratorMap& operand_to_generator); // Converts the raw value generated by a random number generation algorithm // to the distribution requested by the RNG HloInstruction. StatusOr ConvertValueForDistribution( const HloInstruction* hlo, const ElementalIrEmitter::HloToElementGeneratorMap& operand_to_generator, - const llvm_ir::IrArray::Index& index, llvm::Value* raw_value) const; + const llvm_ir::IrArray::Index& index, llvm::Value* raw_value); }; } // namespace xla diff --git a/tensorflow/compiler/xla/service/gpu/BUILD b/tensorflow/compiler/xla/service/gpu/BUILD index 87b799e78e..d6e9436348 100644 --- a/tensorflow/compiler/xla/service/gpu/BUILD +++ b/tensorflow/compiler/xla/service/gpu/BUILD @@ -177,6 +177,7 @@ cc_library( "//tensorflow/compiler/xla/service/llvm_ir:dynamic_update_slice_util", "//tensorflow/compiler/xla/service/llvm_ir:fused_ir_emitter", "//tensorflow/compiler/xla/service/llvm_ir:ir_array", + "//tensorflow/compiler/xla/service/llvm_ir:ir_builder_mixin", "//tensorflow/compiler/xla/service/llvm_ir:kernel_support_library", "//tensorflow/compiler/xla/service/llvm_ir:kernel_tiling", "//tensorflow/compiler/xla/service/llvm_ir:llvm_loop", diff --git a/tensorflow/compiler/xla/service/gpu/elemental_ir_emitter.cc b/tensorflow/compiler/xla/service/gpu/elemental_ir_emitter.cc index afcf9fa2ea..57a3a43a6f 100644 --- a/tensorflow/compiler/xla/service/gpu/elemental_ir_emitter.cc +++ b/tensorflow/compiler/xla/service/gpu/elemental_ir_emitter.cc @@ -77,7 +77,7 @@ StatusOr GpuElementalIrEmitter::EmitLibdeviceMathCall( const string& callee_name, tensorflow::gtl::ArraySlice operands, tensorflow::gtl::ArraySlice input_types, - PrimitiveType output_type) const { + PrimitiveType output_type) { // The libdevice math functions differentiate between "double" and "float" by // appending an 'f' to the function's name. libdevice doesn't have f16 math // functions, so we convert the operands to f32 before calling the function @@ -94,7 +94,7 @@ StatusOr GpuElementalIrEmitter::EmitLibdeviceMathCall( for (int64 i = 0; i < operands.size(); ++i) { if (input_types[i] == F16) { converted_operands[i] = - b_->CreateFPCast(converted_operands[i], b_->getFloatTy()); + FPCast(converted_operands[i], b_->getFloatTy()); converted_input_types[i] = F32; } } @@ -113,7 +113,7 @@ StatusOr GpuElementalIrEmitter::EmitLibdeviceMathCall( converted_input_types, output_type) .ValueOrDie(); if (cast_result_to_fp16) { - result = b_->CreateFPCast(result, b_->getHalfTy()); + result = FPCast(result, b_->getHalfTy()); } return result; } @@ -122,7 +122,7 @@ StatusOr GpuElementalIrEmitter::EmitLlvmIntrinsicMathCall( const string& callee_name, tensorflow::gtl::ArraySlice operands, tensorflow::gtl::ArraySlice input_types, - PrimitiveType output_type) const { + PrimitiveType output_type) { // llvm intrinsics differentiate between half/float/double functions via // the suffixes ".f16", ".f32" and ".f64". string munged_callee = callee_name; @@ -147,7 +147,7 @@ StatusOr GpuElementalIrEmitter::EmitMathCall( const string& callee_name, tensorflow::gtl::ArraySlice operands, tensorflow::gtl::ArraySlice input_types, - PrimitiveType output_type) const { + PrimitiveType output_type) { // Binary math functions transform are of type [T] -> T. for (PrimitiveType input_type : input_types) { if (output_type != input_type) { @@ -163,8 +163,7 @@ StatusOr GpuElementalIrEmitter::EmitMathCall( } StatusOr GpuElementalIrEmitter::EmitFloatBinaryOp( - const HloInstruction* op, llvm::Value* lhs_value, - llvm::Value* rhs_value) const { + const HloInstruction* op, llvm::Value* lhs_value, llvm::Value* rhs_value) { PrimitiveType lhs_input_type = op->operand(0)->shape().element_type(); PrimitiveType rhs_input_type = op->operand(1)->shape().element_type(); PrimitiveType output_type = op->shape().element_type(); @@ -183,8 +182,7 @@ StatusOr GpuElementalIrEmitter::EmitFloatBinaryOp( } StatusOr GpuElementalIrEmitter::EmitPowerOp( - const HloInstruction* op, llvm::Value* lhs_value, - llvm::Value* rhs_value) const { + const HloInstruction* op, llvm::Value* lhs_value, llvm::Value* rhs_value) { CHECK_EQ(op->opcode(), HloOpcode::kPower); PrimitiveType lhs_input_type = op->operand(0)->shape().element_type(); PrimitiveType rhs_input_type = op->operand(1)->shape().element_type(); @@ -218,7 +216,7 @@ StatusOr GpuElementalIrEmitter::EmitPowerOp( // TODO(jlebar): Does this happen with fastmath disabled? If not, should // we force-enable it? TF_ASSIGN_OR_RETURN(auto* sqrt, make_sqrt()); - return b_->CreateFDiv(llvm::ConstantFP::get(llvm_ty, 1), sqrt); + return FDiv(llvm::ConstantFP::get(llvm_ty, 1), sqrt); } VLOG(10) << "emitting pow as regular call to pow(): " << op->ToString(); @@ -227,55 +225,56 @@ StatusOr GpuElementalIrEmitter::EmitPowerOp( } StatusOr GpuElementalIrEmitter::EmitErfcInv( - PrimitiveType prim_type, llvm::Value* value) const { + PrimitiveType prim_type, llvm::Value* value) { return EmitLibdeviceMathCall("__nv_erfcinv", {value}, {prim_type}, prim_type); } -StatusOr GpuElementalIrEmitter::EmitLog( - PrimitiveType prim_type, llvm::Value* value) const { +StatusOr GpuElementalIrEmitter::EmitLog(PrimitiveType prim_type, + llvm::Value* value) { return EmitLibdeviceMathCall("__nv_log", {value}, {prim_type}, prim_type); } -StatusOr GpuElementalIrEmitter::EmitLog1p( - PrimitiveType prim_type, llvm::Value* value) const { +StatusOr GpuElementalIrEmitter::EmitLog1p(PrimitiveType prim_type, + llvm::Value* value) { return EmitLibdeviceMathCall("__nv_log1p", {value}, {prim_type}, prim_type); } -StatusOr GpuElementalIrEmitter::EmitSin( - PrimitiveType prim_type, llvm::Value* value) const { +StatusOr GpuElementalIrEmitter::EmitSin(PrimitiveType prim_type, + llvm::Value* value) { return EmitLibdeviceMathCall("__nv_sin", {value}, {prim_type}, prim_type); } -StatusOr GpuElementalIrEmitter::EmitCos( - PrimitiveType prim_type, llvm::Value* value) const { +StatusOr GpuElementalIrEmitter::EmitCos(PrimitiveType prim_type, + llvm::Value* value) { return EmitLibdeviceMathCall("__nv_cos", {value}, {prim_type}, prim_type); } -StatusOr GpuElementalIrEmitter::EmitExp( - PrimitiveType prim_type, llvm::Value* value) const { +StatusOr GpuElementalIrEmitter::EmitExp(PrimitiveType prim_type, + llvm::Value* value) { return EmitLibdeviceMathCall("__nv_exp", {value}, {prim_type}, prim_type); } -StatusOr GpuElementalIrEmitter::EmitExpm1( - PrimitiveType prim_type, llvm::Value* value) const { +StatusOr GpuElementalIrEmitter::EmitExpm1(PrimitiveType prim_type, + llvm::Value* value) { return EmitLibdeviceMathCall("__nv_expm1", {value}, {prim_type}, prim_type); } StatusOr GpuElementalIrEmitter::EmitPow(PrimitiveType prim_type, llvm::Value* lhs, - llvm::Value* rhs) const { + llvm::Value* rhs) { return EmitLibdeviceMathCall("__nv_pow", {lhs, rhs}, {prim_type, prim_type}, prim_type); } -StatusOr GpuElementalIrEmitter::EmitAtan2( - PrimitiveType prim_type, llvm::Value* lhs, llvm::Value* rhs) const { +StatusOr GpuElementalIrEmitter::EmitAtan2(PrimitiveType prim_type, + llvm::Value* lhs, + llvm::Value* rhs) { return EmitLibdeviceMathCall("__nv_atan2", {lhs, rhs}, {prim_type, prim_type}, prim_type); } -StatusOr GpuElementalIrEmitter::EmitTanh( - PrimitiveType prim_type, llvm::Value* value) const { +StatusOr GpuElementalIrEmitter::EmitTanh(PrimitiveType prim_type, + llvm::Value* value) { // Emit a fast approximation of tanh instead of calling __nv_tanh. // __nv_tanh is particularly bad because it contains branches, thus // preventing LLVM's load-store vectorizer from working its magic across a @@ -285,9 +284,9 @@ StatusOr GpuElementalIrEmitter::EmitTanh( // Upcast F16 to F32 if necessary. llvm::Type* type = prim_type == F16 ? b_->getFloatTy() : value->getType(); - llvm::Value* input = b_->CreateFPCast(value, type); + llvm::Value* input = FPCast(value, type); llvm::Value* fast_tanh = llvm_ir::EmitFastTanh(b_, input); - return b_->CreateFPCast(fast_tanh, value->getType()); + return FPCast(fast_tanh, value->getType()); } llvm::Value* GpuElementalIrEmitter::EmitDeviceFunctionCall( @@ -295,7 +294,7 @@ llvm::Value* GpuElementalIrEmitter::EmitDeviceFunctionCall( tensorflow::gtl::ArraySlice operands, tensorflow::gtl::ArraySlice input_types, PrimitiveType output_type, - tensorflow::gtl::ArraySlice attributes) const { + tensorflow::gtl::ArraySlice attributes) { std::vector ir_input_types; for (PrimitiveType input_type : input_types) { ir_input_types.push_back( @@ -315,29 +314,28 @@ llvm::Value* GpuElementalIrEmitter::EmitDeviceFunctionCall( callee->addFnAttr(attribute); } - return b_->CreateCall(callee, llvm_ir::AsArrayRef(operands)); + return Call(callee, llvm_ir::AsArrayRef(operands)); } -llvm::Value* GpuElementalIrEmitter::EmitThreadId() const { - llvm::Value* block_id = b_->CreateIntCast( - llvm_ir::EmitCallToIntrinsic(llvm::Intrinsic::nvvm_read_ptx_sreg_ctaid_x, - {}, {}, b_), - b_->getIntNTy(128), /*isSigned=*/true, "block.id"); - llvm::Value* thread_id_in_block = b_->CreateIntCast( - llvm_ir::EmitCallToIntrinsic(llvm::Intrinsic::nvvm_read_ptx_sreg_tid_x, - {}, {}, b_), - b_->getIntNTy(128), /*isSigned=*/true, "thread.id"); - llvm::Value* threads_per_block = b_->CreateIntCast( - llvm_ir::EmitCallToIntrinsic(llvm::Intrinsic::nvvm_read_ptx_sreg_ntid_x, - {}, {}, b_), - b_->getIntNTy(128), /*isSigned=*/true, "threads_per_block"); - return b_->CreateNSWAdd(b_->CreateNSWMul(block_id, threads_per_block), - thread_id_in_block); +llvm::Value* GpuElementalIrEmitter::EmitThreadId() { + llvm::Value* block_id = + IntCast(llvm_ir::EmitCallToIntrinsic( + llvm::Intrinsic::nvvm_read_ptx_sreg_ctaid_x, {}, {}, b_), + b_->getIntNTy(128), /*isSigned=*/true, "block.id"); + llvm::Value* thread_id_in_block = + IntCast(llvm_ir::EmitCallToIntrinsic( + llvm::Intrinsic::nvvm_read_ptx_sreg_tid_x, {}, {}, b_), + b_->getIntNTy(128), /*isSigned=*/true, "thread.id"); + llvm::Value* threads_per_block = + IntCast(llvm_ir::EmitCallToIntrinsic( + llvm::Intrinsic::nvvm_read_ptx_sreg_ntid_x, {}, {}, b_), + b_->getIntNTy(128), /*isSigned=*/true, "threads_per_block"); + return NSWAdd(NSWMul(block_id, threads_per_block), thread_id_in_block); } llvm_ir::ElementGenerator GpuElementalIrEmitter::MakeElementGenerator( const HloInstruction* hlo, - const HloToElementGeneratorMap& operand_to_generator) const { + const HloToElementGeneratorMap& operand_to_generator) { switch (hlo->opcode()) { case HloOpcode::kMap: return [=, &operand_to_generator]( @@ -383,7 +381,7 @@ llvm_ir::ElementGenerator GpuElementalIrEmitter::MakeElementGenerator( TF_ASSIGN_OR_RETURN(llvm::Value * init_value, operand_to_generator.at(hlo->operand(1))( IrArray::Index(index.GetType()))); - b_->CreateStore(init_value, accum_ptr); + Store(init_value, accum_ptr); } llvm::Type* index_type = index.GetType(); @@ -405,22 +403,21 @@ llvm_ir::ElementGenerator GpuElementalIrEmitter::MakeElementGenerator( IrArray::Index input_index(index_type, index.size()); llvm::Value* in_bounds = b_->getInt1(true); for (size_t i = 0; i < index.size(); ++i) { - llvm::Value* stridden_index = b_->CreateNSWMul( + llvm::Value* stridden_index = NSWMul( index[i], index_typed_const(window.dimensions(i).stride())); - input_index[i] = b_->CreateNSWSub( - b_->CreateNSWAdd(stridden_index, window_index[i]), - index_typed_const(window.dimensions(i).padding_low())); + input_index[i] = + NSWSub(NSWAdd(stridden_index, window_index[i]), + index_typed_const(window.dimensions(i).padding_low())); // We must check whether 0 ≤ input_index[i] < bound, as otherwise // we are in the pad and so can skip the computation. This // comparison is equivalent to the unsigned comparison // input_index[i] < bound, as a negative value wraps to a large // positive value. - in_bounds = b_->CreateAnd( - in_bounds, - b_->CreateICmpULT( - input_index[i], - index_typed_const(operand->shape().dimensions(i)))); + in_bounds = + And(in_bounds, + ICmpULT(input_index[i], + index_typed_const(operand->shape().dimensions(i)))); } llvm_ir::LlvmIfData if_data = @@ -432,12 +429,11 @@ llvm_ir::ElementGenerator GpuElementalIrEmitter::MakeElementGenerator( operand_to_generator.at(operand)(input_index)); TF_ASSIGN_OR_RETURN( llvm::Value * accum_value, - compute_nested_(*hlo->to_apply(), - {b_->CreateLoad(accum_ptr), input_value})); - b_->CreateStore(accum_value, accum_ptr); + compute_nested_(*hlo->to_apply(), {Load(accum_ptr), input_value})); + Store(accum_value, accum_ptr); SetToFirstInsertPoint(loops.GetOuterLoopExitBasicBlock(), b_); - return b_->CreateLoad(accum_ptr); + return Load(accum_ptr); }; case HloOpcode::kReduce: // TODO(b/112040122): This should be supported. diff --git a/tensorflow/compiler/xla/service/gpu/elemental_ir_emitter.h b/tensorflow/compiler/xla/service/gpu/elemental_ir_emitter.h index 84454d31bb..91942785d2 100644 --- a/tensorflow/compiler/xla/service/gpu/elemental_ir_emitter.h +++ b/tensorflow/compiler/xla/service/gpu/elemental_ir_emitter.h @@ -48,50 +48,50 @@ class GpuElementalIrEmitter : public ElementalIrEmitter { llvm_ir::ElementGenerator MakeElementGenerator( const HloInstruction* hlo, - const HloToElementGeneratorMap& operand_to_generator) const override; + const HloToElementGeneratorMap& operand_to_generator) override; protected: - StatusOr EmitFloatBinaryOp( - const HloInstruction* op, llvm::Value* lhs_value, - llvm::Value* rhs_value) const override; + StatusOr EmitFloatBinaryOp(const HloInstruction* op, + llvm::Value* lhs_value, + llvm::Value* rhs_value) override; StatusOr EmitErfcInv(PrimitiveType prim_type, - llvm::Value* value) const override; + llvm::Value* value) override; StatusOr EmitLog(PrimitiveType prim_type, - llvm::Value* value) const override; + llvm::Value* value) override; StatusOr EmitLog1p(PrimitiveType prim_type, - llvm::Value* value) const override; + llvm::Value* value) override; StatusOr EmitSin(PrimitiveType prim_type, - llvm::Value* value) const override; + llvm::Value* value) override; StatusOr EmitCos(PrimitiveType prim_type, - llvm::Value* value) const override; + llvm::Value* value) override; StatusOr EmitExp(PrimitiveType prim_type, - llvm::Value* value) const override; + llvm::Value* value) override; StatusOr EmitExpm1(PrimitiveType prim_type, - llvm::Value* value) const override; + llvm::Value* value) override; StatusOr EmitPow(PrimitiveType prim_type, llvm::Value* lhs, - llvm::Value* rhs) const override; + llvm::Value* rhs) override; StatusOr EmitAtan2(PrimitiveType prim_type, llvm::Value* lhs, - llvm::Value* rhs) const override; + llvm::Value* rhs) override; StatusOr EmitTanh(PrimitiveType prim_type, - llvm::Value* value) const override; + llvm::Value* value) override; - llvm::Value* EmitThreadId() const override; + llvm::Value* EmitThreadId() override; private: // Emits IR for op, which must have opcode kPower. StatusOr EmitPowerOp(const HloInstruction* op, llvm::Value* lhs_value, - llvm::Value* rhs_value) const; + llvm::Value* rhs_value); // Emits IR to call a device function named "callee_name" on the given // operand. Returns the IR value that represents the return value. @@ -100,7 +100,7 @@ class GpuElementalIrEmitter : public ElementalIrEmitter { tensorflow::gtl::ArraySlice operands, tensorflow::gtl::ArraySlice input_type, PrimitiveType output_type, - tensorflow::gtl::ArraySlice attributes) const; + tensorflow::gtl::ArraySlice attributes); // Emits IR to call an LLVM intrinsic of type [T] -> T. Adjusts // callee_name according to T. Returns the IR value that represents the @@ -109,7 +109,7 @@ class GpuElementalIrEmitter : public ElementalIrEmitter { const string& callee_name, tensorflow::gtl::ArraySlice operands, tensorflow::gtl::ArraySlice input_types, - PrimitiveType output_type) const; + PrimitiveType output_type); // Emits IR to call a libdevice function of type [T] -> T. Adjusts // callee_name according to T. Returns the IR value that represents the @@ -118,7 +118,7 @@ class GpuElementalIrEmitter : public ElementalIrEmitter { const string& callee_name, tensorflow::gtl::ArraySlice operands, tensorflow::gtl::ArraySlice input_types, - PrimitiveType output_type) const; + PrimitiveType output_type); // Emits IR to call a function of type [T] -> T. Does not munge callee_name. // Returns the IR value that represents the return value of the function. @@ -126,7 +126,7 @@ class GpuElementalIrEmitter : public ElementalIrEmitter { const string& callee_name, tensorflow::gtl::ArraySlice operands, tensorflow::gtl::ArraySlice input_types, - PrimitiveType output_type) const; + PrimitiveType output_type); const HloModuleConfig& hlo_module_config_; NestedComputer compute_nested_; diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter.cc b/tensorflow/compiler/xla/service/gpu/ir_emitter.cc index 4cbb6d75a8..a620cebe04 100644 --- a/tensorflow/compiler/xla/service/gpu/ir_emitter.cc +++ b/tensorflow/compiler/xla/service/gpu/ir_emitter.cc @@ -156,7 +156,7 @@ Status IrEmitter::EmitCallToNestedComputation( std::vector arguments(operands.begin(), operands.end()); arguments.push_back(output); arguments.push_back(bindings_.GetTempBufferBase()); - b_.CreateCall(emitted_function, arguments); + Call(emitted_function, arguments); return Status::OK(); } @@ -178,7 +178,7 @@ bool IrEmitter::MaybeEmitDirectAtomicOperation( computation.root_instruction()->shape().element_type(); bool is_atomic_integral = element_type == S32 || element_type == U32 || element_type == S64 || element_type == U64; - llvm::Value* source = b_.CreateLoad(source_address, "source"); + llvm::Value* source = Load(source_address, "source"); if (root_opcode == HloOpcode::kAdd) { // NVPTX supports atomicAdd on F32 and integer types. if (element_type == F32) { @@ -190,8 +190,8 @@ bool IrEmitter::MaybeEmitDirectAtomicOperation( } if (is_atomic_integral) { // integral + integral - b_.CreateAtomicRMW(llvm::AtomicRMWInst::Add, output_address, source, - llvm::AtomicOrdering::SequentiallyConsistent); + AtomicRMW(llvm::AtomicRMWInst::Add, output_address, source, + llvm::AtomicOrdering::SequentiallyConsistent); return true; } } @@ -202,8 +202,8 @@ bool IrEmitter::MaybeEmitDirectAtomicOperation( auto opcode = primitive_util::IsSignedIntegralType(element_type) ? llvm::AtomicRMWInst::Max : llvm::AtomicRMWInst::UMax; - b_.CreateAtomicRMW(opcode, output_address, source, - llvm::AtomicOrdering::SequentiallyConsistent); + AtomicRMW(opcode, output_address, source, + llvm::AtomicOrdering::SequentiallyConsistent); return true; } @@ -212,8 +212,8 @@ bool IrEmitter::MaybeEmitDirectAtomicOperation( auto opcode = primitive_util::IsSignedIntegralType(element_type) ? llvm::AtomicRMWInst::Min : llvm::AtomicRMWInst::UMin; - b_.CreateAtomicRMW(opcode, output_address, source, - llvm::AtomicOrdering::SequentiallyConsistent); + AtomicRMW(opcode, output_address, source, + llvm::AtomicOrdering::SequentiallyConsistent); return true; } @@ -292,10 +292,10 @@ Status IrEmitter::EmitAtomicOperationUsingCAS(const HloComputation& computation, // cas_old_output_address and cas_new_output_address point to the scratch // memory where we store the old and new values for the repeated atomicCAS // operations. - llvm::Value* cas_old_output_address = b_.CreateAlloca( - atomic_type, /*ArraySize=*/nullptr, "cas_old_output_address"); - llvm::Value* cas_new_output_address = b_.CreateAlloca( - atomic_type, /*ArraySize=*/nullptr, "cas_new_output_address"); + llvm::Value* cas_old_output_address = + Alloca(atomic_type, /*ArraySize=*/nullptr, "cas_old_output_address"); + llvm::Value* cas_new_output_address = + Alloca(atomic_type, /*ArraySize=*/nullptr, "cas_new_output_address"); // Emit preparation code to the preheader. llvm::BasicBlock* loop_preheader_bb = b_.GetInsertBlock(); @@ -309,29 +309,26 @@ Status IrEmitter::EmitAtomicOperationUsingCAS(const HloComputation& computation, CHECK_EQ((element_size % sizeof(char)), 0); llvm::Type* address_int_type = module_->getDataLayout().getIntPtrType(output_address_type); - atomic_memory_address = b_.CreatePtrToInt(output_address, address_int_type); + atomic_memory_address = PtrToInt(output_address, address_int_type); llvm::Value* mask = llvm::ConstantInt::get(address_int_type, 3); - llvm::Value* offset = b_.CreateAnd(atomic_memory_address, mask); + llvm::Value* offset = And(atomic_memory_address, mask); mask = llvm::ConstantInt::get(address_int_type, -4); - atomic_memory_address = b_.CreateAnd(atomic_memory_address, mask); + atomic_memory_address = And(atomic_memory_address, mask); atomic_memory_address = - b_.CreateIntToPtr(atomic_memory_address, atomic_address_type); - binop_output_address = b_.CreateAdd( - b_.CreatePtrToInt(cas_new_output_address, address_int_type), offset); + IntToPtr(atomic_memory_address, atomic_address_type); binop_output_address = - b_.CreateIntToPtr(binop_output_address, element_address_type); + Add(PtrToInt(cas_new_output_address, address_int_type), offset); + binop_output_address = IntToPtr(binop_output_address, element_address_type); } else { - atomic_memory_address = - b_.CreateBitCast(output_address, atomic_address_type); + atomic_memory_address = BitCast(output_address, atomic_address_type); binop_output_address = - b_.CreateBitCast(cas_new_output_address, element_address_type); + BitCast(cas_new_output_address, element_address_type); } // Use the value from the memory that atomicCAS operates on to initialize // cas_old_output. - llvm::Value* cas_old_output = - b_.CreateLoad(atomic_memory_address, "cas_old_output"); - b_.CreateStore(cas_old_output, cas_old_output_address); + llvm::Value* cas_old_output = Load(atomic_memory_address, "cas_old_output"); + Store(cas_old_output, cas_old_output_address); llvm::BasicBlock* loop_exit_bb = loop_preheader_bb->splitBasicBlock( b_.GetInsertPoint(), "atomic_op_loop_exit"); @@ -344,32 +341,29 @@ Status IrEmitter::EmitAtomicOperationUsingCAS(const HloComputation& computation, // Emit the body of the loop that repeatedly invokes atomicCAS. // // Use cas_old_output to initialize cas_new_output. - cas_old_output = b_.CreateLoad(cas_old_output_address, "cas_old_output"); - b_.CreateStore(cas_old_output, cas_new_output_address); + cas_old_output = Load(cas_old_output_address, "cas_old_output"); + Store(cas_old_output, cas_new_output_address); // Emits code to calculate new_output = operation(old_output, source); TF_RETURN_IF_ERROR(EmitCallToNestedComputation( computation, {binop_output_address, source_address}, binop_output_address)); - llvm::Value* cas_new_output = - b_.CreateLoad(cas_new_output_address, "cas_new_output"); + llvm::Value* cas_new_output = Load(cas_new_output_address, "cas_new_output"); // Emit code to perform the atomicCAS operation // (cas_old_output, success) = atomicCAS(memory_address, cas_old_output, // cas_new_output); - llvm::Value* ret_value = b_.CreateAtomicCmpXchg( - atomic_memory_address, cas_old_output, cas_new_output, - llvm::AtomicOrdering::SequentiallyConsistent, - llvm::AtomicOrdering::SequentiallyConsistent); + llvm::Value* ret_value = + AtomicCmpXchg(atomic_memory_address, cas_old_output, cas_new_output, + llvm::AtomicOrdering::SequentiallyConsistent, + llvm::AtomicOrdering::SequentiallyConsistent); // Extract the memory value returned from atomicCAS and store it as // cas_old_output. - b_.CreateStore(b_.CreateExtractValue(ret_value, 0, "cas_old_output"), - cas_old_output_address); + Store(ExtractValue(ret_value, 0, "cas_old_output"), cas_old_output_address); // Extract the success bit returned from atomicCAS and generate a // conditional branch on the success bit. - b_.CreateCondBr(b_.CreateExtractValue(ret_value, 1, "success"), loop_exit_bb, - loop_body_bb); + CondBr(ExtractValue(ret_value, 1, "success"), loop_exit_bb, loop_body_bb); // Set the insertion point to the exit basic block so that the caller of // this method can continue emitting code to the right place. @@ -472,10 +466,10 @@ Status IrEmitter::HandleDot(HloInstruction* dot) { if (ShapeUtil::ElementIsComplex(lhs_shape)) { auto value = MultiplyComplex(lhs_value, rhs_value, &b_); result = llvm::ConstantAggregateZero::get(lhs_array.GetElementLlvmType()); - result = b_.CreateInsertValue(result, value.first, {0}); - result = b_.CreateInsertValue(result, value.second, {1}); + result = InsertValue(result, value.first, {0}); + result = InsertValue(result, value.second, {1}); } else { - result = b_.CreateFMul(lhs_value, rhs_value); + result = FMul(lhs_value, rhs_value); } target_array.EmitWriteArrayElement(/*index=*/element_index, result, &b_); return Status::OK(); @@ -559,21 +553,21 @@ Status IrEmitter::HandleDot(HloInstruction* dot) { &*reduction_loop->GetBodyBasicBlock()->getFirstInsertionPt()); llvm::Value* lhs_element = lhs_array.EmitReadArrayElement(lhs_index, &b_); llvm::Value* rhs_element = rhs_array.EmitReadArrayElement(rhs_index, &b_); - llvm::Value* accum = b_.CreateLoad(accum_address); + llvm::Value* accum = Load(accum_address); llvm::Value* updated_accum; if (ShapeUtil::ElementIsComplex(lhs_shape)) { auto value = MultiplyComplex(lhs_element, rhs_element, &b_); llvm::Value* accum_real = Real(accum, &b_); - llvm::Value* real_sum = b_.CreateFAdd(accum_real, value.first); - updated_accum = b_.CreateInsertValue(accum, real_sum, {0}); + llvm::Value* real_sum = FAdd(accum_real, value.first); + updated_accum = InsertValue(accum, real_sum, {0}); llvm::Value* accum_imag = Imag(accum, &b_); - llvm::Value* imag_sum = b_.CreateFAdd(accum_imag, value.second); - updated_accum = b_.CreateInsertValue(updated_accum, imag_sum, {1}); + llvm::Value* imag_sum = FAdd(accum_imag, value.second); + updated_accum = InsertValue(updated_accum, imag_sum, {1}); } else { - llvm::Value* product = b_.CreateFMul(lhs_element, rhs_element); - updated_accum = b_.CreateFAdd(accum, product); + llvm::Value* product = FMul(lhs_element, rhs_element); + updated_accum = FAdd(accum, product); } - b_.CreateStore(updated_accum, accum_address); + Store(updated_accum, accum_address); // After the reduction loop exits, store the accumulator into the target // address. The index into the target address is the concatenation of the rhs @@ -595,7 +589,7 @@ Status IrEmitter::HandleDot(HloInstruction* dot) { SetToFirstInsertPoint(reduction_loop->GetExitBasicBlock(), &b_); target_array.EmitWriteArrayElement( target_index, - b_.CreateLoad(accum_address), // The value written to the target array. + Load(accum_address), // The value written to the target array. &b_); // Set the IR builder insert point to the exit basic block of the outer most @@ -646,10 +640,9 @@ Status IrEmitter::HandleReduce(HloInstruction* reduce) { [=](const llvm_ir::IrArray::Index& index) -> StatusOr { // Initialize an accumulator with init_value. llvm::AllocaInst* accumulator_addr = - b_.CreateAlloca(llvm_ir::PrimitiveTypeToIrType( + Alloca(llvm_ir::PrimitiveTypeToIrType( reduce->shape().element_type(), module_)); - b_.CreateStore(b_.CreateLoad(GetBasePointer(*init_value)), - accumulator_addr); + Store(Load(GetBasePointer(*init_value)), accumulator_addr); // The enclosing loops go over all the target elements. Now we have to // compute the actual target element. For this, we build a new loop nest @@ -686,7 +679,7 @@ Status IrEmitter::HandleReduce(HloInstruction* reduce) { *function, {accumulator_addr, input_address}, accumulator_addr)); SetToFirstInsertPoint(loops.GetOuterLoopExitBasicBlock(), &b_); - return b_.CreateLoad(accumulator_addr); + return Load(accumulator_addr); }); } @@ -769,11 +762,11 @@ StatusOr IrEmitter::ComputeNestedElement( for (llvm::Value* parameter_element : parameter_elements) { parameter_buffers.push_back(llvm_ir::EmitAllocaAtFunctionEntry( parameter_element->getType(), "parameter_buffer", &b_)); - b_.CreateStore(parameter_element, parameter_buffers.back()); + Store(parameter_element, parameter_buffers.back()); } TF_RETURN_IF_ERROR(EmitCallToNestedComputation(computation, parameter_buffers, return_buffer)); - return b_.CreateLoad(return_buffer); + return Load(return_buffer); } } // namespace gpu diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter.h b/tensorflow/compiler/xla/service/gpu/ir_emitter.h index 76e069fc41..e096a07704 100644 --- a/tensorflow/compiler/xla/service/gpu/ir_emitter.h +++ b/tensorflow/compiler/xla/service/gpu/ir_emitter.h @@ -36,6 +36,7 @@ limitations under the License. #include "tensorflow/compiler/xla/service/hlo_computation.h" #include "tensorflow/compiler/xla/service/hlo_instruction.h" #include "tensorflow/compiler/xla/service/llvm_ir/ir_array.h" +#include "tensorflow/compiler/xla/service/llvm_ir/ir_builder_mixin.h" #include "tensorflow/compiler/xla/service/llvm_ir/llvm_loop.h" #include "tensorflow/compiler/xla/service/llvm_ir/loop_emitter.h" #include "tensorflow/compiler/xla/statusor.h" @@ -64,7 +65,8 @@ namespace gpu { // IrEmitterUnnested, but the code is generated using FusedIrEmitter, which is // not a subclass of gpu::IrEmitter, and in fact is better understood as an IR // generator generator. See comments on that class. -class IrEmitter : public DfsHloVisitorWithDefault { +class IrEmitter : public DfsHloVisitorWithDefault, + public IrBuilderMixin { public: IrEmitter(const IrEmitter&) = delete; IrEmitter& operator=(const IrEmitter&) = delete; @@ -99,6 +101,8 @@ class IrEmitter : public DfsHloVisitorWithDefault { Status FinishVisit(HloInstruction* root) override { return Status::OK(); } + llvm::IRBuilder<>* builder() { return &b_; } + protected: // Constructs an IrEmitter with the given IrEmitter context. // ir_emitter_context is owned by the caller and should outlive the IrEmitter diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc index 4d98955c58..c0c8ae181a 100644 --- a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc +++ b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc @@ -729,7 +729,7 @@ Status IrEmitterUnnested::EmitExtraOutputsForReduce( "extra_output_element_address"); TF_ASSIGN_OR_RETURN(llvm::Value* const extra_output_ir_value, extra_output_gens[i].first(index)); - b_.CreateStore(extra_output_ir_value, extra_output_address); + Store(extra_output_ir_value, extra_output_address); } return Status::OK(); } @@ -810,17 +810,17 @@ Status IrEmitterUnnested::EmitReductionToScalar( std::vector partial_reduction_result_addresses; for (int i = 0; i != num_reduces; ++i) { llvm::Value* partial_reduction_result_address = - b_.CreateAlloca(element_ir_type, /*ArraySize=*/nullptr, - "partial_reduction_result." + llvm::Twine(i)); + Alloca(element_ir_type, /*ArraySize=*/nullptr, + "partial_reduction_result." + llvm::Twine(i)); TF_ASSIGN_OR_RETURN(llvm::Value* const init_ir_value, init_value_gens[i](IrArray::Index(index_ty))); - b_.CreateStore(init_ir_value, partial_reduction_result_address); + Store(init_ir_value, partial_reduction_result_address); partial_reduction_result_addresses.push_back( partial_reduction_result_address); } llvm::Value* x_in_tiles = tile_index[0]; - x_in_tiles = b_.CreateZExtOrTrunc(x_in_tiles, index_ty); + x_in_tiles = ZExtOrTrunc(x_in_tiles, index_ty); // Emit an inner for-loop that reduces the elements in the tile. auto emit_tile_element_loop = [=](bool tile_in_bounds) -> Status { @@ -832,15 +832,14 @@ Status IrEmitterUnnested::EmitReductionToScalar( // Emit the body of the partial reduction loop. llvm_ir::SetToFirstInsertPoint(tile_element_loop->GetBodyBasicBlock(), &b_); - llvm::Value* x = b_.CreateNSWAdd( - b_.CreateNSWMul(x_in_tiles, index_typed_constant(kTileSize)), - tile_element_loop->GetIndVarValue()); + llvm::Value* x = + NSWAdd(NSWMul(x_in_tiles, index_typed_constant(kTileSize)), + tile_element_loop->GetIndVarValue()); // Unless we know the tile is entirely in bounds, we have to emit a // x-in-bounds check before reading from the input. if (!tile_in_bounds) { llvm_ir::LlvmIfData if_data = llvm_ir::EmitIfThenElse( - b_.CreateICmpULT(x, index_typed_constant(num_elems)), "x_in_bounds", - &b_); + ICmpULT(x, index_typed_constant(num_elems)), "x_in_bounds", &b_); // Emit code that reads the input element and accumulates it to // the partial reduction result. @@ -849,11 +848,11 @@ Status IrEmitterUnnested::EmitReductionToScalar( IrArray::Index input_index( /*linear=*/x, input_shape, &b_); - llvm::Value* input_address = b_.CreateAlloca(element_ir_type); + llvm::Value* input_address = Alloca(element_ir_type); for (int i = 0; i != num_reduces; ++i) { TF_ASSIGN_OR_RETURN(llvm::Value* const input_ir_value, input_gens[i](input_index)); - b_.CreateStore(input_ir_value, input_address); + Store(input_ir_value, input_address); TF_RETURN_IF_ERROR(EmitCallToNestedComputation( *reducers[i], {partial_reduction_result_addresses[i], input_address}, @@ -864,14 +863,14 @@ Status IrEmitterUnnested::EmitReductionToScalar( // x_end = kTileSize + x_in_tiles * kTileSize, i.e., the location that's // immediately beyond the tile. - llvm::Value* x_end = b_.CreateNSWAdd( - index_typed_constant(kTileSize), - b_.CreateNSWMul(x_in_tiles, index_typed_constant(kTileSize))); + llvm::Value* x_end = + NSWAdd(index_typed_constant(kTileSize), + NSWMul(x_in_tiles, index_typed_constant(kTileSize))); // The tile is entirely in bound if all_threads_in_bounds or // x_end <= num_elems. llvm::Value* tile_in_bounds = - b_.CreateOr(b_.CreateICmpULE(x_end, index_typed_constant(num_elems)), - b_.getInt1(all_threads_in_bounds)); + Or(ICmpULE(x_end, index_typed_constant(num_elems)), + b_.getInt1(all_threads_in_bounds)); llvm_ir::LlvmIfData if_tile_in_bounds_data = llvm_ir::EmitIfThenElse(tile_in_bounds, "tile_in_bounds", &b_); llvm_ir::SetToFirstInsertPoint(if_tile_in_bounds_data.true_block, &b_); @@ -892,20 +891,18 @@ Status IrEmitterUnnested::EmitReductionToScalar( for (int shuffle_distance = kWarpSize / 2; shuffle_distance >= 1; shuffle_distance /= 2) { llvm::Value* result_from_other_lane = - b_.CreateAlloca(element_ir_type, nullptr, "result_from_other_lane"); + Alloca(element_ir_type, nullptr, "result_from_other_lane"); for (int i = 0; i != num_reduces; ++i) { - llvm::Value* partial_reduction_result = b_.CreateLoad( - b_.CreateBitCast(partial_reduction_result_addresses[i], - shuffle_ir_type->getPointerTo()), - "partial_reduction_result"); + llvm::Value* partial_reduction_result = + Load(BitCast(partial_reduction_result_addresses[i], + shuffle_ir_type->getPointerTo()), + "partial_reduction_result"); CHECK_EQ(launch_dimensions.threads_per_block() % kWarpSize, 0) << "Requires block size a multiple of the warp size, otherwise we " "will read undefined elements."; - b_.CreateStore( - EmitFullWarpShuffleDown(partial_reduction_result, - b_.getInt32(shuffle_distance), &b_), - b_.CreateBitCast(result_from_other_lane, - shuffle_ir_type->getPointerTo())); + Store(EmitFullWarpShuffleDown(partial_reduction_result, + b_.getInt32(shuffle_distance), &b_), + BitCast(result_from_other_lane, shuffle_ir_type->getPointerTo())); TF_RETURN_IF_ERROR(EmitCallToNestedComputation( *reducers[i], {partial_reduction_result_addresses[i], result_from_other_lane}, @@ -920,10 +917,9 @@ Status IrEmitterUnnested::EmitReductionToScalar( // lane 0 (which holds the partially accumulated result for its warp) to the // output element. llvm::Value* lane_id = - b_.CreateURem(x_in_tiles, index_typed_constant(kWarpSize), "lane_id"); + URem(x_in_tiles, index_typed_constant(kWarpSize), "lane_id"); llvm_ir::LlvmIfData if_lane_id_is_zero_data = llvm_ir::EmitIfThenElse( - b_.CreateICmpEQ(lane_id, index_typed_constant(0)), "lane_id_is_zero", - &b_); + ICmpEQ(lane_id, index_typed_constant(0)), "lane_id_is_zero", &b_); llvm_ir::SetToFirstInsertPoint(if_lane_id_is_zero_data.true_block, &b_); for (int i = 0; i != num_reduces; ++i) { @@ -1043,12 +1039,12 @@ Status IrEmitterUnnested::EmitColumnReduction( for (int i = 0; i != num_reduces; ++i) { for (int x_offset = 0; x_offset < kTileWidth; ++x_offset) { llvm::Value* partial_reduction_result_address = - b_.CreateAlloca(element_ir_type, /*ArraySize=*/nullptr, - "partial_reduction_result." + - llvm::Twine(i * kTileWidth + x_offset)); + Alloca(element_ir_type, /*ArraySize=*/nullptr, + "partial_reduction_result." + + llvm::Twine(i * kTileWidth + x_offset)); TF_ASSIGN_OR_RETURN(llvm::Value* const init_ir_value, init_value_gens[i](IrArray::Index(index_ty))); - b_.CreateStore(init_ir_value, partial_reduction_result_address); + Store(init_ir_value, partial_reduction_result_address); partial_reduction_result_addresses.push_back( partial_reduction_result_address); } @@ -1059,8 +1055,8 @@ Status IrEmitterUnnested::EmitColumnReduction( llvm::Value* y_in_tiles = tile_index[0]; llvm::Value* x_in_tiles = tile_index[1]; - y_in_tiles = b_.CreateZExtOrTrunc(y_in_tiles, index_ty); - x_in_tiles = b_.CreateZExtOrTrunc(x_in_tiles, index_ty); + y_in_tiles = ZExtOrTrunc(y_in_tiles, index_ty); + x_in_tiles = ZExtOrTrunc(x_in_tiles, index_ty); auto emit_tile_element_loop = [=](bool tile_in_y_bounds, bool tile_in_x_bounds) -> Status { @@ -1072,34 +1068,32 @@ Status IrEmitterUnnested::EmitColumnReduction( // Emit the body of the partial reduction loop. llvm_ir::SetToFirstInsertPoint(tile_element_loop->GetBodyBasicBlock(), &b_); - llvm::Value* y = b_.CreateNSWAdd( - b_.CreateNSWMul(y_in_tiles, index_typed_constant(kTileHeight)), - tile_element_loop->GetIndVarValue()); + llvm::Value* y = + NSWAdd(NSWMul(y_in_tiles, index_typed_constant(kTileHeight)), + tile_element_loop->GetIndVarValue()); // Unless we know that y is in bounds, we have to emit a check before // reading from the input. if (!tile_in_y_bounds) { llvm_ir::LlvmIfData if_data = llvm_ir::EmitIfThenElse( - b_.CreateICmpULT(y, index_typed_constant(height)), "y_in_bounds", - &b_); + ICmpULT(y, index_typed_constant(height)), "y_in_bounds", &b_); // Emit code that reads the input element and accumulates it to // the partial reduction result. llvm_ir::SetToFirstInsertPoint(if_data.true_block, &b_); } for (int x_offset = 0; x_offset < kTileWidth; ++x_offset) { - llvm::Value* x = b_.CreateNSWAdd( - b_.CreateNSWMul(x_in_tiles, index_typed_constant(kTileWidth)), - index_typed_constant(x_offset)); + llvm::Value* x = + NSWAdd(NSWMul(x_in_tiles, index_typed_constant(kTileWidth)), + index_typed_constant(x_offset)); // Unless we know that x is in bounds, we have to emit a check before // reading from the input. if (!tile_in_x_bounds) { llvm_ir::LlvmIfData if_data = llvm_ir::EmitIfThenElse( - b_.CreateICmpULT(x, index_typed_constant(width)), "x_in_bounds", - &b_); + ICmpULT(x, index_typed_constant(width)), "x_in_bounds", &b_); llvm_ir::SetToFirstInsertPoint(if_data.true_block, &b_); } - llvm::Value* input_address = b_.CreateAlloca(element_ir_type); + llvm::Value* input_address = Alloca(element_ir_type); // {y,x} is an index to input_matrix_shape [height,width]. We need to // convert that to an index to input_shape (the shape of the operand of // "reduce"). This conversion is composed of a transposition from @@ -1126,7 +1120,7 @@ Status IrEmitterUnnested::EmitColumnReduction( for (int i = 0; i != num_reduces; ++i) { TF_ASSIGN_OR_RETURN(llvm::Value* const input_ir_value, input_gens[i](input_index)); - b_.CreateStore(input_ir_value, input_address); + Store(input_ir_value, input_address); TF_RETURN_IF_ERROR(EmitCallToNestedComputation( *reducers[i], {partial_reduction_result_addresses[i * kTileWidth + x_offset], @@ -1141,20 +1135,20 @@ Status IrEmitterUnnested::EmitColumnReduction( // y_end = kTileHeight + y_in_tiles * kTileHeight, i.e., the y location // that's immediately beyond the tile. - llvm::Value* y_end = b_.CreateNSWAdd( - index_typed_constant(kTileHeight), - b_.CreateNSWMul(y_in_tiles, index_typed_constant(kTileHeight))); + llvm::Value* y_end = + NSWAdd(index_typed_constant(kTileHeight), + NSWMul(y_in_tiles, index_typed_constant(kTileHeight))); // x_end = kTileWidth + x_in_tiles * kTileWidth, i.e., the x location // that's immediately beyond the tile. - llvm::Value* x_end = b_.CreateNSWAdd( - index_typed_constant(kTileWidth), - b_.CreateNSWMul(x_in_tiles, index_typed_constant(kTileWidth))); + llvm::Value* x_end = + NSWAdd(index_typed_constant(kTileWidth), + NSWMul(x_in_tiles, index_typed_constant(kTileWidth))); llvm::Value* tile_in_y_bounds = - b_.CreateOr(b_.CreateICmpULE(y_end, index_typed_constant(height)), - b_.getInt1(height % kTileHeight == 0)); + Or(ICmpULE(y_end, index_typed_constant(height)), + b_.getInt1(height % kTileHeight == 0)); llvm::Value* tile_in_x_bounds = - b_.CreateOr(b_.CreateICmpULE(x_end, index_typed_constant(width)), - b_.getInt1(width % kTileWidth == 0)); + Or(ICmpULE(x_end, index_typed_constant(width)), + b_.getInt1(width % kTileWidth == 0)); // The tile is in y bounds if "height" is a multiple of kTileHeight or // y_end <= height. llvm_ir::LlvmIfData if_tile_in_y_bounds_data = @@ -1188,9 +1182,9 @@ Status IrEmitterUnnested::EmitColumnReduction( reduce->IsFused() ? reduce->parent()->FusionInstruction() : reduce; for (int i = 0; i != num_reduces; ++i) { for (int x_offset = 0; x_offset < kTileWidth; ++x_offset) { - llvm::Value* x = b_.CreateNSWAdd( - b_.CreateNSWMul(x_in_tiles, index_typed_constant(kTileWidth)), - index_typed_constant(x_offset)); + llvm::Value* x = + NSWAdd(NSWMul(x_in_tiles, index_typed_constant(kTileWidth)), + index_typed_constant(x_offset)); llvm::Value* output_address = GetIrArray(*output, *output, reduce_output_shapes[i]) .EmitArrayElementAddress( @@ -1379,11 +1373,11 @@ Status IrEmitterUnnested::EmitRowReduction( std::vector partial_reduction_result_addresses; for (int i = 0; i != num_reduces; ++i) { llvm::Value* partial_reduction_result_address = - b_.CreateAlloca(element_ir_type, /*ArraySize=*/nullptr, - "partial_reduction_result." + llvm::Twine(i)); + Alloca(element_ir_type, /*ArraySize=*/nullptr, + "partial_reduction_result." + llvm::Twine(i)); TF_ASSIGN_OR_RETURN(llvm::Value* const init_ir_value, init_value_gens[i](IrArray::Index(index_ty))); - b_.CreateStore(init_ir_value, partial_reduction_result_address); + Store(init_ir_value, partial_reduction_result_address); partial_reduction_result_addresses.push_back( partial_reduction_result_address); } @@ -1392,22 +1386,20 @@ Status IrEmitterUnnested::EmitRowReduction( llvm::Value* y = tile_index[1]; llvm::Value* x_tile = tile_index[2]; - x_tile = b_.CreateZExtOrTrunc(x_tile, index_ty); + x_tile = ZExtOrTrunc(x_tile, index_ty); llvm::Value* warp_id = - b_.CreateUDiv(x_tile, index_typed_constant(kWarpSize), "warp_id"); + UDiv(x_tile, index_typed_constant(kWarpSize), "warp_id"); llvm::Value* lane_id = - b_.CreateURem(x_tile, index_typed_constant(kWarpSize), "lane_id"); + URem(x_tile, index_typed_constant(kWarpSize), "lane_id"); // The x-location of the last element in this z-x-tile. // last_x = lane_id + warpSize * (x_tile_size - 1 + warp_id * x_tile_size); - llvm::Value* last_x = b_.CreateNSWAdd( + llvm::Value* last_x = NSWAdd( lane_id, - b_.CreateNSWMul( - index_typed_constant(kWarpSize), - b_.CreateNSWAdd( - index_typed_constant(x_tile_size - 1), - b_.CreateNSWMul(warp_id, index_typed_constant(x_tile_size))))); + NSWMul(index_typed_constant(kWarpSize), + NSWAdd(index_typed_constant(x_tile_size - 1), + NSWMul(warp_id, index_typed_constant(x_tile_size))))); KernelSupportLibrary ksl( &b_, @@ -1419,9 +1411,8 @@ Status IrEmitterUnnested::EmitRowReduction( auto emit_z_x_tile_element_loop = [&](bool x_tile_in_bounds, int64 x_tile_loop_bound) -> Status { auto emit_z_tile_element_loop = [&](llvm::Value* z_indvar) -> Status { - llvm::Value* z = b_.CreateNSWAdd( - z_indvar, - b_.CreateNSWMul(index_typed_constant(z_tile_size), z_tile)); + llvm::Value* z = + NSWAdd(z_indvar, NSWMul(index_typed_constant(z_tile_size), z_tile)); TF_RETURN_IF_ERROR(ksl.For( "x_tile", /*start=*/index_typed_constant(0), @@ -1429,22 +1420,20 @@ Status IrEmitterUnnested::EmitRowReduction( /*step=*/1, [&](llvm::Value* x_indvar) -> Status { // x = lane_id + // warpSize * (element_id_in_x_tile + warp_id * x_tile_size); - llvm::Value* x = b_.CreateNSWAdd( + llvm::Value* x = NSWAdd( lane_id, - b_.CreateNSWMul( - index_typed_constant(kWarpSize), - b_.CreateNSWAdd( - x_indvar, b_.CreateNSWMul( - warp_id, llvm::ConstantInt::get( - index_ty, x_tile_size))))); + NSWMul(index_typed_constant(kWarpSize), + NSWAdd(x_indvar, + NSWMul(warp_id, llvm::ConstantInt::get( + index_ty, x_tile_size))))); // Unless we know the x-tile is entirely in bounds, we have to // emit a x-in-bounds check before reading from the input. if (!x_tile_in_bounds) { llvm_ir::LlvmIfData if_x_in_bounds_data = llvm_ir::EmitIfThenElse( - b_.CreateICmpULT(x, index_typed_constant(width)), - "x_in_bounds", &b_); + ICmpULT(x, index_typed_constant(width)), "x_in_bounds", + &b_); // Points b_ to the then-block. llvm_ir::SetToFirstInsertPoint(if_x_in_bounds_data.true_block, &b_); @@ -1452,7 +1441,7 @@ Status IrEmitterUnnested::EmitRowReduction( // Emit code that reads the input element and accumulates it // to the partial reduction result. - llvm::Value* input_address = b_.CreateAlloca(element_ir_type); + llvm::Value* input_address = Alloca(element_ir_type); { // {z,y,x} is an index to input_3d_tensor_shape // [depth,height,width]. We need to convert that to an index @@ -1483,7 +1472,7 @@ Status IrEmitterUnnested::EmitRowReduction( for (int i = 0; i != num_reduces; ++i) { TF_ASSIGN_OR_RETURN(llvm::Value* const input_ir_value, input_gens[i](input_index)); - b_.CreateStore(input_ir_value, input_address); + Store(input_ir_value, input_address); TF_RETURN_IF_ERROR(EmitCallToNestedComputation( *reducers[i], {partial_reduction_result_addresses[i], input_address}, @@ -1503,8 +1492,8 @@ Status IrEmitterUnnested::EmitRowReduction( }; llvm::Value* tile_in_bounds = - b_.CreateOr(b_.getInt1(width % (x_tile_size * kWarpSize) == 0), - b_.CreateICmpULT(last_x, index_typed_constant(width))); + Or(b_.getInt1(width % (x_tile_size * kWarpSize) == 0), + ICmpULT(last_x, index_typed_constant(width))); TF_RETURN_IF_ERROR( ksl.If(tile_in_bounds, @@ -1532,20 +1521,18 @@ Status IrEmitterUnnested::EmitRowReduction( for (int shuffle_distance = 16; shuffle_distance >= 1; shuffle_distance /= 2) { llvm::Value* result_from_other_lane = - b_.CreateAlloca(element_ir_type, nullptr, "result_from_other_lane"); + Alloca(element_ir_type, nullptr, "result_from_other_lane"); for (int i = 0; i != num_reduces; ++i) { - llvm::Value* partial_reduction_result = b_.CreateLoad( - b_.CreateBitCast(partial_reduction_result_addresses[i], - shuffle_ir_type->getPointerTo()), - "partial_reduction_result"); + llvm::Value* partial_reduction_result = + Load(BitCast(partial_reduction_result_addresses[i], + shuffle_ir_type->getPointerTo()), + "partial_reduction_result"); CHECK_EQ(launch_dimensions.threads_per_block() % kWarpSize, 0) << "Requires block size a multiple of the warp size, otherwise we " "will read undefined elements."; - b_.CreateStore( - EmitFullWarpShuffleDown(partial_reduction_result, - b_.getInt32(shuffle_distance), &b_), - b_.CreateBitCast(result_from_other_lane, - shuffle_ir_type->getPointerTo())); + Store(EmitFullWarpShuffleDown(partial_reduction_result, + b_.getInt32(shuffle_distance), &b_), + BitCast(result_from_other_lane, shuffle_ir_type->getPointerTo())); TF_RETURN_IF_ERROR(EmitCallToNestedComputation( *reducers[i], {partial_reduction_result_addresses[i], result_from_other_lane}, @@ -1560,8 +1547,7 @@ Status IrEmitterUnnested::EmitRowReduction( // lane 0 (which holds the partially accumulated result for its warp) to the // output element. llvm_ir::LlvmIfData if_lane_id_is_zero_data = llvm_ir::EmitIfThenElse( - b_.CreateICmpEQ(lane_id, index_typed_constant(0)), "lane_id_is_zero", - &b_); + ICmpEQ(lane_id, index_typed_constant(0)), "lane_id_is_zero", &b_); llvm_ir::SetToFirstInsertPoint(if_lane_id_is_zero_data.true_block, &b_); for (int i = 0; i != num_reduces; ++i) { llvm::Value* output_address = @@ -1845,7 +1831,7 @@ Status IrEmitterUnnested::HandleSelectAndScatter( &b_); llvm::Value* initialized_flag_address = llvm_ir::EmitAllocaAtFunctionEntry( b_.getInt1Ty(), "initialized_flag_address", &b_); - b_.CreateStore(b_.getInt1(false), initialized_flag_address); + Store(b_.getInt1(false), initialized_flag_address); // Create the inner loop to iterate over the window. llvm_ir::ForLoopNest window_loops(IrName(select_and_scatter, "inner"), &b_, @@ -1866,15 +1852,15 @@ Status IrEmitterUnnested::HandleSelectAndScatter( IrArray::Index operand_index(index_type, source_index.size()); llvm::Value* in_bounds_condition = b_.getInt1(true); for (int64 i = 0; i < rank; ++i) { - llvm::Value* strided_index = b_.CreateNSWMul( + llvm::Value* strided_index = NSWMul( source_index[i], index_typed_constant(window.dimensions(i).stride())); - operand_index[i] = b_.CreateNSWSub( - b_.CreateNSWAdd(strided_index, window_index[i]), - index_typed_constant(window.dimensions(i).padding_low())); - llvm::Value* index_condition = b_.CreateICmpULT( + operand_index[i] = + NSWSub(NSWAdd(strided_index, window_index[i]), + index_typed_constant(window.dimensions(i).padding_low())); + llvm::Value* index_condition = ICmpULT( operand_index[i], index_typed_constant(ShapeUtil::GetDimension(operand->shape(), i))); - in_bounds_condition = b_.CreateAnd(in_bounds_condition, index_condition); + in_bounds_condition = And(in_bounds_condition, index_condition); } CHECK(in_bounds_condition != nullptr); @@ -1884,7 +1870,7 @@ Status IrEmitterUnnested::HandleSelectAndScatter( llvm_ir::EmitIfThenElse(in_bounds_condition, "in-bounds", &b_); llvm_ir::SetToFirstInsertPoint(if_in_bounds.true_block, &b_); llvm_ir::LlvmIfData if_initialized = llvm_ir::EmitIfThenElse( - b_.CreateLoad(initialized_flag_address), "initialized", &b_); + Load(initialized_flag_address), "initialized", &b_); // If the initialized_flag is false, initialize the selected value and index // with the currently visiting operand. @@ -1892,16 +1878,16 @@ Status IrEmitterUnnested::HandleSelectAndScatter( const auto save_operand_index = [&](const IrArray::Index& operand_index) { for (int64 i = 0; i < rank; ++i) { llvm::Value* selected_index_address_slot = - b_.CreateInBoundsGEP(selected_index_address, {b_.getInt32(i)}); - b_.CreateStore(operand_index[i], selected_index_address_slot); + InBoundsGEP(selected_index_address, {b_.getInt32(i)}); + Store(operand_index[i], selected_index_address_slot); } }; IrArray operand_array = GetIrArray(*operand, *select_and_scatter); llvm::Value* operand_data = operand_array.EmitReadArrayElement(operand_index, &b_); - b_.CreateStore(operand_data, selected_value_address); + Store(operand_data, selected_value_address); save_operand_index(operand_index); - b_.CreateStore(b_.getInt1(true), initialized_flag_address); + Store(b_.getInt1(true), initialized_flag_address); // If the initialized_flag is true, call the `select` function to // potentially update the selected value and index with the currently @@ -1917,11 +1903,11 @@ Status IrEmitterUnnested::HandleSelectAndScatter( TF_RETURN_IF_ERROR(EmitCallToNestedComputation( *select_and_scatter->select(), {selected_value_address, operand_address}, select_return_buffer)); - llvm::Value* result = b_.CreateLoad(select_return_buffer); + llvm::Value* result = Load(select_return_buffer); // If the 'select' function returns false, update the selected value and the // index to the currently visiting operand. - llvm::Value* cond = b_.CreateICmpNE( + llvm::Value* cond = ICmpNE( result, llvm::ConstantInt::get(llvm_ir::PrimitiveTypeToIrType( PRED, ir_emitter_context_->llvm_module()), @@ -1930,7 +1916,7 @@ Status IrEmitterUnnested::HandleSelectAndScatter( llvm_ir::LlvmIfData if_select_lhs = llvm_ir::EmitIfThenElse(cond, "if-select-lhs", &b_); llvm_ir::SetToFirstInsertPoint(if_select_lhs.false_block, &b_); - b_.CreateStore(b_.CreateLoad(operand_address), selected_value_address); + Store(Load(operand_address), selected_value_address); save_operand_index(operand_index); // After iterating over the window elements, scatter the source element to @@ -1942,8 +1928,8 @@ Status IrEmitterUnnested::HandleSelectAndScatter( IrArray::Index selected_index(operand_index.GetType()); for (int64 i = 0; i < rank; ++i) { llvm::Value* selected_index_address_slot = - b_.CreateInBoundsGEP(selected_index_address, {b_.getInt32(i)}); - selected_index.push_back(b_.CreateLoad(selected_index_address_slot)); + InBoundsGEP(selected_index_address, {b_.getInt32(i)}); + selected_index.push_back(Load(selected_index_address_slot)); } llvm::Value* source_value_address = GetIrArray(*source, *select_and_scatter) @@ -2367,8 +2353,8 @@ std::unique_ptr IrEmitterUnnested::BuildKernelThunk( *slice.allocation()))); CHECK_NE(loc, nullptr); } else { - loc = b_.CreateInBoundsGEP(kernel_args.at(slice.allocation()), - {b_.getInt64(slice.offset())}); + loc = InBoundsGEP(kernel_args.at(slice.allocation()), + {b_.getInt64(slice.offset())}); } // If gte_index is nonempty, we have to dereference `loc` to get to the @@ -2376,8 +2362,8 @@ std::unique_ptr IrEmitterUnnested::BuildKernelThunk( llvm::Type* int8_double_pointer = llvm::PointerType::get(b_.getInt8PtrTy(), /*AddressSpace=*/0); for (int64 idx : gte_index) { - loc = b_.CreateBitCast(loc, int8_double_pointer); - loc = b_.CreateLoad(b_.CreateInBoundsGEP(loc, {b_.getInt64(idx)})); + loc = BitCast(loc, int8_double_pointer); + loc = Load(InBoundsGEP(loc, {b_.getInt64(idx)})); } bindings_.BindHloToIrValue(*instr, loc, index); @@ -3154,9 +3140,8 @@ LaunchDimensions IrEmitterUnnested::EmitHlo021Tile( const IrArray::Index output_tile_origin = [&] { IrArray::Index index = output_tile_index; for (int i = 1; i < 3; ++i) { - index[i] = - b_.CreateMul(output_tile_index[i], index_typed_constant(kTileSize), - "tile_origin." + std::to_string(i)); + index[i] = Mul(output_tile_index[i], index_typed_constant(kTileSize), + "tile_origin." + std::to_string(i)); } return index; }(); @@ -3169,12 +3154,12 @@ LaunchDimensions IrEmitterUnnested::EmitHlo021Tile( std::vector output_tile_bounds(3); for (int i = 1; i < 3; ++i) { // Only last row or column may not have full size. - output_tile_bounds[i] = b_.CreateSelect( - b_.CreateICmpEQ(output_tile_index[i], - index_typed_constant(output_dims_in_tiles[i] - 1)), - index_typed_constant(reduced_output_dims[i] - - (output_dims_in_tiles[i] - 1) * kTileSize), - index_typed_constant(kTileSize), "kTileSize"); + output_tile_bounds[i] = + Select(ICmpEQ(output_tile_index[i], + index_typed_constant(output_dims_in_tiles[i] - 1)), + index_typed_constant(reduced_output_dims[i] - + (output_dims_in_tiles[i] - 1) * kTileSize), + index_typed_constant(kTileSize), "kTileSize"); } KernelSupportLibrary ksl(&b_, llvm_ir::UnrollMode::kDefaultUnroll); @@ -3192,7 +3177,7 @@ LaunchDimensions IrEmitterUnnested::EmitHlo021Tile( // Adds `addend` to the given `dim` of `index`. auto offset_dim = [&](IrArray::Index index, llvm::Value* addend, int64 dim) { - index[dim] = b_.CreateAdd(index[dim], addend); + index[dim] = Add(index[dim], addend); return index; }; const IrArray::Index input_index = @@ -3208,10 +3193,9 @@ LaunchDimensions IrEmitterUnnested::EmitHlo021Tile( llvm::Value* shmem_buffer = param_shmem_buffers[id]; // TODO(jlebar): Add AA metadata to this store. Tile buffers are // global variables, so LLVM can't infer much about it. - b_.CreateStore( - input_in_logical_shape.EmitReadArrayElement(index, &b_, - "input_element"), - b_.CreateGEP(shmem_buffer, {index_typed_constant(0), y_loc, x})); + Store(input_in_logical_shape.EmitReadArrayElement(index, &b_, + "input_element"), + GEP(shmem_buffer, {index_typed_constant(0), y_loc, x})); } }); @@ -3232,9 +3216,9 @@ LaunchDimensions IrEmitterUnnested::EmitHlo021Tile( output_index, "output", output_tile_bounds[2], output_tile_bounds[1], [&](const IrArray::Index& index, llvm::Value* y_loc) { // TODO(jlebar): Add AA metadata to this load. - llvm::Instruction* load_from_shmem_buffer = b_.CreateLoad( - b_.CreateGEP(param_shmem_buffers[0], {b_.getInt64(0), x, y_loc}), - "output_element"); + llvm::Instruction* load_from_shmem_buffer = + Load(GEP(param_shmem_buffers[0], {b_.getInt64(0), x, y_loc}), + "output_element"); output_in_reduced_shape_arrays[0].EmitWriteArrayElement( index, load_from_shmem_buffer, &b_); }); @@ -3262,7 +3246,7 @@ LaunchDimensions IrEmitterUnnested::EmitHlo021Tile( output_in_reduced_shape_arrays.size()); for (int64 i = 0; i < output_in_reduced_shape_arrays.size(); ++i) { output_in_reduced_shape_arrays[i].EmitWriteArrayElement( - index, b_.CreateExtractValue(output_value, i), &b_); + index, ExtractValue(output_value, i), &b_); } } else { output_in_reduced_shape_arrays[0].EmitWriteArrayElement( diff --git a/tensorflow/compiler/xla/service/llvm_ir/BUILD b/tensorflow/compiler/xla/service/llvm_ir/BUILD index 786448ea76..be12d7c90c 100644 --- a/tensorflow/compiler/xla/service/llvm_ir/BUILD +++ b/tensorflow/compiler/xla/service/llvm_ir/BUILD @@ -249,3 +249,12 @@ cc_library( "@llvm//:core", ], ) + +cc_library( + name = "ir_builder_mixin", + srcs = [], + hdrs = ["ir_builder_mixin.h"], + deps = [ + "@llvm//:core", + ], +) diff --git a/tensorflow/compiler/xla/service/llvm_ir/ir_builder_mixin.h b/tensorflow/compiler/xla/service/llvm_ir/ir_builder_mixin.h new file mode 100644 index 0000000000..abc06fb7b4 --- /dev/null +++ b/tensorflow/compiler/xla/service/llvm_ir/ir_builder_mixin.h @@ -0,0 +1,400 @@ +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_LLVM_IR_IR_BUILDER_MIXIN_H_ +#define TENSORFLOW_COMPILER_XLA_SERVICE_LLVM_IR_IR_BUILDER_MIXIN_H_ + +#include "llvm/IR/IRBuilder.h" + +namespace xla { + +// Mixin class that injects more ergonomic versions of llvm::IRBuilder methods +// into a class. Intended to be used as a CRTP base class, like: +// +// class MyIrEmitter : public IrBuilderMixin { +// llvm::IRBuilder<>* builder() { return builder_; } +// +// void EmitFoo(HloInstruction* foo) { +// Add(Mul(...), FPToUI(...)); +// } +// }; + +template +class IrBuilderMixin { + protected: + template + llvm::Value* Add(Args&&... args) { + return mixin_builder()->CreateAdd(std::forward(args)...); + } + + template + llvm::LoadInst* AlignedLoad(Args&&... args) { + return mixin_builder()->CreateAlignedLoad(std::forward(args)...); + } + + template + llvm::StoreInst* AlignedStore(Args&&... args) { + return mixin_builder()->CreateAlignedStore(std::forward(args)...); + } + + template + llvm::AllocaInst* Alloca(Args&&... args) { + return mixin_builder()->CreateAlloca(std::forward(args)...); + } + + template + llvm::Value* And(Args&&... args) { + return mixin_builder()->CreateAnd(std::forward(args)...); + } + + template + llvm::Value* AtomicCmpXchg(Args&&... args) { + return mixin_builder()->CreateAtomicCmpXchg(std::forward(args)...); + } + + template + llvm::Value* AtomicRMW(Args&&... args) { + return mixin_builder()->CreateAtomicRMW(std::forward(args)...); + } + + template + llvm::Value* BitCast(Args&&... args) { + return mixin_builder()->CreateBitCast(std::forward(args)...); + } + + template + llvm::Value* Br(Args&&... args) { + return mixin_builder()->CreateBr(std::forward(args)...); + } + + llvm::CallInst* Call(llvm::Value* callee, + llvm::ArrayRef args = llvm::None, + const llvm::Twine& name = "", + llvm::MDNode* fp_math_tag = nullptr) { + return mixin_builder()->CreateCall(callee, args, name, fp_math_tag); + } + + template + llvm::BranchInst* CondBr(Args&&... args) { + return mixin_builder()->CreateCondBr(std::forward(args)...); + } + + template + llvm::Value* ConstInBoundsGEP1_32(Args&&... args) { + return mixin_builder()->CreateConstInBoundsGEP1_32( + std::forward(args)...); + } + + template + llvm::Value* FAdd(Args&&... args) { + return mixin_builder()->CreateFAdd(std::forward(args)...); + } + + template + llvm::Value* FMul(Args&&... args) { + return mixin_builder()->CreateFMul(std::forward(args)...); + } + + llvm::Value* GEP(llvm::Value* ptr, llvm::ArrayRef idx_list, + const llvm::Twine& name = "") { + return mixin_builder()->CreateGEP(ptr, idx_list, name); + } + + template + llvm::Value* ICmpEQ(Args&&... args) { + return mixin_builder()->CreateICmpEQ(std::forward(args)...); + } + + template + llvm::Value* ICmpNE(Args&&... args) { + return mixin_builder()->CreateICmpNE(std::forward(args)...); + } + + template + llvm::Value* ICmpULE(Args&&... args) { + return mixin_builder()->CreateICmpULE(std::forward(args)...); + } + + template + llvm::Value* ICmpULT(Args&&... args) { + return mixin_builder()->CreateICmpULT(std::forward(args)...); + } + + llvm::Value* InBoundsGEP(llvm::Value* ptr, + llvm::ArrayRef idx_list, + const llvm::Twine& name = "") { + return mixin_builder()->CreateInBoundsGEP(ptr, idx_list, name); + } + + llvm::Value* ExtractValue(llvm::Value* agg, llvm::ArrayRef idxs, + const llvm::Twine& name = "") { + return mixin_builder()->CreateExtractValue(agg, idxs, name); + } + + llvm::Value* InsertValue(llvm::Value* agg, llvm::Value* val, + llvm::ArrayRef idxs, + const llvm::Twine& name = "") { + return mixin_builder()->CreateInsertValue(agg, val, idxs, name); + } + + template + llvm::Value* IntToPtr(Args&&... args) { + return mixin_builder()->CreateIntToPtr(std::forward(args)...); + } + + template + llvm::LoadInst* Load(Args&&... args) { + return mixin_builder()->CreateLoad(std::forward(args)...); + } + + template + llvm::CallInst* MemCpy(Args&&... args) { + return mixin_builder()->CreateMemCpy(std::forward(args)...); + } + + template + llvm::Value* Mul(Args&&... args) { + return mixin_builder()->CreateMul(std::forward(args)...); + } + + template + llvm::Value* NSWAdd(Args&&... args) { + return mixin_builder()->CreateNSWAdd(std::forward(args)...); + } + + template + llvm::Value* NSWMul(Args&&... args) { + return mixin_builder()->CreateNSWMul(std::forward(args)...); + } + + template + llvm::Value* NSWSub(Args&&... args) { + return mixin_builder()->CreateNSWSub(std::forward(args)...); + } + + template + llvm::Value* Or(Args&&... args) { + return mixin_builder()->CreateOr(std::forward(args)...); + } + + template + llvm::Value* PointerCast(Args&&... args) { + return mixin_builder()->CreatePointerCast(std::forward(args)...); + } + + template + llvm::Value* PtrToInt(Args&&... args) { + return mixin_builder()->CreatePtrToInt(std::forward(args)...); + } + + template + llvm::Value* SDiv(Args&&... args) { + return mixin_builder()->CreateSDiv(std::forward(args)...); + } + + template + llvm::Value* Select(Args&&... args) { + return mixin_builder()->CreateSelect(std::forward(args)...); + } + + template + llvm::Value* SRem(Args&&... args) { + return mixin_builder()->CreateSRem(std::forward(args)...); + } + + template + llvm::StoreInst* Store(Args&&... args) { + return mixin_builder()->CreateStore(std::forward(args)...); + } + + template + llvm::Value* UDiv(Args&&... args) { + return mixin_builder()->CreateUDiv(std::forward(args)...); + } + + template + llvm::Value* URem(Args&&... args) { + return mixin_builder()->CreateURem(std::forward(args)...); + } + + template + llvm::Value* VectorSplat(Args&&... args) { + return mixin_builder()->CreateVectorSplat(std::forward(args)...); + } + + template + llvm::Value* ZExtOrTrunc(Args&&... args) { + return mixin_builder()->CreateZExtOrTrunc(std::forward(args)...); + } + + template + llvm::Value* AShr(Args&&... args) { + return mixin_builder()->CreateAShr(std::forward(args)...); + } + + template + llvm::Value* FCmpOEQ(Args&&... args) { + return mixin_builder()->CreateFCmpOEQ(std::forward(args)...); + } + + template + llvm::Value* FCmpOLT(Args&&... args) { + return mixin_builder()->CreateFCmpOLT(std::forward(args)...); + } + + template + llvm::Value* FCmpONE(Args&&... args) { + return mixin_builder()->CreateFCmpONE(std::forward(args)...); + } + + template + llvm::Value* FCmpUNE(Args&&... args) { + return mixin_builder()->CreateFCmpUNE(std::forward(args)...); + } + + template + llvm::Value* FDiv(Args&&... args) { + return mixin_builder()->CreateFDiv(std::forward(args)...); + } + + template + llvm::Value* FNeg(Args&&... args) { + return mixin_builder()->CreateFNeg(std::forward(args)...); + } + + template + llvm::Value* FPCast(Args&&... args) { + return mixin_builder()->CreateFPCast(std::forward(args)...); + } + + template + llvm::Value* FPToSI(Args&&... args) { + return mixin_builder()->CreateFPToSI(std::forward(args)...); + } + + template + llvm::Value* FPToUI(Args&&... args) { + return mixin_builder()->CreateFPToUI(std::forward(args)...); + } + + template + llvm::Value* FPTrunc(Args&&... args) { + return mixin_builder()->CreateFPTrunc(std::forward(args)...); + } + + template + llvm::Value* FRem(Args&&... args) { + return mixin_builder()->CreateFRem(std::forward(args)...); + } + + template + llvm::Value* FSub(Args&&... args) { + return mixin_builder()->CreateFSub(std::forward(args)...); + } + + template + llvm::Value* ICmpSGE(Args&&... args) { + return mixin_builder()->CreateICmpSGE(std::forward(args)...); + } + + template + llvm::Value* ICmpSLT(Args&&... args) { + return mixin_builder()->CreateICmpSLT(std::forward(args)...); + } + + template + llvm::Value* IntCast(Args&&... args) { + return mixin_builder()->CreateIntCast(std::forward(args)...); + } + + template + llvm::Value* LShr(Args&&... args) { + return mixin_builder()->CreateLShr(std::forward(args)...); + } + + template + llvm::Value* MemSet(Args&&... args) { + return mixin_builder()->CreateMemSet(std::forward(args)...); + } + + template + llvm::Value* Neg(Args&&... args) { + return mixin_builder()->CreateNeg(std::forward(args)...); + } + + template + llvm::Value* Not(Args&&... args) { + return mixin_builder()->CreateNot(std::forward(args)...); + } + + template + llvm::PHINode* PHI(Args&&... args) { + return mixin_builder()->CreatePHI(std::forward(args)...); + } + + template + llvm::Value* RetVoid(Args&&... args) { + return mixin_builder()->CreateRetVoid(std::forward(args)...); + } + + template + llvm::Value* SExtOrTrunc(Args&&... args) { + return mixin_builder()->CreateSExtOrTrunc(std::forward(args)...); + } + + template + llvm::Value* Shl(Args&&... args) { + return mixin_builder()->CreateShl(std::forward(args)...); + } + + template + llvm::Value* SIToFP(Args&&... args) { + return mixin_builder()->CreateSIToFP(std::forward(args)...); + } + + template + llvm::Value* Sub(Args&&... args) { + return mixin_builder()->CreateSub(std::forward(args)...); + } + + template + llvm::Value* Trunc(Args&&... args) { + return mixin_builder()->CreateTrunc(std::forward(args)...); + } + + template + llvm::Value* UIToFP(Args&&... args) { + return mixin_builder()->CreateUIToFP(std::forward(args)...); + } + + template + llvm::Value* Unreachable(Args&&... args) { + return mixin_builder()->CreateUnreachable(std::forward(args)...); + } + + template + llvm::Value* Xor(Args&&... args) { + return mixin_builder()->CreateXor(std::forward(args)...); + } + + private: + llvm::IRBuilder<>* mixin_builder() { + return static_cast(this)->builder(); + } +}; + +} // namespace xla + +#endif // TENSORFLOW_COMPILER_XLA_SERVICE_LLVM_IR_IR_BUILDER_MIXIN_H_ -- GitLab From 0ca6969756137735b5ddd33618c06221063f5b6e Mon Sep 17 00:00:00 2001 From: Paul Donnelly Date: Mon, 27 Aug 2018 20:24:16 -0700 Subject: [PATCH 198/598] GPU int8x4 implementation of Relu PiperOrigin-RevId: 210482715 --- tensorflow/core/kernels/relu_op.cc | 27 ++++++++++++++ tensorflow/core/kernels/relu_op_gpu.cu.cc | 35 ++++++++++++++++++- tensorflow/core/ops/nn_ops.cc | 2 +- .../python/kernel_tests/relu_op_test.py | 30 ++++++++++++++++ 4 files changed, 92 insertions(+), 2 deletions(-) diff --git a/tensorflow/core/kernels/relu_op.cc b/tensorflow/core/kernels/relu_op.cc index d52358737f..173fea37ed 100644 --- a/tensorflow/core/kernels/relu_op.cc +++ b/tensorflow/core/kernels/relu_op.cc @@ -124,6 +124,12 @@ namespace functor { typename TTypes::Tensor backprops); \ extern template struct SeluGrad; +template <> +void Relu::operator()( + const GPUDevice& d, typename TTypes::ConstTensor features, + typename TTypes::Tensor activations); +extern template struct Relu; + TF_CALL_GPU_NUMBER_TYPES(DECLARE_GPU_SPEC); } // namespace functor @@ -157,6 +163,27 @@ TF_CALL_GPU_NUMBER_TYPES(DECLARE_GPU_SPEC); TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU_KERNELS); #undef REGISTER_GPU_KERNELS +template +class ReluOp + : public UnaryElementWiseOp> { + public: + using UnaryElementWiseOp>::UnaryElementWiseOp; + + void Operate(OpKernelContext* context, const Tensor& input, Tensor* output) { + auto flat_input = input.flat(); + OP_REQUIRES(context, (flat_input.size() % 4) == 0, + errors::InvalidArgument( + "Tensor size must be a multiple of 4 for Relu. Got ", + flat_input.size())); + functor::Relu func; + func(context->eigen_device(), flat_input, output->flat()); + } +}; + +REGISTER_KERNEL_BUILDER( + Name("Relu").Device(DEVICE_GPU).TypeConstraint("T"), + ReluOp); + #endif // GOOGLE_CUDA #ifdef TENSORFLOW_USE_SYCL diff --git a/tensorflow/core/kernels/relu_op_gpu.cu.cc b/tensorflow/core/kernels/relu_op_gpu.cu.cc index 089ca8ed27..b9391517c1 100644 --- a/tensorflow/core/kernels/relu_op_gpu.cu.cc +++ b/tensorflow/core/kernels/relu_op_gpu.cu.cc @@ -103,7 +103,7 @@ struct ReluGrad { int32 count = gradient.size(); if (count == 0) return; int32 half2_count = Eigen::divup(count, 2); - const int32 kThreadInBlock = 512; + constexpr int32 kThreadInBlock = 512; CudaLaunchConfig config = GetCudaLaunchConfigFixedBlockSize( half2_count, d, ReluGradHalfKernel, 0, kThreadInBlock); ReluGradHalfKernel<< { backprop.data(), count); } }; + +__global__ void Relu_int8x4_kernel(int vect_count, const int32* input, + int32* output) { + CUDA_1D_KERNEL_LOOP(index, vect_count) { + output[index] = __vmaxs4(input[index], 0); + } +} + +// Functor used by ReluOp to do the computations. +template +struct Relu { + // Computes Relu activation of 'input' containing int8 elements, whose buffer + // size should be a multiple of 4, and aligned to an int32* boundary. + // (Alignment should be guaranteed by the GPU tensor allocator). + // 'output' should have the same size as 'input'. + void operator()(const Device& d, typename TTypes::ConstTensor input, + typename TTypes::Tensor output) { + int32 count = input.size(); + if (count == 0) return; + + int32 vect_count = Eigen::divup(count, 4); + constexpr int32 kThreadInBlock = 512; + CudaLaunchConfig config = GetCudaLaunchConfigFixedBlockSize( + vect_count, d, Relu_int8x4_kernel, 0, kThreadInBlock); + Relu_int8x4_kernel<<>>( + vect_count, reinterpret_cast(input.data()), + reinterpret_cast(output.data())); + } +}; + } // namespace functor // Definition of the GPU implementations declared in relu_op.cc. @@ -126,6 +157,8 @@ struct ReluGrad { TF_CALL_GPU_NUMBER_TYPES(DEFINE_GPU_KERNELS); +template struct functor::Relu; + } // end namespace tensorflow #endif // GOOGLE_CUDA diff --git a/tensorflow/core/ops/nn_ops.cc b/tensorflow/core/ops/nn_ops.cc index 658e116ac8..2485fa4717 100644 --- a/tensorflow/core/ops/nn_ops.cc +++ b/tensorflow/core/ops/nn_ops.cc @@ -960,7 +960,7 @@ REGISTER_OP("Dilation2DBackpropFilter") REGISTER_OP("Relu") .Input("features: T") .Output("activations: T") - .Attr("T: realnumbertype") + .Attr("T: {realnumbertype, qint8}") .SetShapeFn(shape_inference::UnchangedShape); REGISTER_OP("ReluGrad") diff --git a/tensorflow/python/kernel_tests/relu_op_test.py b/tensorflow/python/kernel_tests/relu_op_test.py index 25e947f09e..657d92fa23 100644 --- a/tensorflow/python/kernel_tests/relu_op_test.py +++ b/tensorflow/python/kernel_tests/relu_op_test.py @@ -23,6 +23,7 @@ from six.moves import xrange # pylint: disable=redefined-builtin from tensorflow.python.framework import constant_op from tensorflow.python.framework import dtypes +from tensorflow.python.framework import errors from tensorflow.python.ops import array_ops from tensorflow.python.ops import gradient_checker from tensorflow.python.ops import gradients_impl @@ -71,6 +72,35 @@ class ReluTest(test.TestCase): np.array([[-9, 7, -5, 3, -1], [1, -3, 5, -7, 9]]).astype(t), use_gpu=True) + def _testReluInt8x4(self, np_inputs): + if not test.is_gpu_available(cuda_only=True): + return + np_relu = self._npRelu(np_inputs) + with self.test_session(use_gpu=True): + relu = nn_ops.relu(constant_op.constant(np_inputs, dtypes.qint8)) + if np_inputs.size % 4 == 0: + tf_relu = relu.eval() + self.assertAllClose(np_relu, tf_relu) + self.assertShapeEqual(np_relu, relu) + else: + with self.assertRaisesRegexp( + errors.InvalidArgumentError, + "Tensor size must be a multiple of 4 for Relu. Got %d" % + np_inputs.size): + tf_relu = relu.eval() + + def testReluInt8x4GoodShape(self): + self._testReluInt8x4(np.array([[-50, 7, 23, 0], [-1, -5, 6, 11]])) + + def testReluInt8x4BadShape(self): + np_inputs = np.array([[-50, 7, 23], [0, 1, -5], [6, -2, 11]]) + self.assertEqual(np_inputs.size, 9) + self._testReluInt8x4(np_inputs) + np_inputs = np.array( + [1, -2, 3, -4, 5, -6, 7, -8, 9, -8, 7, -6, 5, -4, 3, -2, 1]) + self.assertEqual(np_inputs.size, 17) + self._testReluInt8x4(np_inputs) + # The gradient test for ReLU is a bit tricky as the derivative is not well # defined at around zero and we want to avoid that in terms of input values. def testGradientFloat32(self): -- GitLab From 7e3683826049bb59bc1e2ec6603613dfb5772ac5 Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Mon, 27 Aug 2018 20:30:14 -0700 Subject: [PATCH 199/598] Update visibility rules. PiperOrigin-RevId: 210483116 --- tensorflow/contrib/coder/BUILD | 1 + 1 file changed, 1 insertion(+) diff --git a/tensorflow/contrib/coder/BUILD b/tensorflow/contrib/coder/BUILD index 855c824ead..4bfd753bb1 100644 --- a/tensorflow/contrib/coder/BUILD +++ b/tensorflow/contrib/coder/BUILD @@ -3,6 +3,7 @@ package(default_visibility = [ "//learning/brain:__subpackages__", + "//research/vision/piedpiper:__subpackages__", "//tensorflow:__subpackages__", ]) -- GitLab From 52ed16ca14cd2db0cba1a33ff6f2d70f56a1bb0e Mon Sep 17 00:00:00 2001 From: Dimitris Vardoulakis Date: Mon, 27 Aug 2018 20:36:26 -0700 Subject: [PATCH 200/598] [TF:XLA] Run the points-to analysis for a module group. PiperOrigin-RevId: 210483654 --- tensorflow/compiler/xla/service/BUILD | 1 + .../compiler/xla/service/hlo_module_group_metadata.cc | 9 +++++++++ .../compiler/xla/service/hlo_module_group_metadata.h | 8 ++++++++ 3 files changed, 18 insertions(+) diff --git a/tensorflow/compiler/xla/service/BUILD b/tensorflow/compiler/xla/service/BUILD index 025f0b0195..716c75da39 100644 --- a/tensorflow/compiler/xla/service/BUILD +++ b/tensorflow/compiler/xla/service/BUILD @@ -1114,6 +1114,7 @@ cc_library( deps = [ ":hlo", ":hlo_casting_utils", + ":tuple_points_to_analysis", "//tensorflow/compiler/xla:shape_util", "//tensorflow/compiler/xla:status", "//tensorflow/compiler/xla:status_macros", diff --git a/tensorflow/compiler/xla/service/hlo_module_group_metadata.cc b/tensorflow/compiler/xla/service/hlo_module_group_metadata.cc index a9c5d48983..9c01862a4b 100644 --- a/tensorflow/compiler/xla/service/hlo_module_group_metadata.cc +++ b/tensorflow/compiler/xla/service/hlo_module_group_metadata.cc @@ -22,6 +22,7 @@ limitations under the License. #include "absl/memory/memory.h" #include "tensorflow/compiler/xla/service/hlo_casting_utils.h" #include "tensorflow/compiler/xla/service/hlo_instructions.h" +#include "tensorflow/compiler/xla/service/tuple_points_to_analysis.h" #include "tensorflow/compiler/xla/shape_util.h" #include "tensorflow/compiler/xla/status_macros.h" #include "tensorflow/compiler/xla/util.h" @@ -131,6 +132,14 @@ Status HloModuleGroupMetadata::Build() { if (VLOG_IS_ON(4)) { DumpCollectedStats(); } + + for (HloModule* module : modules_) { + TF_ASSIGN_OR_RETURN( + std::unique_ptr points_to_analysis, + TuplePointsToAnalysis::Run(module)); + points_to_analyses_[module] = std::move(points_to_analysis); + } + return Status::OK(); } diff --git a/tensorflow/compiler/xla/service/hlo_module_group_metadata.h b/tensorflow/compiler/xla/service/hlo_module_group_metadata.h index dead6d9c20..768b0c7eb3 100644 --- a/tensorflow/compiler/xla/service/hlo_module_group_metadata.h +++ b/tensorflow/compiler/xla/service/hlo_module_group_metadata.h @@ -26,6 +26,7 @@ limitations under the License. #include "tensorflow/compiler/xla/service/hlo_computation.h" #include "tensorflow/compiler/xla/service/hlo_instruction.h" #include "tensorflow/compiler/xla/service/hlo_module.h" +#include "tensorflow/compiler/xla/service/tuple_points_to_analysis.h" #include "tensorflow/compiler/xla/status.h" #include "tensorflow/compiler/xla/statusor.h" #include "tensorflow/core/lib/core/status.h" @@ -197,6 +198,10 @@ class HloModuleGroupMetadata { // Returns the maximum channel id or all_reduce_id used in the module group. int64 max_channel_id() const { return max_channel_id_; } + TuplePointsToAnalysis* points_to_analysis(HloModule* module) const { + return points_to_analyses_.at(module).get(); + } + private: Status Build(); @@ -271,6 +276,9 @@ class HloModuleGroupMetadata { // The modules that this metadata was built from. const std::vector& modules_; + + tensorflow::gtl::FlatMap> + points_to_analyses_; }; } // namespace xla -- GitLab From 41948f588ba2852ebae712358117ffa86e32a24b Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Mon, 27 Aug 2018 21:23:37 -0700 Subject: [PATCH 201/598] Update ops-related pbtxt files. PiperOrigin-RevId: 210487140 --- .../core/ops/compat/ops_history.v1.pbtxt | 32 +++++++++++++++++++ tensorflow/core/ops/ops.pbtxt | 1 + 2 files changed, 33 insertions(+) diff --git a/tensorflow/core/ops/compat/ops_history.v1.pbtxt b/tensorflow/core/ops/compat/ops_history.v1.pbtxt index 97a212b8f3..b341e1332d 100644 --- a/tensorflow/core/ops/compat/ops_history.v1.pbtxt +++ b/tensorflow/core/ops/compat/ops_history.v1.pbtxt @@ -43939,6 +43939,38 @@ op { } } } +op { + name: "Relu" + input_arg { + name: "features" + type_attr: "T" + } + output_arg { + name: "activations" + type_attr: "T" + } + attr { + name: "T" + type: "type" + allowed_values { + list { + type: DT_FLOAT + type: DT_DOUBLE + type: DT_INT32 + type: DT_UINT8 + type: DT_INT16 + type: DT_INT8 + type: DT_INT64 + type: DT_BFLOAT16 + type: DT_UINT16 + type: DT_HALF + type: DT_UINT32 + type: DT_UINT64 + type: DT_QINT8 + } + } + } +} op { name: "Relu6" input_arg { diff --git a/tensorflow/core/ops/ops.pbtxt b/tensorflow/core/ops/ops.pbtxt index 9091622f09..fe8caf0e03 100644 --- a/tensorflow/core/ops/ops.pbtxt +++ b/tensorflow/core/ops/ops.pbtxt @@ -22389,6 +22389,7 @@ op { type: DT_HALF type: DT_UINT32 type: DT_UINT64 + type: DT_QINT8 } } } -- GitLab From 5da18c4078341b88750507e2e0f3fce9d3ed58e8 Mon Sep 17 00:00:00 2001 From: Justin Lebar Date: Mon, 27 Aug 2018 23:22:57 -0700 Subject: [PATCH 202/598] Mark MatMulAndAddCompWithProfiling as manual. PiperOrigin-RevId: 210495040 --- tensorflow/compiler/aot/tests/BUILD | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tensorflow/compiler/aot/tests/BUILD b/tensorflow/compiler/aot/tests/BUILD index 7364d63b53..723e9bec8a 100644 --- a/tensorflow/compiler/aot/tests/BUILD +++ b/tensorflow/compiler/aot/tests/BUILD @@ -187,6 +187,9 @@ tf_library( cpp_class = "MatMulAndAddCompWithProfiling", enable_xla_hlo_profiling = True, graph = "test_graph_tfmatmulandadd.pb", + tags = [ + "manual", + ], ) tf_library( -- GitLab From 821e2c31f7d4cd1f932a094e0860a03b8d056f88 Mon Sep 17 00:00:00 2001 From: Tom Hennigan Date: Tue, 28 Aug 2018 00:51:09 -0700 Subject: [PATCH 203/598] Replace `get_default_context()` with `context()`. PiperOrigin-RevId: 210501378 --- tensorflow/python/eager/context.py | 9 +-------- tensorflow/python/eager/execution_callbacks.py | 8 ++++---- tensorflow/python/framework/test_util.py | 2 +- tensorflow/python/kernel_tests/array_ops_test.py | 2 +- tensorflow/python/kernel_tests/constant_op_eager_test.py | 2 +- 5 files changed, 8 insertions(+), 15 deletions(-) diff --git a/tensorflow/python/eager/context.py b/tensorflow/python/eager/context.py index f75ea6c265..13fb0e88a6 100644 --- a/tensorflow/python/eager/context.py +++ b/tensorflow/python/eager/context.py @@ -629,14 +629,7 @@ def context(): def context_safe(): - return _context - - -# TODO(agarwal): remove this. -def get_default_context(): - """Same as context.""" - if _context is None: - _initialize_context() + """Returns current context (or None if one hasn't been initialized).""" return _context diff --git a/tensorflow/python/eager/execution_callbacks.py b/tensorflow/python/eager/execution_callbacks.py index 9a08259653..80ff4459d6 100644 --- a/tensorflow/python/eager/execution_callbacks.py +++ b/tensorflow/python/eager/execution_callbacks.py @@ -146,7 +146,7 @@ def inf_nan_callback(op_type, """ del attrs, inputs # Not used. - ctx = context.get_default_context() + ctx = context.context() for index, output in enumerate(outputs): if not output.dtype.is_numpy_compatible: @@ -263,12 +263,12 @@ def add_execution_callback(callback): Return value(s) from the callback are ignored. """ execute.execute = execute.execute_with_callbacks - context.get_default_context().add_post_execution_callback(callback) + context.context().add_post_execution_callback(callback) def clear_execution_callbacks(): """Clear all execution callbacks from the default eager context.""" - context.get_default_context().clear_post_execution_callbacks() + context.context().clear_post_execution_callbacks() def seterr(inf_or_nan=None): @@ -309,7 +309,7 @@ def seterr(inf_or_nan=None): "Valid actions are %s." % (inf_or_nan, _VALID_CALLBACK_ACTIONS)) old_settings = {"inf_or_nan": "ignore"} - default_context = context.get_default_context() + default_context = context.context() carryover_callbacks = [] for callback in default_context.post_execution_callbacks: diff --git a/tensorflow/python/framework/test_util.py b/tensorflow/python/framework/test_util.py index 155134fac4..7cddd861c8 100644 --- a/tensorflow/python/framework/test_util.py +++ b/tensorflow/python/framework/test_util.py @@ -547,7 +547,7 @@ def assert_no_new_tensors(f): f(self, **kwargs) # Make an effort to clear caches, which would otherwise look like leaked # Tensors. - context.get_default_context()._clear_caches() # pylint: disable=protected-access + context.context()._clear_caches() # pylint: disable=protected-access gc.collect() tensors_after = [ obj for obj in gc.get_objects() diff --git a/tensorflow/python/kernel_tests/array_ops_test.py b/tensorflow/python/kernel_tests/array_ops_test.py index 81442d12e9..b2bafeadba 100644 --- a/tensorflow/python/kernel_tests/array_ops_test.py +++ b/tensorflow/python/kernel_tests/array_ops_test.py @@ -1145,7 +1145,7 @@ class IdentityTest(test_util.TensorFlowTestCase): def testEagerIdentity(self): with context.eager_mode(): - ctx = context.get_default_context() + ctx = context.context() if not ctx.num_gpus(): self.skipTest("No GPUs found") diff --git a/tensorflow/python/kernel_tests/constant_op_eager_test.py b/tensorflow/python/kernel_tests/constant_op_eager_test.py index a0d5557b92..cc788219ef 100644 --- a/tensorflow/python/kernel_tests/constant_op_eager_test.py +++ b/tensorflow/python/kernel_tests/constant_op_eager_test.py @@ -523,7 +523,7 @@ class OnesLikeTest(test.TestCase): class FillTest(test.TestCase): def _compare(self, dims, val, np_ans, use_gpu): - ctx = context.get_default_context() + ctx = context.context() device = "GPU:0" if (use_gpu and ctx.num_gpus()) else "CPU:0" with ops.device(device): tf_ans = array_ops.fill(dims, val, name="fill") -- GitLab From f255b51c6e637ac7701996b4457157d3c313dca4 Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Tue, 28 Aug 2018 01:39:19 -0700 Subject: [PATCH 204/598] Make init_scope() public PiperOrigin-RevId: 210505860 --- tensorflow/python/framework/ops.py | 18 ++++++++++++++++++ .../tools/api/golden/v1/tensorflow.pbtxt | 4 ++++ .../tools/api/golden/v2/tensorflow.pbtxt | 4 ++++ 3 files changed, 26 insertions(+) diff --git a/tensorflow/python/framework/ops.py b/tensorflow/python/framework/ops.py index ae86d55d3e..192aadbaba 100644 --- a/tensorflow/python/framework/ops.py +++ b/tensorflow/python/framework/ops.py @@ -5217,6 +5217,7 @@ _default_graph_stack = _DefaultGraphStack() # pylint: disable=g-doc-return-or-yield,line-too-long +@tf_export("init_scope") @tf_contextlib.contextmanager def init_scope(): """A context manager that lifts ops out of control-flow scopes and function-building graphs. @@ -5246,6 +5247,23 @@ def init_scope(): (3) The gradient tape is paused while the scope is active. + When eager execution is enabled, code inside an init_scope block runs with + eager execution enabled even when defining graph functions via + tf.contrib.eager.defun. For example: + + ```python + tf.enable_eager_execution() + + @tf.contrib.eager.defun + def func(): + # A defun-decorated function constructs TensorFlow graphs, + # it does not execute eagerly. + assert not tf.executing_eagerly() + with tf.init_scope(): + # Initialization runs with eager execution enabled + assert tf.executing_eagerly() + ``` + Raises: RuntimeError: if graph state is incompatible with this initialization. """ diff --git a/tensorflow/tools/api/golden/v1/tensorflow.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.pbtxt index f710524031..00fe63f55e 100644 --- a/tensorflow/tools/api/golden/v1/tensorflow.pbtxt +++ b/tensorflow/tools/api/golden/v1/tensorflow.pbtxt @@ -1288,6 +1288,10 @@ tf_module { name: "import_graph_def" argspec: "args=[\'graph_def\', \'input_map\', \'return_elements\', \'name\', \'op_dict\', \'producer_op_list\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\'], " } + member_method { + name: "init_scope" + argspec: "args=[], varargs=None, keywords=None, defaults=None" + } member_method { name: "initialize_all_tables" argspec: "args=[\'name\'], varargs=None, keywords=None, defaults=[\'init_all_tables\'], " diff --git a/tensorflow/tools/api/golden/v2/tensorflow.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.pbtxt index c8114c431a..807908617a 100644 --- a/tensorflow/tools/api/golden/v2/tensorflow.pbtxt +++ b/tensorflow/tools/api/golden/v2/tensorflow.pbtxt @@ -1264,6 +1264,10 @@ tf_module { name: "import_graph_def" argspec: "args=[\'graph_def\', \'input_map\', \'return_elements\', \'name\', \'op_dict\', \'producer_op_list\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\'], " } + member_method { + name: "init_scope" + argspec: "args=[], varargs=None, keywords=None, defaults=None" + } member_method { name: "initialize_all_tables" argspec: "args=[\'name\'], varargs=None, keywords=None, defaults=[\'init_all_tables\'], " -- GitLab From 3e13ae966115b1aaf793601b0647b40efb25a2da Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Tue, 28 Aug 2018 01:52:59 -0700 Subject: [PATCH 205/598] Implementation of reduce_any. PiperOrigin-RevId: 210507220 --- tensorflow/contrib/lite/build_def.bzl | 1 + .../internal/reference/reference_ops.h | 17 ++++ tensorflow/contrib/lite/kernels/reduce.cc | 42 +++++++++ .../contrib/lite/kernels/reduce_test.cc | 93 ++++++++++++++++++- tensorflow/contrib/lite/kernels/register.cc | 2 + tensorflow/contrib/lite/model.cc | 4 +- .../contrib/lite/testing/generate_examples.py | 23 ++++- .../contrib/lite/toco/export_tensorflow.cc | 20 +--- .../propagate_fixed_sizes.cc | 64 +------------ .../contrib/lite/toco/import_tensorflow.cc | 20 +--- tensorflow/contrib/lite/toco/model.h | 6 +- .../contrib/lite/toco/tflite/operator.cc | 27 +++++- .../contrib/lite/toco/tflite/operator_test.cc | 27 ++++-- 13 files changed, 228 insertions(+), 118 deletions(-) diff --git a/tensorflow/contrib/lite/build_def.bzl b/tensorflow/contrib/lite/build_def.bzl index 458a50f25c..30252831a3 100644 --- a/tensorflow/contrib/lite/build_def.bzl +++ b/tensorflow/contrib/lite/build_def.bzl @@ -266,6 +266,7 @@ def generated_test_models(): "padv2", "prelu", "pow", + "reduce_any", "reduce_max", "reduce_min", "reduce_prod", diff --git a/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h b/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h index ff77f61191..f67d0a8752 100644 --- a/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h +++ b/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h @@ -4133,6 +4133,23 @@ inline bool ReduceProd(const T* input_data, const int* input_dims, resolved_axis, init_value, reducer); } +// Computes the logical_or of elements across dimensions given in axis. +inline bool ReduceAny(const bool* input_data, const int* input_dims, + const int input_num_dims, bool* output_data, + const int* output_dims, const int output_num_dims, + const int* axis, const int64_t num_axis_dimensions, + bool keep_dims, int* temp_index, int* resolved_axis) { + bool init_value = false; + + auto reducer = [](const bool current, const bool in) -> bool { + return current || in; + }; + return ReduceGeneric(input_data, input_dims, input_num_dims, + output_data, output_dims, output_num_dims, axis, + num_axis_dimensions, keep_dims, temp_index, + resolved_axis, init_value, reducer); +} + // Computes the mean of elements across dimensions given in axis. // It does so in two stages, first calculates the sum of elements along the axis // then divides it by the number of element in axis. diff --git a/tensorflow/contrib/lite/kernels/reduce.cc b/tensorflow/contrib/lite/kernels/reduce.cc index 839b48cb83..4001cf357f 100644 --- a/tensorflow/contrib/lite/kernels/reduce.cc +++ b/tensorflow/contrib/lite/kernels/reduce.cc @@ -177,6 +177,9 @@ TfLiteStatus InitializeTemporaries(TfLiteContext* context, TfLiteNode* node, case kTfLiteUInt8: temp_sum->type = kTfLiteInt32; break; + case kTfLiteBool: + temp_sum->type = kTfLiteBool; + break; default: return kTfLiteError; } @@ -204,6 +207,13 @@ TfLiteStatus PrepareSimple(TfLiteContext* context, TfLiteNode* node) { return kTfLiteOk; } +TfLiteStatus PrepareAny(TfLiteContext* context, TfLiteNode* node) { + TF_LITE_ENSURE_EQ(context, NumInputs(node), 2); + const TfLiteTensor* input = GetInput(context, node, 0); + TF_LITE_ENSURE_EQ(context, input->type, kTfLiteBool); + return PrepareSimple(context, node); +} + TfLiteStatus PrepareMean(TfLiteContext* context, TfLiteNode* node) { TF_LITE_ENSURE_OK(context, PrepareSimple(context, node)); @@ -476,6 +486,31 @@ TfLiteStatus EvalMin(TfLiteContext* context, TfLiteNode* node) { #undef TF_LITE_MIN return kTfLiteOk; } + +template +TfLiteStatus EvalAny(TfLiteContext* context, TfLiteNode* node) { + OpContext op_context(context, node); + int64_t num_axis = NumElements(op_context.axis); + TfLiteTensor* temp_index = GetTemporary(context, node, /*index=*/0); + TfLiteTensor* resolved_axis = GetTemporary(context, node, /*index=*/1); + // Resize the output tensor if the output tensor is dynamic. + if (IsDynamicTensor(op_context.output)) { + TF_LITE_ENSURE_OK(context, + ResizeTempAxis(context, &op_context, resolved_axis)); + TF_LITE_ENSURE_OK(context, ResizeOutputTensor(context, &op_context)); + } + if (kernel_type == kReference) { + reference_ops::ReduceAny( + GetTensorData(op_context.input), op_context.input->dims->data, + op_context.input->dims->size, GetTensorData(op_context.output), + op_context.output->dims->data, op_context.output->dims->size, + GetTensorData(op_context.axis), num_axis, + op_context.params->keep_dims, GetTensorData(temp_index), + GetTensorData(resolved_axis)); + } + + return kTfLiteOk; +} } // namespace reduce TfLiteRegistration* Register_MEAN_REF() { @@ -513,6 +548,12 @@ TfLiteRegistration* Register_REDUCE_MIN_REF() { return &r; } +TfLiteRegistration* Register_REDUCE_ANY_REF() { + static TfLiteRegistration r = {reduce::Init, reduce::Free, reduce::PrepareAny, + reduce::EvalAny}; + return &r; +} + // TODO(kanlig): add optimized implementation of Mean. TfLiteRegistration* Register_MEAN() { return Register_MEAN_REF(); } TfLiteRegistration* Register_SUM() { return Register_SUM_REF(); } @@ -521,6 +562,7 @@ TfLiteRegistration* Register_REDUCE_PROD() { } TfLiteRegistration* Register_REDUCE_MAX() { return Register_REDUCE_MAX_REF(); } TfLiteRegistration* Register_REDUCE_MIN() { return Register_REDUCE_MIN_REF(); } +TfLiteRegistration* Register_REDUCE_ANY() { return Register_REDUCE_ANY_REF(); } } // namespace builtin } // namespace ops diff --git a/tensorflow/contrib/lite/kernels/reduce_test.cc b/tensorflow/contrib/lite/kernels/reduce_test.cc index 69a07f76b6..6d289b14d8 100644 --- a/tensorflow/contrib/lite/kernels/reduce_test.cc +++ b/tensorflow/contrib/lite/kernels/reduce_test.cc @@ -198,6 +198,35 @@ class MinOpDynamicModel : public BaseOpModel { } }; +// Model for the tests case where axis is a const tensor. +class AnyOpConstModel : public BaseOpModel { + public: + AnyOpConstModel(const TensorData& input, const TensorData& output, + std::initializer_list axis_shape, + std::initializer_list axis, bool keep_dims) { + input_ = AddInput(input); + axis_ = AddConstInput(TensorType_INT32, axis, axis_shape); + output_ = AddOutput(output); + SetBuiltinOp(BuiltinOperator_REDUCE_ANY, BuiltinOptions_ReducerOptions, + CreateReducerOptions(builder_, keep_dims).Union()); + BuildInterpreter({GetShape(input_)}); + } +}; + +// Model for the tests case where axis is a dynamic tensor. +class AnyOpDynamicModel : public BaseOpModel { + public: + AnyOpDynamicModel(const TensorData& input, const TensorData& output, + const TensorData& axis, bool keep_dims) { + input_ = AddInput(input); + axis_ = AddInput(axis); + output_ = AddOutput(output); + SetBuiltinOp(BuiltinOperator_REDUCE_ANY, BuiltinOptions_ReducerOptions, + CreateReducerOptions(builder_, keep_dims).Union()); + BuildInterpreter({GetShape(input_)}); + } +}; + // for quantized Add, the error shouldn't exceed step float GetTolerance(int min, int max) { return (max - min) / 255.0; } @@ -778,7 +807,7 @@ TEST(DynamicFloatMinOpTest, KeepDims) { ElementsAreArray(ArrayFloatNear({1, 3, 5}))); } -TEST(DynamicFloatMinOpTest, Scale) { +TEST(DynamicFloatMinOpTest, Scalar) { std::vector data = {9.527}; MinOpDynamicModel m({TensorType_FLOAT32, {1}}, {TensorType_FLOAT32, {1}}, {TensorType_INT32, {1}}, true); @@ -862,6 +891,68 @@ TEST(DynamicUint8MinOpTest, Scalar) { ElementsAreArray(ArrayFloatNear({11.1294}, kQuantizedTolerance))); } +// Tests for reduce_any + +TEST(ConstAnyOpTest, NotKeepDims) { + std::vector data = {false, false, false, false, false, false, + false, true, false, false, false, true}; + AnyOpConstModel m({TensorType_BOOL, {2, 3, 2}}, {TensorType_BOOL, {2}}, {4}, + {1, 0, -3, -3}, false); + m.SetInput(data); + m.Invoke(); + EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({2})); + EXPECT_THAT(m.GetOutput(), ElementsAreArray({false, true})); +} + +TEST(ConstAnyOpTest, KeepDims) { + std::vector data = {false, false, false, false, false, false, + false, true, false, false, false, true}; + AnyOpConstModel m({TensorType_BOOL, {2, 3, 2}}, {TensorType_BOOL, {3}}, {2}, + {0, 2}, true); + m.SetInput(data); + m.Invoke(); + EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 3, 1})); + EXPECT_THAT(m.GetOutput(), ElementsAreArray({true, false, true})); +} + +TEST(DynamicAnyOpTest, NotKeepDims) { + std::vector data = {false, false, false, false, false, false, + false, true, false, false, false, true}; + AnyOpDynamicModel m({TensorType_BOOL, {2, 3, 2}}, {TensorType_BOOL, {2}}, + {TensorType_INT32, {4}}, false); + std::vector axis = {1, 0, -3, -3}; + m.SetAxis(axis); + m.SetInput(data); + m.Invoke(); + EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({2})); + EXPECT_THAT(m.GetOutput(), ElementsAreArray({false, true})); +} + +TEST(DynamicAnyOpTest, KeepDims) { + std::vector data = {false, false, false, false, false, false, + false, true, false, false, false, true}; + AnyOpDynamicModel m({TensorType_BOOL, {2, 3, 2}}, {TensorType_BOOL, {3}}, + {TensorType_INT32, {2}}, true); + std::vector axis = {0, 2}; + m.SetAxis(axis); + m.SetInput(data); + m.Invoke(); + EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 3, 1})); + EXPECT_THAT(m.GetOutput(), ElementsAreArray({true, false, true})); +} + +TEST(DynamicAnyOpTest, Scalar) { + std::vector data = {false}; + AnyOpDynamicModel m({TensorType_BOOL, {1}}, {TensorType_BOOL, {1}}, + {TensorType_INT32, {1}}, true); + std::vector axis = {0}; + m.SetAxis(axis); + m.SetInput(data); + m.Invoke(); + EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1})); + EXPECT_THAT(m.GetOutput(), ElementsAreArray({false})); +} + } // namespace } // namespace tflite diff --git a/tensorflow/contrib/lite/kernels/register.cc b/tensorflow/contrib/lite/kernels/register.cc index 341fd14127..175dfab210 100644 --- a/tensorflow/contrib/lite/kernels/register.cc +++ b/tensorflow/contrib/lite/kernels/register.cc @@ -95,6 +95,7 @@ TfLiteRegistration* Register_SUM(); TfLiteRegistration* Register_REDUCE_PROD(); TfLiteRegistration* Register_REDUCE_MAX(); TfLiteRegistration* Register_REDUCE_MIN(); +TfLiteRegistration* Register_REDUCE_ANY(); TfLiteRegistration* Register_SELECT(); TfLiteRegistration* Register_SLICE(); TfLiteRegistration* Register_SIN(); @@ -222,6 +223,7 @@ BuiltinOpResolver::BuiltinOpResolver() { AddBuiltin(BuiltinOperator_REDUCE_PROD, Register_REDUCE_PROD()); AddBuiltin(BuiltinOperator_REDUCE_MAX, Register_REDUCE_MAX()); AddBuiltin(BuiltinOperator_REDUCE_MIN, Register_REDUCE_MIN()); + AddBuiltin(BuiltinOperator_REDUCE_ANY, Register_REDUCE_ANY()); AddBuiltin(BuiltinOperator_EXPAND_DIMS, Register_EXPAND_DIMS()); AddBuiltin(BuiltinOperator_SPARSE_TO_DENSE, Register_SPARSE_TO_DENSE()); AddBuiltin(BuiltinOperator_EQUAL, Register_EQUAL()); diff --git a/tensorflow/contrib/lite/model.cc b/tensorflow/contrib/lite/model.cc index da3ed42e20..aa410ab002 100644 --- a/tensorflow/contrib/lite/model.cc +++ b/tensorflow/contrib/lite/model.cc @@ -624,7 +624,8 @@ TfLiteStatus ParseOpData(const Operator* op, BuiltinOperator op_type, case BuiltinOperator_REDUCE_MAX: case BuiltinOperator_REDUCE_MIN: case BuiltinOperator_REDUCE_PROD: - case BuiltinOperator_SUM: { + case BuiltinOperator_SUM: + case BuiltinOperator_REDUCE_ANY: { auto* params = MallocPOD(); if (auto* schema_params = op->builtin_options_as_ReducerOptions()) { params->keep_dims = schema_params->keep_dims(); @@ -800,7 +801,6 @@ TfLiteStatus ParseOpData(const Operator* op, BuiltinOperator op_type, case BuiltinOperator_LOGICAL_AND: case BuiltinOperator_LOGICAL_NOT: case BuiltinOperator_FLOOR_DIV: - case BuiltinOperator_REDUCE_ANY: break; } return kTfLiteOk; diff --git a/tensorflow/contrib/lite/testing/generate_examples.py b/tensorflow/contrib/lite/testing/generate_examples.py index a329bb3a25..cbd6a2a7f0 100644 --- a/tensorflow/contrib/lite/testing/generate_examples.py +++ b/tensorflow/contrib/lite/testing/generate_examples.py @@ -821,13 +821,17 @@ def make_binary_op_tests(zip_path, binary_operator): make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs) -def make_reduce_tests(reduce_op, min_value=-10, max_value=10): +def make_reduce_tests(reduce_op, + min_value=-10, + max_value=10, + boolean_tensor_only=False): """Make a set of tests to do reduce operation. Args: reduce_op: TensorFlow reduce operation to test, i.e. `tf.reduce_mean`. min_value: min value for created tensor data. max_value: max value for created tensor data. + boolean_tensor_only: If true, will only generate tensor with boolean value. Returns: a function representing the true generator with `reduce_op_in` curried. @@ -867,10 +871,11 @@ def make_reduce_tests(reduce_op, min_value=-10, max_value=10): def build_graph(parameters): """Build the mean op testing graph.""" + dtype = parameters["input_dtype"] + if boolean_tensor_only: + dtype = tf.bool input_tensor = tf.placeholder( - dtype=parameters["input_dtype"], - name="input", - shape=parameters["input_shape"]) + dtype=dtype, name="input", shape=parameters["input_shape"]) # Get axis as either a placeholder or constants. if parameters["const_axis"]: @@ -889,9 +894,12 @@ def make_reduce_tests(reduce_op, min_value=-10, max_value=10): return input_tensors, [out] def build_inputs(parameters, sess, inputs, outputs): + dtype = parameters["input_dtype"] + if boolean_tensor_only: + dtype = tf.bool values = [ create_tensor_data( - parameters["input_dtype"], + dtype, parameters["input_shape"], min_value=min_value, max_value=max_value) @@ -931,6 +939,11 @@ def make_reduce_min_tests(zip_path): return make_reduce_tests(tf.reduce_min)(zip_path) +def make_reduce_any_tests(zip_path): + """Make a set of tests to do any.""" + return make_reduce_tests(tf.reduce_any, boolean_tensor_only=True)(zip_path) + + def make_exp_tests(zip_path): """Make a set of tests to do exp.""" diff --git a/tensorflow/contrib/lite/toco/export_tensorflow.cc b/tensorflow/contrib/lite/toco/export_tensorflow.cc index 94602445c2..6fdf47dedc 100644 --- a/tensorflow/contrib/lite/toco/export_tensorflow.cc +++ b/tensorflow/contrib/lite/toco/export_tensorflow.cc @@ -1900,21 +1900,6 @@ void ConvertPowOperator(const Model& model, const PowOperator& src_op, (*pow_op->mutable_attr())["T"].set_type(data_type); } -void ConvertAnyOperator(const Model& model, const AnyOperator& src_op, - GraphDef* tensorflow_graph) { - tensorflow::NodeDef* any_op = tensorflow_graph->add_node(); - any_op->set_op("Any"); - any_op->set_name(src_op.outputs[0]); - CHECK_EQ(src_op.inputs.size(), 2); - for (int i = 0; i < 2; ++i) { - *any_op->add_input() = src_op.inputs[i]; - } - const tensorflow::DataType data_type = - GetTensorFlowDataType(model, src_op.inputs[1]); - (*any_op->mutable_attr())["Tidx"].set_type(data_type); - (*any_op->mutable_attr())["keep_dims"].set_b(src_op.keep_dims); -} - void ConvertLogicalAndOperator(const Model& model, const LogicalAndOperator& src_op, GraphDef* tensorflow_graph) { @@ -2221,8 +2206,9 @@ void ConvertOperator(const Model& model, const Operator& src_op, ConvertPowOperator(model, static_cast(src_op), "Pow", tensorflow_graph); } else if (src_op.type == OperatorType::kAny) { - ConvertAnyOperator(model, static_cast(src_op), - tensorflow_graph); + ConvertReduceOperator(model, + static_cast(src_op), + tensorflow_graph, "Any"); } else if (src_op.type == OperatorType::kLogicalAnd) { ConvertLogicalAndOperator(model, static_cast(src_op), diff --git a/tensorflow/contrib/lite/toco/graph_transformations/propagate_fixed_sizes.cc b/tensorflow/contrib/lite/toco/graph_transformations/propagate_fixed_sizes.cc index fa2be961f5..28effc2a67 100644 --- a/tensorflow/contrib/lite/toco/graph_transformations/propagate_fixed_sizes.cc +++ b/tensorflow/contrib/lite/toco/graph_transformations/propagate_fixed_sizes.cc @@ -539,6 +539,8 @@ bool KeepDims(const Operator& op) { return static_cast(op).keep_dims; case OperatorType::kMean: return static_cast(op).keep_dims; + case OperatorType::kAny: + return static_cast(op).keep_dims; default: LOG(FATAL) << "Not a reduction operator!"; return false; @@ -1515,65 +1517,6 @@ void ProcessTileOperator(Model* model, TensorFlowTileOperator* op) { } } -void ProcessAnyOperator(Model* model, AnyOperator* op) { - CHECK_EQ(op->inputs.size(), 2); - CHECK_EQ(op->outputs.size(), 1); - - auto& output_array = model->GetArray(op->outputs[0]); - if (output_array.has_shape()) { - // We have already run. - return; - } - - const auto& input_array = model->GetArray(op->inputs[0]); - if (!input_array.has_shape()) { - // Yield until input dims have been resolved. - return; - } - const auto& input_shape = input_array.shape(); - - auto& reduction_indices_array = model->GetArray(op->inputs[1]); - if (!reduction_indices_array.has_shape()) { - // Yield until reduction indices shape been resolved. - return; - } - if (!reduction_indices_array.buffer) { - // Yield until the reduction indices are constant. - return; - } - CHECK(reduction_indices_array.data_type == ArrayDataType::kInt32) - << "Any reduction input must be int32"; - - int input_rank = input_shape.dimensions_count(); - std::set true_indices; - const auto& reduction_indices = - reduction_indices_array.GetBuffer().data; - for (int i = 0; i < reduction_indices.size(); ++i) { - const int32 reduction_index = reduction_indices[i]; - if (reduction_index < -input_rank || reduction_index >= input_rank) { - CHECK(false) << "Invalid reduction dimension " << reduction_index - << " for input with " << input_rank << " dimensions"; - } - int32 wrapped_index = reduction_index; - if (wrapped_index < 0) { - wrapped_index += input_rank; - } - true_indices.insert(wrapped_index); - } - - auto* mutable_dims = output_array.mutable_shape()->mutable_dims(); - mutable_dims->clear(); - for (int i = 0; i < input_rank; ++i) { - if (true_indices.count(i) > 0) { - if (op->keep_dims) { - mutable_dims->emplace_back(1); - } - } else { - mutable_dims->emplace_back(input_shape.dims(i)); - } - } -} - void ProcessOneHotOperator(Model* model, OneHotOperator* op) { CHECK_EQ(op->inputs.size(), 4); CHECK_EQ(op->outputs.size(), 1); @@ -1769,6 +1712,7 @@ bool PropagateFixedSizes::Run(Model* model, std::size_t op_index) { case OperatorType::kSum: case OperatorType::kReduceProd: case OperatorType::kMean: + case OperatorType::kAny: ProcessTensorFlowReductionOperator(model, op); break; case OperatorType::kSelect: @@ -1900,8 +1844,6 @@ bool PropagateFixedSizes::Run(Model* model, std::size_t op_index) { case OperatorType::kTile: ProcessTileOperator(model, static_cast(op)); break; - case OperatorType::kAny: - ProcessAnyOperator(model, static_cast(op)); break; case OperatorType::kOneHot: ProcessOneHotOperator(model, static_cast(op)); diff --git a/tensorflow/contrib/lite/toco/import_tensorflow.cc b/tensorflow/contrib/lite/toco/import_tensorflow.cc index 0e04ee4ccb..cb6da21039 100644 --- a/tensorflow/contrib/lite/toco/import_tensorflow.cc +++ b/tensorflow/contrib/lite/toco/import_tensorflow.cc @@ -1638,24 +1638,6 @@ tensorflow::Status ConvertShapeOperator( return tensorflow::Status::OK(); } -tensorflow::Status ConvertAnyOperator( - const NodeDef& node, const TensorFlowImportFlags& tf_import_flags, - Model* model) { - CHECK_EQ(node.op(), "Any"); - TF_QCHECK_OK(CheckInputsCount(node, tf_import_flags, 2)); - const auto idx_type = - HasAttr(node, "Tidx") ? GetDataTypeAttr(node, "Tidx") : DT_INT32; - CHECK(idx_type == DT_INT32); - auto op = absl::make_unique(); - op->inputs.push_back(node.input(0)); - op->inputs.push_back(node.input(1)); - op->outputs.push_back(node.name()); - op->keep_dims = - HasAttr(node, "keep_dims") ? GetBoolAttr(node, "keep_dims") : false; - model->operators.push_back(std::move(op)); - return tensorflow::Status::OK(); -} - void StripCaretFromArrayNames(Model* model) { for (auto& op : model->operators) { for (auto& input : op->inputs) { @@ -1937,7 +1919,7 @@ ConverterMapType GetTensorFlowNodeConverterMap() { {"Add", ConvertSimpleOperator}, {"AddN", ConvertSimpleOperator}, {"All", ConvertSimpleOperator}, - {"Any", ConvertAnyOperator}, + {"Any", ConvertReduceOperator}, {"ArgMax", ConvertArgMaxOperator}, {"ArgMin", ConvertArgMinOperator}, {"Assert", ConvertSimpleOperator}, diff --git a/tensorflow/contrib/lite/toco/model.h b/tensorflow/contrib/lite/toco/model.h index 3a909c3d8e..fa1c459f0e 100644 --- a/tensorflow/contrib/lite/toco/model.h +++ b/tensorflow/contrib/lite/toco/model.h @@ -1768,11 +1768,11 @@ struct PowOperator : Operator { // // Inputs: // Inputs[0]: required: A boolean input tensor. -// Inputs[1]: required: reduction_indices. // // TensorFlow equivalent: tf.reduce_any. -struct AnyOperator : Operator { - AnyOperator() : Operator(OperatorType::kAny) {} +struct TensorFlowAnyOperator : Operator { + TensorFlowAnyOperator() : Operator(OperatorType::kAny) {} + std::vector axis; bool keep_dims = false; }; diff --git a/tensorflow/contrib/lite/toco/tflite/operator.cc b/tensorflow/contrib/lite/toco/tflite/operator.cc index e9383098cc..f687e9689e 100644 --- a/tensorflow/contrib/lite/toco/tflite/operator.cc +++ b/tensorflow/contrib/lite/toco/tflite/operator.cc @@ -769,7 +769,7 @@ class Sum }; class ReduceMax - : public BuiltinOperator { public: using BuiltinOperator::BuiltinOperator; @@ -788,7 +788,7 @@ class ReduceMax }; class ReduceMin - : public BuiltinOperator { public: using BuiltinOperator::BuiltinOperator; @@ -807,7 +807,26 @@ class ReduceMin }; class ReduceProd - : public BuiltinOperator { + public: + using BuiltinOperator::BuiltinOperator; + flatbuffers::Offset WriteOptions( + const TocoOperator& op, + flatbuffers::FlatBufferBuilder* builder) const override { + return ::tflite::CreateReducerOptions(*builder, op.keep_dims); + } + + void ReadOptions(const TfLiteOptions& options, + TocoOperator* op) const override { + op->keep_dims = options.keep_dims(); + } + + int GetVersion(const Operator& op) const override { return 1; } +}; + +class ReduceAny + : public BuiltinOperator { public: using BuiltinOperator::BuiltinOperator; @@ -1336,6 +1355,8 @@ std::vector> BuildOperatorList() { OperatorType::kReduceMax)); ops.push_back(MakeUnique(::tflite::BuiltinOperator_REDUCE_MIN, OperatorType::kReduceMin)); + ops.push_back(MakeUnique(::tflite::BuiltinOperator_REDUCE_ANY, + OperatorType::kAny)); ops.push_back( MakeUnique(::tflite::BuiltinOperator_RESIZE_BILINEAR, OperatorType::kResizeBilinear)); diff --git a/tensorflow/contrib/lite/toco/tflite/operator_test.cc b/tensorflow/contrib/lite/toco/tflite/operator_test.cc index bb0b457483..6da9317e4f 100644 --- a/tensorflow/contrib/lite/toco/tflite/operator_test.cc +++ b/tensorflow/contrib/lite/toco/tflite/operator_test.cc @@ -97,6 +97,16 @@ class OperatorTest : public ::testing::Test { ASSERT_NE(nullptr, output_toco_op.get()); } + + template + void CheckReducerOperator(const string& name, OperatorType type) { + T op; + + op.keep_dims = false; + + auto output_toco_op = SerializeAndDeserialize(GetOperator(name, type), op); + EXPECT_EQ(op.keep_dims, output_toco_op->keep_dims); + } }; TEST_F(OperatorTest, SimpleOperators) { @@ -144,13 +154,16 @@ TEST_F(OperatorTest, BuiltinAdd) { output_toco_op->fused_activation_function); } -TEST_F(OperatorTest, BuiltinMean) { - MeanOperator op; - op.keep_dims = false; - - auto output_toco_op = - SerializeAndDeserialize(GetOperator("MEAN", OperatorType::kMean), op); - EXPECT_EQ(op.keep_dims, output_toco_op->keep_dims); +TEST_F(OperatorTest, BuiltinReducerOps) { + CheckReducerOperator("MEAN", OperatorType::kMean); + CheckReducerOperator("SUM", OperatorType::kSum); + CheckReducerOperator("REDUCE_PROD", + OperatorType::kReduceProd); + CheckReducerOperator("REDUCE_MAX", + OperatorType::kReduceMax); + CheckReducerOperator("REDUCE_MIN", + OperatorType::kReduceMin); + CheckReducerOperator("REDUCE_ANY", OperatorType::kAny); } TEST_F(OperatorTest, BuiltinCast) { -- GitLab From a6ebb9294a8eb848bcba46905d5f71ccf0da1a18 Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Tue, 28 Aug 2018 02:16:17 -0700 Subject: [PATCH 206/598] compat: Update forward compatibility horizon to 2018-08-28 PiperOrigin-RevId: 210509807 --- tensorflow/python/compat/compat.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tensorflow/python/compat/compat.py b/tensorflow/python/compat/compat.py index d9f92c3eda..74b001a572 100644 --- a/tensorflow/python/compat/compat.py +++ b/tensorflow/python/compat/compat.py @@ -26,7 +26,7 @@ import datetime from tensorflow.python.util import tf_contextlib from tensorflow.python.util.tf_export import tf_export -_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2018, 8, 27) +_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2018, 8, 28) @tf_export("compat.forward_compatible") -- GitLab From 5045a71675fd198b5ae322593ec47ea4327ad348 Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Tue, 28 Aug 2018 02:45:57 -0700 Subject: [PATCH 207/598] [TF:XLA] Add support for mirror_pad in symmetric mode. PiperOrigin-RevId: 210512603 --- tensorflow/compiler/tests/binary_ops_test.py | 33 ++++++++++++++++++- .../compiler/tf2xla/kernels/mirror_pad_op.cc | 32 ++++++++++++++---- 2 files changed, 57 insertions(+), 8 deletions(-) diff --git a/tensorflow/compiler/tests/binary_ops_test.py b/tensorflow/compiler/tests/binary_ops_test.py index ed4940f204..17280e445b 100644 --- a/tensorflow/compiler/tests/binary_ops_test.py +++ b/tensorflow/compiler/tests/binary_ops_test.py @@ -1010,7 +1010,38 @@ class BinaryOpsTest(xla_test.XLATestCase): [7, 7, 7, 7, 7, 7]], dtype=dtype)) - def testMirrorPad(self): + def testSymmetricMirrorPad(self): + mirror_pad = lambda t, paddings: array_ops.pad(t, paddings, "SYMMETRIC") + for dtype in self.numeric_types: + self._testBinary( + mirror_pad, + np.array( + [ + [1, 2, 3], # + [4, 5, 6], # + ], + dtype=dtype), + np.array([[ + 2, + 2, + ], [3, 3]], dtype=np.int32), + expected=np.array( + [ + [6, 5, 4, 4, 5, 6, 6, 5, 4], # + [3, 2, 1, 1, 2, 3, 3, 2, 1], # + [3, 2, 1, 1, 2, 3, 3, 2, 1], # + [6, 5, 4, 4, 5, 6, 6, 5, 4], # + [6, 5, 4, 4, 5, 6, 6, 5, 4], # + [3, 2, 1, 1, 2, 3, 3, 2, 1], # + ], + dtype=dtype)) + self._testBinary( + mirror_pad, + np.array([[1, 2, 3], [4, 5, 6]], dtype=dtype), + np.array([[0, 0], [0, 0]], dtype=np.int32), + expected=np.array([[1, 2, 3], [4, 5, 6]], dtype=dtype)) + + def testReflectMirrorPad(self): mirror_pad = lambda t, paddings: array_ops.pad(t, paddings, "REFLECT") for dtype in self.numeric_types: self._testBinary( diff --git a/tensorflow/compiler/tf2xla/kernels/mirror_pad_op.cc b/tensorflow/compiler/tf2xla/kernels/mirror_pad_op.cc index eedfc3c914..2a42eeaf76 100644 --- a/tensorflow/compiler/tf2xla/kernels/mirror_pad_op.cc +++ b/tensorflow/compiler/tf2xla/kernels/mirror_pad_op.cc @@ -29,7 +29,14 @@ class MirrorPadOp : public XlaOpKernel { xla::StatusOr DoMirrorPad(const xla::XlaOp& t, const xla::Shape& original_shape, const xla::LiteralSlice& pad_literal, + const MirrorPadMode mode, xla::XlaBuilder* b) { + // The difference in the semantics of REFLECT and SYMMETRIC is that REFLECT + // will not mirror the border values while symmetric does. + // e.g. input is [1, 2, 3] and paddings is [0, 2], then the output is: + // - [1, 2, 3, 2, 1] in reflect mode + // - [1, 2, 3, 3, 2] in symmetric mode. + int64 excluded_edges = mode == MirrorPadMode::REFLECT ? 1 : 0; xla::XlaOp accum = t; for (int64 dimno = xla::ShapeUtil::Rank(original_shape) - 1; dimno >= 0; --dimno) { @@ -39,9 +46,19 @@ class MirrorPadOp : public XlaOpKernel { TF_ASSIGN_OR_RETURN(int64 rhs_padding, pad_literal.GetIntegralAsS64({dimno, 1})); int64 dim_size = original_shape.dimensions(dimno); - auto lhs_pad = xla::SliceInDim(t_rev, dim_size - 1 - lhs_padding, - dim_size - 1, 1, dimno); - auto rhs_pad = xla::SliceInDim(t_rev, 1, 1 + rhs_padding, 1, dimno); + + // Padding amounts on each side must be no more than the size of the + // original shape. + TF_RET_CHECK(lhs_padding >= 0 && + lhs_padding <= dim_size - excluded_edges); + TF_RET_CHECK(rhs_padding >= 0 && + rhs_padding <= dim_size - excluded_edges); + + auto lhs_pad = + xla::SliceInDim(t_rev, dim_size - excluded_edges - lhs_padding, + dim_size - excluded_edges, 1, dimno); + auto rhs_pad = xla::SliceInDim(t_rev, excluded_edges, + excluded_edges + rhs_padding, 1, dimno); accum = xla::ConcatInDim(b, {lhs_pad, accum, rhs_pad}, dimno); } return accum; @@ -53,9 +70,10 @@ class MirrorPadOp : public XlaOpKernel { MirrorPadMode mode; OP_REQUIRES_OK(ctx, GetNodeAttr(def(), "mode", &mode)); - OP_REQUIRES(ctx, mode == MirrorPadMode::REFLECT, - xla::Unimplemented( - "Only REFLECT MirrorPad mode is currently supported")); + OP_REQUIRES( + ctx, mode == MirrorPadMode::REFLECT || mode == MirrorPadMode::SYMMETRIC, + xla::Unimplemented("Unsupported MirrorPad mode. Only SYMMETRIC and " + "REFLECT modes are currently supported")); const int dims = input_shape.dims(); OP_REQUIRES( @@ -83,7 +101,7 @@ class MirrorPadOp : public XlaOpKernel { xla::StatusOr in0_shape = b->GetShape(in0); OP_REQUIRES(ctx, in0_shape.ok(), in0_shape.status()); xla::StatusOr accum_status = - DoMirrorPad(in0, in0_shape.ValueOrDie(), pad_literal, b); + DoMirrorPad(in0, in0_shape.ValueOrDie(), pad_literal, mode, b); OP_REQUIRES_OK(ctx, accum_status.status()); -- GitLab From 0e8e9c48fc1d59e13c567fbc869bb2dc55d595a8 Mon Sep 17 00:00:00 2001 From: Thomas Joerg Date: Tue, 28 Aug 2018 05:08:13 -0700 Subject: [PATCH 208/598] Fix Typo. PiperOrigin-RevId: 210524100 --- tensorflow/compiler/xla/service/gpu/multi_output_fusion.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tensorflow/compiler/xla/service/gpu/multi_output_fusion.h b/tensorflow/compiler/xla/service/gpu/multi_output_fusion.h index 67ca5d49ee..f0b4d67ab8 100644 --- a/tensorflow/compiler/xla/service/gpu/multi_output_fusion.h +++ b/tensorflow/compiler/xla/service/gpu/multi_output_fusion.h @@ -22,7 +22,7 @@ namespace xla { namespace gpu { // Multi-output fusion of sibling and producer-consumer instructions for the -// Jellyfish backend. +// GPU backend. class GpuMultiOutputFusion : public MultiOutputFusion { public: GpuMultiOutputFusion(); -- GitLab From 57919740bf151cb6395aa60e30404ee9caa066d6 Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Tue, 28 Aug 2018 05:23:18 -0700 Subject: [PATCH 209/598] Domain tuple sharding propagation from users instead of from operands. PiperOrigin-RevId: 210525464 --- .../compiler/xla/service/hlo_domain_map.cc | 21 +- .../compiler/xla/service/hlo_domain_map.h | 11 +- .../xla/service/hlo_domain_metadata.h | 5 +- .../compiler/xla/service/hlo_domain_test.cc | 118 +++++++- .../xla/service/hlo_sharding_metadata.cc | 279 ++++++++++++------ 5 files changed, 322 insertions(+), 112 deletions(-) diff --git a/tensorflow/compiler/xla/service/hlo_domain_map.cc b/tensorflow/compiler/xla/service/hlo_domain_map.cc index edf0073f30..8b2846e0c2 100644 --- a/tensorflow/compiler/xla/service/hlo_domain_map.cc +++ b/tensorflow/compiler/xla/service/hlo_domain_map.cc @@ -72,6 +72,11 @@ Status HloDomainMap::TryProcessEmptyDomain(HloInstruction* instruction) { } Status HloDomainMap::Populate(HloComputation* computation) { + InstructionOrderMap instructions_post_order; + int64 count = 0; + for (HloInstruction* instruction : computation->MakeInstructionPostOrder()) { + instructions_post_order.insert(std::make_pair(instruction, count++)); + } for (HloInstruction* instruction : computation->instructions()) { if (IsDomainInstruction(instruction)) { // If this is a kDomain of the kind we are currently processing, check @@ -85,7 +90,7 @@ Status HloDomainMap::Populate(HloComputation* computation) { continue; } TF_ASSIGN_OR_RETURN(std::unique_ptr domain, - CreateDomain(instruction)); + CreateDomain(instruction, instructions_post_order)); TF_RETURN_IF_ERROR(InsertDomain(std::move(domain))); } return Status::OK(); @@ -143,10 +148,12 @@ Status HloDomainMap::ExpandDomain(HloInstruction* instruction, } StatusOr> HloDomainMap::CreateDomain( - HloInstruction* instruction) const { + HloInstruction* instruction, + const InstructionOrderMap& instructions_order) const { auto domain = absl::make_unique(); TF_RETURN_IF_ERROR(ExpandDomain(instruction, domain.get())); - domain->instructions = MakeNonDomainInstructions(domain->reach_set); + domain->instructions = + MakeNonDomainInstructions(domain->reach_set, instructions_order); return std::move(domain); } @@ -168,7 +175,8 @@ bool HloDomainMap::IsDomainInstruction(HloInstruction* instruction) const { /* static */ std::vector HloDomainMap::MakeNonDomainInstructions( - const tensorflow::gtl::FlatSet& instruction_set) { + const tensorflow::gtl::FlatSet& instruction_set, + const InstructionOrderMap& instructions_order) { std::vector instructions; instructions.reserve(instruction_set.size()); for (HloInstruction* instruction : instruction_set) { @@ -176,9 +184,10 @@ HloDomainMap::MakeNonDomainInstructions( instructions.push_back(instruction); } } + // sort instructions according to instructions_order std::sort(instructions.begin(), instructions.end(), - [](HloInstruction* a, HloInstruction* b) { - return a->unique_id() < b->unique_id(); + [&instructions_order](HloInstruction* a, HloInstruction* b) { + return instructions_order.at(a) < instructions_order.at(b); }); return instructions; } diff --git a/tensorflow/compiler/xla/service/hlo_domain_map.h b/tensorflow/compiler/xla/service/hlo_domain_map.h index 1ca7159725..633109249a 100644 --- a/tensorflow/compiler/xla/service/hlo_domain_map.h +++ b/tensorflow/compiler/xla/service/hlo_domain_map.h @@ -70,6 +70,11 @@ class HloDomainMap { int64 GetDomainId(HloInstruction* instruction) const; private: + // Map used for representing instruction ordering, i.e. + // order_map[a] < order_map[b] means a must be ordered before b. + using InstructionOrderMap = + tensorflow::gtl::FlatMap; + HloDomainMap(string domain_kind) : domain_kind_(std::move(domain_kind)) {} // Check if the kDomain instruction is facing (via its operand link) another @@ -95,12 +100,14 @@ class HloDomainMap { // Creates a domain data structure using the ExpandDomain() API. StatusOr> CreateDomain( - HloInstruction* instruction) const; + HloInstruction* instruction, + const InstructionOrderMap& instructions_order) const; // Out of an instruction set, returns a vector of all the ones which are not // a kDomain kind. static std::vector MakeNonDomainInstructions( - const tensorflow::gtl::FlatSet& instruction_set); + const tensorflow::gtl::FlatSet& instruction_set, + const InstructionOrderMap& instructions_order); string domain_kind_; std::vector> instruction_domains_; diff --git a/tensorflow/compiler/xla/service/hlo_domain_metadata.h b/tensorflow/compiler/xla/service/hlo_domain_metadata.h index 575149c8b8..6c142ee474 100644 --- a/tensorflow/compiler/xla/service/hlo_domain_metadata.h +++ b/tensorflow/compiler/xla/service/hlo_domain_metadata.h @@ -44,7 +44,10 @@ class DomainMetadata { // two domains of different kind intersect each other. tensorflow::gtl::FlatSet reach_set; - // The same instructions in reach_set, but purged from kDomain instructions. + // The same instructions in reach_set, but purged from kDomain instructions + // and ordered according to their computation graph post-order, i.e. + // if instructions[pos_a] depends on instructions[pos_b], then pos_a > + // pos_b. std::vector instructions; // If we consider a graph edge as an arrow oriented from the operand to the diff --git a/tensorflow/compiler/xla/service/hlo_domain_test.cc b/tensorflow/compiler/xla/service/hlo_domain_test.cc index 79e78ee2d0..c8e0a9e289 100644 --- a/tensorflow/compiler/xla/service/hlo_domain_test.cc +++ b/tensorflow/compiler/xla/service/hlo_domain_test.cc @@ -350,7 +350,8 @@ ENTRY entry { token = token[] after-all() infeed = ((f32[4], f32[4]), token[]) infeed(token), sharding={{maximal device=1}, {maximal device=0}, {maximal device=0}} - infeed.data = (f32[4], f32[4]) get-tuple-element(infeed), index=0 + infeed.data = (f32[4], f32[4]) get-tuple-element(infeed), index=0, + sharding={{maximal device=1}, {maximal device=0}} gte0 = f32[4] get-tuple-element(infeed.data), index=0 gte1 = f32[4] get-tuple-element(infeed.data), index=1 copy0 = f32[4] copy(gte0) @@ -384,11 +385,8 @@ ENTRY entry { // \ / // TUPLE // | - HloInstruction* infeed = FindInstruction(module, "infeed"); - ASSERT_NE(infeed, nullptr); - HloInstruction* infeed_data = - infeed->parent()->AddInstruction(HloInstruction::CreateGetTupleElement( - ShapeUtil::GetTupleElementShape(infeed->shape(), 0), infeed, 0)); + HloInstruction* infeed_data = FindInstruction(module, "infeed.data"); + ASSERT_NE(infeed_data, nullptr); auto infeed_data_users = infeed_data->users(); HloInstruction* new_gte0 = infeed_data->parent()->AddInstruction( @@ -496,6 +494,7 @@ TEST_F(HloDomainTest, DumpParseNullSharding) { ASSERT_TRUE(ParseModule(hlo_string).status().ok()); } +// Tuple inputs are domain instructions. TEST_F(HloDomainTest, DomainTuple) { const char* const hlo_string = R"( HloModule Module @@ -503,7 +502,8 @@ HloModule Module ENTRY entry { p0 = f32[4] parameter(0), sharding={maximal device=0} cst = u32[] constant(0), sharding={maximal device=1} - tpl = (u32[], f32[4]) tuple(cst, p0), sharding={{maximal device=1}, {maximal device=0}} + tpl = (u32[], f32[4]) tuple(cst, p0), + sharding={{maximal device=1}, {maximal device=0}} ROOT gte = f32[4] get-tuple-element(tpl), index=1, sharding={maximal device=0} } )"; @@ -588,5 +588,109 @@ ENTRY %entry (p0: (f32[4], f32[4])) -> (f32[4], f32[4], f32[4]) { EXPECT_FALSE(HasDomainEdge(module, "d", "c")); } +// Emulate instructions inserted at top and bottom within nested tuple domain. +TEST_F(HloDomainTest, DomainTupleTopBottomInsert) { + const char* const hlo_string = R"( +HloModule Module + +ENTRY entry { + p0 = f32[4] parameter(0), sharding={maximal device=1} + p1 = (f32[5], f32[6]) parameter(1), + sharding={{maximal device=1}, {maximal device=0}} + tuple.0 = (f32[4], (f32[5], f32[6])) tuple(p0, p1), + sharding={{maximal device=1}, {maximal device=1}, {maximal device=0}} + ROOT res = (f32[5], f32[6]) get-tuple-element(tuple.0), index=1, + sharding={{maximal device=1}, {maximal device=0}} +} +)"; + + TF_ASSERT_OK_AND_ASSIGN(HloModule * module, ParseModule(hlo_string)); + + HloDomainIsolator isolator(ShardingDomainCreator{}); + TF_ASSERT_OK_AND_ASSIGN(bool isolator_changed, isolator.Run(module)); + EXPECT_TRUE(isolator_changed); + + // Clear sharding of tuple.0 instruction, in order to test domain sharding + // application. + auto tuple0 = FindInstruction(module, "tuple.0"); + tuple0->clear_sharding(); + + // Insert the following instructons above and below tuple.0, to emulate other + // passes effects: + // COPY.0 + // \ / + // TUPLE.0 + // / \ + // COPY.1 \ + // / \ + // GTE.0 GTE.1 + // | | + // | COPY.2 + // \ / + // \ / + // TUPLE.1 + // | + auto tuple0_users = tuple0->users(); + auto computation = tuple0->parent(); + HloInstruction* copy0 = computation->AddInstruction( + HloInstruction::CreateUnary(tuple0->operand(1)->shape(), HloOpcode::kCopy, + tuple0->mutable_operand(1))); + TF_EXPECT_OK(tuple0->ReplaceOperandWith(1, copy0)); + + HloInstruction* copy1 = computation->AddInstruction( + HloInstruction::CreateUnary(tuple0->shape(), HloOpcode::kCopy, tuple0)); + HloInstruction* gte0 = + computation->AddInstruction(HloInstruction::CreateGetTupleElement( + ShapeUtil::GetTupleElementShape(copy1->shape(), 0), copy1, 0)); + HloInstruction* gte1 = + computation->AddInstruction(HloInstruction::CreateGetTupleElement( + ShapeUtil::GetTupleElementShape(tuple0->shape(), 1), tuple0, 1)); + HloInstruction* copy2 = computation->AddInstruction( + HloInstruction::CreateUnary(gte1->shape(), HloOpcode::kCopy, gte1)); + HloInstruction* tuple1 = + computation->AddInstruction(HloInstruction::CreateTuple({gte0, copy2})); + + for (HloInstruction* user : tuple0_users) { + TF_EXPECT_OK(tuple0->ReplaceUseWith(user, tuple1)); + } + + HloDomainRemover remover(ShardingMetadata::KindName(), + ShardingMetadata::NormalizeShardingDomain); + TF_ASSERT_OK_AND_ASSIGN(bool remover_changed, remover.Run(module)); + EXPECT_TRUE(remover_changed); + + EXPECT_TRUE(tuple0->has_sharding()); + EXPECT_EQ(HloSharding::Tuple(tuple0->shape(), {HloSharding::AssignDevice(1), + HloSharding::AssignDevice(1), + HloSharding::AssignDevice(0)}), + tuple0->sharding()); + + EXPECT_TRUE(copy0->has_sharding()); + EXPECT_EQ(HloSharding::Tuple(copy0->shape(), {HloSharding::AssignDevice(1), + HloSharding::AssignDevice(0)}), + copy0->sharding()); + + // copy1 has partial information only from gte.0, so in the end it gets no + // sharding at all. During propagation it does propagate the information from + // gte.0 though, enabling Tuple.0 to be fully sharded. + EXPECT_FALSE(copy1->has_sharding()); + + EXPECT_TRUE(gte0->has_sharding()); + EXPECT_EQ(HloSharding::AssignDevice(1), gte0->sharding()); + + EXPECT_TRUE(gte1->has_sharding()); + EXPECT_EQ(HloSharding::Tuple(gte1->shape(), {HloSharding::AssignDevice(1), + HloSharding::AssignDevice(0)}), + gte1->sharding()); + + EXPECT_TRUE(copy2->has_sharding()); + EXPECT_EQ(HloSharding::Tuple(copy2->shape(), {HloSharding::AssignDevice(1), + HloSharding::AssignDevice(0)}), + copy2->sharding()); + + EXPECT_TRUE(tuple1->has_sharding()); + EXPECT_EQ(tuple0->sharding(), tuple1->sharding()); +} + } // namespace } // namespace xla diff --git a/tensorflow/compiler/xla/service/hlo_sharding_metadata.cc b/tensorflow/compiler/xla/service/hlo_sharding_metadata.cc index a9b3b66934..6e9b96488c 100644 --- a/tensorflow/compiler/xla/service/hlo_sharding_metadata.cc +++ b/tensorflow/compiler/xla/service/hlo_sharding_metadata.cc @@ -24,6 +24,23 @@ namespace xla { namespace { +// AssignmentKind and kUnassignedDevice are used during tuple domain sharding +// propagation in order to distinguish among three cases: +// kUnassigned: no assignment has occurred +// kAssigned: at least an assignment has occurred +// kConflict: no assignment has occurred because of conflicting propagations, +// which occurs when multiple users of an instruction have different +// shardings. +enum class AssignmentKind { kUnassigned, kAssigned, kConflict }; + +// kUnassignedDevice can only be assigned to tuple leaf shardings to indicate +// absence of sharding information for that particular sub-sharding during +// sharding propagation. It is used to be able to express tuple shardings with +// partial information. At the end of the propagation the sharding of +// tuple-shaped instructions using kUnassignedDevice's is cleared. +// TODO(b/112883246): Centralized enum of reserved devices. +constexpr int64 kUnassignedDevice = -2; + struct PassThrough { PassThrough(HloInstruction* user, HloInstruction* operand) : user(user), operand(operand) {} @@ -147,108 +164,174 @@ Status ApplyDomainSingleSharding(const DomainMetadata::Domain& domain, return Status::OK(); } -// Retrieves the sharding of a tuple shaped instruction in form of a ShapeTree. -// If the instruction has no sharding, a ShapeTree with HloSharding::Replicate() -// sharding will be returned. -ShapeTree GetTupleSharding(HloInstruction* tuple) { - if (tuple->has_sharding()) { - return tuple->sharding().GetAsShapeTree(tuple->shape()); +// Return the ShapeTree of the user argument. The user argument +// is assumed to be a user of the instruction argument. +// If user is a tuple instruction, return the tuple subsharding corresponding to +// the operand matching the instruction argument, because that is the +// subsharding corresponding to instruction. +ShapeTree GetShardingTreeFromUser( + const HloInstruction& instruction, const HloInstruction& user) { + if (user.opcode() == HloOpcode::kTuple) { + return user.sharding() + .GetSubSharding(user.shape(), {user.operand_index(&instruction)}) + .GetAsShapeTree(instruction.shape()); + } + return user.sharding().GetAsShapeTree(user.shape()); +} + +// Assign rhs to lhs. If rhs is unassigned (assigned to kUnassignedDevice) +// then no assignment is made. Therefore kUnassignedDevice is never propagated. +// kConflict is returned if lhs is already assigned and rhs is assigned to a +// different device. +StatusOr AssignLeafSharding(HloSharding* lhs, + const HloSharding& rhs) { + TF_RET_CHECK(!lhs->IsTuple() && !rhs.IsTuple()); + if (rhs.UsesDevice(kUnassignedDevice)) { + return AssignmentKind::kUnassigned; + } + if (lhs->UsesDevice(kUnassignedDevice)) { + *lhs = rhs; + return AssignmentKind::kAssigned; } - return ShapeTree(tuple->shape(), HloSharding::Replicate()); + return lhs->UniqueDevice() != rhs.UniqueDevice() + ? AssignmentKind::kConflict + : AssignmentKind::kUnassigned; } -// Retrieves the sharding of operand, asked from a user instruction which is -// within domain. If operand is a kDomain, it means that sharding argument is -// the operand sharding, otherwise the operand's own sharding will be returned. -const HloSharding* GetOperandSharding(const HloInstruction* operand, +// Assigns the whole rhs tree to lhs_tree, starting at lhs_it. +// In case of conflicting assignment AssignmentKind::kConflict is returned. In +// this case lhs_tree is partially assigned, up to the conflicting leaf. It is +// up to the caller to discard the partial assignment in case of conflict. +StatusOr AssignTreeSharding( + ShapeTree* lhs_tree, ShapeTree::iterator lhs_it, + const ShapeTree& rhs_tree) { + AssignmentKind assigned = AssignmentKind::kUnassigned; + auto rhs_it = rhs_tree.begin(); + for (; lhs_it != lhs_tree->end() && rhs_it != rhs_tree.end(); + ++lhs_it, ++rhs_it) { + // TODO(b/112885211): Add ShapeTree::IsLeaf(const ShapeTreeIterator &it) + if (rhs_tree.IsLeaf(rhs_it->first)) { + TF_RET_CHECK(lhs_tree->IsLeaf(lhs_it->first)); + TF_ASSIGN_OR_RETURN(AssignmentKind sub_assigned, + AssignLeafSharding(&lhs_it->second, rhs_it->second)); + if (sub_assigned == AssignmentKind::kConflict) { + // In case of conflict we return conflict to the caller. At this point + // partial assignments to lhs_tree may have been made already. It is up + // to the caller to discard the partial assignment in case of conflict. + return AssignmentKind::kConflict; + } else if (sub_assigned == AssignmentKind::kAssigned) { + assigned = sub_assigned; + } + } + } + TF_RET_CHECK(rhs_it == rhs_tree.end()); + return assigned; +} + +StatusOr ApplyShardingFromUsers(HloInstruction* instruction, const DomainMetadata::Domain& domain, - const HloSharding& sharding) { - // Here the user of operand is within the domain instruction set, and since it - // is user of operand, we need to look into the enter_domains set. If this is - // not a kDomain within the user domains set, then return the operand - // sharding, if any. - if (operand->opcode() != HloOpcode::kDomain || - domain.enter_domains.count(const_cast(operand)) == 0) { - return operand->has_sharding() ? &operand->sharding() : nullptr; + const HloSharding& domain_sharding) { + if (instruction->users().empty()) { + // No sharding from users, use domain_sharding, after checking + // compatibility. + TF_RET_CHECK(ShapeUtil::IsTuple(instruction->shape()) && + ShapeUtil::GetLeafCount(instruction->shape()) == + domain_sharding.tuple_elements().size()); + instruction->set_sharding(domain_sharding); + return true; + } + AssignmentKind assigned = AssignmentKind::kUnassigned; + // The sharding_tree leaves are initialized to kUnassignedDevice. Only Tuple + // subshardings can result in a final sharding assignment containing + // kUnassignedDevice leaves, in case some tuple indexes are not used, or are + // used by users that don't have a sharding. + // Non-tuple shardings are either assigned to a real sharding, or are not + // assigned at all. As such they will never get assigned to kUnassignedDevice. + // In any case, kUnassignedDevice is never propagated, from the implementation + // of AssignLeafSharding. + ShapeTree sharding_tree( + instruction->shape(), HloSharding::AssignDevice(kUnassignedDevice)); + for (HloInstruction* user : instruction->users()) { + if (user->opcode() == HloOpcode::kDomain && + domain.exit_domains.count(const_cast(user)) > 0) { + // If a user is a domain and it is registered in the domain exits, then + // the instruction sharding is taken directly from the domain, and no + // further users need to be visited. + instruction->set_sharding(domain_sharding); + return true; + } + if (!user->has_sharding()) { + continue; + } + AssignmentKind sub_assigned = AssignmentKind::kUnassigned; + ShapeTree user_sharding_tree = + GetShardingTreeFromUser(*instruction, *user); + if (ShapeUtil::IsTuple(instruction->shape())) { + // For tuple-shaped instructions collect individual tuple subshardings + // from the uses, and then combine them into the tuple sharding. + // If the user is a GTE its sharding concerns only the subtree of + // sharding_tree at index user->tuple_index, otherwise the whole + // sharding_tree is affected. + ShapeTree::iterator sharding_tree_begin = + user->opcode() == HloOpcode::kGetTupleElement + ? sharding_tree.find({user->tuple_index()}) + : sharding_tree.begin(); + TF_ASSIGN_OR_RETURN( + sub_assigned, AssignTreeSharding(&sharding_tree, sharding_tree_begin, + user_sharding_tree)); + } else { + // Non-tuple shape: assign common users sharding. + TF_RET_CHECK(user_sharding_tree.leaf_count() == 1) + << "Expected non-tuple user sharding"; + TF_ASSIGN_OR_RETURN( + sub_assigned, + AssignTreeSharding(&sharding_tree, sharding_tree.begin(), + user_sharding_tree)); + } + + if (sub_assigned == AssignmentKind::kConflict) { + // In case of conflict we don't assign any sharding. + return false; + } else if (sub_assigned == AssignmentKind::kAssigned) { + assigned = sub_assigned; + } + } + + if (assigned == AssignmentKind::kAssigned) { + if (ShapeUtil::IsTuple(instruction->shape())) { + instruction->set_sharding(HloSharding::Tuple(sharding_tree)); + } else { + TF_RET_CHECK(sharding_tree.leaf_count() == 1); + instruction->set_sharding(sharding_tree.leaf_begin()->second); + } + return true; } - // At this point operand is a kDomain of the currently processed domain, so we - // can refer to sharding as the domain sharding. - return &sharding; + return false; } // Tries to propagate the sharding information into the instructions that are -// part of the domain, in a post order manner (operand propagate to user). +// part of the domain, in a reverse post order manner (users propoagate to +// instruction). StatusOr ApplyDomainShardingPass(const DomainMetadata::Domain& domain, - const HloSharding& sharding) { + const HloSharding& domain_sharding) { int64 assigned = 0; - for (HloInstruction* instruction : domain.instructions) { + // domain.instructions are ordered in a post-order manner. As we do + // user->operand propagation we process instructions in reverse order. In so + // doing we are guaranteed to process all users before their operands. + for (auto it = domain.instructions.rbegin(); it != domain.instructions.rend(); + ++it) { + HloInstruction* instruction = *it; if (instruction->has_sharding()) { continue; } - if (instruction->opcode() == HloOpcode::kGetTupleElement) { - HloInstruction* tuple = instruction->mutable_operand(0); - const HloSharding* tuple_sharding = - GetOperandSharding(tuple, domain, sharding); - if (tuple_sharding != nullptr) { - if (tuple_sharding->IsTuple()) { - HloSharding sub_sharding = tuple_sharding->GetSubSharding( - tuple->shape(), {instruction->tuple_index()}); - VLOG(4) << " " << instruction->name() << " to sharding " - << sub_sharding; - instruction->set_sharding(sub_sharding); - } else { - SetSingleSharding(instruction, *tuple_sharding); - } - ++assigned; - } - } else if (instruction->opcode() == HloOpcode::kTuple) { - int64 tuple_assigned = 0; - ShapeTree shape_tree = GetTupleSharding(instruction); - for (int64 i = 0; i < instruction->operand_count(); ++i) { - const HloSharding* operand_sharding = - GetOperandSharding(instruction->operand(i), domain, sharding); - if (operand_sharding != nullptr) { - HloSharding operand_subsharding = HloSharding::Replicate(); - if (operand_sharding == &sharding) { - operand_subsharding = - sharding.GetSubSharding(instruction->shape(), {i}); - operand_sharding = &operand_subsharding; - } - if (shape_tree.element({i}) != *operand_sharding) { - *shape_tree.mutable_element({i}) = *operand_sharding; - ++tuple_assigned; - } - } - } - if (tuple_assigned > 0) { - HloSharding tuple_sharding = HloSharding::Tuple(shape_tree); - VLOG(4) << " " << instruction->name() << " to sharding " - << tuple_sharding; - instruction->set_sharding(tuple_sharding); - ++assigned; - } - } else { - // If all the operand of the given instruction has the same single device - // assignment, assign that device to this instruction as well. - const HloSharding* common_sharding = nullptr; - for (const HloInstruction* operand : instruction->operands()) { - const HloSharding* operand_sharding = - GetOperandSharding(operand, domain, sharding); - if (operand_sharding != nullptr) { - if (common_sharding != nullptr && - *common_sharding != *operand_sharding) { - common_sharding = nullptr; - break; - } - common_sharding = operand_sharding; - } - } - if (common_sharding != nullptr) { - VLOG(4) << " " << instruction->name() << " to sharding " - << *common_sharding; - instruction->set_sharding(*common_sharding); - ++assigned; - } + // Take the sharding from the users. + TF_ASSIGN_OR_RETURN( + bool instruction_assigned, + ApplyShardingFromUsers(instruction, domain, domain_sharding)); + if (instruction_assigned) { + ++assigned; + VLOG(4) << " " << instruction->name() << " to sharding " + << instruction->sharding(); } } return assigned; @@ -266,18 +349,22 @@ Status ApplyDomainSharding(const DomainMetadata::Domain& domain, return ApplyDomainSingleSharding(domain, *single_sharding); } VLOG(1) << "Assigning non-trivial sharding " << sharding; - for (;;) { - TF_ASSIGN_OR_RETURN(int64 assigned, - ApplyDomainShardingPass(domain, sharding)); - if (assigned == 0) { - break; - } - } + TF_RETURN_IF_ERROR(ApplyDomainShardingPass(domain, sharding).status()); + int64 unassigned = 0; for (HloInstruction* instruction : domain.instructions) { if (!instruction->has_sharding()) { LOG(WARNING) << "Unassigned instruction: " << instruction->ToString(); ++unassigned; + } else { + // Un-set sharding of tuples whose sub-sgardings are assigned to + // kUnassignedDevice. Indeed in case of doubt it is better to leave the + // entire tuple unassigned, and let the device placer decide for it. + if (instruction->sharding().UsesDevice(kUnassignedDevice)) { + TF_RET_CHECK(ShapeUtil::IsTuple(instruction->shape())) + << "Only tuples can have kUnassignedDevice sub shardings"; + instruction->clear_sharding(); + } } } // Should we error out if unassigned > 0? -- GitLab From de1696e9a818646fe6f200db42b150f1b7141900 Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Tue, 28 Aug 2018 06:36:02 -0700 Subject: [PATCH 210/598] Fix perfromance of HloComputation::ComputeChannelDependencies Previously it used an std::map containing std::vector's what added a large overhead to HloComputation::MakeInstructionPostOrder when a model contained a large number of channels. The new implementation replaced it with a FlatMap and an InlineVector what eliminates a large number of allocations and improves perfromance by a lot. PiperOrigin-RevId: 210531816 --- .../compiler/xla/service/hlo_computation.cc | 47 ++++++++++--------- .../compiler/xla/service/hlo_computation.h | 11 ++++- 2 files changed, 34 insertions(+), 24 deletions(-) diff --git a/tensorflow/compiler/xla/service/hlo_computation.cc b/tensorflow/compiler/xla/service/hlo_computation.cc index 4a59380ed9..c2d0673f49 100644 --- a/tensorflow/compiler/xla/service/hlo_computation.cc +++ b/tensorflow/compiler/xla/service/hlo_computation.cc @@ -319,12 +319,12 @@ void ComputeComputationPostOrder( } } -enum State { kVisiting, kVisited }; +} // namespace -void ComputeInstructionPostOrder( - std::map> channel_dependency_map, +void HloComputation::ComputeInstructionPostOrder( + const HloComputation::ChannelDependencyMap& channel_dependency_map, std::vector* post_order, HloInstruction* root, - tensorflow::gtl::FlatMap* visited) { + tensorflow::gtl::FlatMap* visited) const { std::vector dfs_stack; dfs_stack.push_back(root); while (!dfs_stack.empty()) { @@ -362,20 +362,22 @@ void ComputeInstructionPostOrder( // dependencies. switch (current->opcode()) { case HloOpcode::kRecvDone: { - const auto& dependencies = - channel_dependency_map[current->channel_id()]; - for (HloInstruction* op : dependencies) { - dfs_stack.emplace_back(op); + auto it = channel_dependency_map.find(current->channel_id()); + if (it != channel_dependency_map.end()) { + for (HloInstruction* op : it->second) { + dfs_stack.emplace_back(op); + } } break; } case HloOpcode::kCrossReplicaSum: { auto all_reduce_id = current->all_reduce_id(); if (all_reduce_id) { - const auto& dependencies = - channel_dependency_map[all_reduce_id.value()]; - for (HloInstruction* op : dependencies) { - dfs_stack.emplace_back(op); + auto it = channel_dependency_map.find(all_reduce_id.value()); + if (it != channel_dependency_map.end()) { + for (HloInstruction* op : it->second) { + dfs_stack.emplace_back(op); + } } } break; @@ -386,11 +388,9 @@ void ComputeInstructionPostOrder( } } -} // namespace - -std::map> +HloComputation::ChannelDependencyMap HloComputation::ComputeChannelDependencies() const { - std::map> channel_dependency_map; + ChannelDependencyMap channel_dependency_map; for (const auto& instruction : instructions_) { switch (instruction->opcode()) { case HloOpcode::kSend: { @@ -421,7 +421,7 @@ std::vector HloComputation::MakeInstructionPostOrder() const { std::vector post_order; post_order.reserve(instruction_count()); std::vector trace_instructions; - tensorflow::gtl::FlatMap visited; + tensorflow::gtl::FlatMap visited; for (auto& instruction : instructions_) { if (instruction->opcode() == HloOpcode::kTrace) { // Trace instructions aren't handled by the DFS visitor. Add trace @@ -746,16 +746,19 @@ std::unique_ptr HloComputation::ComputeReachability() switch (hlo->opcode()) { case HloOpcode::kRecvDone: { - const auto& dependencies = channel_dependency_map[hlo->channel_id()]; - absl::c_copy(dependencies, std::back_inserter(inputs)); + auto it = channel_dependency_map.find(hlo->channel_id()); + if (it != channel_dependency_map.end()) { + absl::c_copy(it->second, std::back_inserter(inputs)); + } break; } case HloOpcode::kCrossReplicaSum: { auto all_reduce_id = hlo->all_reduce_id(); if (all_reduce_id) { - const auto& dependencies = - channel_dependency_map[all_reduce_id.value()]; - absl::c_copy(dependencies, std::back_inserter(inputs)); + auto it = channel_dependency_map.find(all_reduce_id.value()); + if (it != channel_dependency_map.end()) { + absl::c_copy(it->second, std::back_inserter(inputs)); + } } break; } diff --git a/tensorflow/compiler/xla/service/hlo_computation.h b/tensorflow/compiler/xla/service/hlo_computation.h index 8d9b694977..59016624f7 100644 --- a/tensorflow/compiler/xla/service/hlo_computation.h +++ b/tensorflow/compiler/xla/service/hlo_computation.h @@ -403,8 +403,15 @@ class HloComputation { // instructions. For send&recv pairs it means the send instruction and for // cross-replica-sum the union of the dependencies for all participating // instructions. - std::map> ComputeChannelDependencies() - const; + using ChannelDependencyMap = + tensorflow::gtl::FlatMap>; + ChannelDependencyMap ComputeChannelDependencies() const; + + enum VisitState { kVisiting, kVisited }; + void ComputeInstructionPostOrder( + const HloComputation::ChannelDependencyMap& channel_dependency_map, + std::vector* post_order, HloInstruction* root, + tensorflow::gtl::FlatMap* visited) const; string name_; int64 unique_id_; -- GitLab From 5656c3db01c8d98758c0edeb6934dbd4698f39d1 Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Tue, 28 Aug 2018 06:56:23 -0700 Subject: [PATCH 211/598] Implementation of floor_div. PiperOrigin-RevId: 210533721 --- tensorflow/contrib/lite/build_def.bzl | 1 + .../lite/g3doc/tf_ops_compatibility.md | 12 ++ tensorflow/contrib/lite/kernels/BUILD | 15 ++ tensorflow/contrib/lite/kernels/floor_div.cc | 146 ++++++++++++++++++ .../contrib/lite/kernels/floor_div_test.cc | 90 +++++++++++ .../internal/reference/reference_ops.h | 15 ++ tensorflow/contrib/lite/kernels/register.cc | 2 + .../contrib/lite/testing/generate_examples.py | 13 +- .../testing/generated_examples_zip_test.cc | 9 ++ .../contrib/lite/toco/tflite/operator.cc | 2 + .../contrib/lite/toco/tflite/operator_test.cc | 1 + 11 files changed, 304 insertions(+), 2 deletions(-) create mode 100644 tensorflow/contrib/lite/kernels/floor_div.cc create mode 100644 tensorflow/contrib/lite/kernels/floor_div_test.cc diff --git a/tensorflow/contrib/lite/build_def.bzl b/tensorflow/contrib/lite/build_def.bzl index 30252831a3..fc199f0a0e 100644 --- a/tensorflow/contrib/lite/build_def.bzl +++ b/tensorflow/contrib/lite/build_def.bzl @@ -235,6 +235,7 @@ def generated_test_models(): "exp", "expand_dims", "floor", + "floor_div", "fully_connected", "fused_batch_norm", "gather", diff --git a/tensorflow/contrib/lite/g3doc/tf_ops_compatibility.md b/tensorflow/contrib/lite/g3doc/tf_ops_compatibility.md index fb9d5f6787..8660d29855 100644 --- a/tensorflow/contrib/lite/g3doc/tf_ops_compatibility.md +++ b/tensorflow/contrib/lite/g3doc/tf_ops_compatibility.md @@ -854,6 +854,18 @@ Outputs { } ``` +**FLOOR_DIV** + +``` +Inputs { + 0: a list of tensors. + 1: a list of tensors. +} +Outputs { + 0: A tensor of floor_div output tensors. +} +``` + And these are TensorFlow Lite operations that are present but not ready for custom models yet: diff --git a/tensorflow/contrib/lite/kernels/BUILD b/tensorflow/contrib/lite/kernels/BUILD index 407d52f0e8..8287115f5c 100644 --- a/tensorflow/contrib/lite/kernels/BUILD +++ b/tensorflow/contrib/lite/kernels/BUILD @@ -172,6 +172,7 @@ cc_library( "expand_dims.cc", "fake_quant.cc", "floor.cc", + "floor_div.cc", "fully_connected.cc", "gather.cc", "hashtable_lookup.cc", @@ -1216,6 +1217,20 @@ tf_cc_test( ], ) +tf_cc_test( + name = "floor_div_test", + size = "small", + srcs = ["floor_div_test.cc"], + tags = ["tflite_not_portable_ios"], + deps = [ + ":builtin_ops", + "//tensorflow/contrib/lite:builtin_op_data", + "//tensorflow/contrib/lite:framework", + "//tensorflow/contrib/lite/kernels:test_util", + "@com_google_googletest//:gtest", + ], +) + filegroup( name = "all_files", srcs = glob( diff --git a/tensorflow/contrib/lite/kernels/floor_div.cc b/tensorflow/contrib/lite/kernels/floor_div.cc new file mode 100644 index 0000000000..3c177ea330 --- /dev/null +++ b/tensorflow/contrib/lite/kernels/floor_div.cc @@ -0,0 +1,146 @@ +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ +#include "tensorflow/contrib/lite/context.h" +#include "tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h" +#include "tensorflow/contrib/lite/kernels/internal/tensor.h" +#include "tensorflow/contrib/lite/kernels/kernel_util.h" +#include "tensorflow/contrib/lite/kernels/op_macros.h" + +namespace tflite { +namespace ops { +namespace builtin { +namespace floor_div { +namespace { + +// Input/output tensor index. +constexpr int kInputTensor1 = 0; +constexpr int kInputTensor2 = 1; +constexpr int kOutputTensor = 0; + +// Op data for floor_div op. +struct OpData { + bool requires_broadcast; +}; + +template +T FloorDiv(T input1, T input2) { + return std::floor(std::divides()(static_cast(input1), + static_cast(input2))); +} + +void* Init(TfLiteContext* context, const char* buffer, size_t length) { + auto* data = new OpData; + data->requires_broadcast = false; + return data; +} + +void Free(TfLiteContext* context, void* buffer) { + delete reinterpret_cast(buffer); +} + +TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) { + TF_LITE_ENSURE_EQ(context, NumInputs(node), 2); + TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1); + + // Reinterprete the opaque data provided by user. + OpData* data = reinterpret_cast(node->user_data); + + const TfLiteTensor* input1 = GetInput(context, node, kInputTensor1); + const TfLiteTensor* input2 = GetInput(context, node, kInputTensor2); + TfLiteTensor* output = GetOutput(context, node, kOutputTensor); + + TF_LITE_ENSURE_EQ(context, input1->type, input2->type); + + const TfLiteType type = input1->type; + if (type != kTfLiteInt32) { + context->ReportError(context, "Currently floor_div only supports int32."); + return kTfLiteError; + } + output->type = type; + + data->requires_broadcast = !HaveSameShapes(input1, input2); + + TfLiteIntArray* output_size = nullptr; + if (data->requires_broadcast) { + TF_LITE_ENSURE_OK(context, CalculateShapeForBroadcast( + context, input1, input2, &output_size)); + } else { + output_size = TfLiteIntArrayCopy(input1->dims); + } + + return context->ResizeTensor(context, output, output_size); +} + +template +TfLiteStatus EvalImpl(TfLiteContext* context, bool requires_broadcast, + const TfLiteTensor* input1, const TfLiteTensor* input2, + TfLiteTensor* output) { + const T* denominator_data = GetTensorData(input2); + + // Validate the denominator. + for (int i = 0; i < NumElements(input2); ++i) { + if (std::equal_to()(denominator_data[i], 0)) { + context->ReportError(context, "Division by 0"); + return kTfLiteError; + } + } + if (requires_broadcast) { + reference_ops::BroadcastBinaryFunction( + GetTensorData(input1), GetTensorDims(input1), denominator_data, + GetTensorDims(input2), GetTensorData(output), GetTensorDims(output), + FloorDiv); + } else { + reference_ops::BinaryFunction( + GetTensorData(input1), GetTensorDims(input1), + GetTensorData(input2), GetTensorDims(input2), + GetTensorData(output), GetTensorDims(output), FloorDiv); + } + + return kTfLiteOk; +} + +TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) { + OpData* data = reinterpret_cast(node->user_data); + + const TfLiteTensor* input1 = GetInput(context, node, kInputTensor1); + const TfLiteTensor* input2 = GetInput(context, node, kInputTensor2); + TfLiteTensor* output = GetOutput(context, node, kOutputTensor); + + switch (input1->type) { + case kTfLiteInt32: { + return EvalImpl(context, data->requires_broadcast, input1, + input2, output); + } + default: { + context->ReportError(context, "Currently floor_div only supports int32."); + return kTfLiteError; + } + } +} + +} // namespace +} // namespace floor_div + +TfLiteRegistration* Register_FLOOR_DIV() { + // Init, Free, Prepare, Eval are satisfying the Interface required by + // TfLiteRegistration. + static TfLiteRegistration r = {floor_div::Init, floor_div::Free, + floor_div::Prepare, floor_div::Eval}; + return &r; +} + +} // namespace builtin +} // namespace ops +} // namespace tflite diff --git a/tensorflow/contrib/lite/kernels/floor_div_test.cc b/tensorflow/contrib/lite/kernels/floor_div_test.cc new file mode 100644 index 0000000000..eea69b61ac --- /dev/null +++ b/tensorflow/contrib/lite/kernels/floor_div_test.cc @@ -0,0 +1,90 @@ +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ +#include +#include "tensorflow/contrib/lite/interpreter.h" +#include "tensorflow/contrib/lite/kernels/register.h" +#include "tensorflow/contrib/lite/kernels/test_util.h" +#include "tensorflow/contrib/lite/model.h" + +namespace tflite { +namespace { + +using ::testing::ElementsAre; + +template +class FloorDivModel : public SingleOpModel { + public: + FloorDivModel(const TensorData& input1, const TensorData& input2, + const TensorData& output) { + input1_ = AddInput(input1); + input2_ = AddInput(input2); + output_ = AddOutput(output); + SetBuiltinOp(BuiltinOperator_FLOOR_DIV, BuiltinOptions_FloorDivOptions, + CreateFloorDivOptions(builder_).Union()); + BuildInterpreter({GetShape(input1_), GetShape(input2_)}); + } + + int input1() { return input1_; } + int input2() { return input2_; } + + std::vector GetOutput() { return ExtractVector(output_); } + std::vector GetOutputShape() { return GetTensorShape(output_); } + + private: + int input1_; + int input2_; + int output_; +}; + +TEST(PowOpModel, Simple) { + FloorDivModel model({TensorType_INT32, {1, 2, 2, 1}}, + {TensorType_INT32, {1, 2, 2, 1}}, + {TensorType_INT32, {}}); + model.PopulateTensor(model.input1(), {10, 9, 11, 3}); + model.PopulateTensor(model.input2(), {2, 2, 3, 4}); + model.Invoke(); + EXPECT_THAT(model.GetOutputShape(), ElementsAre(1, 2, 2, 1)); + EXPECT_THAT(model.GetOutput(), ElementsAre(5, 4, 3, 0)); +} + +TEST(PowOpModel, NegativeValue) { + FloorDivModel model({TensorType_INT32, {1, 2, 2, 1}}, + {TensorType_INT32, {1, 2, 2, 1}}, + {TensorType_INT32, {}}); + model.PopulateTensor(model.input1(), {10, -9, -11, 7}); + model.PopulateTensor(model.input2(), {2, 2, -3, -4}); + model.Invoke(); + EXPECT_THAT(model.GetOutputShape(), ElementsAre(1, 2, 2, 1)); + EXPECT_THAT(model.GetOutput(), ElementsAre(5, -5, 3, -2)); +} + +TEST(PowOpModel, BroadcastFloorDiv) { + FloorDivModel model({TensorType_INT32, {1, 2, 2, 1}}, + {TensorType_INT32, {1}}, {TensorType_INT32, {}}); + model.PopulateTensor(model.input1(), {10, -9, -11, 7}); + model.PopulateTensor(model.input2(), {-3}); + model.Invoke(); + EXPECT_THAT(model.GetOutputShape(), ElementsAre(1, 2, 2, 1)); + EXPECT_THAT(model.GetOutput(), ElementsAre(-4, 3, 3, -3)); +} + +} // namespace +} // namespace tflite + +int main(int argc, char** argv) { + ::tflite::LogToStderr(); + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h b/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h index f67d0a8752..b47a7f3b45 100644 --- a/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h +++ b/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h @@ -4949,6 +4949,21 @@ inline void BroadcastBinaryFunction(const T1* input1_data, DimsToShape(output_dims), output_data, func); } +// Legacy Dims<4> version. +// +// R: Result type. T1: Input 1 type. T2: Input 2 type. +// TODO(renjieliu): Refactor other binary functions to use this one. +template +inline void BinaryFunction(const T1* input1_data, const Dims<4>& input1_dims, + const T2* input2_data, const Dims<4>& input2_dims, + R* output_data, const Dims<4>& output_dims, + R (*func)(T1, T2)) { + const int flat_size = MatchingFlatSize(input1_dims, input2_dims, output_dims); + for (int i = 0; i < flat_size; ++i) { + output_data[i] = func(input1_data[i], input2_data[i]); + } +} + } // namespace reference_ops } // namespace tflite diff --git a/tensorflow/contrib/lite/kernels/register.cc b/tensorflow/contrib/lite/kernels/register.cc index 175dfab210..7b859dc332 100644 --- a/tensorflow/contrib/lite/kernels/register.cc +++ b/tensorflow/contrib/lite/kernels/register.cc @@ -115,6 +115,7 @@ TfLiteRegistration* Register_LOGICAL_OR(); TfLiteRegistration* Register_LOGICAL_AND(); TfLiteRegistration* Register_LOGICAL_NOT(); TfLiteRegistration* Register_UNPACK(); +TfLiteRegistration* Register_FLOOR_DIV(); TfLiteStatus UnsupportedTensorFlowOp(TfLiteContext* context, TfLiteNode* node) { context->ReportError( @@ -239,6 +240,7 @@ BuiltinOpResolver::BuiltinOpResolver() { AddBuiltin(BuiltinOperator_LOGICAL_AND, Register_LOGICAL_AND()); AddBuiltin(BuiltinOperator_LOGICAL_NOT, Register_LOGICAL_NOT()); AddBuiltin(BuiltinOperator_UNPACK, Register_UNPACK()); + AddBuiltin(BuiltinOperator_FLOOR_DIV, Register_FLOOR_DIV()); // TODO(andrewharp, ahentz): Move these somewhere more appropriate so that // custom ops aren't always included by default. diff --git a/tensorflow/contrib/lite/testing/generate_examples.py b/tensorflow/contrib/lite/testing/generate_examples.py index cbd6a2a7f0..57134ccd15 100644 --- a/tensorflow/contrib/lite/testing/generate_examples.py +++ b/tensorflow/contrib/lite/testing/generate_examples.py @@ -780,10 +780,15 @@ def make_binary_op_tests(zip_path, binary_operator): "input_shape_2": [[5]], "activation": [False, True] }, { - "dtype": [tf.float32], + "dtype": [tf.float32, tf.int32], "input_shape_1": [[1, 3, 4, 3]], "input_shape_2": [[3]], - "activation": [True] + "activation": [True, False] + }, { + "dtype": [tf.float32, tf.int32], + "input_shape_1": [[3]], + "input_shape_2": [[1, 3, 4, 3]], + "activation": [True, False] }, { "dtype": [tf.float32], "input_shape_1": [[]], @@ -1098,6 +1103,10 @@ def make_pow_tests(zip_path): make_binary_op_tests(zip_path, tf.pow) +def make_floor_div_tests(zip_path): + make_binary_op_tests(zip_path, tf.floor_div) + + def make_gather_tests(zip_path): """Make a set of tests to do gather.""" diff --git a/tensorflow/contrib/lite/testing/generated_examples_zip_test.cc b/tensorflow/contrib/lite/testing/generated_examples_zip_test.cc index e67fee2a1c..37c7ae0e1c 100644 --- a/tensorflow/contrib/lite/testing/generated_examples_zip_test.cc +++ b/tensorflow/contrib/lite/testing/generated_examples_zip_test.cc @@ -101,6 +101,15 @@ std::map kBrokenTests = { "77546240"}, {R"(^\/arg_min_max.*axis_is_last_dim=False.*input_shape=\[.,.\])", "77546240"}, + + // No Support for float. + {R"(^\/floor_div.*dtype=tf\.float32)", "112859002"}, + + // Relu does not support int32. + // These test cases appends a Relu after the tested ops when + // activation=True. The tests are failing since Relu doesn't support int32. + {R"(^\/div.*activation=True.*dtype=tf\.int32)", "112968789"}, + {R"(^\/floor_div.*activation=True.*dtype=tf\.int32)", "112968789"}, }; // Allows test data to be unarchived into a temporary directory and makes diff --git a/tensorflow/contrib/lite/toco/tflite/operator.cc b/tensorflow/contrib/lite/toco/tflite/operator.cc index f687e9689e..a314c8d53a 100644 --- a/tensorflow/contrib/lite/toco/tflite/operator.cc +++ b/tensorflow/contrib/lite/toco/tflite/operator.cc @@ -1458,6 +1458,8 @@ std::vector> BuildOperatorList() { "LOGICAL_AND", OperatorType::kLogicalAnd)); ops.emplace_back(new SimpleOperator( "LOGICAL_NOT", OperatorType::kLogicalNot)); + ops.emplace_back(new SimpleOperator( + "FLOOR_DIV", OperatorType::kFloorDiv)); // Element-wise operator ops.push_back( MakeUnique>("SIN", OperatorType::kSin)); diff --git a/tensorflow/contrib/lite/toco/tflite/operator_test.cc b/tensorflow/contrib/lite/toco/tflite/operator_test.cc index 6da9317e4f..519a3a4e01 100644 --- a/tensorflow/contrib/lite/toco/tflite/operator_test.cc +++ b/tensorflow/contrib/lite/toco/tflite/operator_test.cc @@ -143,6 +143,7 @@ TEST_F(OperatorTest, SimpleOperators) { OperatorType::kLogicalAnd); CheckSimpleOperator("LOGICAL_NOT", OperatorType::kLogicalNot); + CheckSimpleOperator("FLOOR_DIV", OperatorType::kFloorDiv); } TEST_F(OperatorTest, BuiltinAdd) { -- GitLab From 2cb954e5441605c8668d2aecbf12e324a07b3c89 Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Tue, 28 Aug 2018 08:29:07 -0700 Subject: [PATCH 212/598] Convert more kernel signatures to use runtime shapes. PiperOrigin-RevId: 210544473 --- .../internal/optimized/optimized_ops.h | 149 +++++++++--------- .../internal/reference/reference_ops.h | 137 ++++++++-------- 2 files changed, 150 insertions(+), 136 deletions(-) diff --git a/tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h b/tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h index 7319636bf5..0cc7a7f2e7 100644 --- a/tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h +++ b/tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h @@ -2323,53 +2323,6 @@ inline void SpaceToDepth(const T* input_data, const Dims<4>& input_dims, DimsToShape(output_dims), output_data); } -template -void NonGlobalBatchNormalization( - const float* input_data, const Dims<4>& input_dims, const float* mean_data, - const Dims<4>& mean_dims, const float* multiplier_data, - const Dims<4>& multiplier_dims, const float* offset_data, - const Dims<4>& offset_dims, float* output_data, - const Dims<4>& output_dims) { - gemmlowp::ScopedProfilingLabel label("NonGlobalBatchNormalization"); - const int batches = MatchingArraySize(input_dims, 3, output_dims, 3); - const int inner_size = MatchingFlatSizeSkipDim( - input_dims, 3, mean_dims, multiplier_dims, offset_dims, output_dims); - - for (int b = 0; b < batches; ++b) { - for (int i = 0; i < inner_size; ++i) { - *output_data = ActivationFunction( - (*input_data - mean_data[i]) * multiplier_data[i] + offset_data[i]); - ++output_data; - ++input_data; - } - } -} - -template -void GlobalBatchNormalization(const float* input_data, - const Dims<4>& input_dims, const float* mean_data, - const Dims<4>& mean_dims, - const float* multiplier_data, - const Dims<4>& multiplier_dims, - const float* offset_data, - const Dims<4>& offset_dims, float* output_data, - const Dims<4>& output_dims) { - gemmlowp::ScopedProfilingLabel label("GlobalBatchNormalization"); - const int outer_size = MatchingFlatSizeSkipDim(input_dims, 0, output_dims); - const int depth = - MatchingArraySize(input_dims, 0, mean_dims, 0, multiplier_dims, 0, - offset_dims, 0, output_dims, 0); - - for (int i = 0; i < outer_size; ++i) { - for (int c = 0; c < depth; ++c) { - *output_data = ActivationFunction( - (*input_data - mean_data[c]) * multiplier_data[c] + offset_data[c]); - ++output_data; - ++input_data; - } - } -} - inline void Relu(const RuntimeShape& input_shape, const float* input_data, const RuntimeShape& output_shape, float* output_data) { gemmlowp::ScopedProfilingLabel label("Relu (not fused)"); @@ -2379,11 +2332,12 @@ inline void Relu(const RuntimeShape& input_shape, const float* input_data, output = input.cwiseMax(0.0f); } -template -void L2Normalization(const float* input_data, const RuntimeShape& input_shape, - float* output_data, const RuntimeShape& output_shape) { +inline void L2Normalization(const tflite::L2NormalizationParams& op_params, + const RuntimeShape& input_shape, + const float* input_data, + const RuntimeShape& output_shape, + float* output_data) { gemmlowp::ScopedProfilingLabel label("L2Normalization"); - static_assert(Ac == FusedActivationFunctionType::kNone, ""); const int trailing_dim = input_shape.DimensionsCount() - 1; const int outer_size = MatchingFlatSizeSkipDim(input_shape, trailing_dim, output_shape); @@ -2404,6 +2358,18 @@ void L2Normalization(const float* input_data, const RuntimeShape& input_shape, } } +// Legacy. +template +void L2Normalization(const float* input_data, const RuntimeShape& input_shape, + float* output_data, const RuntimeShape& output_shape) { + static_assert(Ac == FusedActivationFunctionType::kNone, ""); + tflite::L2NormalizationParams op_params; + // No params need to be set for float. + + L2Normalization(op_params, input_shape, input_data, output_shape, + output_data); +} + inline void GetInvSqrtQuantizedMultiplierExp(int32 input, int32* output_inv_sqrt, int* output_shift) { @@ -2452,16 +2418,18 @@ inline void GetInvSqrtQuantizedMultiplierExp(int32 input, *output_shift *= kReverseShift; } -inline void L2Normalization(const uint8* input_data, +inline void L2Normalization(const tflite::L2NormalizationParams& op_params, const RuntimeShape& input_shape, - int32 input_zero_point, uint8* output_data, - const RuntimeShape& output_shape) { + const uint8* input_data, + const RuntimeShape& output_shape, + uint8* output_data) { gemmlowp::ScopedProfilingLabel label("L2Normalization/8bit"); const int trailing_dim = input_shape.DimensionsCount() - 1; const int depth = MatchingDim(input_shape, trailing_dim, output_shape, trailing_dim); const int outer_size = MatchingFlatSizeSkipDim(input_shape, trailing_dim, output_shape); + const int32 input_zero_point = op_params.input_zero_point; for (int i = 0; i < outer_size; ++i) { int32 square_l2_norm = 0; for (int c = 0; c < depth; c++) { @@ -2487,6 +2455,18 @@ inline void L2Normalization(const uint8* input_data, } } +// Legacy. +inline void L2Normalization(const uint8* input_data, + const RuntimeShape& input_shape, + int32 input_zero_point, uint8* output_data, + const RuntimeShape& output_shape) { + tflite::L2NormalizationParams op_params; + op_params.input_zero_point = input_zero_point; + + L2Normalization(op_params, input_shape, input_data, output_shape, + output_data); +} + inline void Add(const ArithmeticParams& params, const RuntimeShape& input1_shape, const float* input1_data, const RuntimeShape& input2_shape, const float* input2_data, @@ -4077,29 +4057,28 @@ inline void L2Pool(const PoolParams& params, const RuntimeShape& input_shape, } } -inline void LocalResponseNormalization(const float* input_data, - const Dims<4>& input_dims, int range, - float bias, float alpha, float beta, - float* output_data, - const Dims<4>& output_dims) { +inline void LocalResponseNormalization( + const tflite::LocalResponseNormalizationParams& op_params, + const RuntimeShape& input_shape, const float* input_data, + const RuntimeShape& output_shape, float* output_data) { gemmlowp::ScopedProfilingLabel label("LocalResponseNormalization"); - MatchingFlatSize(input_dims, output_dims); + MatchingFlatSize(input_shape, output_shape); - const auto data_in = MapAsMatrixWithFirstDimAsRows(input_data, input_dims); - auto data_out = MapAsMatrixWithFirstDimAsRows(output_data, output_dims); + const auto data_in = MapAsMatrixWithLastDimAsRows(input_data, input_shape); + auto data_out = MapAsMatrixWithLastDimAsRows(output_data, output_shape); // Carry out local response normalization, vector by vector. // Since the data are stored column major, making row-wise operation // probably not memory efficient anyway, we do an explicit for loop over // the columns. - const int double_range = range * 2; + const int double_range = op_params.range * 2; Eigen::VectorXf padded_square(data_in.rows() + double_range); padded_square.setZero(); for (int r = 0; r < data_in.cols(); ++r) { // Do local response normalization for data_in(:, r) // first, compute the square and store them in buffer for repeated use - padded_square.block(range, 0, data_in.rows(), 1) = - data_in.col(r).cwiseProduct(data_in.col(r)) * alpha; + padded_square.block(op_params.range, 0, data_in.rows(), 1) = + data_in.col(r).cwiseProduct(data_in.col(r)) * op_params.alpha; // Then, compute the scale and writes them to data_out float accumulated_scale = 0; for (int i = 0; i < double_range; ++i) { @@ -4107,21 +4086,37 @@ inline void LocalResponseNormalization(const float* input_data, } for (int i = 0; i < data_in.rows(); ++i) { accumulated_scale += padded_square(i + double_range); - data_out(i, r) = bias + accumulated_scale; + data_out(i, r) = op_params.bias + accumulated_scale; accumulated_scale -= padded_square(i); } } // In a few cases, the pow computation could benefit from speedups. - if (beta == 1) { + if (op_params.beta == 1) { data_out.array() = data_in.array() * data_out.array().inverse(); - } else if (beta == 0.5) { + } else if (op_params.beta == 0.5) { data_out.array() = data_in.array() * data_out.array().sqrt().inverse(); } else { - data_out.array() = data_in.array() * data_out.array().pow(-beta); + data_out.array() = data_in.array() * data_out.array().pow(-op_params.beta); } } +// Legacy Dims<4>. +inline void LocalResponseNormalization(const float* input_data, + const Dims<4>& input_dims, int range, + float bias, float alpha, float beta, + float* output_data, + const Dims<4>& output_dims) { + tflite::LocalResponseNormalizationParams op_params; + op_params.range = range; + op_params.bias = bias; + op_params.alpha = alpha; + op_params.beta = beta; + + LocalResponseNormalization(op_params, DimsToShape(input_dims), input_data, + DimsToShape(output_dims), output_data); +} + inline void Softmax(const float* input_data, const RuntimeShape& input_shape, float beta, float* output_data, const RuntimeShape& output_shape) { @@ -5055,14 +5050,22 @@ inline void Tanh(const int16* input_data, const RuntimeShape& input_shape, } template -inline void Cast(const SrcT* input_data, const Dims<4>& input_dims, - DstT* output_data, const Dims<4>& output_dims) { +inline void Cast(const RuntimeShape& input_shape, const SrcT* input_data, + const RuntimeShape& output_shape, DstT* output_data) { gemmlowp::ScopedProfilingLabel label("Cast"); - auto input_map = MapAsVector(input_data, input_dims); - auto output_map = MapAsVector(output_data, output_dims); + auto input_map = MapAsVector(input_data, input_shape); + auto output_map = MapAsVector(output_data, output_shape); output_map.array() = input_map.array().template cast(); } +// Legacy Dims<4> version. +template +void Cast(const SrcT* input_data, const Dims<4>& input_dims, DstT* output_data, + const Dims<4>& output_dims) { + Cast(DimsToShape(input_dims), input_data, DimsToShape(output_dims), + output_data); +} + inline void Floor(const RuntimeShape& input_shape, const float* input_data, const RuntimeShape& output_shape, float* output_data) { gemmlowp::ScopedProfilingLabel label("Floor"); diff --git a/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h b/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h index b47a7f3b45..9c957a3936 100644 --- a/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h +++ b/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h @@ -849,49 +849,6 @@ void FullyConnected(const uint8* input_data, const Dims<4>& input_dims, output_activation_max, output_data, output_dims, gemm_context); } -template -void NonGlobalBatchNormalization( - const float* input_data, const Dims<4>& input_dims, const float* mean_data, - const Dims<4>& mean_dims, const float* multiplier_data, - const Dims<4>& multiplier_dims, const float* offset_data, - const Dims<4>& offset_dims, float* output_data, - const Dims<4>& output_dims) { - const int batches = MatchingArraySize(input_dims, 3, output_dims, 3); - const int inner_size = MatchingFlatSizeSkipDim( - input_dims, 3, mean_dims, multiplier_dims, offset_dims, output_dims); - - for (int b = 0; b < batches; ++b) { - for (int i = 0; i < inner_size; ++i) { - output_data[b * inner_size + i] = ActivationFunction( - (input_data[b * inner_size + i] - mean_data[i]) * multiplier_data[i] + - offset_data[i]); - } - } -} - -template -void GlobalBatchNormalization(const float* input_data, - const Dims<4>& input_dims, const float* mean_data, - const Dims<4>& mean_dims, - const float* multiplier_data, - const Dims<4>& multiplier_dims, - const float* offset_data, - const Dims<4>& offset_dims, float* output_data, - const Dims<4>& output_dims) { - const int outer_size = MatchingFlatSizeSkipDim(input_dims, 0, output_dims); - const int depth = - MatchingArraySize(input_dims, 0, mean_dims, 0, multiplier_dims, 0, - offset_dims, 0, output_dims, 0); - - for (int i = 0; i < outer_size; ++i) { - for (int c = 0; c < depth; ++c) { - output_data[depth * i + c] = ActivationFunction( - (input_data[depth * i + c] - mean_data[c]) * multiplier_data[c] + - offset_data[c]); - } - } -} - inline void Relu(const RuntimeShape& input_shape, const float* input_data, const RuntimeShape& output_shape, float* output_data) { const int flat_size = MatchingFlatSize(input_shape, output_shape); @@ -955,10 +912,11 @@ inline void ReluX(uint8 min_value, uint8 max_value, const uint8* input_data, ReluX(params, input_shape, input_data, output_shape, output_data); } -template -void L2Normalization(const float* input_data, const RuntimeShape& input_shape, - float* output_data, const RuntimeShape& output_shape) { - static_assert(Ac == FusedActivationFunctionType::kNone, ""); +inline void L2Normalization(const tflite::L2NormalizationParams& op_params, + const RuntimeShape& input_shape, + const float* input_data, + const RuntimeShape& output_shape, + float* output_data) { const int trailing_dim = input_shape.DimensionsCount() - 1; const int outer_size = MatchingFlatSizeSkipDim(input_shape, trailing_dim, output_shape); @@ -977,6 +935,18 @@ void L2Normalization(const float* input_data, const RuntimeShape& input_shape, } } +// Legacy . +template +void L2Normalization(const float* input_data, const RuntimeShape& input_shape, + float* output_data, const RuntimeShape& output_shape) { + static_assert(Ac == FusedActivationFunctionType::kNone, ""); + tflite::L2NormalizationParams op_params; + // No params need to be set for float. + + L2Normalization(op_params, input_shape, input_data, output_shape, + output_data); +} + inline void GetInvSqrtQuantizedMultiplierExp(int32 input, int32* output_inv_sqrt, int* output_shift) { @@ -1025,15 +995,17 @@ inline void GetInvSqrtQuantizedMultiplierExp(int32 input, *output_shift *= kReverseShift; } -inline void L2Normalization(const uint8* input_data, +inline void L2Normalization(const tflite::L2NormalizationParams& op_params, const RuntimeShape& input_shape, - int32 input_zero_point, uint8* output_data, - const RuntimeShape& output_shape) { + const uint8* input_data, + const RuntimeShape& output_shape, + uint8* output_data) { const int trailing_dim = input_shape.DimensionsCount() - 1; const int depth = MatchingDim(input_shape, trailing_dim, output_shape, trailing_dim); const int outer_size = MatchingFlatSizeSkipDim(input_shape, trailing_dim, output_shape); + const int32 input_zero_point = op_params.input_zero_point; for (int i = 0; i < outer_size; ++i) { int32 square_l2_norm = 0; for (int c = 0; c < depth; c++) { @@ -1056,6 +1028,18 @@ inline void L2Normalization(const uint8* input_data, } } +// Legacy. +inline void L2Normalization(const uint8* input_data, + const RuntimeShape& input_shape, + int32 input_zero_point, uint8* output_data, + const RuntimeShape& output_shape) { + tflite::L2NormalizationParams op_params; + op_params.input_zero_point = input_zero_point; + + L2Normalization(op_params, input_shape, input_data, output_shape, + output_data); +} + template inline void Add(const ArithmeticParams& params, const RuntimeShape& input1_shape, const T* input1_data, @@ -2836,29 +2820,48 @@ inline void MaxPool(const PoolParams& params, const RuntimeShape& input_shape, } } -inline void LocalResponseNormalization(const float* input_data, - const Dims<4>& input_dims, int range, - float bias, float alpha, float beta, - float* output_data, - const Dims<4>& output_dims) { - const int outer_size = MatchingFlatSizeSkipDim(input_dims, 0, output_dims); - const int depth = MatchingArraySize(input_dims, 0, output_dims, 0); +inline void LocalResponseNormalization( + const tflite::LocalResponseNormalizationParams& op_params, + const RuntimeShape& input_shape, const float* input_data, + const RuntimeShape& output_shape, float* output_data) { + const int trailing_dim = input_shape.DimensionsCount() - 1; + const int outer_size = + MatchingFlatSizeSkipDim(input_shape, trailing_dim, output_shape); + const int depth = + MatchingDim(input_shape, trailing_dim, output_shape, trailing_dim); for (int i = 0; i < outer_size; ++i) { for (int c = 0; c < depth; ++c) { - const int begin_input_c = std::max(0, c - range); - const int end_input_c = std::min(depth, c + range); + const int begin_input_c = std::max(0, c - op_params.range); + const int end_input_c = std::min(depth, c + op_params.range); float accum = 0.f; for (int input_c = begin_input_c; input_c < end_input_c; ++input_c) { const float input_val = input_data[i * depth + input_c]; accum += input_val * input_val; } - const float multiplier = std::pow(bias + alpha * accum, -beta); + const float multiplier = + std::pow(op_params.bias + op_params.alpha * accum, -op_params.beta); output_data[i * depth + c] = input_data[i * depth + c] * multiplier; } } } +// Legacy Dims<4>. +inline void LocalResponseNormalization(const float* input_data, + const Dims<4>& input_dims, int range, + float bias, float alpha, float beta, + float* output_data, + const Dims<4>& output_dims) { + tflite::LocalResponseNormalizationParams op_params; + op_params.range = range; + op_params.bias = bias; + op_params.alpha = alpha; + op_params.beta = beta; + + LocalResponseNormalization(op_params, DimsToShape(input_dims), input_data, + DimsToShape(output_dims), output_data); +} + inline void Softmax(const float* input_data, const RuntimeShape& input_shape, float beta, float* output_data, const RuntimeShape& output_shape) { @@ -3388,9 +3391,9 @@ inline void FakeQuant(const float* input_data, const Dims<4>& input_dims, } template -inline void Cast(const SrcT* input_data, const Dims<4>& input_dims, - DstT* output_data, const Dims<4>& output_dims) { - const int flat_size = MatchingFlatSize(output_dims, input_dims); +inline void Cast(const RuntimeShape& input_shape, const SrcT* input_data, + const RuntimeShape& output_shape, DstT* output_data) { + const int flat_size = MatchingFlatSize(input_shape, output_shape); for (int i = 0; i < flat_size; i++) { int offset = i; @@ -3398,6 +3401,14 @@ inline void Cast(const SrcT* input_data, const Dims<4>& input_dims, } } +// Legacy Dims<4> version. +template +void Cast(const SrcT* input_data, const Dims<4>& input_dims, DstT* output_data, + const Dims<4>& output_dims) { + Cast(DimsToShape(input_dims), input_data, DimsToShape(output_dims), + output_data); +} + inline void Floor(const RuntimeShape& input_shape, const float* input_data, const RuntimeShape& output_shape, float* output_data) { const int flat_size = MatchingFlatSize(input_shape, output_shape); -- GitLab From f8f5bbe006bc98e98fd939898ceff08dbaace34f Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Tue, 28 Aug 2018 09:07:14 -0700 Subject: [PATCH 213/598] Removed ToString method from tensorflow::StringPiece. This will make it easier to replace tensorflow::StringPiece with absl::string_view, as absl::string_view does not contain a ToString method. PiperOrigin-RevId: 210550029 --- .../core/grappler/optimizers/data/filter_fusion_test.cc | 2 +- tensorflow/core/lib/core/stringpiece.h | 4 ---- tensorflow/core/platform/s3/s3_file_system.cc | 4 ++-- tensorflow/core/platform/windows/windows_file_system.h | 2 +- tensorflow/stream_executor/dso_loader.cc | 8 ++++---- 5 files changed, 8 insertions(+), 12 deletions(-) diff --git a/tensorflow/core/grappler/optimizers/data/filter_fusion_test.cc b/tensorflow/core/grappler/optimizers/data/filter_fusion_test.cc index 5a289e60d0..12b1924efd 100644 --- a/tensorflow/core/grappler/optimizers/data/filter_fusion_test.cc +++ b/tensorflow/core/grappler/optimizers/data/filter_fusion_test.cc @@ -30,7 +30,7 @@ namespace { NodeDef MakeFilterNode(StringPiece name, StringPiece input_node_name) { return test::function::NDef( - name, "FilterDataset", {input_node_name.ToString()}, + name, "FilterDataset", {string(input_node_name)}, {{"predicate", FunctionDefHelper::FunctionRef("IsZero")}, {"Targuments", {}}, {"output_shapes", {}}, diff --git a/tensorflow/core/lib/core/stringpiece.h b/tensorflow/core/lib/core/stringpiece.h index be659e5f8e..02dded42c1 100644 --- a/tensorflow/core/lib/core/stringpiece.h +++ b/tensorflow/core/lib/core/stringpiece.h @@ -92,10 +92,6 @@ class StringPiece { StringPiece substr(size_t pos, size_t n = npos) const; - // Return a string that contains the copy of the referenced data. - // DEPRECATED: use std::string(sv) instead. - std::string ToString() const { return std::string(data_, size_); } - // Three-way comparison. Returns value: // < 0 iff "*this" < "b", // == 0 iff "*this" == "b", diff --git a/tensorflow/core/platform/s3/s3_file_system.cc b/tensorflow/core/platform/s3/s3_file_system.cc index 462113f9bb..ce0f6cd741 100644 --- a/tensorflow/core/platform/s3/s3_file_system.cc +++ b/tensorflow/core/platform/s3/s3_file_system.cc @@ -150,13 +150,13 @@ Status ParseS3Path(const string& fname, bool empty_object_ok, string* bucket, return errors::InvalidArgument("S3 path doesn't start with 's3://': ", fname); } - *bucket = bucketp.ToString(); + *bucket = string(bucketp); if (bucket->empty() || *bucket == ".") { return errors::InvalidArgument("S3 path doesn't contain a bucket name: ", fname); } str_util::ConsumePrefix(&objectp, "/"); - *object = objectp.ToString(); + *object = string(objectp); if (!empty_object_ok && object->empty()) { return errors::InvalidArgument("S3 path doesn't contain an object name: ", fname); diff --git a/tensorflow/core/platform/windows/windows_file_system.h b/tensorflow/core/platform/windows/windows_file_system.h index 6b04720c68..1f4c535f24 100644 --- a/tensorflow/core/platform/windows/windows_file_system.h +++ b/tensorflow/core/platform/windows/windows_file_system.h @@ -71,7 +71,7 @@ class LocalWinFileSystem : public WindowsFileSystem { string TranslateName(const string& name) const override { StringPiece scheme, host, path; io::ParseURI(name, &scheme, &host, &path); - return path.ToString(); + return string(path); } }; diff --git a/tensorflow/stream_executor/dso_loader.cc b/tensorflow/stream_executor/dso_loader.cc index 114143b3ab..ea5dffd15e 100644 --- a/tensorflow/stream_executor/dso_loader.cc +++ b/tensorflow/stream_executor/dso_loader.cc @@ -121,7 +121,7 @@ static mutex& GetRpathMutex() { /* static */ void DsoLoader::RegisterRpath(port::StringPiece path) { mutex_lock lock{GetRpathMutex()}; - GetRpaths()->push_back(path.ToString()); + GetRpaths()->emplace_back(path); } /* static */ port::Status DsoLoader::GetDsoHandle(port::StringPiece path, @@ -131,7 +131,7 @@ static mutex& GetRpathMutex() { return port::Status(port::error::INVALID_ARGUMENT, "Only LoadKind::kLocal is currently supported"); } - string path_string = path.ToString(); + string path_string(path); port::Status s = port::Env::Default()->LoadLibrary(path_string.c_str(), dso_handle); if (!s.ok()) { @@ -154,7 +154,7 @@ static mutex& GetRpathMutex() { /* static */ string DsoLoader::GetBinaryDirectory(bool strip_executable_name) { string exe_path = port::Env::Default()->GetExecutablePath(); - return strip_executable_name ? port::Dirname(exe_path).ToString() : exe_path; + return strip_executable_name ? string(port::Dirname(exe_path)) : exe_path; } // Creates a heap-allocated vector for initial rpaths. @@ -212,7 +212,7 @@ static std::vector* CreatePrimordialRpaths() { } attempted.push_back(candidate); - return library_name.ToString(); + return string(library_name); } /* static */ string DsoLoader::GetCudaLibraryDirPath() { -- GitLab From 8987d1cfd3c17eab4e28da376fdc718f53d82e19 Mon Sep 17 00:00:00 2001 From: Misha Brukman Date: Tue, 28 Aug 2018 09:09:03 -0700 Subject: [PATCH 214/598] Provide an alternative method to find gRPC `roots.pem` file, using an environment variable, to avoid having to copy a particular file to `/usr/share`. PiperOrigin-RevId: 210550389 --- tensorflow/contrib/bigtable/README.md | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/tensorflow/contrib/bigtable/README.md b/tensorflow/contrib/bigtable/README.md index b9abfa8295..f33eaf7e3d 100644 --- a/tensorflow/contrib/bigtable/README.md +++ b/tensorflow/contrib/bigtable/README.md @@ -324,8 +324,14 @@ If you encounter a log line that includes the following: "filename":"/usr/share/grpc/roots.pem" ``` -you likely need to copy the [gRPC `roots.pem` file][grpcPem] to -`/usr/share/grpc/roots.pem` on your local machine. +you can solve it via either of the following approaches: + +* copy the [gRPC `roots.pem` file][grpcPem] to + `/usr/share/grpc/roots.pem` on your local machine, which is the default + location where gRPC will look for this file +* export the environment variable `GRPC_DEFAULT_SSL_ROOTS_FILE_PATH` to point to + the full path of the gRPC `roots.pem` file on your file system if it's in a + different location [grpcPem]: https://github.com/grpc/grpc/blob/master/etc/roots.pem -- GitLab From 8e5c118ce835e0b8625ef073e2f4d978c70498ae Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Tue, 28 Aug 2018 09:10:49 -0700 Subject: [PATCH 215/598] While loop dispatch depends only on whether variables directly referenced in the condition are tensors. This fixes a bug where a variable in an inner loop could be referenced before creation. These variables would be used in the AG while_stmt to determine whether to dispatch to tf.while_loop or run the Python loop. PiperOrigin-RevId: 210550604 --- .../autograph/converters/control_flow.py | 3 ++- .../autograph/converters/control_flow_test.py | 18 ++++++++++++++++++ 2 files changed, 20 insertions(+), 1 deletion(-) diff --git a/tensorflow/contrib/autograph/converters/control_flow.py b/tensorflow/contrib/autograph/converters/control_flow.py index 8d314250a0..3530fbb2ec 100644 --- a/tensorflow/contrib/autograph/converters/control_flow.py +++ b/tensorflow/contrib/autograph/converters/control_flow.py @@ -217,7 +217,7 @@ class ControlFlowTransformer(converter.Base): cond_scope = anno.getanno(node, annos.NodeAnno.COND_SCOPE) cond_closure = set() - for s in cond_scope.referenced: + for s in cond_scope.used: for root in s.support_set: if root not in body_scope.created: cond_closure.add(root) @@ -250,6 +250,7 @@ class ControlFlowTransformer(converter.Base): node_body = ast_util.rename_symbols(node.body, ssf_map) test = ast_util.rename_symbols(node.test, ssf_map) + # TODO(b/113118541) investigate the need-for and correctness-of extra_deps template = """ def test_name(state_ssf): return test diff --git a/tensorflow/contrib/autograph/converters/control_flow_test.py b/tensorflow/contrib/autograph/converters/control_flow_test.py index 2a6f3cb395..1d04ba3ba6 100644 --- a/tensorflow/contrib/autograph/converters/control_flow_test.py +++ b/tensorflow/contrib/autograph/converters/control_flow_test.py @@ -48,6 +48,24 @@ class ControlFlowTest(converter_testing.TestCase): self.assertTransformedResult(test_fn, constant_op.constant(5), (10, 5, 5)) + def test_while_nested(self): + + def test_fn(n): + i = 0 + j = 0 + s = 0 + while i < n: + while j < i: + j += 3 + u = i + j # 'u' is not defined within the inner loop + s += u + i += 1 + j = 0 + return s, i, j, n + + self.assertTransformedResult(test_fn, constant_op.constant(5), + (25, 5, 0, 5)) + def test_while_single_output(self): def test_fn(n): -- GitLab From 54ff806b9512ecfecfa9990abf235a459fd720b5 Mon Sep 17 00:00:00 2001 From: Peter Buchlovsky Date: Tue, 28 Aug 2018 09:11:34 -0700 Subject: [PATCH 216/598] Minor documentation corrections. PiperOrigin-RevId: 210550697 --- tensorflow/contrib/distribute/README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tensorflow/contrib/distribute/README.md b/tensorflow/contrib/distribute/README.md index 2f5dd10550..ba92ea0b12 100644 --- a/tensorflow/contrib/distribute/README.md +++ b/tensorflow/contrib/distribute/README.md @@ -1,6 +1,6 @@ # Distribution Strategy -> *NOTE*: This is a experimental feature. The API and performance +> *NOTE*: This is an experimental feature. The API and performance > characteristics are subject to change. ## Overview @@ -9,7 +9,7 @@ API is an easy way to distribute your training across multiple devices/machines. Our goal is to allow users to use existing models and training code with minimal changes to enable distributed training. -Moreover, we've design the API in such a way that it works with both eager and +Moreover, we've designed the API in such a way that it works with both eager and graph execution. Currently we support one type of strategy, called -- GitLab From 00045099ee05f85f05c8367a122bcd9ef6fc6b07 Mon Sep 17 00:00:00 2001 From: Karmel Allison Date: Tue, 28 Aug 2018 10:06:50 -0700 Subject: [PATCH 217/598] Add estimator.export_saved_model without removing export_savedmodel in order to stage TF 2.0 removal of export_savedmodel. The new export_saved_model will not have the option to strip_default_attrs; the new behavior is that default attributes are always stripped. PiperOrigin-RevId: 210559617 --- tensorflow/python/estimator/estimator.py | 62 ++++++++++++++----- ...rflow.estimator.-baseline-classifier.pbtxt | 4 ++ ...orflow.estimator.-baseline-regressor.pbtxt | 4 ++ ....estimator.-boosted-trees-classifier.pbtxt | 4 ++ ...w.estimator.-boosted-trees-regressor.pbtxt | 4 ++ ...nsorflow.estimator.-d-n-n-classifier.pbtxt | 4 ++ ...or.-d-n-n-linear-combined-classifier.pbtxt | 4 ++ ...tor.-d-n-n-linear-combined-regressor.pbtxt | 4 ++ ...ensorflow.estimator.-d-n-n-regressor.pbtxt | 4 ++ .../v1/tensorflow.estimator.-estimator.pbtxt | 4 ++ ...sorflow.estimator.-linear-classifier.pbtxt | 4 ++ ...nsorflow.estimator.-linear-regressor.pbtxt | 4 ++ ...rflow.estimator.-baseline-classifier.pbtxt | 4 ++ ...orflow.estimator.-baseline-regressor.pbtxt | 4 ++ ....estimator.-boosted-trees-classifier.pbtxt | 4 ++ ...w.estimator.-boosted-trees-regressor.pbtxt | 4 ++ ...nsorflow.estimator.-d-n-n-classifier.pbtxt | 4 ++ ...or.-d-n-n-linear-combined-classifier.pbtxt | 4 ++ ...tor.-d-n-n-linear-combined-regressor.pbtxt | 4 ++ ...ensorflow.estimator.-d-n-n-regressor.pbtxt | 4 ++ .../v2/tensorflow.estimator.-estimator.pbtxt | 4 ++ ...sorflow.estimator.-linear-classifier.pbtxt | 4 ++ ...nsorflow.estimator.-linear-regressor.pbtxt | 4 ++ 23 files changed, 135 insertions(+), 15 deletions(-) diff --git a/tensorflow/python/estimator/estimator.py b/tensorflow/python/estimator/estimator.py index f55ca93c0d..97a02bd1e8 100644 --- a/tensorflow/python/estimator/estimator.py +++ b/tensorflow/python/estimator/estimator.py @@ -120,7 +120,9 @@ class Estimator(object): warm_start_from=None): """Constructs an `Estimator` instance. - See [estimators](https://tensorflow.org/guide/estimators) for more information. + See [estimators](https://tensorflow.org/guide/estimators) for more + information. + To warm-start an `Estimator`: ```python @@ -286,8 +288,8 @@ class Estimator(object): Args: input_fn: A function that provides input data for training as minibatches. - See [Premade - Estimators](https://tensorflow.org/guide/premade_estimators#create_input_functions) + See [Premade Estimators]( + https://tensorflow.org/guide/premade_estimators#create_input_functions) for more information. The function should construct and return one of the following: * A `tf.data.Dataset` object: Outputs of `Dataset` object must be a tuple @@ -405,7 +407,8 @@ class Estimator(object): Args: input_fn: A function that constructs the input data for evaluation. See - [Premade Estimators](https://tensorflow.org/guide/premade#create_input_functions} + [Premade Estimators]( + https://tensorflow.org/guide/premade#create_input_functions) for more information. The function should construct and return one of the following: * A `tf.data.Dataset` object: Outputs of `Dataset` object must be a tuple @@ -492,8 +495,8 @@ class Estimator(object): input_fn: A function that constructs the features. Prediction continues until `input_fn` raises an end-of-input exception (`tf.errors.OutOfRangeError` or `StopIteration`). - See [Premade - Estimators](https://tensorflow.org/guide/premade_estimators#create_input_functions) + See [Premade Estimators]( + https://tensorflow.org/guide/premade_estimators#create_input_functions) for more information. The function should construct and return one of the following: @@ -606,6 +609,38 @@ class Estimator(object): as_text=False, checkpoint_path=None, strip_default_attrs=False): + # pylint: disable=line-too-long,g-doc-args,g-doc-return-or-yield + """Exports inference graph as a `SavedModel` into the given dir. + + Note that `export_to_savedmodel` will be renamed to `export_to_saved_model` + in TensorFlow 2.0. At that time, `export_to_savedmodel` without the + additional underscore will be available only through tf.compat.v1. + + Please see `tf.estimator.Estimator.export_saved_model` for more information. + + There is one additional arg versus the new method: + strip_default_attrs: This parameter is going away in TF 2.0, and + the new behavior will automatically strip all default attributes. + Boolean. If `True`, default-valued attributes will be + removed from the `NodeDef`s. For a detailed guide, see [Stripping + Default-Valued Attributes]( + https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/saved_model/README.md#stripping-default-valued-attributes). + """ + # pylint: enable=line-too-long,g-doc-args,g-doc-return-or-yield + return self._export_saved_model_for_mode( + export_dir_base, + serving_input_receiver_fn, + assets_extra=assets_extra, + as_text=as_text, + checkpoint_path=checkpoint_path, + strip_default_attrs=strip_default_attrs, + mode=model_fn_lib.ModeKeys.PREDICT) + + def export_saved_model( + self, export_dir_base, serving_input_receiver_fn, + assets_extra=None, + as_text=False, + checkpoint_path=None): # pylint: disable=line-too-long """Exports inference graph as a `SavedModel` into the given dir. @@ -652,28 +687,25 @@ class Estimator(object): as_text: whether to write the `SavedModel` proto in text format. checkpoint_path: The checkpoint path to export. If `None` (the default), the most recent checkpoint found within the model directory is chosen. - strip_default_attrs: Boolean. If `True`, default-valued attributes will be - removed from the `NodeDef`s. For a detailed guide, see [Stripping - Default-Valued - Attributes](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/saved_model/README.md#stripping-default-valued-attributes). Returns: The string path to the exported directory. Raises: ValueError: if no `serving_input_receiver_fn` is provided, no - `export_outputs` - are provided, or no checkpoint can be found. + `export_outputs` are provided, or no checkpoint can be found. """ # pylint: enable=line-too-long - return self._export_saved_model_for_mode( + # TODO(b/111442174): `export_to_savedmodel` will be renamed to + # `export_to_saved_model` in TensorFlow 2.0. This function is a wrapper + # while staging the new version; do not add any logic here. + return self.export_savedmodel( export_dir_base, serving_input_receiver_fn, assets_extra=assets_extra, as_text=as_text, checkpoint_path=checkpoint_path, - strip_default_attrs=strip_default_attrs, - mode=model_fn_lib.ModeKeys.PREDICT) + strip_default_attrs=True) def _export_saved_model_for_mode( self, export_dir_base, input_receiver_fn, diff --git a/tensorflow/tools/api/golden/v1/tensorflow.estimator.-baseline-classifier.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.estimator.-baseline-classifier.pbtxt index cf22e39d4c..082e26b99b 100644 --- a/tensorflow/tools/api/golden/v1/tensorflow.estimator.-baseline-classifier.pbtxt +++ b/tensorflow/tools/api/golden/v1/tensorflow.estimator.-baseline-classifier.pbtxt @@ -31,6 +31,10 @@ tf_class { name: "evaluate" argspec: "args=[\'self\', \'input_fn\', \'steps\', \'hooks\', \'checkpoint_path\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], " } + member_method { + name: "export_saved_model" + argspec: "args=[\'self\', \'export_dir_base\', \'serving_input_receiver_fn\', \'assets_extra\', \'as_text\', \'checkpoint_path\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], " + } member_method { name: "export_savedmodel" argspec: "args=[\'self\', \'export_dir_base\', \'serving_input_receiver_fn\', \'assets_extra\', \'as_text\', \'checkpoint_path\', \'strip_default_attrs\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\', \'False\'], " diff --git a/tensorflow/tools/api/golden/v1/tensorflow.estimator.-baseline-regressor.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.estimator.-baseline-regressor.pbtxt index a363bceae3..7cc4191eb3 100644 --- a/tensorflow/tools/api/golden/v1/tensorflow.estimator.-baseline-regressor.pbtxt +++ b/tensorflow/tools/api/golden/v1/tensorflow.estimator.-baseline-regressor.pbtxt @@ -31,6 +31,10 @@ tf_class { name: "evaluate" argspec: "args=[\'self\', \'input_fn\', \'steps\', \'hooks\', \'checkpoint_path\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], " } + member_method { + name: "export_saved_model" + argspec: "args=[\'self\', \'export_dir_base\', \'serving_input_receiver_fn\', \'assets_extra\', \'as_text\', \'checkpoint_path\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], " + } member_method { name: "export_savedmodel" argspec: "args=[\'self\', \'export_dir_base\', \'serving_input_receiver_fn\', \'assets_extra\', \'as_text\', \'checkpoint_path\', \'strip_default_attrs\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\', \'False\'], " diff --git a/tensorflow/tools/api/golden/v1/tensorflow.estimator.-boosted-trees-classifier.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.estimator.-boosted-trees-classifier.pbtxt index c23b04b4ef..7027e78df4 100644 --- a/tensorflow/tools/api/golden/v1/tensorflow.estimator.-boosted-trees-classifier.pbtxt +++ b/tensorflow/tools/api/golden/v1/tensorflow.estimator.-boosted-trees-classifier.pbtxt @@ -31,6 +31,10 @@ tf_class { name: "evaluate" argspec: "args=[\'self\', \'input_fn\', \'steps\', \'hooks\', \'checkpoint_path\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], " } + member_method { + name: "export_saved_model" + argspec: "args=[\'self\', \'export_dir_base\', \'serving_input_receiver_fn\', \'assets_extra\', \'as_text\', \'checkpoint_path\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], " + } member_method { name: "export_savedmodel" argspec: "args=[\'self\', \'export_dir_base\', \'serving_input_receiver_fn\', \'assets_extra\', \'as_text\', \'checkpoint_path\', \'strip_default_attrs\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\', \'False\'], " diff --git a/tensorflow/tools/api/golden/v1/tensorflow.estimator.-boosted-trees-regressor.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.estimator.-boosted-trees-regressor.pbtxt index 6878d28fff..d8167ea7cb 100644 --- a/tensorflow/tools/api/golden/v1/tensorflow.estimator.-boosted-trees-regressor.pbtxt +++ b/tensorflow/tools/api/golden/v1/tensorflow.estimator.-boosted-trees-regressor.pbtxt @@ -31,6 +31,10 @@ tf_class { name: "evaluate" argspec: "args=[\'self\', \'input_fn\', \'steps\', \'hooks\', \'checkpoint_path\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], " } + member_method { + name: "export_saved_model" + argspec: "args=[\'self\', \'export_dir_base\', \'serving_input_receiver_fn\', \'assets_extra\', \'as_text\', \'checkpoint_path\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], " + } member_method { name: "export_savedmodel" argspec: "args=[\'self\', \'export_dir_base\', \'serving_input_receiver_fn\', \'assets_extra\', \'as_text\', \'checkpoint_path\', \'strip_default_attrs\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\', \'False\'], " diff --git a/tensorflow/tools/api/golden/v1/tensorflow.estimator.-d-n-n-classifier.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.estimator.-d-n-n-classifier.pbtxt index 0c6b7e4a82..718f415a77 100644 --- a/tensorflow/tools/api/golden/v1/tensorflow.estimator.-d-n-n-classifier.pbtxt +++ b/tensorflow/tools/api/golden/v1/tensorflow.estimator.-d-n-n-classifier.pbtxt @@ -31,6 +31,10 @@ tf_class { name: "evaluate" argspec: "args=[\'self\', \'input_fn\', \'steps\', \'hooks\', \'checkpoint_path\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], " } + member_method { + name: "export_saved_model" + argspec: "args=[\'self\', \'export_dir_base\', \'serving_input_receiver_fn\', \'assets_extra\', \'as_text\', \'checkpoint_path\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], " + } member_method { name: "export_savedmodel" argspec: "args=[\'self\', \'export_dir_base\', \'serving_input_receiver_fn\', \'assets_extra\', \'as_text\', \'checkpoint_path\', \'strip_default_attrs\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\', \'False\'], " diff --git a/tensorflow/tools/api/golden/v1/tensorflow.estimator.-d-n-n-linear-combined-classifier.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.estimator.-d-n-n-linear-combined-classifier.pbtxt index 9c1c072124..b23c019d6c 100644 --- a/tensorflow/tools/api/golden/v1/tensorflow.estimator.-d-n-n-linear-combined-classifier.pbtxt +++ b/tensorflow/tools/api/golden/v1/tensorflow.estimator.-d-n-n-linear-combined-classifier.pbtxt @@ -31,6 +31,10 @@ tf_class { name: "evaluate" argspec: "args=[\'self\', \'input_fn\', \'steps\', \'hooks\', \'checkpoint_path\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], " } + member_method { + name: "export_saved_model" + argspec: "args=[\'self\', \'export_dir_base\', \'serving_input_receiver_fn\', \'assets_extra\', \'as_text\', \'checkpoint_path\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], " + } member_method { name: "export_savedmodel" argspec: "args=[\'self\', \'export_dir_base\', \'serving_input_receiver_fn\', \'assets_extra\', \'as_text\', \'checkpoint_path\', \'strip_default_attrs\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\', \'False\'], " diff --git a/tensorflow/tools/api/golden/v1/tensorflow.estimator.-d-n-n-linear-combined-regressor.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.estimator.-d-n-n-linear-combined-regressor.pbtxt index 7391d4b07a..caa9e3f1de 100644 --- a/tensorflow/tools/api/golden/v1/tensorflow.estimator.-d-n-n-linear-combined-regressor.pbtxt +++ b/tensorflow/tools/api/golden/v1/tensorflow.estimator.-d-n-n-linear-combined-regressor.pbtxt @@ -31,6 +31,10 @@ tf_class { name: "evaluate" argspec: "args=[\'self\', \'input_fn\', \'steps\', \'hooks\', \'checkpoint_path\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], " } + member_method { + name: "export_saved_model" + argspec: "args=[\'self\', \'export_dir_base\', \'serving_input_receiver_fn\', \'assets_extra\', \'as_text\', \'checkpoint_path\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], " + } member_method { name: "export_savedmodel" argspec: "args=[\'self\', \'export_dir_base\', \'serving_input_receiver_fn\', \'assets_extra\', \'as_text\', \'checkpoint_path\', \'strip_default_attrs\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\', \'False\'], " diff --git a/tensorflow/tools/api/golden/v1/tensorflow.estimator.-d-n-n-regressor.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.estimator.-d-n-n-regressor.pbtxt index f50e375f7c..1f5e650940 100644 --- a/tensorflow/tools/api/golden/v1/tensorflow.estimator.-d-n-n-regressor.pbtxt +++ b/tensorflow/tools/api/golden/v1/tensorflow.estimator.-d-n-n-regressor.pbtxt @@ -31,6 +31,10 @@ tf_class { name: "evaluate" argspec: "args=[\'self\', \'input_fn\', \'steps\', \'hooks\', \'checkpoint_path\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], " } + member_method { + name: "export_saved_model" + argspec: "args=[\'self\', \'export_dir_base\', \'serving_input_receiver_fn\', \'assets_extra\', \'as_text\', \'checkpoint_path\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], " + } member_method { name: "export_savedmodel" argspec: "args=[\'self\', \'export_dir_base\', \'serving_input_receiver_fn\', \'assets_extra\', \'as_text\', \'checkpoint_path\', \'strip_default_attrs\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\', \'False\'], " diff --git a/tensorflow/tools/api/golden/v1/tensorflow.estimator.-estimator.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.estimator.-estimator.pbtxt index d72b576977..ebd3869c9b 100644 --- a/tensorflow/tools/api/golden/v1/tensorflow.estimator.-estimator.pbtxt +++ b/tensorflow/tools/api/golden/v1/tensorflow.estimator.-estimator.pbtxt @@ -30,6 +30,10 @@ tf_class { name: "evaluate" argspec: "args=[\'self\', \'input_fn\', \'steps\', \'hooks\', \'checkpoint_path\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], " } + member_method { + name: "export_saved_model" + argspec: "args=[\'self\', \'export_dir_base\', \'serving_input_receiver_fn\', \'assets_extra\', \'as_text\', \'checkpoint_path\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], " + } member_method { name: "export_savedmodel" argspec: "args=[\'self\', \'export_dir_base\', \'serving_input_receiver_fn\', \'assets_extra\', \'as_text\', \'checkpoint_path\', \'strip_default_attrs\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\', \'False\'], " diff --git a/tensorflow/tools/api/golden/v1/tensorflow.estimator.-linear-classifier.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.estimator.-linear-classifier.pbtxt index 154f171e89..53ec5a0c78 100644 --- a/tensorflow/tools/api/golden/v1/tensorflow.estimator.-linear-classifier.pbtxt +++ b/tensorflow/tools/api/golden/v1/tensorflow.estimator.-linear-classifier.pbtxt @@ -31,6 +31,10 @@ tf_class { name: "evaluate" argspec: "args=[\'self\', \'input_fn\', \'steps\', \'hooks\', \'checkpoint_path\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], " } + member_method { + name: "export_saved_model" + argspec: "args=[\'self\', \'export_dir_base\', \'serving_input_receiver_fn\', \'assets_extra\', \'as_text\', \'checkpoint_path\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], " + } member_method { name: "export_savedmodel" argspec: "args=[\'self\', \'export_dir_base\', \'serving_input_receiver_fn\', \'assets_extra\', \'as_text\', \'checkpoint_path\', \'strip_default_attrs\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\', \'False\'], " diff --git a/tensorflow/tools/api/golden/v1/tensorflow.estimator.-linear-regressor.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.estimator.-linear-regressor.pbtxt index 4d46d1e6b6..3791162619 100644 --- a/tensorflow/tools/api/golden/v1/tensorflow.estimator.-linear-regressor.pbtxt +++ b/tensorflow/tools/api/golden/v1/tensorflow.estimator.-linear-regressor.pbtxt @@ -31,6 +31,10 @@ tf_class { name: "evaluate" argspec: "args=[\'self\', \'input_fn\', \'steps\', \'hooks\', \'checkpoint_path\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], " } + member_method { + name: "export_saved_model" + argspec: "args=[\'self\', \'export_dir_base\', \'serving_input_receiver_fn\', \'assets_extra\', \'as_text\', \'checkpoint_path\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], " + } member_method { name: "export_savedmodel" argspec: "args=[\'self\', \'export_dir_base\', \'serving_input_receiver_fn\', \'assets_extra\', \'as_text\', \'checkpoint_path\', \'strip_default_attrs\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\', \'False\'], " diff --git a/tensorflow/tools/api/golden/v2/tensorflow.estimator.-baseline-classifier.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-baseline-classifier.pbtxt index cf22e39d4c..082e26b99b 100644 --- a/tensorflow/tools/api/golden/v2/tensorflow.estimator.-baseline-classifier.pbtxt +++ b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-baseline-classifier.pbtxt @@ -31,6 +31,10 @@ tf_class { name: "evaluate" argspec: "args=[\'self\', \'input_fn\', \'steps\', \'hooks\', \'checkpoint_path\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], " } + member_method { + name: "export_saved_model" + argspec: "args=[\'self\', \'export_dir_base\', \'serving_input_receiver_fn\', \'assets_extra\', \'as_text\', \'checkpoint_path\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], " + } member_method { name: "export_savedmodel" argspec: "args=[\'self\', \'export_dir_base\', \'serving_input_receiver_fn\', \'assets_extra\', \'as_text\', \'checkpoint_path\', \'strip_default_attrs\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\', \'False\'], " diff --git a/tensorflow/tools/api/golden/v2/tensorflow.estimator.-baseline-regressor.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-baseline-regressor.pbtxt index a363bceae3..7cc4191eb3 100644 --- a/tensorflow/tools/api/golden/v2/tensorflow.estimator.-baseline-regressor.pbtxt +++ b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-baseline-regressor.pbtxt @@ -31,6 +31,10 @@ tf_class { name: "evaluate" argspec: "args=[\'self\', \'input_fn\', \'steps\', \'hooks\', \'checkpoint_path\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], " } + member_method { + name: "export_saved_model" + argspec: "args=[\'self\', \'export_dir_base\', \'serving_input_receiver_fn\', \'assets_extra\', \'as_text\', \'checkpoint_path\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], " + } member_method { name: "export_savedmodel" argspec: "args=[\'self\', \'export_dir_base\', \'serving_input_receiver_fn\', \'assets_extra\', \'as_text\', \'checkpoint_path\', \'strip_default_attrs\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\', \'False\'], " diff --git a/tensorflow/tools/api/golden/v2/tensorflow.estimator.-boosted-trees-classifier.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-boosted-trees-classifier.pbtxt index c23b04b4ef..7027e78df4 100644 --- a/tensorflow/tools/api/golden/v2/tensorflow.estimator.-boosted-trees-classifier.pbtxt +++ b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-boosted-trees-classifier.pbtxt @@ -31,6 +31,10 @@ tf_class { name: "evaluate" argspec: "args=[\'self\', \'input_fn\', \'steps\', \'hooks\', \'checkpoint_path\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], " } + member_method { + name: "export_saved_model" + argspec: "args=[\'self\', \'export_dir_base\', \'serving_input_receiver_fn\', \'assets_extra\', \'as_text\', \'checkpoint_path\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], " + } member_method { name: "export_savedmodel" argspec: "args=[\'self\', \'export_dir_base\', \'serving_input_receiver_fn\', \'assets_extra\', \'as_text\', \'checkpoint_path\', \'strip_default_attrs\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\', \'False\'], " diff --git a/tensorflow/tools/api/golden/v2/tensorflow.estimator.-boosted-trees-regressor.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-boosted-trees-regressor.pbtxt index 6878d28fff..d8167ea7cb 100644 --- a/tensorflow/tools/api/golden/v2/tensorflow.estimator.-boosted-trees-regressor.pbtxt +++ b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-boosted-trees-regressor.pbtxt @@ -31,6 +31,10 @@ tf_class { name: "evaluate" argspec: "args=[\'self\', \'input_fn\', \'steps\', \'hooks\', \'checkpoint_path\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], " } + member_method { + name: "export_saved_model" + argspec: "args=[\'self\', \'export_dir_base\', \'serving_input_receiver_fn\', \'assets_extra\', \'as_text\', \'checkpoint_path\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], " + } member_method { name: "export_savedmodel" argspec: "args=[\'self\', \'export_dir_base\', \'serving_input_receiver_fn\', \'assets_extra\', \'as_text\', \'checkpoint_path\', \'strip_default_attrs\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\', \'False\'], " diff --git a/tensorflow/tools/api/golden/v2/tensorflow.estimator.-d-n-n-classifier.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-d-n-n-classifier.pbtxt index 0c6b7e4a82..718f415a77 100644 --- a/tensorflow/tools/api/golden/v2/tensorflow.estimator.-d-n-n-classifier.pbtxt +++ b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-d-n-n-classifier.pbtxt @@ -31,6 +31,10 @@ tf_class { name: "evaluate" argspec: "args=[\'self\', \'input_fn\', \'steps\', \'hooks\', \'checkpoint_path\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], " } + member_method { + name: "export_saved_model" + argspec: "args=[\'self\', \'export_dir_base\', \'serving_input_receiver_fn\', \'assets_extra\', \'as_text\', \'checkpoint_path\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], " + } member_method { name: "export_savedmodel" argspec: "args=[\'self\', \'export_dir_base\', \'serving_input_receiver_fn\', \'assets_extra\', \'as_text\', \'checkpoint_path\', \'strip_default_attrs\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\', \'False\'], " diff --git a/tensorflow/tools/api/golden/v2/tensorflow.estimator.-d-n-n-linear-combined-classifier.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-d-n-n-linear-combined-classifier.pbtxt index 9c1c072124..b23c019d6c 100644 --- a/tensorflow/tools/api/golden/v2/tensorflow.estimator.-d-n-n-linear-combined-classifier.pbtxt +++ b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-d-n-n-linear-combined-classifier.pbtxt @@ -31,6 +31,10 @@ tf_class { name: "evaluate" argspec: "args=[\'self\', \'input_fn\', \'steps\', \'hooks\', \'checkpoint_path\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], " } + member_method { + name: "export_saved_model" + argspec: "args=[\'self\', \'export_dir_base\', \'serving_input_receiver_fn\', \'assets_extra\', \'as_text\', \'checkpoint_path\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], " + } member_method { name: "export_savedmodel" argspec: "args=[\'self\', \'export_dir_base\', \'serving_input_receiver_fn\', \'assets_extra\', \'as_text\', \'checkpoint_path\', \'strip_default_attrs\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\', \'False\'], " diff --git a/tensorflow/tools/api/golden/v2/tensorflow.estimator.-d-n-n-linear-combined-regressor.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-d-n-n-linear-combined-regressor.pbtxt index 7391d4b07a..caa9e3f1de 100644 --- a/tensorflow/tools/api/golden/v2/tensorflow.estimator.-d-n-n-linear-combined-regressor.pbtxt +++ b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-d-n-n-linear-combined-regressor.pbtxt @@ -31,6 +31,10 @@ tf_class { name: "evaluate" argspec: "args=[\'self\', \'input_fn\', \'steps\', \'hooks\', \'checkpoint_path\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], " } + member_method { + name: "export_saved_model" + argspec: "args=[\'self\', \'export_dir_base\', \'serving_input_receiver_fn\', \'assets_extra\', \'as_text\', \'checkpoint_path\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], " + } member_method { name: "export_savedmodel" argspec: "args=[\'self\', \'export_dir_base\', \'serving_input_receiver_fn\', \'assets_extra\', \'as_text\', \'checkpoint_path\', \'strip_default_attrs\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\', \'False\'], " diff --git a/tensorflow/tools/api/golden/v2/tensorflow.estimator.-d-n-n-regressor.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-d-n-n-regressor.pbtxt index f50e375f7c..1f5e650940 100644 --- a/tensorflow/tools/api/golden/v2/tensorflow.estimator.-d-n-n-regressor.pbtxt +++ b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-d-n-n-regressor.pbtxt @@ -31,6 +31,10 @@ tf_class { name: "evaluate" argspec: "args=[\'self\', \'input_fn\', \'steps\', \'hooks\', \'checkpoint_path\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], " } + member_method { + name: "export_saved_model" + argspec: "args=[\'self\', \'export_dir_base\', \'serving_input_receiver_fn\', \'assets_extra\', \'as_text\', \'checkpoint_path\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], " + } member_method { name: "export_savedmodel" argspec: "args=[\'self\', \'export_dir_base\', \'serving_input_receiver_fn\', \'assets_extra\', \'as_text\', \'checkpoint_path\', \'strip_default_attrs\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\', \'False\'], " diff --git a/tensorflow/tools/api/golden/v2/tensorflow.estimator.-estimator.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-estimator.pbtxt index d72b576977..ebd3869c9b 100644 --- a/tensorflow/tools/api/golden/v2/tensorflow.estimator.-estimator.pbtxt +++ b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-estimator.pbtxt @@ -30,6 +30,10 @@ tf_class { name: "evaluate" argspec: "args=[\'self\', \'input_fn\', \'steps\', \'hooks\', \'checkpoint_path\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], " } + member_method { + name: "export_saved_model" + argspec: "args=[\'self\', \'export_dir_base\', \'serving_input_receiver_fn\', \'assets_extra\', \'as_text\', \'checkpoint_path\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], " + } member_method { name: "export_savedmodel" argspec: "args=[\'self\', \'export_dir_base\', \'serving_input_receiver_fn\', \'assets_extra\', \'as_text\', \'checkpoint_path\', \'strip_default_attrs\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\', \'False\'], " diff --git a/tensorflow/tools/api/golden/v2/tensorflow.estimator.-linear-classifier.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-linear-classifier.pbtxt index 154f171e89..53ec5a0c78 100644 --- a/tensorflow/tools/api/golden/v2/tensorflow.estimator.-linear-classifier.pbtxt +++ b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-linear-classifier.pbtxt @@ -31,6 +31,10 @@ tf_class { name: "evaluate" argspec: "args=[\'self\', \'input_fn\', \'steps\', \'hooks\', \'checkpoint_path\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], " } + member_method { + name: "export_saved_model" + argspec: "args=[\'self\', \'export_dir_base\', \'serving_input_receiver_fn\', \'assets_extra\', \'as_text\', \'checkpoint_path\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], " + } member_method { name: "export_savedmodel" argspec: "args=[\'self\', \'export_dir_base\', \'serving_input_receiver_fn\', \'assets_extra\', \'as_text\', \'checkpoint_path\', \'strip_default_attrs\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\', \'False\'], " diff --git a/tensorflow/tools/api/golden/v2/tensorflow.estimator.-linear-regressor.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-linear-regressor.pbtxt index 4d46d1e6b6..3791162619 100644 --- a/tensorflow/tools/api/golden/v2/tensorflow.estimator.-linear-regressor.pbtxt +++ b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-linear-regressor.pbtxt @@ -31,6 +31,10 @@ tf_class { name: "evaluate" argspec: "args=[\'self\', \'input_fn\', \'steps\', \'hooks\', \'checkpoint_path\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], " } + member_method { + name: "export_saved_model" + argspec: "args=[\'self\', \'export_dir_base\', \'serving_input_receiver_fn\', \'assets_extra\', \'as_text\', \'checkpoint_path\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], " + } member_method { name: "export_savedmodel" argspec: "args=[\'self\', \'export_dir_base\', \'serving_input_receiver_fn\', \'assets_extra\', \'as_text\', \'checkpoint_path\', \'strip_default_attrs\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\', \'False\'], " -- GitLab From b7f2d11cc308631a8f0b733a1b2db39696507155 Mon Sep 17 00:00:00 2001 From: Jiri Simsa Date: Tue, 28 Aug 2018 10:07:45 -0700 Subject: [PATCH 218/598] [tf.data] Enable optimizations for input pipelines with stateful functions. PiperOrigin-RevId: 210559796 --- .../optimization/map_vectorization_test.py | 19 ++++++++-------- .../kernel_tests/optimize_dataset_op_test.py | 13 +++++++++++ .../data/python/kernel_tests/test_utils.py | 18 +++++++++++---- tensorflow/core/framework/dataset.cc | 21 ++++++++++-------- tensorflow/core/framework/dataset.h | 22 +++++++++++-------- .../core/kernels/data/filter_dataset_op.cc | 2 +- .../core/kernels/data/flat_map_dataset_op.cc | 2 +- .../data/group_by_reducer_dataset_op.cc | 9 ++++---- .../data/group_by_window_dataset_op.cc | 7 +++--- .../kernels/data/interleave_dataset_op.cc | 2 +- .../kernels/data/map_and_batch_dataset_op.cc | 2 +- .../core/kernels/data/map_dataset_op.cc | 2 +- .../core/kernels/data/optimize_dataset_op.cc | 1 + .../data/parallel_interleave_dataset_op.cc | 3 +-- .../kernels/data/parallel_map_dataset_op.cc | 2 +- .../core/kernels/data/scan_dataset_op.cc | 2 +- 16 files changed, 77 insertions(+), 50 deletions(-) diff --git a/tensorflow/contrib/data/python/kernel_tests/optimization/map_vectorization_test.py b/tensorflow/contrib/data/python/kernel_tests/optimization/map_vectorization_test.py index 57bf22591a..e2c9bc82df 100644 --- a/tensorflow/contrib/data/python/kernel_tests/optimization/map_vectorization_test.py +++ b/tensorflow/contrib/data/python/kernel_tests/optimization/map_vectorization_test.py @@ -122,15 +122,12 @@ class MapVectorizationTest(test_utils.DatasetTestBase, parameterized.TestCase): base_dataset = dataset_ops.Dataset.from_tensor_slices([[1, 2], [3, 4]]).repeat(5) - _, optimized = self._get_test_datasets( + unoptimized, optimized = self._get_test_datasets( base_dataset, map_fn, expect_optimized=False) - nxt = optimized.make_one_shot_iterator().get_next() - - # NOTE: Right now, it raises an error because we can't save datasets that - # are stateful, and we rely on this saving mechanism to optimize datasets, - # so stateful functions can't be optimized. - with self.assertRaisesRegexp(errors.InvalidArgumentError, "[Ss]tateful"): - self.evaluate(nxt) + self._assert_datasets_raise_same_error( + unoptimized, optimized, errors.InvalidArgumentError, + [("OneShotIterator", "OneShotIterator_1", 1), + ("IteratorGetNext", "IteratorGetNext_1", 1)]) def testOptimizationIgnoreRagged(self): # Make sure we ignore inputs that might not be uniformly sized @@ -151,8 +148,10 @@ class MapVectorizationTest(test_utils.DatasetTestBase, parameterized.TestCase): base_dataset = dataset_ops.Dataset.range(20).batch(1, drop_remainder=True) unoptimized, optimized = self._get_test_datasets( base_dataset, map_fn, expect_optimized=False) - self._assert_datasets_raise_same_error(unoptimized, optimized, - errors.InvalidArgumentError) + self._assert_datasets_raise_same_error( + unoptimized, optimized, errors.InvalidArgumentError, + [("OneShotIterator", "OneShotIterator_1", 1), + ("IteratorGetNext", "IteratorGetNext_1", 1)]) class MapVectorizationBenchmark(test.Benchmark): diff --git a/tensorflow/contrib/data/python/kernel_tests/optimize_dataset_op_test.py b/tensorflow/contrib/data/python/kernel_tests/optimize_dataset_op_test.py index ec43bc3653..446bf8d749 100644 --- a/tensorflow/contrib/data/python/kernel_tests/optimize_dataset_op_test.py +++ b/tensorflow/contrib/data/python/kernel_tests/optimize_dataset_op_test.py @@ -22,6 +22,7 @@ from absl.testing import parameterized from tensorflow.contrib.data.python.ops import optimization from tensorflow.python.data.ops import dataset_ops from tensorflow.python.framework import errors +from tensorflow.python.ops import random_ops from tensorflow.python.platform import test @@ -100,6 +101,18 @@ class OptimizeDatasetTest(test.TestCase, parameterized.TestCase): with self.assertRaises(errors.OutOfRangeError): sess.run(get_next) + def testStatefulFunctionOptimization(self): + dataset = dataset_ops.Dataset.range(10).apply( + optimization.assert_next([ + "MapAndBatch" + ])).map(lambda _: random_ops.random_uniform([])).batch(10).apply( + optimization.optimize(["map_and_batch_fusion"])) + iterator = dataset.make_one_shot_iterator() + get_next = iterator.get_next() + + with self.test_session() as sess: + sess.run(get_next) + if __name__ == "__main__": test.main() diff --git a/tensorflow/contrib/data/python/kernel_tests/test_utils.py b/tensorflow/contrib/data/python/kernel_tests/test_utils.py index 1b962b3418..1d70b16041 100644 --- a/tensorflow/contrib/data/python/kernel_tests/test_utils.py +++ b/tensorflow/contrib/data/python/kernel_tests/test_utils.py @@ -17,6 +17,8 @@ from __future__ import absolute_import from __future__ import division from __future__ import print_function +import re + from tensorflow.python.data.util import nest from tensorflow.python.framework import errors from tensorflow.python.platform import test @@ -45,7 +47,11 @@ class DatasetTestBase(test.TestCase): for i in range(len(op1)): self.assertAllEqual(op1[i], op2[i]) - def _assert_datasets_raise_same_error(self, dataset1, dataset2, exc_class): + def _assert_datasets_raise_same_error(self, + dataset1, + dataset2, + exception_class, + replacements=None): next1 = dataset1.make_one_shot_iterator().get_next() next2 = dataset2.make_one_shot_iterator().get_next() with self.test_session() as sess: @@ -53,8 +59,12 @@ class DatasetTestBase(test.TestCase): sess.run(next1) raise ValueError( "Expected dataset to raise an error of type %s, but it did not." % - repr(exc_class)) - except exc_class as e: + repr(exception_class)) + except exception_class as e: + expected_message = e.message + for old, new, count in replacements: + expected_message = expected_message.replace(old, new, count) # Check that the first segment of the error messages are the same. - with self.assertRaisesRegexp(exc_class, e.message.split(". ")[0]): + with self.assertRaisesRegexp(exception_class, + re.escape(expected_message)): sess.run(next2) diff --git a/tensorflow/core/framework/dataset.cc b/tensorflow/core/framework/dataset.cc index f3c7189292..b0b27ce94f 100644 --- a/tensorflow/core/framework/dataset.cc +++ b/tensorflow/core/framework/dataset.cc @@ -133,22 +133,25 @@ Status GraphDefBuilderWrapper::AddDataset( return Status::OK(); } -Status GraphDefBuilderWrapper::AddFunction( - const FunctionLibraryDefinition& flib_def, const string& function_name) { +Status GraphDefBuilderWrapper::AddFunction(SerializationContext* ctx, + const string& function_name) { if (b_->HasFunction(function_name)) { VLOG(1) << "Function with name " << function_name << "already exists in" << " the graph. It will not be added again."; return Status::OK(); } - TF_RETURN_IF_ERROR(EnsureFunctionIsStateless(flib_def, function_name)); - const FunctionDef* f_def = flib_def.Find(function_name); + if (!ctx->allow_stateful_functions()) { + TF_RETURN_IF_ERROR( + EnsureFunctionIsStateless(ctx->flib_def(), function_name)); + } + const FunctionDef* f_def = ctx->flib_def().Find(function_name); if (f_def == nullptr) { return errors::InvalidArgument("Unable to find FunctionDef for ", function_name, " in the registry."); } FunctionDefLibrary def; *def.add_function() = *f_def; - const string gradient_func = flib_def.FindGradient(function_name); + const string gradient_func = ctx->flib_def().FindGradient(function_name); if (!gradient_func.empty()) { GradientDef* g_def = def.add_gradient(); g_def->set_function_name(function_name); @@ -159,19 +162,19 @@ Status GraphDefBuilderWrapper::AddFunction( // Recursively add functions in inputs of function_name. for (const NodeDef& node_def : f_def->node_def()) { const OpRegistrationData* op_reg_data = nullptr; - TF_RETURN_IF_ERROR(flib_def.LookUp(node_def.op(), &op_reg_data)); + TF_RETURN_IF_ERROR(ctx->flib_def().LookUp(node_def.op(), &op_reg_data)); if (op_reg_data->is_function_op) { - TF_RETURN_IF_ERROR(AddFunction(flib_def, op_reg_data->op_def.name())); + TF_RETURN_IF_ERROR(AddFunction(ctx, op_reg_data->op_def.name())); } // Recursively add functions in attrs of this NodeDef. for (const auto& pair : node_def.attr()) { - TF_RETURN_IF_ERROR(AddAttrFunctions(pair.second, flib_def)); + TF_RETURN_IF_ERROR(AddAttrFunctions(ctx, pair.second)); } } // Recursively add functions in attrs of function_name. for (auto iter = f_def->attr().begin(); iter != f_def->attr().end(); iter++) { - TF_RETURN_IF_ERROR(AddAttrFunctions(iter->second, flib_def)); + TF_RETURN_IF_ERROR(AddAttrFunctions(ctx, iter->second)); } return Status::OK(); } diff --git a/tensorflow/core/framework/dataset.h b/tensorflow/core/framework/dataset.h index e0c26d9286..e06ca68bca 100644 --- a/tensorflow/core/framework/dataset.h +++ b/tensorflow/core/framework/dataset.h @@ -41,6 +41,7 @@ limitations under the License. namespace tensorflow { class DatasetBase; +class SerializationContext; // Interface for reading values from a key-value store. // Used for restoring iterator state. @@ -155,11 +156,11 @@ class GraphDefBuilderWrapper { // Adds a user-defined function with name `function_name` to the graph and // recursively adds all functions it references. If a function with a matching // name has already been added, returns with OK status. If a user-defined with - // name `function_name` is not found in the FunctionLibraryDefinition, returns - // an InvalidArgumentError. If the function with name `function_name` or any - // of its dependent functions are stateful, returns an InvalidArgument error. - Status AddFunction(const FunctionLibraryDefinition& flib_def, - const string& function_name); + // name `function_name` is not found in the context's function library, + // returns an InvalidArgumentError. If the function with name `function_name` + // or any of its dependent functions are stateful, and the context does not + // explicitly permit stateful functions, returns an InvalidArgument error. + Status AddFunction(SerializationContext* ctx, const string& function_name); template void BuildAttrValue(const T& value, AttrValue* attr) { @@ -220,13 +221,13 @@ class GraphDefBuilderWrapper { return false; } - Status AddAttrFunctions(const AttrValue& attr_value, - const FunctionLibraryDefinition& flib_def) { + Status AddAttrFunctions(SerializationContext* ctx, + const AttrValue& attr_value) { if (attr_value.has_func()) { - TF_RETURN_IF_ERROR(AddFunction(flib_def, attr_value.func().name())); + TF_RETURN_IF_ERROR(AddFunction(ctx, attr_value.func().name())); } else if (attr_value.has_list()) { for (const NameAttrList& name_attr_list : attr_value.list().func()) { - TF_RETURN_IF_ERROR(AddFunction(flib_def, name_attr_list.name())); + TF_RETURN_IF_ERROR(AddFunction(ctx, name_attr_list.name())); } } return Status::OK(); @@ -332,11 +333,14 @@ class IteratorContext { class SerializationContext { public: struct Params { + bool allow_stateful_functions = false; const FunctionLibraryDefinition* flib_def; // Not owned. }; explicit SerializationContext(Params params) : params_(std::move(params)) {} + bool allow_stateful_functions() { return params_.allow_stateful_functions; } + const FunctionLibraryDefinition& flib_def() { return *params_.flib_def; } private: diff --git a/tensorflow/core/kernels/data/filter_dataset_op.cc b/tensorflow/core/kernels/data/filter_dataset_op.cc index f5c7d336a6..bbce001eaf 100644 --- a/tensorflow/core/kernels/data/filter_dataset_op.cc +++ b/tensorflow/core/kernels/data/filter_dataset_op.cc @@ -112,7 +112,7 @@ class FilterDatasetOp : public UnaryDatasetOpKernel { Status AsGraphDefInternal(SerializationContext* ctx, DatasetGraphDefBuilder* b, Node** output) const override { - TF_RETURN_IF_ERROR(b->AddFunction(ctx->flib_def(), func_.name())); + TF_RETURN_IF_ERROR(b->AddFunction(ctx, func_.name())); Node* input_graph_node; TF_RETURN_IF_ERROR(b->AddInputDataset(ctx, input_, &input_graph_node)); diff --git a/tensorflow/core/kernels/data/flat_map_dataset_op.cc b/tensorflow/core/kernels/data/flat_map_dataset_op.cc index 21e627a8e8..b1eb2fd849 100644 --- a/tensorflow/core/kernels/data/flat_map_dataset_op.cc +++ b/tensorflow/core/kernels/data/flat_map_dataset_op.cc @@ -94,7 +94,7 @@ class FlatMapDatasetOp : public UnaryDatasetOpKernel { Status AsGraphDefInternal(SerializationContext* ctx, DatasetGraphDefBuilder* b, Node** output) const override { - TF_RETURN_IF_ERROR(b->AddFunction(ctx->flib_def(), func_.name())); + TF_RETURN_IF_ERROR(b->AddFunction(ctx, func_.name())); Node* input_graph_node = nullptr; TF_RETURN_IF_ERROR(b->AddInputDataset(ctx, input_, &input_graph_node)); diff --git a/tensorflow/core/kernels/data/group_by_reducer_dataset_op.cc b/tensorflow/core/kernels/data/group_by_reducer_dataset_op.cc index 4a388645f2..130f04da3e 100644 --- a/tensorflow/core/kernels/data/group_by_reducer_dataset_op.cc +++ b/tensorflow/core/kernels/data/group_by_reducer_dataset_op.cc @@ -109,11 +109,10 @@ class GroupByReducerDatasetOp : public UnaryDatasetOpKernel { Status AsGraphDefInternal(SerializationContext* ctx, DatasetGraphDefBuilder* b, Node** output) const override { - TF_RETURN_IF_ERROR(b->AddFunction(ctx->flib_def(), key_func().name())); - TF_RETURN_IF_ERROR(b->AddFunction(ctx->flib_def(), init_func().name())); - TF_RETURN_IF_ERROR(b->AddFunction(ctx->flib_def(), reduce_func().name())); - TF_RETURN_IF_ERROR( - b->AddFunction(ctx->flib_def(), finalize_func().name())); + TF_RETURN_IF_ERROR(b->AddFunction(ctx, key_func().name())); + TF_RETURN_IF_ERROR(b->AddFunction(ctx, init_func().name())); + TF_RETURN_IF_ERROR(b->AddFunction(ctx, reduce_func().name())); + TF_RETURN_IF_ERROR(b->AddFunction(ctx, finalize_func().name())); Node* input_graph_node = nullptr; TF_RETURN_IF_ERROR(b->AddInputDataset(ctx, input_, &input_graph_node)); diff --git a/tensorflow/core/kernels/data/group_by_window_dataset_op.cc b/tensorflow/core/kernels/data/group_by_window_dataset_op.cc index f993a68934..46a3185b49 100644 --- a/tensorflow/core/kernels/data/group_by_window_dataset_op.cc +++ b/tensorflow/core/kernels/data/group_by_window_dataset_op.cc @@ -139,10 +139,9 @@ class GroupByWindowDatasetOp : public UnaryDatasetOpKernel { Status AsGraphDefInternal(SerializationContext* ctx, DatasetGraphDefBuilder* b, Node** output) const override { - TF_RETURN_IF_ERROR(b->AddFunction(ctx->flib_def(), key_func_.name())); - TF_RETURN_IF_ERROR(b->AddFunction(ctx->flib_def(), reduce_func_.name())); - TF_RETURN_IF_ERROR( - b->AddFunction(ctx->flib_def(), window_size_func_.name())); + TF_RETURN_IF_ERROR(b->AddFunction(ctx, key_func_.name())); + TF_RETURN_IF_ERROR(b->AddFunction(ctx, reduce_func_.name())); + TF_RETURN_IF_ERROR(b->AddFunction(ctx, window_size_func_.name())); Node* input_graph_node = nullptr; TF_RETURN_IF_ERROR(b->AddInputDataset(ctx, input_, &input_graph_node)); diff --git a/tensorflow/core/kernels/data/interleave_dataset_op.cc b/tensorflow/core/kernels/data/interleave_dataset_op.cc index 6bba667759..716e040277 100644 --- a/tensorflow/core/kernels/data/interleave_dataset_op.cc +++ b/tensorflow/core/kernels/data/interleave_dataset_op.cc @@ -116,7 +116,7 @@ class InterleaveDatasetOp : public UnaryDatasetOpKernel { Status AsGraphDefInternal(SerializationContext* ctx, DatasetGraphDefBuilder* b, Node** output) const override { - TF_RETURN_IF_ERROR(b->AddFunction(ctx->flib_def(), func_.name())); + TF_RETURN_IF_ERROR(b->AddFunction(ctx, func_.name())); Node* input_node; TF_RETURN_IF_ERROR(b->AddInputDataset(ctx, input_, &input_node)); Node* cycle_length_node; diff --git a/tensorflow/core/kernels/data/map_and_batch_dataset_op.cc b/tensorflow/core/kernels/data/map_and_batch_dataset_op.cc index c4df7f2756..8b0c9ad6b2 100644 --- a/tensorflow/core/kernels/data/map_and_batch_dataset_op.cc +++ b/tensorflow/core/kernels/data/map_and_batch_dataset_op.cc @@ -147,7 +147,7 @@ class MapAndBatchDatasetOp : public UnaryDatasetOpKernel { Status AsGraphDefInternal(SerializationContext* ctx, DatasetGraphDefBuilder* b, Node** output) const override { - TF_RETURN_IF_ERROR(b->AddFunction(ctx->flib_def(), map_fn_.name())); + TF_RETURN_IF_ERROR(b->AddFunction(ctx, map_fn_.name())); Node* input_graph_node = nullptr; TF_RETURN_IF_ERROR(b->AddInputDataset(ctx, input_, &input_graph_node)); Node* batch_size_node; diff --git a/tensorflow/core/kernels/data/map_dataset_op.cc b/tensorflow/core/kernels/data/map_dataset_op.cc index 26ae26a7fd..7f8182d917 100644 --- a/tensorflow/core/kernels/data/map_dataset_op.cc +++ b/tensorflow/core/kernels/data/map_dataset_op.cc @@ -92,7 +92,7 @@ class MapDatasetOp : public UnaryDatasetOpKernel { Status AsGraphDefInternal(SerializationContext* ctx, DatasetGraphDefBuilder* b, Node** output) const override { - TF_RETURN_IF_ERROR(b->AddFunction(ctx->flib_def(), func_.name())); + TF_RETURN_IF_ERROR(b->AddFunction(ctx, func_.name())); Node* input_graph_node = nullptr; TF_RETURN_IF_ERROR(b->AddInputDataset(ctx, input_, &input_graph_node)); diff --git a/tensorflow/core/kernels/data/optimize_dataset_op.cc b/tensorflow/core/kernels/data/optimize_dataset_op.cc index 9b14078407..831e7252da 100644 --- a/tensorflow/core/kernels/data/optimize_dataset_op.cc +++ b/tensorflow/core/kernels/data/optimize_dataset_op.cc @@ -92,6 +92,7 @@ class OptimizeDatasetOp : public UnaryDatasetOpKernel { DatasetGraphDefBuilder db(&b); Node* input_node = nullptr; SerializationContext::Params params; + params.allow_stateful_functions = true; params.flib_def = ctx->function_library()->GetFunctionLibraryDefinition(); SerializationContext serialization_ctx(params); TF_RETURN_IF_ERROR( diff --git a/tensorflow/core/kernels/data/parallel_interleave_dataset_op.cc b/tensorflow/core/kernels/data/parallel_interleave_dataset_op.cc index bf86361a71..f6b3fd97e3 100644 --- a/tensorflow/core/kernels/data/parallel_interleave_dataset_op.cc +++ b/tensorflow/core/kernels/data/parallel_interleave_dataset_op.cc @@ -137,8 +137,7 @@ class ParallelInterleaveDatasetOp : public UnaryDatasetOpKernel { Status AsGraphDefInternal(SerializationContext* ctx, DatasetGraphDefBuilder* b, Node** output) const override { - TF_RETURN_IF_ERROR( - b->AddFunction(ctx->flib_def(), interleave_func_.name())); + TF_RETURN_IF_ERROR(b->AddFunction(ctx, interleave_func_.name())); Node* input_node; TF_RETURN_IF_ERROR(b->AddInputDataset(ctx, input_, &input_node)); Node* cycle_length_node; diff --git a/tensorflow/core/kernels/data/parallel_map_dataset_op.cc b/tensorflow/core/kernels/data/parallel_map_dataset_op.cc index e03a4e353b..bff54813d6 100644 --- a/tensorflow/core/kernels/data/parallel_map_dataset_op.cc +++ b/tensorflow/core/kernels/data/parallel_map_dataset_op.cc @@ -142,7 +142,7 @@ class ParallelMapDatasetOp : public UnaryDatasetOpKernel { b->AddScalar(num_parallel_calls_, &num_parallel_calls)); // Attr: f - TF_RETURN_IF_ERROR(b->AddFunction(ctx->flib_def(), func_.name())); + TF_RETURN_IF_ERROR(b->AddFunction(ctx, func_.name())); AttrValue f; b->BuildAttrValue(func_, &f); diff --git a/tensorflow/core/kernels/data/scan_dataset_op.cc b/tensorflow/core/kernels/data/scan_dataset_op.cc index 5d3319b19f..fccad933d0 100644 --- a/tensorflow/core/kernels/data/scan_dataset_op.cc +++ b/tensorflow/core/kernels/data/scan_dataset_op.cc @@ -109,7 +109,7 @@ class ScanDatasetOp : public UnaryDatasetOpKernel { Status AsGraphDefInternal(SerializationContext* ctx, DatasetGraphDefBuilder* b, Node** output) const override { - TF_RETURN_IF_ERROR(b->AddFunction(ctx->flib_def(), func_.name())); + TF_RETURN_IF_ERROR(b->AddFunction(ctx, func_.name())); Node* input_node; TF_RETURN_IF_ERROR(b->AddInputDataset(ctx, input_, &input_node)); std::vector initial_state_nodes; -- GitLab From e5f8fadc6c9f3b6f3513fd5dc1fa05a572e8a2b1 Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Tue, 28 Aug 2018 10:10:04 -0700 Subject: [PATCH 219/598] Convert more kernel signatures to use runtime shapes. PiperOrigin-RevId: 210560223 --- .../internal/optimized/optimized_ops.h | 214 +++++++++++------- .../internal/reference/reference_ops.h | 210 +++++++++++------ .../contrib/lite/kernels/internal/types.h | 25 +- 3 files changed, 286 insertions(+), 163 deletions(-) diff --git a/tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h b/tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h index 0cc7a7f2e7..b00097c433 100644 --- a/tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h +++ b/tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h @@ -2748,17 +2748,16 @@ inline void BroadcastAddFivefold(const ArithmeticParams& unswitched_params, } } -inline void Mul(const float* input1_data, const Dims<4>& input1_dims, - const float* input2_data, const Dims<4>& input2_dims, - float output_activation_min, float output_activation_max, - float* output_data, const Dims<4>& output_dims) { +inline void Mul(const ArithmeticParams& params, + const RuntimeShape& input1_shape, const float* input1_data, + const RuntimeShape& input2_shape, const float* input2_data, + const RuntimeShape& output_shape, float* output_data) { gemmlowp::ScopedProfilingLabel label("Mul"); - TFLITE_DCHECK(IsPackedWithoutStrides(input1_dims)); - TFLITE_DCHECK(IsPackedWithoutStrides(input2_dims)); - TFLITE_DCHECK(IsPackedWithoutStrides(output_dims)); + const float output_activation_min = params.float_activation_min; + const float output_activation_max = params.float_activation_max; int i = 0; - const int size = MatchingFlatSize(input1_dims, input2_dims, output_dims); + const int size = MatchingFlatSize(input1_shape, input2_shape, output_shape); #ifdef USE_NEON const auto activation_min = vdupq_n_f32(output_activation_min); const auto activation_max = vdupq_n_f32(output_activation_max); @@ -2809,6 +2808,20 @@ inline void Mul(const float* input1_data, const Dims<4>& input1_dims, } } +// Legacy Dims<4>. +inline void Mul(const float* input1_data, const Dims<4>& input1_dims, + const float* input2_data, const Dims<4>& input2_dims, + float output_activation_min, float output_activation_max, + float* output_data, const Dims<4>& output_dims) { + tflite::ArithmeticParams op_params; + op_params.float_activation_min = output_activation_min; + op_params.float_activation_max = output_activation_max; + + Mul(op_params, DimsToShape(input1_dims), input1_data, + DimsToShape(input2_dims), input2_data, DimsToShape(output_dims), + output_data); +} + // legacy, for compatibility with old checked-in code template void Mul(const float* input1_data, const Dims<4>& input1_dims, @@ -2821,13 +2834,16 @@ void Mul(const float* input1_data, const Dims<4>& input1_dims, output_activation_max, output_data, output_dims); } -inline void Mul(const int32* input1_data, const Dims<4>& input1_dims, - const int32* input2_data, const Dims<4>& input2_dims, - int32 output_activation_min, int32 output_activation_max, - int32* output_data, const Dims<4>& output_dims) { - gemmlowp::ScopedProfilingLabel label("Mul/int32"); +inline void Mul(const ArithmeticParams& params, + const RuntimeShape& input1_shape, const int32* input1_data, + const RuntimeShape& input2_shape, const int32* input2_data, + const RuntimeShape& output_shape, int32* output_data) { + gemmlowp::ScopedProfilingLabel label("Mul/int32/activation"); - const int flat_size = MatchingFlatSize(input1_dims, input2_dims, output_dims); + const int flat_size = + MatchingFlatSize(input1_shape, input2_shape, output_shape); + const int32 output_activation_min = params.quantized_activation_min; + const int32 output_activation_max = params.quantized_activation_max; for (int i = 0; i < flat_size; ++i) { output_data[i] = ActivationFunctionWithMinMax( input1_data[i] * input2_data[i], output_activation_min, @@ -2835,22 +2851,38 @@ inline void Mul(const int32* input1_data, const Dims<4>& input1_dims, } } -template -void Mul(const int32* input1_data, const Dims<4>& input1_dims, - const int32* input2_data, const Dims<4>& input2_dims, - int32* output_data, const Dims<4>& output_dims) { +// Legacy Dims<4>. +inline void Mul(const int32* input1_data, const Dims<4>& input1_dims, + const int32* input2_data, const Dims<4>& input2_dims, + int32 output_activation_min, int32 output_activation_max, + int32* output_data, const Dims<4>& output_dims) { + tflite::ArithmeticParams op_params; + op_params.quantized_activation_min = output_activation_min; + op_params.quantized_activation_max = output_activation_max; + + Mul(op_params, DimsToShape(input1_dims), input1_data, + DimsToShape(input2_dims), input2_data, DimsToShape(output_dims), + output_data); +} + +inline void MulNoActivation(const ArithmeticParams& params, + const RuntimeShape& input1_shape, + const int32* input1_data, + const RuntimeShape& input2_shape, + const int32* input2_data, + const RuntimeShape& output_shape, + int32* output_data) { gemmlowp::ScopedProfilingLabel label("Mul/int32"); - TFLITE_DCHECK(Ac == FusedActivationFunctionType::kNone); - auto input1_map = MapAsVector(input1_data, input1_dims); - auto input2_map = MapAsVector(input2_data, input2_dims); - auto output_map = MapAsVector(output_data, output_dims); - if (AreSameDims(input1_dims, input2_dims)) { + auto input1_map = MapAsVector(input1_data, input1_shape); + auto input2_map = MapAsVector(input2_data, input2_shape); + auto output_map = MapAsVector(output_data, output_shape); + if (input1_shape == input2_shape) { output_map.array() = input1_map.array() * input2_map.array(); - } else if (FlatSize(input2_dims) == 1) { + } else if (input2_shape.FlatSize() == 1) { auto scalar = input2_data[0]; output_map.array() = input1_map.array() * scalar; - } else if (FlatSize(input1_dims) == 1) { + } else if (input1_shape.FlatSize() == 1) { auto scalar = input1_data[0]; output_map.array() = scalar * input2_map.array(); } else { @@ -2859,14 +2891,30 @@ void Mul(const int32* input1_data, const Dims<4>& input1_dims, } } -inline void Mul(const int16* input1_data, const Dims<4>& input1_dims, - const int16* input2_data, const Dims<4>& input2_dims, - int16* output_data, const Dims<4>& output_dims) { - gemmlowp::ScopedProfilingLabel label("Mul/Int16"); +// Legacy Dims<4>. +template +void Mul(const int32* input1_data, const Dims<4>& input1_dims, + const int32* input2_data, const Dims<4>& input2_dims, + int32* output_data, const Dims<4>& output_dims) { + TFLITE_DCHECK(Ac == FusedActivationFunctionType::kNone); + tflite::ArithmeticParams op_params; + // No parameters needed. + + MulNoActivation(op_params, DimsToShape(input1_dims), input1_data, + DimsToShape(input2_dims), input2_data, + DimsToShape(output_dims), output_data); +} + +inline void Mul(const ArithmeticParams& params, + const RuntimeShape& input1_shape, const int16* input1_data, + const RuntimeShape& input2_shape, const int16* input2_data, + const RuntimeShape& output_shape, int16* output_data) { + gemmlowp::ScopedProfilingLabel label("Mul/Int16/NoActivation"); // This is a copy of the reference implementation. We do not currently have a // properly optimized version. - const int flat_size = MatchingFlatSize(output_dims, input1_dims, input2_dims); + const int flat_size = + MatchingFlatSize(input1_shape, input2_shape, output_shape); for (int i = 0; i < flat_size; i++) { // F0 uses 0 integer bits, range [-1, 1]. @@ -2878,17 +2926,32 @@ inline void Mul(const int16* input1_data, const Dims<4>& input1_dims, } } +// Legacy Dims<4>. inline void Mul(const int16* input1_data, const Dims<4>& input1_dims, const int16* input2_data, const Dims<4>& input2_dims, - int32 output_offset, int32 output_activation_min, - int32 output_activation_max, uint8* output_data, - const Dims<4>& output_dims) { + int16* output_data, const Dims<4>& output_dims) { + tflite::ArithmeticParams op_params; + // No parameters needed. + + Mul(op_params, DimsToShape(input1_dims), input1_data, + DimsToShape(input2_dims), input2_data, DimsToShape(output_dims), + output_data); +} + +inline void Mul(const ArithmeticParams& params, + const RuntimeShape& input1_shape, const int16* input1_data, + const RuntimeShape& input2_shape, const int16* input2_data, + const RuntimeShape& output_shape, uint8* output_data) { gemmlowp::ScopedProfilingLabel label("Mul/Int16Uint8"); // This is a copy of the reference implementation. We do not currently have a // properly optimized version. + const int32 output_activation_min = params.quantized_activation_min; + const int32 output_activation_max = params.quantized_activation_max; + const int32 output_offset = params.output_offset; TFLITE_DCHECK_LE(output_activation_min, output_activation_max); - const int flat_size = MatchingFlatSize(output_dims, input1_dims, input2_dims); + const int flat_size = + MatchingFlatSize(input1_shape, input2_shape, output_shape); for (int i = 0; i < flat_size; i++) { // F0 uses 0 integer bits, range [-1, 1]. @@ -2906,62 +2969,51 @@ inline void Mul(const int16* input1_data, const Dims<4>& input1_dims, } } -// TODO(jiawen): We can implement BroadcastMul on buffers of arbitrary -// dimensionality if the runtime code does a single loop over one dimension -// that handles broadcasting as the base case. The code generator would then -// generate max(D1, D2) nested for loops. -// TODO(benoitjacob): BroadcastMul is intentionally duplicated from -// reference_ops.h. Once an optimized version is implemented and NdArrayDesc -// is no longer referenced in this file, move NdArrayDesc from types.h to -// reference_ops.h. +// Legacy Dims<4>. +inline void Mul(const int16* input1_data, const Dims<4>& input1_dims, + const int16* input2_data, const Dims<4>& input2_dims, + int32 output_offset, int32 output_activation_min, + int32 output_activation_max, uint8* output_data, + const Dims<4>& output_dims) { + tflite::ArithmeticParams op_params; + op_params.output_offset = output_offset; + op_params.quantized_activation_min = output_activation_min; + op_params.quantized_activation_max = output_activation_max; + + Mul(op_params, DimsToShape(input1_dims), input1_data, + DimsToShape(input2_dims), input2_data, DimsToShape(output_dims), + output_data); +} + +// Legacy Dims<4>. template void BroadcastMul(const T* input1_data, const Dims<4>& input1_dims, const T* input2_data, const Dims<4>& input2_dims, T output_activation_min, T output_activation_max, T* output_data, const Dims<4>& output_dims) { - gemmlowp::ScopedProfilingLabel label("BroadcastMul"); + tflite::ArithmeticParams op_params; + SetActivationParams(output_activation_min, output_activation_max, &op_params); - NdArrayDesc<4> desc1; - NdArrayDesc<4> desc2; - NdArrayDescsForElementwiseBroadcast(input1_dims, input2_dims, &desc1, &desc2); - - // In Tensorflow, the dimensions are canonically named (batch_number, row, - // col, channel), with extents (batches, height, width, depth), with the - // trailing dimension changing most rapidly (channels has the smallest stride, - // typically 1 element). - // - // In generated C code, we store arrays with the dimensions reversed. The - // first dimension has smallest stride. - // - // We name our variables by their Tensorflow convention, but generate C code - // nesting loops such that the innermost loop has the smallest stride for the - // best cache behavior. - for (int b = 0; b < ArraySize(output_dims, 3); ++b) { - for (int y = 0; y < ArraySize(output_dims, 2); ++y) { - for (int x = 0; x < ArraySize(output_dims, 1); ++x) { - for (int c = 0; c < ArraySize(output_dims, 0); ++c) { - output_data[Offset(output_dims, c, x, y, b)] = - ActivationFunctionWithMinMax( - input1_data[SubscriptToIndex(desc1, c, x, y, b)] * - input2_data[SubscriptToIndex(desc2, c, x, y, b)], - output_activation_min, output_activation_max); - } - } - } - } + BroadcastMul4DSlow(op_params, DimsToShape(input1_dims), input1_data, + DimsToShape(input2_dims), input2_data, + DimsToShape(output_dims), output_data); } +// Legacy Dims<4>. // legacy, for compatibility with old checked-in code -template -void BroadcastMul(const T* input1_data, const Dims<4>& input1_dims, - const T* input2_data, const Dims<4>& input2_dims, - T* output_data, const Dims<4>& output_dims) { - T output_activation_min, output_activation_max; - GetActivationMinMax(Ac, &output_activation_min, &output_activation_max); - - BroadcastMul(input1_data, input1_dims, input2_data, input2_dims, - output_activation_min, output_activation_max, output_data, - output_dims); +template +inline void BroadcastMul(const float* input1_data, const Dims<4>& input1_dims, + const float* input2_data, const Dims<4>& input2_dims, + float* output_data, const Dims<4>& output_dims) { + tflite::ArithmeticParams op_params; + float float_activation_min; + float float_activation_max; + GetActivationMinMax(Ac, &float_activation_min, &float_activation_max); + SetActivationParams(float_activation_min, float_activation_max, &op_params); + + BroadcastMul4DSlow(op_params, DimsToShape(input1_dims), input1_data, + DimsToShape(input2_dims), input2_data, + DimsToShape(output_dims), output_data); } // Element-wise mul that can often be used for inner loop of broadcast Mul as diff --git a/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h b/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h index 9c957a3936..3875b73e05 100644 --- a/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h +++ b/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h @@ -1363,11 +1363,16 @@ inline void BroadcastAddFivefold(const ArithmeticParams& unswitched_params, } template -inline void Mul(const T* input1_data, const Dims<4>& input1_dims, - const T* input2_data, const Dims<4>& input2_dims, - T output_activation_min, T output_activation_max, - T* output_data, const Dims<4>& output_dims) { - const int flat_size = MatchingFlatSize(input1_dims, input2_dims, output_dims); +inline void Mul(const ArithmeticParams& params, + const RuntimeShape& input1_shape, const T* input1_data, + const RuntimeShape& input2_shape, const T* input2_data, + const RuntimeShape& output_shape, T* output_data) { + T output_activation_min; + T output_activation_max; + GetActivationParams(params, &output_activation_min, &output_activation_max); + + const int flat_size = + MatchingFlatSize(input1_shape, input2_shape, output_shape); for (int i = 0; i < flat_size; ++i) { output_data[i] = ActivationFunctionWithMinMax( input1_data[i] * input2_data[i], output_activation_min, @@ -1375,6 +1380,20 @@ inline void Mul(const T* input1_data, const Dims<4>& input1_dims, } } +// Legacy Dims<4>. +template +inline void Mul(const T* input1_data, const Dims<4>& input1_dims, + const T* input2_data, const Dims<4>& input2_dims, + T output_activation_min, T output_activation_max, + T* output_data, const Dims<4>& output_dims) { + tflite::ArithmeticParams op_params; + SetActivationParams(output_activation_min, output_activation_max, &op_params); + + Mul(op_params, DimsToShape(input1_dims), input1_data, + DimsToShape(input2_dims), input2_data, DimsToShape(output_dims), + output_data); +} + // legacy, for compatibility with old checked-in code template void Mul(const float* input1_data, const Dims<4>& input1_dims, @@ -1383,44 +1402,65 @@ void Mul(const float* input1_data, const Dims<4>& input1_dims, float output_activation_min, output_activation_max; GetActivationMinMax(Ac, &output_activation_min, &output_activation_max); - Mul(input1_data, input1_dims, input2_data, input2_dims, output_activation_min, - output_activation_max, output_data, output_dims); + tflite::ArithmeticParams op_params; + SetActivationParams(output_activation_min, output_activation_max, &op_params); + + Mul(op_params, DimsToShape(input1_dims), input1_data, + DimsToShape(input2_dims), input2_data, DimsToShape(output_dims), + output_data); } // TODO(jiawen): We can implement BroadcastMul on buffers of arbitrary // dimensionality if the runtime code does a single loop over one dimension // that handles broadcasting as the base case. The code generator would then // generate max(D1, D2) nested for loops. +// TODO(benoitjacob): BroadcastMul is intentionally duplicated from +// reference_ops.h. Once an optimized version is implemented and NdArrayDesc +// is no longer referenced in this file, move NdArrayDesc from types.h to +// reference_ops.h. template -void BroadcastMul(const T* input1_data, const Dims<4>& input1_dims, - const T* input2_data, const Dims<4>& input2_dims, - T output_activation_min, T output_activation_max, - T* output_data, const Dims<4>& output_dims) { - gemmlowp::ScopedProfilingLabel label("BroadcastMul"); +void BroadcastMul4DSlow(const ArithmeticParams& params, + const RuntimeShape& unextended_input1_shape, + const T* input1_data, + const RuntimeShape& unextended_input2_shape, + const T* input2_data, + const RuntimeShape& unextended_output_shape, + T* output_data) { + gemmlowp::ScopedProfilingLabel label("BroadcastMul4DSlow"); + T output_activation_min; + T output_activation_max; + GetActivationParams(params, &output_activation_min, &output_activation_max); + + TFLITE_DCHECK_LE(unextended_input1_shape.DimensionsCount(), 4); + TFLITE_DCHECK_LE(unextended_input2_shape.DimensionsCount(), 4); + TFLITE_DCHECK_LE(unextended_output_shape.DimensionsCount(), 4); + RuntimeShape output_shape = + RuntimeShape::ExtendedShape(4, unextended_output_shape); NdArrayDesc<4> desc1; NdArrayDesc<4> desc2; - NdArrayDescsForElementwiseBroadcast(input1_dims, input2_dims, &desc1, &desc2); + NdArrayDescsForElementwiseBroadcast(unextended_input1_shape, + unextended_input2_shape, &desc1, &desc2); // In Tensorflow, the dimensions are canonically named (batch_number, row, // col, channel), with extents (batches, height, width, depth), with the - // trailing dimension changing most rapidly (channels has the smallest - // stride, typically 1 element). + // trailing dimension changing most rapidly (channels has the smallest stride, + // typically 1 element). // // In generated C code, we store arrays with the dimensions reversed. The // first dimension has smallest stride. // // We name our variables by their Tensorflow convention, but generate C code - // nesting loops such that the innermost loop has the smallest stride for - // the best cache behavior. - for (int b = 0; b < ArraySize(output_dims, 3); ++b) { - for (int y = 0; y < ArraySize(output_dims, 2); ++y) { - for (int x = 0; x < ArraySize(output_dims, 1); ++x) { - for (int c = 0; c < ArraySize(output_dims, 0); ++c) { - output_data[Offset(output_dims, c, x, y, b)] = + // nesting loops such that the innermost loop has the smallest stride for the + // best cache behavior. + for (int b = 0; b < output_shape.Dims(0); ++b) { + for (int y = 0; y < output_shape.Dims(1); ++y) { + for (int x = 0; x < output_shape.Dims(2); ++x) { + for (int c = 0; c < output_shape.Dims(3); ++c) { + output_data[Offset(output_shape, b, y, x, c)] = ActivationFunctionWithMinMax( - input1_data[SubscriptToIndex(desc1, c, x, y, b)] * - input2_data[SubscriptToIndex(desc2, c, x, y, b)], + input1_data[SubscriptToIndex(desc1, b, y, x, c)] * + input2_data[SubscriptToIndex(desc2, b, y, x, c)], output_activation_min, output_activation_max); } } @@ -1428,6 +1468,20 @@ void BroadcastMul(const T* input1_data, const Dims<4>& input1_dims, } } +// Legacy. +template +void BroadcastMul(const T* input1_data, const Dims<4>& input1_dims, + const T* input2_data, const Dims<4>& input2_dims, + T output_activation_min, T output_activation_max, + T* output_data, const Dims<4>& output_dims) { + tflite::ArithmeticParams op_params; + SetActivationParams(output_activation_min, output_activation_max, &op_params); + + BroadcastMul4DSlow(op_params, DimsToShape(input1_dims), input1_data, + DimsToShape(input2_dims), input2_data, + DimsToShape(output_dims), output_data); +} + // legacy, for compatibility with old checked-in code template void BroadcastMul(const T* input1_data, const Dims<4>& input1_dims, @@ -1436,9 +1490,12 @@ void BroadcastMul(const T* input1_data, const Dims<4>& input1_dims, T output_activation_min, output_activation_max; GetActivationMinMax(Ac, &output_activation_min, &output_activation_max); - BroadcastMul(input1_data, input1_dims, input2_data, input2_dims, - output_activation_min, output_activation_max, output_data, - output_dims); + tflite::ArithmeticParams op_params; + SetActivationParams(output_activation_min, output_activation_max, &op_params); + + BroadcastMul4DSlow(op_params, DimsToShape(input1_dims), input1_data, + DimsToShape(input2_dims), input2_data, + DimsToShape(output_dims), output_data); } // Element-wise mul that can often be used for inner loop of broadcast Mul as @@ -1569,6 +1626,7 @@ inline void BroadcastMul4DSlow(const ArithmeticParams& params, } } +// Legacy. // Transitional version that will be moved shortly to legacy_reference_ops, as // part of RuntimeShape revisions. inline void BroadcastMul4DSlow(const uint8* input1_data, @@ -1579,52 +1637,27 @@ inline void BroadcastMul4DSlow(const uint8* input1_data, int output_shift, int32 output_activation_min, int32 output_activation_max, uint8* output_data, const Dims<4>& output_dims) { - gemmlowp::ScopedProfilingLabel label("BroadcastMul/8bit"); - - NdArrayDesc<4> desc1; - NdArrayDesc<4> desc2; - NdArrayDescsForElementwiseBroadcast(input1_dims, input2_dims, &desc1, &desc2); - - // In Tensorflow, the dimensions are canonically named (batch_number, row, - // col, channel), with extents (batches, height, width, depth), with the - // trailing dimension changing most rapidly (channels has the smallest - // stride, typically 1 element). - // - // In generated C code, we store arrays with the dimensions reversed. The - // first dimension has smallest stride. - // - // We name our variables by their Tensorflow convention, but generate C code - // nesting loops such that the innermost loop has the smallest stride for - // the best cache behavior. - for (int b = 0; b < ArraySize(output_dims, 3); ++b) { - for (int y = 0; y < ArraySize(output_dims, 2); ++y) { - for (int x = 0; x < ArraySize(output_dims, 1); ++x) { - for (int c = 0; c < ArraySize(output_dims, 0); ++c) { - const int32 input1_val = - input1_offset + input1_data[SubscriptToIndex(desc1, c, x, y, b)]; - const int32 input2_val = - input2_offset + input2_data[SubscriptToIndex(desc2, c, x, y, b)]; - const int32 unclamped_result = - output_offset + - MultiplyByQuantizedMultiplierSmallerThanOneExp( - input1_val * input2_val, output_multiplier, output_shift); - const int32 clamped_output = - std::min(output_activation_max, - std::max(output_activation_min, unclamped_result)); - output_data[Offset(output_dims, c, x, y, b)] = - static_cast(clamped_output); - } - } - } - } + tflite::ArithmeticParams op_params; + SetActivationParams(output_activation_min, output_activation_max, &op_params); + op_params.input1_offset = input1_offset; + op_params.input2_offset = input2_offset; + op_params.output_offset = output_offset; + op_params.output_multiplier = output_multiplier; + op_params.output_shift = output_shift; + + BroadcastMul4DSlow(op_params, DimsToShape(input1_dims), input1_data, + DimsToShape(input2_dims), input2_data, + DimsToShape(output_dims), output_data); } -inline void Mul(const int16* input1_data, const Dims<4>& input1_dims, - const int16* input2_data, const Dims<4>& input2_dims, - int16* output_data, const Dims<4>& output_dims) { +inline void Mul(const ArithmeticParams& params, + const RuntimeShape& input1_shape, const int16* input1_data, + const RuntimeShape& input2_shape, const int16* input2_data, + const RuntimeShape& output_shape, int16* output_data) { gemmlowp::ScopedProfilingLabel label("Mul/Int16"); - const int flat_size = MatchingFlatSize(output_dims, input1_dims, input2_dims); + const int flat_size = + MatchingFlatSize(input1_shape, input2_shape, output_shape); for (int i = 0; i < flat_size; i++) { // F0 uses 0 integer bits, range [-1, 1]. @@ -1636,15 +1669,30 @@ inline void Mul(const int16* input1_data, const Dims<4>& input1_dims, } } +// Legacy Dims<4>. inline void Mul(const int16* input1_data, const Dims<4>& input1_dims, const int16* input2_data, const Dims<4>& input2_dims, - int32 output_offset, int32 output_activation_min, - int32 output_activation_max, uint8* output_data, - const Dims<4>& output_dims) { + int16* output_data, const Dims<4>& output_dims) { + tflite::ArithmeticParams op_params; + // No params in this version. + + Mul(op_params, DimsToShape(input1_dims), input1_data, + DimsToShape(input2_dims), input2_data, DimsToShape(output_dims), + output_data); +} + +inline void Mul(const ArithmeticParams& params, + const RuntimeShape& input1_shape, const int16* input1_data, + const RuntimeShape& input2_shape, const int16* input2_data, + const RuntimeShape& output_shape, uint8* output_data) { gemmlowp::ScopedProfilingLabel label("Mul/Int16Uint8"); + int32 output_offset = params.output_offset; + int32 output_activation_min = params.quantized_activation_min; + int32 output_activation_max = params.quantized_activation_max; TFLITE_DCHECK_LE(output_activation_min, output_activation_max); - const int flat_size = MatchingFlatSize(output_dims, input1_dims, input2_dims); + const int flat_size = + MatchingFlatSize(input1_shape, input2_shape, output_shape); for (int i = 0; i < flat_size; i++) { // F0 uses 0 integer bits, range [-1, 1]. @@ -1662,6 +1710,22 @@ inline void Mul(const int16* input1_data, const Dims<4>& input1_dims, } } +// Legacy Dims<4>. +inline void Mul(const int16* input1_data, const Dims<4>& input1_dims, + const int16* input2_data, const Dims<4>& input2_dims, + int32 output_offset, int32 output_activation_min, + int32 output_activation_max, uint8* output_data, + const Dims<4>& output_dims) { + tflite::ArithmeticParams op_params; + op_params.quantized_activation_min = output_activation_min; + op_params.quantized_activation_max = output_activation_max; + op_params.output_offset = output_offset; + + Mul(op_params, DimsToShape(input1_dims), input1_data, + DimsToShape(input2_dims), input2_data, DimsToShape(output_dims), + output_data); +} + // TODO(jiawen): We can implement BroadcastDiv on buffers of arbitrary // dimensionality if the runtime code does a single loop over one dimension // that handles broadcasting as the base case. The code generator would then diff --git a/tensorflow/contrib/lite/kernels/internal/types.h b/tensorflow/contrib/lite/kernels/internal/types.h index 2603ed2eb7..8e17eaa964 100644 --- a/tensorflow/contrib/lite/kernels/internal/types.h +++ b/tensorflow/contrib/lite/kernels/internal/types.h @@ -913,23 +913,30 @@ struct TanhParams { int input_left_shift; }; -template -inline void SetActivationParams(T min, T max, ArithmeticParams* params); - -template <> -inline void SetActivationParams(float min, float max, - ArithmeticParams* params) { +template +inline void SetActivationParams(float min, float max, P* params) { params->float_activation_min = min; params->float_activation_max = max; } -template <> -inline void SetActivationParams(int32 min, int32 max, - ArithmeticParams* params) { +template +inline void SetActivationParams(int32 min, int32 max, P* params) { params->quantized_activation_min = min; params->quantized_activation_max = max; } +template +inline void GetActivationParams(const P& params, int32* min, int32* max) { + *min = params.quantized_activation_min; + *max = params.quantized_activation_max; +} + +template +inline void GetActivationParams(const P& params, float* min, float* max) { + *min = params.float_activation_min; + *max = params.float_activation_max; +} + } // namespace tflite #endif // TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_TYPES_H_ -- GitLab From 9d8b58a8074a5bdc152cd5a2a9260ccb72eaef90 Mon Sep 17 00:00:00 2001 From: Allen Lavoie Date: Tue, 28 Aug 2018 10:18:28 -0700 Subject: [PATCH 220/598] Work around an eager reference leak when slicing EagerTensors Removes a function monkey patch from EagerTensor slicing. There's a TODO to figure out why that monkey patching was problematic. Fixes #20218 (or at least makes it better) PiperOrigin-RevId: 210561714 --- .../python/kernel_tests/array_ops_test.py | 8 ++++ tensorflow/python/kernel_tests/rnn_test.py | 7 +++ tensorflow/python/ops/array_ops.py | 48 ++++++++++--------- 3 files changed, 41 insertions(+), 22 deletions(-) diff --git a/tensorflow/python/kernel_tests/array_ops_test.py b/tensorflow/python/kernel_tests/array_ops_test.py index b2bafeadba..b0e24e969c 100644 --- a/tensorflow/python/kernel_tests/array_ops_test.py +++ b/tensorflow/python/kernel_tests/array_ops_test.py @@ -559,6 +559,14 @@ class StridedSliceTest(test_util.TensorFlowTestCase): s = array_ops.strided_slice(x, begin, end, strides) self.assertAllEqual([3.], self.evaluate(s)) + @test_util.assert_no_new_pyobjects_executing_eagerly + def testEagerMemory(self): + with context.eager_mode(): + inputs = constant_op.constant( + [[[1], [2], [3], [4]]], dtype=dtypes.float32) + # Tests that slicing an EagerTensor doesn't leak memory + inputs[0] # pylint: disable=pointless-statement + def testDegenerateSlices(self): with self.test_session(use_gpu=True): checker = StridedSliceChecker(self, StridedSliceChecker.REF_TENSOR) diff --git a/tensorflow/python/kernel_tests/rnn_test.py b/tensorflow/python/kernel_tests/rnn_test.py index 78f2993d27..562d11f0b0 100644 --- a/tensorflow/python/kernel_tests/rnn_test.py +++ b/tensorflow/python/kernel_tests/rnn_test.py @@ -229,6 +229,13 @@ class RNNTest(test.TestCase): self.assertAllEqual([[[1, 1], [2, 2], [3, 3], [4, 4]]], outputs[1]) self.assertAllEqual(4, state) + @test_util.assert_no_new_pyobjects_executing_eagerly + def testEagerMemory(self): + with context.eager_mode(): + cell = TensorArrayStateRNNCell() + inputs = np.array([[[1], [2], [3], [4]]], dtype=np.float32) + rnn.dynamic_rnn(cell, inputs, dtype=dtypes.float32, sequence_length=[4]) + @test_util.run_in_graph_and_eager_modes def testTensorArrayStateIsAccepted(self): cell = TensorArrayStateRNNCell() diff --git a/tensorflow/python/ops/array_ops.py b/tensorflow/python/ops/array_ops.py index 66bc4df18c..5b079ebb0b 100644 --- a/tensorflow/python/ops/array_ops.py +++ b/tensorflow/python/ops/array_ops.py @@ -691,28 +691,32 @@ def strided_slice(input_, parent_name = name - def assign(val, name=None): - """Closure that holds all the arguments to create an assignment.""" - - if var is None: - raise ValueError("Sliced assignment is only supported for variables") - - if name is None: - name = parent_name + "_assign" - - return var._strided_slice_assign( - begin=begin, - end=end, - strides=strides, - value=val, - name=name, - begin_mask=begin_mask, - end_mask=end_mask, - ellipsis_mask=ellipsis_mask, - new_axis_mask=new_axis_mask, - shrink_axis_mask=shrink_axis_mask) - - op.assign = assign + if not (var is None and isinstance(op, ops.EagerTensor)): + # TODO(b/113297051): Assigning a function to an EagerTensor seems to leak + # memory. Slicing variables still leaks, although ".assign" is removed for + # EagerTensors which are not variable slices to mitigate the issue. + def assign(val, name=None): + """Closure that holds all the arguments to create an assignment.""" + + if var is None: + raise ValueError("Sliced assignment is only supported for variables") + + if name is None: + name = parent_name + "_assign" + + return var._strided_slice_assign( + begin=begin, + end=end, + strides=strides, + value=val, + name=name, + begin_mask=begin_mask, + end_mask=end_mask, + ellipsis_mask=ellipsis_mask, + new_axis_mask=new_axis_mask, + shrink_axis_mask=shrink_axis_mask) + + op.assign = assign return op -- GitLab From 1711dc4bf8331b2a00dd2c4a9a54a8e441c0e6aa Mon Sep 17 00:00:00 2001 From: Yifei Feng Date: Tue, 28 Aug 2018 10:22:15 -0700 Subject: [PATCH 221/598] Update and re-enable CODEOWNERS. PiperOrigin-RevId: 210562358 --- CODEOWNERS | 103 +++++++++++++++++++++++++++++------------------------ 1 file changed, 56 insertions(+), 47 deletions(-) diff --git a/CODEOWNERS b/CODEOWNERS index b9f0313cc6..113eaf798f 100644 --- a/CODEOWNERS +++ b/CODEOWNERS @@ -1,53 +1,62 @@ -# NOTE: Disabled temporarily because it's too noisy on pushes. # Where component owners are known, add them here. -# /tensorflow/core/platform/windows/ @mrry -# /tensorflow/java/ @asimshankar -# /tensorflow/tensorboard/ @jart @dandelionmane -# /tensorflow/tools/docs/ @markdaoust +/tenosrflow/core/debug @caisq +/tensorflow/core/platform/windows/ @mrry +/tensorflow/go @asimshankar +/tensorflow/java/ @asimshankar +/tensorflow/python/debug @caisq +/tensorflow/python/tools/api/generator/ @annarev +/tensorflow/tensorboard/ @jart +/tensorflow/tools/docs/ @markdaoust # contrib -# NEED OWNER: /tensorflow/contrib/avro/ -# /tensorflow/contrib/batching/ @alextp @chrisolston -# /tensorflow/contrib/bayesflow/ @ebrevdo @rsepassi @jvdillon -# /tensorflow/contrib/boosted_trees/ @sshrdp @yk5 @nataliaponomareva -# /tensorflow/contrib/cmake/ @mrry @benoitsteiner -# /tensorflow/contrib/copy_graph/ @tucker @poxvoculi -# /tensorflow/contrib/crf/ @kentonl -# /tensorflow/contrib/data/ @mrry -# /tensorflow/contrib/distributions/ @jvdillon @langmore @rsepassi -# /tensorflow/contrib/factorization/ @agarwal-ashish @xavigonzalvo -# /tensorflow/contrib/ffmpeg/ @fredbertsch -# NEED OWNER: /tensorflow/contrib/framework/ -# /tensorflow/contrib/graph_editor/ @purpledog +# NEED OWNER: /tensorflow/contrib/all_reduce +/tensorflow/contrib/batching/ @alextp @chrisolston +/tensorflow/contrib/bayesflow/ @ebrevdo @rsepassi @jvdillon +/tensorflow/contrib/boosted_trees/ @sshrdp @yk5 @nataliaponomareva +/tensorflow/contrib/checkpoint/ @allenlavoie +/tensorflow/contrib/contrib/cluster_resolver/ @frankchn +/tensorflow/contrib/cmake/ @mrry +/tensorflow/contrib/copy_graph/ @tucker @poxvoculi +/tensorflow/contrib/crf/ @kentonl +/tensorflow/contrib/data/ @mrry +/tensorflow/tensorflow/contrib/distribute @joshl @priyag @sourabhbajaj @frankchn +/tensorflow/contrib/distributions/ @jvdillon @langmore @rsepassi +/tensorflow/contrib/eager @alextp @asimshankar +/tensorflow/contrib/factorization/ @agarwal-ashish @xavigonzalvo +/tensorflow/contrib/ffmpeg/ @fredbertsch +/tensorflow/contrib/framework/ @ebrevdo +/tensorflow/contrib/gan/ @joel-shor +/tensorflow/contrib/graph_editor/ @purpledog # NEED OWNER: /tensorflow/contrib/grid_rnn/ -# /tensorflow/contrib/hvx/ @satok16 -# /tensorflow/contrib/integrate/ @shoyer -# /tensorflow/contrib/kernel_methods/ @petrosmol -# /tensorflow/contrib/ios_examples/ @petewarden -# /tensorflow/contrib/labeled_tensor/ @shoyer -# /tensorflow/contrib/layers/ @fchollet @martinwicke -# /tensorflow/contrib/learn/ @martinwicke @ispirmustafa @alextp -# /tensorflow/contrib/linalg/ @langmore -# /tensorflow/contrib/linear_optimizer/ @petrosmol @andreasst @katsiapis -# /tensorflow/contrib/lookup/ @ysuematsu @andreasst -# /tensorflow/contrib/losses/ @alextp @ispirmustafa -# /tensorflow/contrib/makefile/ @petewarden @satok16 @wolffg -# /tensorflow/contrib/metrics/ @alextp @honkentuber @ispirmustafa -# /tensorflow/contrib/nccl/ @cwhipkey @zheng-xq -# /tensorflow/contrib/opt/ @strategist333 -# /tensorflow/contrib/pi_examples/ @maciekcc -# /tensorflow/contrib/quantization/ @petewarden @cwhipkey @keveman -# /tensorflow/contrib/rnn/ @ebrevdo -# /tensorflow/contrib/saved_model/ @nfiedel @sukritiramesh -# /tensorflow/contrib/seq2seq/ @lukaszkaiser -# /tensorflow/contrib/session_bundle/ @nfiedel @sukritiramesh -# /tensorflow/contrib/slim/ @sguada @thenbasilmanran -# /tensorflow/contrib/stateless/ @girving -# /tensorflow/contrib/tensor_forest/ @gilberthendry @thomascolthurst @yupbank -# /tensorflow/contrib/testing/ @dandelionmane -# /tensorflow/contrib/timeseries/ @allenlavoie -# /tensorflow/contrib/tpu/ @frankchn @saeta @jhseu -# /tensorflow/contrib/training/ @joel-shor @ebrevdo -# /tensorflow/contrib/util/ @sherrym +/tensorflow/contrib/hvx/ @satok16 +/tensorflow/contrib/integrate/ @shoyer +/tensorflow/contrib/kernel_methods/ @petrosmol +/tensorflow/contrib/ios_examples/ @petewarden +/tensorflow/contrib/labeled_tensor/ @shoyer +/tensorflow/contrib/layers/ @fchollet @martinwicke +/tensorflow/contrib/learn/ @martinwicke @ispirmustafa @alextp +/tensorflow/contrib/linalg/ @langmore +/tensorflow/contrib/linear_optimizer/ @petrosmol @andreasst @katsiapis +/tensorflow/contrib/lookup/ @ysuematsu @andreasst +/tensorflow/contrib/losses/ @alextp @ispirmustafa +/tensorflow/contrib/makefile/ @petewarden @satok16 @wolffg +/tensorflow/contrib/metrics/ @alextp @honkentuber @ispirmustafa +/tensorflow/contrib/nccl/ @cwhipkey @zheng-xq +/tensorflow/contrib/opt/ @strategist333 @alextp +/tensorflow/contrib/pi_examples/ @maciekcc +/tensorflow/contrib/quantization/ @petewarden +/tensorflow/contrib/rnn/ @ebrevdo @scottzhu +/tensorflow/contrib/saved_model/ @nfiedel @sukritiramesh @allenl +/tensorflow/contrib/seq2seq/ @ebrevdo @lmthang +/tensorflow/contrib/session_bundle/ @nfiedel @sukritiramesh +/tensorflow/contrib/slim/ @sguada @thenbasilmanran +/tensorflow/contrib/stateless/ @girving @alextp +/tensorflow/contrib/tensor_forest/ @gilberthendry @thomascolthurst @yupbank +/tensorflow/contrib/tensorrt/ @laigd +# NEED OWNER: /tensorflow/contrib/testing/ +/tensorflow/contrib/timeseries/ @allenlavoie +/tensorflow/contrib/tpu/ @frankchn @saeta @jhseu @sourabhbajaj +/tensorflow/contrib/training/ @joel-shor @ebrevdo +/tensorflow/contrib/util/ @sherrym \ No newline at end of file -- GitLab From d8f64b90e6f0bfec5135bdf99e42b8fdaf53788d Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Tue, 28 Aug 2018 10:35:29 -0700 Subject: [PATCH 222/598] Adding note about missing definitions of gradients to sparse reduction function, and that they shouldn't be used in models that need training. PiperOrigin-RevId: 210564790 --- tensorflow/python/ops/sparse_ops.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/tensorflow/python/ops/sparse_ops.py b/tensorflow/python/ops/sparse_ops.py index 38ce5236e3..d1b8be4df7 100644 --- a/tensorflow/python/ops/sparse_ops.py +++ b/tensorflow/python/ops/sparse_ops.py @@ -889,6 +889,9 @@ def sparse_reduce_max(sp_input, axis=None, keepdims=None, `tf.reduce_max()`. In particular, this Op also returns a dense `Tensor` instead of a sparse one. + Note: A gradient is not defined for this function, so it can't be used + in training models that need gradient descent. + Reduces `sp_input` along the dimensions given in `reduction_axes`. Unless `keepdims` is true, the rank of the tensor is reduced by 1 for each entry in `reduction_axes`. If `keepdims` is true, the reduced dimensions are retained @@ -956,6 +959,9 @@ def sparse_reduce_max_sparse(sp_input, `tf.reduce_max()`. In contrast to SparseReduceSum, this Op returns a SparseTensor. + Note: A gradient is not defined for this function, so it can't be used + in training models that need gradient descent. + Reduces `sp_input` along the dimensions given in `reduction_axes`. Unless `keepdims` is true, the rank of the tensor is reduced by 1 for each entry in `reduction_axes`. If `keepdims` is true, the reduced dimensions are retained @@ -1057,6 +1063,9 @@ def sparse_reduce_sum_sparse(sp_input, `tf.reduce_sum()`. In contrast to SparseReduceSum, this Op returns a SparseTensor. + Note: A gradient is not defined for this function, so it can't be used + in training models that need gradient descent. + Reduces `sp_input` along the dimensions given in `reduction_axes`. Unless `keepdims` is true, the rank of the tensor is reduced by 1 for each entry in `reduction_axes`. If `keepdims` is true, the reduced dimensions are retained -- GitLab From 4f4e1b48862c30a21dedffb7a1929a2b2600ec9f Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Tue, 28 Aug 2018 10:36:25 -0700 Subject: [PATCH 223/598] Removed redundant std::string -> string conversions. PiperOrigin-RevId: 210565027 --- .../compiler/xla/service/hlo_creation_utils.cc | 2 +- .../compiler/xla/service/hlo_graph_dumper.cc | 2 +- tensorflow/compiler/xla/service/hlo_lexer.cc | 2 +- tensorflow/compiler/xla/service/hlo_parser.cc | 2 +- .../compiler/xla/service/hlo_pass_pipeline.cc | 12 ++++++------ .../service/human_readable_profile_builder.h | 9 ++++----- tensorflow/compiler/xla/service/name_uniquer.cc | 2 +- .../compiler/xla/service/shape_inference.cc | 2 +- tensorflow/core/framework/function.cc | 2 +- tensorflow/core/framework/node_def_builder.cc | 17 ++++++++--------- tensorflow/core/framework/node_def_util.cc | 6 +++--- tensorflow/core/framework/op_def_builder.cc | 4 ++-- tensorflow/core/framework/op_gen_lib.cc | 2 +- tensorflow/core/framework/op_kernel.cc | 2 +- .../core/framework/shape_inference_testutil.h | 2 +- tensorflow/core/graph/graph.cc | 4 ++-- tensorflow/core/graph/graph_constructor.cc | 10 +++++----- tensorflow/core/graph/graph_constructor_test.cc | 5 ++--- tensorflow/core/graph/graph_def_builder.cc | 4 ++-- tensorflow/core/graph/graph_def_builder.h | 2 +- tensorflow/core/graph/graph_partition.cc | 2 +- tensorflow/core/graph/node_builder.cc | 2 +- tensorflow/core/graph/while_context.cc | 2 +- 23 files changed, 48 insertions(+), 51 deletions(-) diff --git a/tensorflow/compiler/xla/service/hlo_creation_utils.cc b/tensorflow/compiler/xla/service/hlo_creation_utils.cc index 0ceb6a2968..131846794d 100644 --- a/tensorflow/compiler/xla/service/hlo_creation_utils.cc +++ b/tensorflow/compiler/xla/service/hlo_creation_utils.cc @@ -338,7 +338,7 @@ StatusOr BroadcastZeros( StatusOr> CreateComputationWithSignature( ArraySlice domain, const Shape& range, absl::string_view name) { - HloComputation::Builder b{std::string(name)}; + HloComputation::Builder b{string(name)}; int64 param_idx = 0; for (const Shape* param_shape : domain) { b.AddInstruction(HloInstruction::CreateParameter( diff --git a/tensorflow/compiler/xla/service/hlo_graph_dumper.cc b/tensorflow/compiler/xla/service/hlo_graph_dumper.cc index 6cf7730fdc..f2f9ed5969 100644 --- a/tensorflow/compiler/xla/service/hlo_graph_dumper.cc +++ b/tensorflow/compiler/xla/service/hlo_graph_dumper.cc @@ -305,7 +305,7 @@ class HloDotDumper { const DebugOptions& debug_options, bool show_backend_config, const HloExecutionProfile* profile, NodeFilter filter) : computation_(computation), - label_(std::string(label)), + label_(label), debug_options_(debug_options), show_backend_config_(show_backend_config), profile_(profile), diff --git a/tensorflow/compiler/xla/service/hlo_lexer.cc b/tensorflow/compiler/xla/service/hlo_lexer.cc index 5b23ee7d00..8350285e67 100644 --- a/tensorflow/compiler/xla/service/hlo_lexer.cc +++ b/tensorflow/compiler/xla/service/hlo_lexer.cc @@ -269,7 +269,7 @@ TokKind HloLexer::LexIdentifier() { } } - str_val_ = std::string(identifier); + str_val_ = string(identifier); return TokKind::kIdent; } diff --git a/tensorflow/compiler/xla/service/hlo_parser.cc b/tensorflow/compiler/xla/service/hlo_parser.cc index ba0f07dd14..e4edb87aa5 100644 --- a/tensorflow/compiler/xla/service/hlo_parser.cc +++ b/tensorflow/compiler/xla/service/hlo_parser.cc @@ -324,7 +324,7 @@ bool HloParser::Error(LocTy loc, absl::string_view msg) { std::vector error_lines; error_lines.push_back( StrCat("was parsing ", line, ":", col, ": error: ", msg)); - error_lines.push_back(std::string(lexer_.GetLine(loc))); + error_lines.emplace_back(lexer_.GetLine(loc)); error_lines.push_back(col == 0 ? "" : StrCat(string(col - 1, ' '), "^")); error_.push_back(StrJoin(error_lines, "\n")); diff --git a/tensorflow/compiler/xla/service/hlo_pass_pipeline.cc b/tensorflow/compiler/xla/service/hlo_pass_pipeline.cc index de7ad6d209..6e4ed0de62 100644 --- a/tensorflow/compiler/xla/service/hlo_pass_pipeline.cc +++ b/tensorflow/compiler/xla/service/hlo_pass_pipeline.cc @@ -91,7 +91,7 @@ StatusOr HloPassPipeline::Run(HloModule* module) { return Status::OK(); }; - string prefix = std::string(name()) + ": pipeline start"; + string prefix = StrCat(name(), ": pipeline start"); bool changed = false; string message; TF_RETURN_IF_ERROR( @@ -99,12 +99,12 @@ StatusOr HloPassPipeline::Run(HloModule* module) { const string xla_dump_per_pass_hlo_proto_to = module->config().debug_options().xla_dump_per_pass_hlo_proto_to(); if (!xla_dump_per_pass_hlo_proto_to.empty()) { - DumpModuleProto(*module, xla_dump_per_pass_hlo_proto_to, - std::string(name()), "pipeline_start"); + DumpModuleProto(*module, xla_dump_per_pass_hlo_proto_to, string(name()), + "pipeline_start"); } for (auto& pass : passes_) { - if (disabled_passes.count(std::string(pass->name())) > 0) { + if (disabled_passes.count(string(pass->name())) > 0) { VLOG(1) << " Skipping HLO pass " << pass->name() << ", disabled by --xla_disable_hlo_passes"; continue; @@ -121,8 +121,8 @@ StatusOr HloPassPipeline::Run(HloModule* module) { TF_RETURN_IF_ERROR( run_invariant_checkers(StrCat("after running pass: ", pass->name()))); if (!xla_dump_per_pass_hlo_proto_to.empty()) { - DumpModuleProto(*module, xla_dump_per_pass_hlo_proto_to, - std::string(name()), std::string(pass->name())); + DumpModuleProto(*module, xla_dump_per_pass_hlo_proto_to, string(name()), + string(pass->name())); } changed |= changed_this_pass; diff --git a/tensorflow/compiler/xla/service/human_readable_profile_builder.h b/tensorflow/compiler/xla/service/human_readable_profile_builder.h index b99624460e..925111fa1f 100644 --- a/tensorflow/compiler/xla/service/human_readable_profile_builder.h +++ b/tensorflow/compiler/xla/service/human_readable_profile_builder.h @@ -32,7 +32,7 @@ class HumanReadableProfileBuilder { explicit HumanReadableProfileBuilder(absl::string_view computation_name, int64 total_cycles, double clock_rate_ghz) - : computation_name_(std::string(computation_name)), + : computation_name_(computation_name), total_cycles_(total_cycles), clock_rate_ghz_(clock_rate_ghz) { CHECK_GE(clock_rate_ghz, 1e-9); @@ -47,10 +47,9 @@ class HumanReadableProfileBuilder { absl::string_view category, int64 cycles, int64 flop_count, int64 transcendental_count, int64 bytes_accessed, float optimal_seconds) { - op_infos_.push_back({std::string(op_name), std::string(short_name), - std::string(category), cycles, flop_count, - transcendental_count, bytes_accessed, - optimal_seconds}); + op_infos_.push_back({string(op_name), string(short_name), string(category), + cycles, flop_count, transcendental_count, + bytes_accessed, optimal_seconds}); } // Gets the human-readable profile. diff --git a/tensorflow/compiler/xla/service/name_uniquer.cc b/tensorflow/compiler/xla/service/name_uniquer.cc index 70cd0a339a..bd8fb17a23 100644 --- a/tensorflow/compiler/xla/service/name_uniquer.cc +++ b/tensorflow/compiler/xla/service/name_uniquer.cc @@ -54,7 +54,7 @@ NameUniquer::NameUniquer(const string& separator) { } string NameUniquer::GetUniqueName(absl::string_view prefix) { - string root = GetSanitizedName(prefix.empty() ? "name" : std::string(prefix)); + string root = GetSanitizedName(prefix.empty() ? "name" : string(prefix)); // Strip away numeric suffix (if any). Only recognize separator if it is in // the middle of the name. diff --git a/tensorflow/compiler/xla/service/shape_inference.cc b/tensorflow/compiler/xla/service/shape_inference.cc index ae6a366d25..b04d2a7ba6 100644 --- a/tensorflow/compiler/xla/service/shape_inference.cc +++ b/tensorflow/compiler/xla/service/shape_inference.cc @@ -52,7 +52,7 @@ bool AllUnique(tensorflow::gtl::ArraySlice slice) { Status ExpectArray(const Shape& shape, absl::string_view op_type) { if (!ShapeUtil::IsArray(shape)) { return InvalidArgument("Expected array argument for %s, but got %s.", - std::string(op_type), ShapeUtil::HumanString(shape)); + string(op_type), ShapeUtil::HumanString(shape)); } return Status::OK(); } diff --git a/tensorflow/core/framework/function.cc b/tensorflow/core/framework/function.cc index 6b92e10d76..26f32677af 100644 --- a/tensorflow/core/framework/function.cc +++ b/tensorflow/core/framework/function.cc @@ -504,7 +504,7 @@ string Print(const NodeDef& n) { std::vector dep; for (StringPiece s : n.input()) { if (str_util::ConsumePrefix(&s, "^")) { - dep.push_back(std::string(s)); + dep.emplace_back(s); } else { dat.push_back(s); } diff --git a/tensorflow/core/framework/node_def_builder.cc b/tensorflow/core/framework/node_def_builder.cc index 8e00bfe4f8..348a825af9 100644 --- a/tensorflow/core/framework/node_def_builder.cc +++ b/tensorflow/core/framework/node_def_builder.cc @@ -24,23 +24,22 @@ limitations under the License. namespace tensorflow { NodeDefBuilder::NodeOut::NodeOut(StringPiece n, int i, DataType dt) - : node(std::string(n)), index(i), data_type(dt) {} + : node(n), index(i), data_type(dt) {} NodeDefBuilder::NodeOut::NodeOut() { // uninitialized, call Reset() before use. } void NodeDefBuilder::NodeOut::Reset(StringPiece n, int i, DataType dt) { - node = std::string(n); + node = string(n); index = i; data_type = dt; } NodeDefBuilder::NodeDefBuilder(StringPiece name, StringPiece op_name, const OpRegistryInterface* op_registry) { - node_def_.set_name(std::string(name)); - const Status status = - op_registry->LookUpOpDef(std::string(op_name), &op_def_); + node_def_.set_name(string(name)); + const Status status = op_registry->LookUpOpDef(string(op_name), &op_def_); if (status.ok()) { Initialize(); } else { @@ -51,7 +50,7 @@ NodeDefBuilder::NodeDefBuilder(StringPiece name, StringPiece op_name, NodeDefBuilder::NodeDefBuilder(StringPiece name, const OpDef* op_def) : op_def_(op_def) { - node_def_.set_name(std::string(name)); + node_def_.set_name(string(name)); Initialize(); } @@ -171,7 +170,7 @@ void NodeDefBuilder::AddInput(StringPiece src_node, int src_index) { } else if (src_index > 0) { node_def_.add_input(strings::StrCat(src_node, ":", src_index)); } else { - node_def_.add_input(std::string(src_node)); + node_def_.add_input(string(src_node)); } } @@ -194,12 +193,12 @@ void NodeDefBuilder::VerifyInputRef(const OpDef::ArgDef* input_arg, } NodeDefBuilder& NodeDefBuilder::ControlInput(StringPiece src_node) { - control_inputs_.push_back(std::string(src_node)); + control_inputs_.emplace_back(src_node); return *this; } NodeDefBuilder& NodeDefBuilder::Device(StringPiece device_spec) { - node_def_.set_device(std::string(device_spec)); + node_def_.set_device(string(device_spec)); return *this; } diff --git a/tensorflow/core/framework/node_def_util.cc b/tensorflow/core/framework/node_def_util.cc index 0bd79366eb..bacc1d72c4 100644 --- a/tensorflow/core/framework/node_def_util.cc +++ b/tensorflow/core/framework/node_def_util.cc @@ -254,7 +254,7 @@ DEFINE_GET_ATTR(NameAttrList, func, "func", emplace_back, v, ;); #undef DEFINE_GET_ATTR bool HasNodeAttr(const NodeDef& node_def, StringPiece attr_name) { - return node_def.attr().find(std::string(attr_name)) != node_def.attr().end(); + return node_def.attr().find(string(attr_name)) != node_def.attr().end(); } static const string& kEmptyString = *new string(); @@ -653,7 +653,7 @@ Status AttachDef(const Status& status, const Node& node) { void AddNodeAttr(StringPiece name, const AttrValue& value, NodeDef* node_def) { node_def->mutable_attr()->insert( - AttrValueMap::value_type(std::string(name), value)); + AttrValueMap::value_type(string(name), value)); } #define ADD_NODE_ATTR(T) \ @@ -691,7 +691,7 @@ ADD_NODE_ATTR(gtl::ArraySlice) #undef ADD_NODE_ATTR void AddAttr(StringPiece name, const AttrValue& value, AttrValueMap* map) { - map->insert(AttrValueMap::value_type(std::string(name), value)); + map->insert(AttrValueMap::value_type(string(name), value)); } #define ADD_ATTR(T) \ diff --git a/tensorflow/core/framework/op_def_builder.cc b/tensorflow/core/framework/op_def_builder.cc index 91eb6c0672..34a7a43d38 100644 --- a/tensorflow/core/framework/op_def_builder.cc +++ b/tensorflow/core/framework/op_def_builder.cc @@ -527,7 +527,7 @@ void FinalizeDoc(const string& text, OpDef* op_def, } // namespace OpDefBuilder::OpDefBuilder(StringPiece op_name) { - op_def()->set_name(std::string(op_name)); // NOLINT + op_def()->set_name(string(op_name)); // NOLINT } OpDefBuilder& OpDefBuilder::Attr(StringPiece spec) { @@ -584,7 +584,7 @@ OpDefBuilder& OpDefBuilder::Deprecated(int version, StringPiece explanation) { } else { OpDeprecation* deprecation = op_def()->mutable_deprecation(); deprecation->set_version(version); - deprecation->set_explanation(std::string(explanation)); + deprecation->set_explanation(string(explanation)); } return *this; } diff --git a/tensorflow/core/framework/op_gen_lib.cc b/tensorflow/core/framework/op_gen_lib.cc index 4b56d807df..505ab54775 100644 --- a/tensorflow/core/framework/op_gen_lib.cc +++ b/tensorflow/core/framework/op_gen_lib.cc @@ -186,7 +186,7 @@ static bool FindMultiline(StringPiece line, size_t colon, string* end) { while (str_util::ConsumePrefix(&line, " ")) { } if (str_util::ConsumePrefix(&line, "<<")) { - *end = std::string(line); + *end = string(line); return true; } return false; diff --git a/tensorflow/core/framework/op_kernel.cc b/tensorflow/core/framework/op_kernel.cc index b285accce7..c694e10193 100644 --- a/tensorflow/core/framework/op_kernel.cc +++ b/tensorflow/core/framework/op_kernel.cc @@ -913,7 +913,7 @@ void OpKernelContext::clear_recorded_memory() { struct KernelRegistration { KernelRegistration(const KernelDef& d, StringPiece c, kernel_factory::OpKernelRegistrar::Factory f) - : def(d), kernel_class_name(std::string(c)), factory(f) {} + : def(d), kernel_class_name(c), factory(f) {} const KernelDef def; const string kernel_class_name; const kernel_factory::OpKernelRegistrar::Factory factory; diff --git a/tensorflow/core/framework/shape_inference_testutil.h b/tensorflow/core/framework/shape_inference_testutil.h index f6656b3b45..bb4dc25da4 100644 --- a/tensorflow/core/framework/shape_inference_testutil.h +++ b/tensorflow/core/framework/shape_inference_testutil.h @@ -32,7 +32,7 @@ class Tensor; struct ShapeInferenceTestOp { typedef std::pair ShapeAndType; - explicit ShapeInferenceTestOp(StringPiece name) : name(std::string(name)) {} + explicit ShapeInferenceTestOp(StringPiece name) : name(string(name)) {} string name; NodeDef node_def; std::vector input_tensors; diff --git a/tensorflow/core/graph/graph.cc b/tensorflow/core/graph/graph.cc index 568f0870c0..ade9266231 100644 --- a/tensorflow/core/graph/graph.cc +++ b/tensorflow/core/graph/graph.cc @@ -483,7 +483,7 @@ const Edge* Graph::AddControlEdge(Node* source, Node* dest, void Graph::RemoveControlEdge(const Edge* e) { if (!e->src_->IsSource() && !e->dst_->IsSink()) { e->dst_->MaybeCopyOnWrite(); - std::string e_src_name = strings::StrCat("^", e->src_->name()); + string e_src_name = strings::StrCat("^", e->src_->name()); auto* inputs = e->dst_->props_->node_def.mutable_input(); for (auto it = inputs->begin(); it != inputs->end(); ++it) { if (*it == e_src_name) { @@ -721,7 +721,7 @@ Status Graph::AddWhileContext(StringPiece frame_name, std::vector body_outputs, WhileContext** result) { auto pair = while_ctxs_.insert(std::pair( - std::string(frame_name), + string(frame_name), WhileContext(frame_name, std::move(enter_nodes), std::move(exit_nodes), cond_output, std::move(body_inputs), std::move(body_outputs)))); diff --git a/tensorflow/core/graph/graph_constructor.cc b/tensorflow/core/graph/graph_constructor.cc index 8c73f8f712..ee10194142 100644 --- a/tensorflow/core/graph/graph_constructor.cc +++ b/tensorflow/core/graph/graph_constructor.cc @@ -513,7 +513,7 @@ Status GraphConstructor::InitFromEdges() { num_control_edges++; } else { TensorId id(ParseTensorName(input_name)); - if (next_iteration_nodes_.find(std::string(id.first)) != + if (next_iteration_nodes_.find(string(id.first)) != next_iteration_nodes_.end()) { has_loop_back_edge = true; } @@ -835,7 +835,7 @@ void GraphConstructor::UniquifyNames( // We require that UniquifyNames() is called on all NodeDefs in topological // order. This guarantees that node_def's inputs will already be uniquified // if necessary. - auto iter = uniquified_names_.find(std::string(id.first)); + auto iter = uniquified_names_.find(string(id.first)); if (iter == uniquified_names_.end()) continue; id.first = iter->second; node_def->set_input(i, id.ToString()); @@ -854,7 +854,7 @@ void GraphConstructor::UpdateUniquifiedColocationNames() { for (int i = 0; i < coloc_values.size(); ++i) { StringPiece val(coloc_values[i]); if (str_util::ConsumePrefix(&val, kColocationGroupPrefix)) { - const auto& name_pair = uniquified_names_.find(std::string(val)); + const auto& name_pair = uniquified_names_.find(string(val)); if (name_pair == uniquified_names_.end()) continue; updated = true; coloc_values[i] = @@ -880,7 +880,7 @@ bool GraphConstructor::NameExistsInGraphDef(StringPiece name) { } string GraphConstructor::FindUniqueName(StringPiece original_name) { - string name = std::string(original_name); + string name(original_name); int count = 0; // Check that any generated names don't collide with imported NodeDefs (as // well as nodes in g_). @@ -997,7 +997,7 @@ Status GraphConstructor::Convert() { src_node->num_outputs(), " outputs"); } - inputs.emplace_back(std::string(id.first), src_node, src_index); + inputs.emplace_back(string(id.first), src_node, src_index); } if (has_data_back_edge && !IsMerge(*node_def)) { diff --git a/tensorflow/core/graph/graph_constructor_test.cc b/tensorflow/core/graph/graph_constructor_test.cc index e338840eeb..73142ebde7 100644 --- a/tensorflow/core/graph/graph_constructor_test.cc +++ b/tensorflow/core/graph/graph_constructor_test.cc @@ -156,9 +156,8 @@ class GraphConstructorTest : public ::testing::Test { return ""; } StringPiece loc(value[0]); - return str_util::ConsumePrefix(&loc, kColocationGroupPrefix) - ? std::string(loc) - : ""; + return str_util::ConsumePrefix(&loc, kColocationGroupPrefix) ? string(loc) + : ""; } string GraphDebugString() const { diff --git a/tensorflow/core/graph/graph_def_builder.cc b/tensorflow/core/graph/graph_def_builder.cc index dd84c4f7c7..6d5df7efba 100644 --- a/tensorflow/core/graph/graph_def_builder.cc +++ b/tensorflow/core/graph/graph_def_builder.cc @@ -44,12 +44,12 @@ GraphDefBuilder::Options GraphDefBuilder::Options::WithControlInputs( } GraphDefBuilder::Options GraphDefBuilder::Options::WithNameImpl( StringPiece name) { - name_ = std::string(name); + name_ = string(name); return *this; } GraphDefBuilder::Options GraphDefBuilder::Options::WithDeviceImpl( StringPiece device) { - device_ = std::string(device); + device_ = string(device); return *this; } GraphDefBuilder::Options GraphDefBuilder::Options::WithControlInputImpl( diff --git a/tensorflow/core/graph/graph_def_builder.h b/tensorflow/core/graph/graph_def_builder.h index ec131580ae..400d8b6c84 100644 --- a/tensorflow/core/graph/graph_def_builder.h +++ b/tensorflow/core/graph/graph_def_builder.h @@ -128,7 +128,7 @@ class GraphDefBuilder { Options WithControlInputsImpl(gtl::ArraySlice control_inputs); template Options WithAttrImpl(StringPiece name, T&& value) { - attrs_.emplace_back(std::string(name), AttrValue()); + attrs_.emplace_back(string(name), AttrValue()); SetAttrValue(std::forward(value), &attrs_.back().second); return *this; } diff --git a/tensorflow/core/graph/graph_partition.cc b/tensorflow/core/graph/graph_partition.cc index ea0a814ab8..1dbcebab59 100644 --- a/tensorflow/core/graph/graph_partition.cc +++ b/tensorflow/core/graph/graph_partition.cc @@ -793,7 +793,7 @@ Status TopologicalSortNodesWithTimePriority( for (int n = 0; n < gdef->node_size(); ++n) { const NodeDef* ndef = &gdef->node(n); for (int i = 0; i < ndef->input_size(); ++i) { - node_to_output_nodes[std::string(ParseTensorName(ndef->input(i)).first)] + node_to_output_nodes[string(ParseTensorName(ndef->input(i)).first)] .push_back(ndef); } int64 start_time; diff --git a/tensorflow/core/graph/node_builder.cc b/tensorflow/core/graph/node_builder.cc index 03f3bbd663..a446e0d136 100644 --- a/tensorflow/core/graph/node_builder.cc +++ b/tensorflow/core/graph/node_builder.cc @@ -30,7 +30,7 @@ NodeBuilder::NodeOut::NodeOut(Node* n, int32 i) // NOLINT(runtime/explicit) dt(SafeGetOutput(node, i, &error)) {} NodeBuilder::NodeOut::NodeOut(StringPiece n, int32 i, DataType t) - : node(nullptr), error(false), name(std::string(n)), index(i), dt(t) {} + : node(nullptr), error(false), name(n), index(i), dt(t) {} NodeBuilder::NodeOut::NodeOut() : node(nullptr), error(true), index(0), dt(DT_FLOAT) {} diff --git a/tensorflow/core/graph/while_context.cc b/tensorflow/core/graph/while_context.cc index 1b38aac35d..8e89bc4c75 100644 --- a/tensorflow/core/graph/while_context.cc +++ b/tensorflow/core/graph/while_context.cc @@ -23,7 +23,7 @@ WhileContext::WhileContext(StringPiece frame_name, OutputTensor cond_output, std::vector body_inputs, std::vector body_outputs) - : frame_name_(std::string(frame_name)), + : frame_name_(frame_name), enter_nodes_(std::move(enter_nodes)), exit_nodes_(std::move(exit_nodes)), cond_output_(cond_output), -- GitLab From 65a7ae95d431b98ab4afc796b1bc9854d1d14286 Mon Sep 17 00:00:00 2001 From: Nupur Garg Date: Tue, 28 Aug 2018 10:38:26 -0700 Subject: [PATCH 224/598] Internal change. PiperOrigin-RevId: 210565419 --- tensorflow/contrib/lite/python/tflite_convert.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tensorflow/contrib/lite/python/tflite_convert.py b/tensorflow/contrib/lite/python/tflite_convert.py index 7d7a4ba94a..46bdb3e553 100644 --- a/tensorflow/contrib/lite/python/tflite_convert.py +++ b/tensorflow/contrib/lite/python/tflite_convert.py @@ -312,7 +312,7 @@ def run_main(_): "quantization via \"dummy quantization\". (default None)")) parser.add_argument( "--quantize_weights", - type=bool, + action="store_true", help=("Store float weights as quantized weights followed by dequantize " "operations. Inference is still done in FLOAT, but reduces model " "size (at the cost of accuracy and latency).")) -- GitLab From 5aaebe06b476d7b7484d6eb2b68440654557018a Mon Sep 17 00:00:00 2001 From: Shashi Shekhar Date: Tue, 28 Aug 2018 10:41:46 -0700 Subject: [PATCH 225/598] Add missing file. PiperOrigin-RevId: 210565982 --- .../lite/tools/accuracy/ilsvrc/README.md | 2 +- .../ilsvrc/generate_validation_labels.py | 101 ++++++++++++++++++ 2 files changed, 102 insertions(+), 1 deletion(-) create mode 100644 tensorflow/contrib/lite/tools/accuracy/ilsvrc/generate_validation_labels.py diff --git a/tensorflow/contrib/lite/tools/accuracy/ilsvrc/README.md b/tensorflow/contrib/lite/tools/accuracy/ilsvrc/README.md index 3c6a0d85b3..9b3b99451d 100644 --- a/tensorflow/contrib/lite/tools/accuracy/ilsvrc/README.md +++ b/tensorflow/contrib/lite/tools/accuracy/ilsvrc/README.md @@ -47,7 +47,7 @@ category labels. The `validation_ground_truth.txt` can be converted by the follo ILSVRC_2012_DEVKIT_DIR=[set to path to ILSVRC 2012 devkit] VALIDATION_LABELS=[set to path to output] -python generate_validation_labels -- \ +python generate_validation_labels.py -- \ --ilsvrc_devkit_dir=${ILSVRC_2012_DEVKIT_DIR} \ --validation_labels_output=${VALIDATION_LABELS} ``` diff --git a/tensorflow/contrib/lite/tools/accuracy/ilsvrc/generate_validation_labels.py b/tensorflow/contrib/lite/tools/accuracy/ilsvrc/generate_validation_labels.py new file mode 100644 index 0000000000..7e2c8fd40e --- /dev/null +++ b/tensorflow/contrib/lite/tools/accuracy/ilsvrc/generate_validation_labels.py @@ -0,0 +1,101 @@ +# Copyright 2018 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Tool to convert ILSVRC devkit validation ground truth to synset labels.""" + +import argparse +from os import path +import sys +import scipy.io + +_SYNSET_ARRAYS_RELATIVE_PATH = 'data/meta.mat' +_VALIDATION_FILE_RELATIVE_PATH = 'data/ILSVRC2012_validation_ground_truth.txt' + + +def _synset_to_word(filepath): + """Returns synset to word dictionary by reading sysnset arrays.""" + mat = scipy.io.loadmat(filepath) + entries = mat['synsets'] + # These fields are listed in devkit readme.txt + fields = [ + 'synset_id', 'WNID', 'words', 'gloss', 'num_children', 'children', + 'wordnet_height', 'num_train_images' + ] + synset_index = fields.index('synset_id') + words_index = fields.index('words') + synset_to_word = {} + for entry in entries: + entry = entry[0] + synset_id = int(entry[synset_index][0]) + first_word = entry[words_index][0].split(',')[0] + synset_to_word[synset_id] = first_word + return synset_to_word + + +def _validation_file_path(ilsvrc_dir): + return path.join(ilsvrc_dir, _VALIDATION_FILE_RELATIVE_PATH) + + +def _synset_array_path(ilsvrc_dir): + return path.join(ilsvrc_dir, _SYNSET_ARRAYS_RELATIVE_PATH) + + +def _generate_validation_labels(ilsvrc_dir, output_file): + synset_to_word = _synset_to_word(_synset_array_path(ilsvrc_dir)) + with open(_validation_file_path(ilsvrc_dir), 'r') as synset_id_file, open( + output_file, 'w') as output: + for synset_id in synset_id_file: + synset_id = int(synset_id) + output.write('%s\n' % synset_to_word[synset_id]) + + +def _check_arguments(args): + if not args.validation_labels_output: + raise ValueError('Invalid path to output file.') + ilsvrc_dir = args.ilsvrc_devkit_dir + if not ilsvrc_dir or not path.isdir(ilsvrc_dir): + raise ValueError('Invalid path to ilsvrc_dir') + if not path.exists(_validation_file_path(ilsvrc_dir)): + raise ValueError('Invalid path to ilsvrc_dir, cannot find validation file.') + if not path.exists(_synset_array_path(ilsvrc_dir)): + raise ValueError( + 'Invalid path to ilsvrc_dir, cannot find synset arrays file.') + + +def main(): + parser = argparse.ArgumentParser( + description='Converts ILSVRC devkit validation_ground_truth.txt to synset' + ' labels file that can be used by the accuracy script.') + parser.add_argument( + '--validation_labels_output', + type=str, + help='Full path for outputting validation labels.') + parser.add_argument( + '--ilsvrc_devkit_dir', + type=str, + help='Full path to ILSVRC 2012 devikit directory.') + args = parser.parse_args() + try: + _check_arguments(args) + except ValueError as e: + parser.print_usage() + file_name = path.basename(sys.argv[0]) + sys.stderr.write('{0}: error: {1}\n'.format(file_name, str(e))) + sys.exit(1) + _generate_validation_labels(args.ilsvrc_devkit_dir, + args.validation_labels_output) + + +if __name__ == '__main__': + main() -- GitLab From 8f99e5ad11040a6f0b5c12648e98bdbfe4dc3970 Mon Sep 17 00:00:00 2001 From: Alan Chiao Date: Tue, 28 Aug 2018 10:49:02 -0700 Subject: [PATCH 226/598] Optimized hybrid convolution with symmetric quantization. Add unit tests for multiple channels. PiperOrigin-RevId: 210567300 --- tensorflow/contrib/lite/kernels/conv.cc | 153 ++++++++++++++++-- tensorflow/contrib/lite/kernels/conv_test.cc | 145 +++++++++++++++++ .../contrib/lite/kernels/internal/BUILD | 2 + .../internal/optimized/optimized_ops.h | 80 +++++++++ 4 files changed, 363 insertions(+), 17 deletions(-) diff --git a/tensorflow/contrib/lite/kernels/conv.cc b/tensorflow/contrib/lite/kernels/conv.cc index 50fe5c2e04..51989f541f 100644 --- a/tensorflow/contrib/lite/kernels/conv.cc +++ b/tensorflow/contrib/lite/kernels/conv.cc @@ -30,6 +30,7 @@ limitations under the License. #include "tensorflow/contrib/lite/kernels/internal/quantization_util.h" #include "tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h" #include "tensorflow/contrib/lite/kernels/internal/tensor.h" +#include "tensorflow/contrib/lite/kernels/internal/tensor_utils.h" #include "tensorflow/contrib/lite/kernels/kernel_util.h" #include "tensorflow/contrib/lite/kernels/op_macros.h" #include "tensorflow/contrib/lite/kernels/padding.h" @@ -60,6 +61,8 @@ struct OpData { // memory buffers. int im2col_id = kTensorNotAllocated; int hwcn_weights_id = kTensorNotAllocated; + int input_quantized_id = kTensorNotAllocated; + int scaling_factors_id = kTensorNotAllocated; TfLitePaddingValues padding; // The scaling factor from input to output (aka the 'real multiplier') can @@ -74,6 +77,8 @@ struct OpData { // of the allocated temporaries. int32_t im2col_index; int32_t hwcn_weights_index; + int32_t input_quantized_index; + int32_t scaling_factors_index; bool need_hwcn_weights; bool have_weights_been_transposed; bool need_im2col; @@ -125,6 +130,9 @@ static TfLiteStatus AllocateTemporaryTensorsIfRequired(TfLiteContext* context, TfLiteTensor* input = &context->tensors[node->inputs->data[0]]; TfLiteTensor* filter = &context->tensors[node->inputs->data[1]]; + const bool is_hybrid = + (input->type == kTfLiteFloat32 && filter->type == kTfLiteUInt8); + int filter_width = filter->dims->data[2]; int filter_height = filter->dims->data[1]; @@ -145,8 +153,8 @@ static TfLiteStatus AllocateTemporaryTensorsIfRequired(TfLiteContext* context, // buffer to store the results. // This path is only used for float processing, so only create the buffer if // we're running with that data type. - data->need_hwcn_weights = - (input->type == kTfLiteFloat32 && data->run_multithreaded_kernel); + data->need_hwcn_weights = (input->type == kTfLiteFloat32 && + data->run_multithreaded_kernel && !is_hybrid); int temporaries_count = 0; if (data->need_im2col) { @@ -164,6 +172,25 @@ static TfLiteStatus AllocateTemporaryTensorsIfRequired(TfLiteContext* context, ++temporaries_count; } + if (is_hybrid) { + // Allocate tensor to store the on-the-fly quantized inputs. + data->input_quantized_index = temporaries_count; + if (data->input_quantized_id == kTensorNotAllocated) { + TF_LITE_ENSURE_OK( + context, context->AddTensors(context, 1, &data->input_quantized_id)); + } + ++temporaries_count; + + // Allocate tensor to store the quantization params computed during + // on-the-fly input quantization. + data->scaling_factors_index = temporaries_count; + if (data->scaling_factors_id == kTensorNotAllocated) { + TF_LITE_ENSURE_OK( + context, context->AddTensors(context, 1, &data->scaling_factors_id)); + } + ++temporaries_count; + } + TfLiteIntArrayFree(node->temporaries); node->temporaries = TfLiteIntArrayCreate(temporaries_count); @@ -174,10 +201,6 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) { auto* params = reinterpret_cast(node->builtin_data); OpData* data = reinterpret_cast(node->user_data); - data->run_multithreaded_kernel = context->recommended_num_threads != 1; - - TF_LITE_ENSURE_STATUS(AllocateTemporaryTensorsIfRequired(context, node)); - bool has_bias = node->inputs->size == 3; // Check number of inputs/outputs TF_LITE_ENSURE(context, has_bias || node->inputs->size == 2); @@ -193,11 +216,10 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) { TF_LITE_ENSURE_EQ(context, input->dims->data[3], filter->dims->data[3]); // Check types. (We assume that UINT8 refers to quantized tensors) - TfLiteType data_type = input->type; + TfLiteType input_type = input->type; TF_LITE_ENSURE(context, - data_type == kTfLiteFloat32 || data_type == kTfLiteUInt8); - TF_LITE_ENSURE_EQ(context, output->type, data_type); - TF_LITE_ENSURE_EQ(context, filter->type, data_type); + input_type == kTfLiteFloat32 || input_type == kTfLiteUInt8); + TF_LITE_ENSURE_EQ(context, output->type, input_type); TfLiteTensor* bias = nullptr; @@ -207,15 +229,26 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) { if (has_bias) { bias = &context->tensors[node->inputs->data[2]]; - if (data_type == kTfLiteUInt8) { + if (input_type == kTfLiteUInt8) { TF_LITE_ENSURE_EQ(context, bias->type, kTfLiteInt32); TF_LITE_ENSURE_EQ(context, bias->params.zero_point, 0); } else { - TF_LITE_ENSURE_EQ(context, bias->type, data_type); + TF_LITE_ENSURE_EQ(context, bias->type, input_type); } TF_LITE_ENSURE_EQ(context, NumElements(bias), SizeOfDimension(filter, 0)); } + const bool is_hybrid = + (input->type == kTfLiteFloat32 && filter->type == kTfLiteUInt8); + + data->run_multithreaded_kernel = context->recommended_num_threads != 1; + // Hybrid kernels don't support multithreading yet. + if (is_hybrid) { + data->run_multithreaded_kernel = false; + } + + TF_LITE_ENSURE_STATUS(AllocateTemporaryTensorsIfRequired(context, node)); + int channels_out = filter->dims->data[0]; int width = input->dims->data[2]; int height = input->dims->data[1]; @@ -250,9 +283,9 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) { TF_LITE_ENSURE(context, has_bias); - // Note that quantized inference requires that all tensors have their + // Note that full fixed-point inference requires that all tensors have their // parameters set. This is usually done during quantized training. - if (data_type != kTfLiteFloat32) { + if (input_type != kTfLiteFloat32) { double real_multiplier = 0.0; TF_LITE_ENSURE_STATUS(GetQuantizedConvolutionMultipler( context, input, filter, bias, output, &real_multiplier)); @@ -287,7 +320,10 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) { TfLiteTensor* im2col = &context->tensors[node->temporaries->data[data->im2col_index]]; - im2col->type = data_type; + im2col->type = input->type; + if (is_hybrid) { + im2col->type = kTfLiteUInt8; + } im2col->allocation_type = kTfLiteArenaRw; auto im2col_status = context->ResizeTensor(context, im2col, im2col_size); if (im2col_status != kTfLiteOk) return im2col_status; @@ -307,7 +343,7 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) { TfLiteTensor* hwcn_weights = &context->tensors[node->temporaries->data[data->hwcn_weights_index]]; - hwcn_weights->type = data_type; + hwcn_weights->type = input_type; hwcn_weights->allocation_type = kTfLiteArenaRwPersistent; auto hwcn_weights_status = @@ -319,6 +355,35 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) { data->have_weights_been_transposed = false; } + if (is_hybrid) { + node->temporaries->data[data->input_quantized_index] = + data->input_quantized_id; + TfLiteTensor* input_quantized = + GetTemporary(context, node, data->input_quantized_index); + input_quantized->type = kTfLiteUInt8; + input_quantized->allocation_type = kTfLiteArenaRw; + if (!TfLiteIntArrayEqual(input_quantized->dims, input->dims)) { + TfLiteIntArray* input_quantized_size = TfLiteIntArrayCopy(input->dims); + TF_LITE_ENSURE_OK(context, context->ResizeTensor(context, input_quantized, + input_quantized_size)); + } + + node->temporaries->data[data->scaling_factors_index] = + data->scaling_factors_id; + TfLiteTensor* scaling_factors = + GetTemporary(context, node, data->scaling_factors_index); + scaling_factors->type = kTfLiteInt32; + scaling_factors->allocation_type = kTfLiteArenaRw; + TfLiteIntArray* scaling_factors_size = TfLiteIntArrayCreate(1); + // Only one scale factor per batch is typically necessary. See optimized + // implementation for why we need to allocate for height elements here. + scaling_factors_size->data[0] = height; + if (!TfLiteIntArrayEqual(scaling_factors->dims, scaling_factors_size)) { + TF_LITE_ENSURE_OK(context, context->ResizeTensor(context, scaling_factors, + scaling_factors_size)); + } + } + return kTfLiteOk; } @@ -455,6 +520,57 @@ void EvalFloat(TfLiteContext* context, TfLiteNode* node, } } +template +void EvalHybrid(TfLiteContext* context, TfLiteNode* node, + TfLiteConvParams* params, OpData* data, TfLiteTensor* input, + TfLiteTensor* filter, TfLiteTensor* bias, TfLiteTensor* im2col, + TfLiteTensor* hwcn_weights, TfLiteTensor* output) { + float output_activation_min, output_activation_max; + CalculateActivationRange(params->activation, &output_activation_min, + &output_activation_max); + + const int input_size = NumElements(input) / SizeOfDimension(input, 0); + const int batch_size = SizeOfDimension(input, 0); + + const TfLiteTensor* input_quantized = + GetTemporary(context, node, data->input_quantized_index); + int8_t* quantized_input_ptr_batch = + reinterpret_cast(input_quantized->data.uint8); + float* scaling_factors_ptr = + GetTemporary(context, node, data->scaling_factors_index)->data.f; + + // Per-batch input quantization for higher accuracy. + for (int b = 0; b < batch_size; ++b) { + float unused_min, unused_max; + const int offset = b * input_size; + tensor_utils::SymmetricQuantizeFloats( + input->data.f + offset, input_size, quantized_input_ptr_batch + offset, + &unused_min, &unused_max, &scaling_factors_ptr[b]); + scaling_factors_ptr[b] *= filter->params.scale; + } + + int8_t* im2col_ptr = reinterpret_cast(im2col->data.uint8); + int8_t* filter_ptr = reinterpret_cast(filter->data.uint8); + + switch (kernel_type) { + case kReference: + case kGenericOptimized: + case kMultithreadOptimized: + case kCblasOptimized: + // There is only one implementation for hybrid kernel. Note + // this does not make use of gemmlowp nor supports multithreading. + optimized_ops::HybridConv( + quantized_input_ptr_batch, GetTensorDims(input), filter_ptr, + GetTensorDims(filter), GetTensorData(bias), + GetTensorDims(bias), params->stride_width, params->stride_height, + data->padding.width, data->padding.height, scaling_factors_ptr, + output_activation_min, output_activation_max, + GetTensorData(output), GetTensorDims(output), im2col_ptr, + GetTensorDims(im2col)); + break; + } +} + template TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) { auto* params = reinterpret_cast(node->builtin_data); @@ -484,7 +600,10 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) { // separate ops to avoid dispatch overhead here. switch (input->type) { // Already know in/outtypes are same. case kTfLiteFloat32: - if (data->run_multithreaded_kernel) { + if (filter->type == kTfLiteUInt8) { + EvalHybrid(context, node, params, data, input, filter, + bias, im2col, hwcn_weights, output); + } else if (data->run_multithreaded_kernel) { EvalFloat(context, node, params, data, input, filter, bias, im2col, hwcn_weights, output); } else { diff --git a/tensorflow/contrib/lite/kernels/conv_test.cc b/tensorflow/contrib/lite/kernels/conv_test.cc index 98152043c9..a4b9fb1a0b 100644 --- a/tensorflow/contrib/lite/kernels/conv_test.cc +++ b/tensorflow/contrib/lite/kernels/conv_test.cc @@ -142,6 +142,41 @@ TEST_P(ConvolutionOpTest, SimpleTestFloat32) { })); } +// This test's output is equivalent to the SimpleTestFloat32 +// because we break each input into two channels, each with half of the value, +// while keeping the filters for each channel equivalent. +// +// 2 * (A/2) * B = A * B, where the left side is this new test. +TEST_P(ConvolutionOpTest, SimpleTestFloat32WithChannels) { + ConvolutionOpModel m(GetRegistration(), {TensorType_FLOAT32, {2, 2, 4, 2}}, + {TensorType_FLOAT32, {3, 2, 2, 2}}, + {TensorType_FLOAT32, {}}); + + m.SetInput({ + // First batch + 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, // row = 1 + 1, 1, 1, 1, 1, 1, 1, 1, // row = 2 + // Second batch + 0.5, 0.5, 1, 1, 1.5, 1.5, 2, 2, // row = 1 + 0.5, 0.5, 1, 1, 1.5, 1.5, 2, 2 // row = 2 + }); + m.SetFilter({ + 1, 1, 2, 2, 3, 3, 4, 4, // first 2x2 filter + -1, -1, 1, 1, -1, -1, 1, 1, // second 2x2 filter + -1, -1, -1, -1, 1, 1, 1, 1 // third 2x2 filter + }); + m.SetBias({1, 2, 3}); + + m.Invoke(); + + EXPECT_THAT(m.GetOutput(), ElementsAreArray({ + 18, 2, 5, // first batch, left + 18, 2, 5, // first batch, right + 17, 4, 3, // second batch, left + 37, 4, 3, // second batch, right + })); +} + TEST_P(ConvolutionOpTest, SimpleTestFloat32WithAnisotropicStrides) { ConvolutionOpModel m(GetRegistration(), {TensorType_FLOAT32, {1, 3, 6, 1}}, {TensorType_FLOAT32, {1, 2, 2, 1}}, @@ -624,6 +659,116 @@ TEST_P(ConvolutionOpTest, SimpleTestQuantizedWithDilation) { ElementsAreArray({5, 5, 5, 5, 5, 5, 5, 5, 5})); } +class HybridConvolutionOpModel : public BaseConvolutionOpModel { + public: + using BaseConvolutionOpModel::BaseConvolutionOpModel; + + void SetInput(std::initializer_list data) { + PopulateTensor(input_, data); + } + + void SetFilter(std::initializer_list f) { + SymmetricQuantizeAndPopulate(filter_, f); + } + + void SetBias(std::initializer_list data) { + PopulateTensor(bias_, data); + } + + std::vector GetOutput() { return ExtractVector(output_); } +}; + +TEST_P(ConvolutionOpTest, SimpleTestHybrid) { + HybridConvolutionOpModel m( + GetRegistration(), {TensorType_FLOAT32, {2, 2, 4, 1}}, + {TensorType_UINT8, {3, 2, 2, 1}}, {TensorType_FLOAT32, {}}); + + m.SetInput({ + // First batch + 1, 1, 1, 1, // row = 1 + 2, 2, 2, 2, // row = 2 + // Second batch + 1, 2, 3, 4, // row = 1 + 1, 2, 3, 4, // row = 2 + }); + m.SetFilter({ + 1, 2, 3, 4, // first 2x2 filter + -1, 1, -1, 1, // second 2x2 filter + -1, -1, 1, 1, // third 2x2 filter + }); + m.SetBias({1, 2, 3}); + + m.Invoke(); + + // Example: we get 17.1577 instead of 17. + // + // Second batch: + // 1 2 3 4 -> 32 64 95 127 with scale factor 127/4. + // 1 2 3 4 32 64 95 127 + // + // First filter: + // 1 2 -> 32 64 with scale factor of 127/4. + // 3 4 95 127 + // + // The left half of the input gives us 16288. Multiply by (4/127)^2 for + // dequantization and adding 1 for the bias gives us the result. and adding + // the bias gives us the result. + // + // The optimized kernel converts the input into this matrix via Im2Col + // + // 1 1 2 2 + // 1 1 2 2 + // 1 2 1 2 + // 3 4 3 4 + // + // and multiplies it with the filter directly. + EXPECT_THAT(m.GetOutput(), ElementsAreArray(ArrayFloatNear( + { + 18, 2, 5, // first batch, left + 18, 2, 5, // first batch, right + 17, 4, 3, // second batch, left + 37, 4, 3, // second batch, right + }, + 0.16))); +} + +// This test's output is equivalent to the SimpleTestHybrid +// because we break each input into two channels, each with half of the value, +// while keeping the filters for each channel equivalent. +// +// 2 * (A/2) * B = A * B, where the left side is this new test. +TEST_P(ConvolutionOpTest, SimpleTestHybridWithChannels) { + HybridConvolutionOpModel m( + GetRegistration(), {TensorType_FLOAT32, {2, 2, 4, 2}}, + {TensorType_UINT8, {3, 2, 2, 2}}, {TensorType_FLOAT32, {}}); + + m.SetInput({ + // First batch + 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, // row = 1 + 1, 1, 1, 1, 1, 1, 1, 1, // row = 2 + // Second batch + 0.5, 0.5, 1, 1, 1.5, 1.5, 2, 2, // row = 1 + 0.5, 0.5, 1, 1, 1.5, 1.5, 2, 2 // row = 2 + }); + m.SetFilter({ + 1, 1, 2, 2, 3, 3, 4, 4, // first 2x2 filter + -1, -1, 1, 1, -1, -1, 1, 1, // second 2x2 filter + -1, -1, -1, -1, 1, 1, 1, 1 // third 2x2 filter + }); + m.SetBias({1, 2, 3}); + + m.Invoke(); + + EXPECT_THAT(m.GetOutput(), ElementsAreArray(ArrayFloatNear( + { + 18, 2, 5, // first batch, left + 18, 2, 5, // first batch, right + 17, 4, 3, // second batch, left + 37, 4, 3, // second batch, right + }, + 0.16))); +} + INSTANTIATE_TEST_CASE_P( ConvolutionOpTest, ConvolutionOpTest, ::testing::ValuesIn(SingleOpTest::GetKernelTags(*kKernelMap))); diff --git a/tensorflow/contrib/lite/kernels/internal/BUILD b/tensorflow/contrib/lite/kernels/internal/BUILD index 96798c900e..464163bd78 100644 --- a/tensorflow/contrib/lite/kernels/internal/BUILD +++ b/tensorflow/contrib/lite/kernels/internal/BUILD @@ -160,6 +160,7 @@ cc_library( ":types", ":reference_base", ":round", + ":tensor_utils", "//third_party/eigen3", "@gemmlowp", "//tensorflow/contrib/lite:builtin_op_data", @@ -191,6 +192,7 @@ cc_library( deps = [ ":quantization_util", ":strided_slice_logic", + ":tensor_utils", ":types", ":legacy_reference_base", ":round", diff --git a/tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h b/tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h index b00097c433..e4bb4e0534 100644 --- a/tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h +++ b/tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h @@ -34,6 +34,7 @@ limitations under the License. #include "tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h" #include "tensorflow/contrib/lite/kernels/internal/round.h" #include "tensorflow/contrib/lite/kernels/internal/strided_slice_logic.h" +#include "tensorflow/contrib/lite/kernels/internal/tensor_utils.h" #include "tensorflow/contrib/lite/kernels/internal/types.h" namespace tflite { @@ -1935,6 +1936,85 @@ inline void Conv(const float* input_data, const Dims<4>& input_dims, output_activation_max); } +inline void HybridConv(const int8_t* input_data, const Dims<4>& input_dims, + const int8_t* filter_data, const Dims<4>& filter_dims, + const float* bias_data, const Dims<4>& bias_dims, + int stride_width, int stride_height, int pad_width, + int pad_height, float* scaling_factors_ptr, + float output_activation_min, float output_activation_max, + float* output_data, const Dims<4>& output_dims, + int8_t* im2col_data, const Dims<4>& im2col_dims) { + const int batch_size = input_dims.sizes[3]; + const int filter_width = ArraySize(filter_dims, 1); + const int filter_height = ArraySize(filter_dims, 2); + + const int8* gemm_input_data = nullptr; + int num_input; + const bool need_im2col = stride_width != 1 || stride_height != 1 || + filter_width != 1 || filter_height != 1; + + if (need_im2col) { + TFLITE_DCHECK(im2col_data); + // symmetric quantization assumes zero point of 0. + const int input_zero_point = 0; + Im2col(input_data, input_dims, stride_width, stride_height, pad_width, + pad_height, filter_height, filter_width, input_zero_point, + im2col_data, im2col_dims); + gemm_input_data = im2col_data; + num_input = im2col_dims.sizes[0] * im2col_dims.sizes[1] * + im2col_dims.sizes[2] * im2col_dims.sizes[3]; + } else { + TFLITE_DCHECK(!im2col_data); + gemm_input_data = input_data; + num_input = input_dims.sizes[0] * input_dims.sizes[1] * + input_dims.sizes[2] * input_dims.sizes[3]; + } + + // Flatten 4D matrices into 2D matrices for matrix multiplication. + + // Flatten so that each filter has its own row. + const int filter_rows = filter_dims.sizes[3]; + const int filter_cols = + filter_dims.sizes[0] * filter_dims.sizes[1] * filter_dims.sizes[2]; + + // In MatrixBatchVectorMultiplyAccumulate, each output value is the + // dot product of one row of the first matrix with one row of the second + // matrix. Therefore, the number of cols in each matrix are equivalent. + // + // After Im2Col, each input patch becomes a row. + const int gemm_input_cols = filter_cols; + const int gemm_input_rows = num_input / gemm_input_cols; + + const int output_cols = output_dims.sizes[0]; + const int output_rows = + output_dims.sizes[1] * output_dims.sizes[2] * output_dims.sizes[3]; + TFLITE_DCHECK_EQ(output_cols, filter_rows); + TFLITE_DCHECK_EQ(output_rows, gemm_input_rows); + TFLITE_DCHECK_EQ(bias_dims.sizes[0], output_cols); + TFLITE_DCHECK_EQ(bias_dims.sizes[1], 1); + TFLITE_DCHECK_EQ(bias_dims.sizes[2], 1); + TFLITE_DCHECK_EQ(bias_dims.sizes[3], 1); + + // MatrixBatchVectorMultiplyAccumulate assumes that each row of the second + // input matrix has its own scale factor. This code duplicates the scale + // factors for each row in the same batch. + const int rows_per_batch = gemm_input_rows / batch_size; + for (int i = gemm_input_rows - 1; i >= 0; --i) { + scaling_factors_ptr[i] = scaling_factors_ptr[i / rows_per_batch]; + } + + tensor_utils::ZeroVector(output_data, output_rows * output_cols); + + tensor_utils::MatrixBatchVectorMultiplyAccumulate( + filter_data, filter_rows, filter_cols, gemm_input_data, + scaling_factors_ptr, /*n_batch=*/gemm_input_rows, output_data, + /*result_stride=*/1); + + AddBiasAndEvalActivationFunction(bias_data, bias_dims, output_data, + output_dims, output_activation_min, + output_activation_max); +} + template void Conv(const float* input_data, const Dims<4>& input_dims, const float* filter_data, const Dims<4>& filter_dims, -- GitLab From cf879ef5dddee8d1b5081afe5bd8f49f15245d08 Mon Sep 17 00:00:00 2001 From: Rachel Lim Date: Tue, 28 Aug 2018 11:07:35 -0700 Subject: [PATCH 227/598] Adds a tf.ensure_shape function as a substitute for tensor.set_shape, which validates the true shape of the tensor at runtime. PiperOrigin-RevId: 210570878 --- .../base_api/api_def_EnsureShape.pbtxt | 26 ++++ .../python_api/api_def_EnsureShape.pbtxt | 4 + tensorflow/core/kernels/shape_ops.cc | 93 +++++++++++ tensorflow/core/ops/array_ops.cc | 24 +++ tensorflow/python/framework/ops.py | 5 + tensorflow/python/kernel_tests/BUILD | 2 + .../python/kernel_tests/check_ops_test.py | 146 ++++++++++++++++++ tensorflow/python/ops/check_ops.py | 49 ++++++ .../tools/api/golden/v1/tensorflow.pbtxt | 4 + .../tools/api/golden/v2/tensorflow.pbtxt | 4 + 10 files changed, 357 insertions(+) create mode 100644 tensorflow/core/api_def/base_api/api_def_EnsureShape.pbtxt create mode 100644 tensorflow/core/api_def/python_api/api_def_EnsureShape.pbtxt diff --git a/tensorflow/core/api_def/base_api/api_def_EnsureShape.pbtxt b/tensorflow/core/api_def/base_api/api_def_EnsureShape.pbtxt new file mode 100644 index 0000000000..1658472209 --- /dev/null +++ b/tensorflow/core/api_def/base_api/api_def_EnsureShape.pbtxt @@ -0,0 +1,26 @@ +op { + graph_op_name: "EnsureShape" + in_arg { + name: "input" + description: <