From 21a9efc4cddbce661073544db31a63639686310a Mon Sep 17 00:00:00 2001 From: Yong Tang Date: Tue, 28 Nov 2017 05:28:49 -0800 Subject: [PATCH 001/405] Add complex dtypes support for `tf.squared_difference` This fix tries to address the issue raised in 14932 where complex dtypes are not supported for `tf.squared_difference`, which is different from the doc string in `math_ops.cc` (see `BINARY_FEWER`). This fix adds the complex64 and complex128 support in kernel, and adds additional test cases. This fix fixes 14932. Signed-off-by: Yong Tang --- tensorflow/core/kernels/cwise_op_squared_difference.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tensorflow/core/kernels/cwise_op_squared_difference.cc b/tensorflow/core/kernels/cwise_op_squared_difference.cc index 78fefc69c7..d0ff271df6 100644 --- a/tensorflow/core/kernels/cwise_op_squared_difference.cc +++ b/tensorflow/core/kernels/cwise_op_squared_difference.cc @@ -16,8 +16,8 @@ limitations under the License. #include "tensorflow/core/kernels/cwise_ops_common.h" namespace tensorflow { -REGISTER5(BinaryOp, CPU, "SquaredDifference", functor::squared_difference, - float, Eigen::half, double, int32, int64); +REGISTER7(BinaryOp, CPU, "SquaredDifference", functor::squared_difference, + float, Eigen::half, double, int32, int64, complex64, complex128); #if GOOGLE_CUDA REGISTER4(BinaryOp, GPU, "SquaredDifference", functor::squared_difference, float, Eigen::half, double, int64); -- GitLab From 4f5e66aca388ee13e925d173a82644eed9d5a760 Mon Sep 17 00:00:00 2001 From: Yong Tang Date: Tue, 28 Nov 2017 05:32:08 -0800 Subject: [PATCH 002/405] Add test cases for complex dtypes support with `tf.squared_difference` Signed-off-by: Yong Tang --- tensorflow/python/ops/math_ops_test.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tensorflow/python/ops/math_ops_test.py b/tensorflow/python/ops/math_ops_test.py index d314124ccd..7078ac99c8 100644 --- a/tensorflow/python/ops/math_ops_test.py +++ b/tensorflow/python/ops/math_ops_test.py @@ -203,7 +203,9 @@ class SquaredDifferenceTest(test_util.TensorFlowTestCase): @test_util.run_in_graph_and_eager_modes() def testSquaredDifference(self): - for dtype in [np.int32, np.float16]: + for dtype in [np.float16, np.float32, np.float64, + np.int32, np.int64, + np.complex64, np.complex128]: x = np.array([[1, 2, 3], [4, 5, 6]], dtype=dtype) y = np.array([-3, -2, -1], dtype=dtype) z = (x - y) * (x - y) -- GitLab From f7bb3741549e791f687fa8289fb281717eae7426 Mon Sep 17 00:00:00 2001 From: Yong Tang Date: Sat, 24 Feb 2018 19:05:52 +0000 Subject: [PATCH 003/405] Add additional test to cover squared difference for complex where imag parts are not 0. Signed-off-by: Yong Tang --- tensorflow/python/ops/math_ops_test.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/tensorflow/python/ops/math_ops_test.py b/tensorflow/python/ops/math_ops_test.py index 7078ac99c8..3224e40db2 100644 --- a/tensorflow/python/ops/math_ops_test.py +++ b/tensorflow/python/ops/math_ops_test.py @@ -213,6 +213,16 @@ class SquaredDifferenceTest(test_util.TensorFlowTestCase): z_tf = self.evaluate(math_ops.squared_difference(x, y)) self.assertAllClose(z, z_tf) + @test_util.run_in_graph_and_eager_modes() + def testComplexSquaredDifference(self): + for dtype in [np.complex64, np.complex128]: + x = np.array([[1+3j, 2+2j, 3+1j], [4-1j, 5-2j, 6-3j]], dtype=dtype) + y = np.array([-3+1j, -2+2j, -1+3j], dtype=dtype) + z = (x - y) * (x - y) + with test_util.device(use_gpu=True): + z_tf = self.evaluate(math_ops.squared_difference(x, y)) + self.assertAllClose(z, z_tf) + @test_util.with_c_api class ApproximateEqualTest(test_util.TensorFlowTestCase): -- GitLab From ab5cdb187d96e3a865724c3d41671dd253288456 Mon Sep 17 00:00:00 2001 From: Yong Tang Date: Sat, 24 Feb 2018 21:22:26 +0000 Subject: [PATCH 004/405] Enable squared_difference complex on CPU only Signed-off-by: Yong Tang --- tensorflow/python/ops/math_ops_test.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/tensorflow/python/ops/math_ops_test.py b/tensorflow/python/ops/math_ops_test.py index 3224e40db2..533a00e737 100644 --- a/tensorflow/python/ops/math_ops_test.py +++ b/tensorflow/python/ops/math_ops_test.py @@ -204,8 +204,7 @@ class SquaredDifferenceTest(test_util.TensorFlowTestCase): @test_util.run_in_graph_and_eager_modes() def testSquaredDifference(self): for dtype in [np.float16, np.float32, np.float64, - np.int32, np.int64, - np.complex64, np.complex128]: + np.int32, np.int64]: x = np.array([[1, 2, 3], [4, 5, 6]], dtype=dtype) y = np.array([-3, -2, -1], dtype=dtype) z = (x - y) * (x - y) @@ -219,7 +218,7 @@ class SquaredDifferenceTest(test_util.TensorFlowTestCase): x = np.array([[1+3j, 2+2j, 3+1j], [4-1j, 5-2j, 6-3j]], dtype=dtype) y = np.array([-3+1j, -2+2j, -1+3j], dtype=dtype) z = (x - y) * (x - y) - with test_util.device(use_gpu=True): + with test_util.device(use_gpu=False): z_tf = self.evaluate(math_ops.squared_difference(x, y)) self.assertAllClose(z, z_tf) -- GitLab From 7950d197767ef24a1525b809a310b82020f665ba Mon Sep 17 00:00:00 2001 From: mbhuiyan Date: Wed, 1 Aug 2018 13:43:35 -0700 Subject: [PATCH 005/405] MKL DNN: Adding support of fusing Pad and Conv2D in MKL DNN optimized code --- tensorflow/core/graph/mkl_layout_pass.cc | 298 +++++++++++++++++- tensorflow/core/graph/mkl_layout_pass_test.cc | 70 ++++ tensorflow/core/kernels/BUILD | 28 ++ tensorflow/core/kernels/mkl_conv_ops.cc | 96 +++++- tensorflow/core/kernels/mkl_conv_ops.h | 28 +- tensorflow/core/ops/nn_ops.cc | 49 +++ 6 files changed, 554 insertions(+), 15 deletions(-) diff --git a/tensorflow/core/graph/mkl_layout_pass.cc b/tensorflow/core/graph/mkl_layout_pass.cc index c22e0a3872..d0abe5da35 100644 --- a/tensorflow/core/graph/mkl_layout_pass.cc +++ b/tensorflow/core/graph/mkl_layout_pass.cc @@ -2432,6 +2432,9 @@ class MklLayoutRewritePass : public GraphOptimizationPass { csinfo_.mkl_conv2d_with_bias = "_MklConv2DWithBias"; csinfo_.mkl_conv2d_grad_filter_with_bias = "_MklConv2DBackpropFilterWithBias"; + csinfo_.mkl_pad_with_conv2d = "_MklPadWithConv2D"; + csinfo_.pad = "Pad"; + csinfo_.pad_with_conv2d = "__MklDummyPadWithConv2D"; csinfo_.relu = "Relu"; csinfo_.relu_grad = "ReluGrad"; csinfo_.tanh = "Tanh"; @@ -2508,6 +2511,8 @@ class MklLayoutRewritePass : public GraphOptimizationPass { rinfo_.push_back({csinfo_.mul, mkl_op_registry::GetMklOpName(csinfo_.mul), CopyAttrsDataType, AlwaysRewrite}); + rinfo_.push_back({csinfo_.pad_with_conv2d, csinfo_.mkl_pad_with_conv2d, + CopyAttrsPadWithConv2D, AlwaysRewrite}); rinfo_.push_back({csinfo_.relu, mkl_op_registry::GetMklOpName(csinfo_.relu), CopyAttrsDataType, AlwaysRewrite}); rinfo_.push_back({csinfo_.relu_grad, @@ -2546,6 +2551,10 @@ class MklLayoutRewritePass : public GraphOptimizationPass { minfo_.push_back({csinfo_.conv2d_grad_filter, csinfo_.bias_add_grad, csinfo_.conv2d_grad_filter_with_bias, GetConv2DBackpropFilterOrBiasAddGrad}); + minfo_.push_back({csinfo_.pad, csinfo_.conv2d, + csinfo_.pad_with_conv2d, GetPadOrConv2D}); + //TODO : Need to check if pad is with zero or not + // if is zero then replace, if not then do not replace } // Standard interface to run pass @@ -2628,7 +2637,10 @@ class MklLayoutRewritePass : public GraphOptimizationPass { string mkl_conv2d_grad_filter; string mkl_conv2d_grad_filter_with_bias; string mkl_conv2d_with_bias; + string mkl_pad_with_conv2d; string mul; + string pad; + string pad_with_conv2d; string relu; string relu_grad; string tanh; @@ -2734,6 +2746,7 @@ class MklLayoutRewritePass : public GraphOptimizationPass { // Helper function to merge different nodes Status MergeConv2DWithBiasAdd(std::unique_ptr* g, Node* m, Node* n); + Status MergePadWithConv2D(std::unique_ptr* g, Node* m, Node* n); Status MergeConv2DBackpropFilterWithBiasAddGrad(std::unique_ptr* g, Node* m, Node* n); @@ -2771,6 +2784,59 @@ class MklLayoutRewritePass : public GraphOptimizationPass { return n; } + // Find Pad or Conv2D node that can be merged with input node 'm'. + // If input 'm' is Pad, then check if there exists Conv2D node that can be + // merged with 'm'. If input 'm' is Conv2D, then check if there exists BiasAdd + // node that can be merged with 'm'. + static Node* GetPadOrConv2D(const Node* m) { + CHECK_NOTNULL(m); + Node* n = nullptr; + + if (m->type_string() == csinfo_.pad) { + // If m is Pad, then Conv2D is the output of Pad. + for (const Edge* e : m->out_edges()) { + if (!e->IsControlEdge() && + e->dst()->type_string() == csinfo_.conv2d) { + n = e->dst(); + break; + } + } + } else { + CHECK_EQ(m->type_string(), csinfo_.conv2d); + // If m is conv2D, Go over all input edges + // and search for Pad Node. + for (const Edge* e : m->in_edges()) { + if (!e->IsControlEdge() && + e->src()->type_string() == csinfo_.pad) { + n = e->src(); + break; + } + } + } + // Check if only VALID type of padding is used + // or not. + if (n != nullptr) { + const Node* conv_node; + if (m->type_string() == csinfo_.conv2d) + conv_node = m; + else + conv_node = n; + string padding; + TF_CHECK_OK(GetNodeAttr(conv_node->def(), "padding", &padding)); + if (padding != "VALID") + // Then do not merge. + // Only VALID type of padding in conv op can be + // merged with Pad op. + n = nullptr; + } + if (n == nullptr) { + VLOG(1) << "MklLayoutRewritePass: Could not find matching " + << "Pad and Conv2D node for merging. Input node: " + << m->DebugString(); + } + + return n; + } // Find Conv2DBackpropFilter or BiasAddGrad node that can be merged with input // node 'm'. If input 'm' is Conv2DBackpropFilter, then check if there exists // BiasAddGrad node that can be merged with 'm'. If input 'm' is BiasAddGrad, @@ -3090,6 +3156,9 @@ class MklLayoutRewritePass : public GraphOptimizationPass { static void CopyAttrsDataType(const Node* orig_node, NodeBuilder* nb); static void CopyAttrsFusedBatchNorm(const Node* orig_node, NodeBuilder* nb); static void CopyAttrsLRN(const Node* orig_node, NodeBuilder* nb); + static void CopyAttrsPadWithConv2D(const Node* orig_node, NodeBuilder* nb); + static void CopyAttrsFromPadAndConv2D(const Node* orig_node1, const Node* orig_node2, + NodeBuilder* nb); static void CopyAttrsPooling(const Node* orig_node, NodeBuilder* nb); static void CopyAttrsReshape(const Node* orig_node, NodeBuilder* nb); static void CopyAttrsSplit(const Node* orig_node, NodeBuilder* nb); @@ -3289,6 +3358,8 @@ int MklLayoutRewritePass::SetUpContiguousInputs( // 2nd input (slot 1) of _MklConv2D and _MklConv2DWithBias. for (const Edge* e : filter_node->out_edges()) { if ((e->dst()->type_string() == csinfo_.mkl_conv2d || + // add check for mkl_pad_with_conv2d + e->dst()->type_string() == csinfo_.mkl_pad_with_conv2d || e->dst()->type_string() == csinfo_.mkl_conv2d_with_bias) && e->dst_input() == kConv2DFilterInputSlotIdx /* filter is 2nd input of Conv2D and _MklConv2D. */) { @@ -3598,6 +3669,65 @@ void MklLayoutRewritePass::CopyAttrsConv2D(const Node* orig_node, nb->Attr("use_cudnn_on_gpu", use_cudnn_on_gpu); } +//used in rinfo when replacing __MklDummyPadWithConv2D by _MklPadWithConv2D +void MklLayoutRewritePass::CopyAttrsPadWithConv2D(const Node* orig_node, + NodeBuilder* nb) { + DataType Tpaddings; + DataType T; + string data_format; + string padding; + std::vector strides; + bool use_cudnn_on_gpu; + + // Get all attributes from old node 1. + TF_CHECK_OK(GetNodeAttr(orig_node->def(), "T", &T)); + TF_CHECK_OK(GetNodeAttr(orig_node->def(), "strides", &strides)); + TF_CHECK_OK(GetNodeAttr(orig_node->def(), "padding", &padding)); + TF_CHECK_OK(GetNodeAttr(orig_node->def(), "data_format", &data_format)); + TF_CHECK_OK( + GetNodeAttr(orig_node->def(), "use_cudnn_on_gpu", &use_cudnn_on_gpu)); + TF_CHECK_OK(GetNodeAttr(orig_node->def(), "Tpaddings", &Tpaddings)); + + // Add attributes to new node. + nb->Attr("T", T); + nb->Attr("strides", strides); + nb->Attr("padding", padding); + nb->Attr("data_format", data_format); + nb->Attr("use_cudnn_on_gpu", use_cudnn_on_gpu); + nb->Attr("Tpaddings", Tpaddings); +} + +//used with MergePadWithConv2D +void MklLayoutRewritePass::CopyAttrsFromPadAndConv2D(const Node* orig_node1, + const Node* orig_node2, NodeBuilder* nb) { + DataType Tpaddings; + DataType T; + string data_format; + string padding; + std::vector strides; + bool use_cudnn_on_gpu; + + // Get all attributes from old node 1. + TF_CHECK_OK(GetNodeAttr(orig_node1->def(), "T", &T)); + TF_CHECK_OK(GetNodeAttr(orig_node1->def(), "strides", &strides)); + TF_CHECK_OK(GetNodeAttr(orig_node1->def(), "padding", &padding)); + TF_CHECK_OK(GetNodeAttr(orig_node1->def(), "data_format", &data_format)); + TF_CHECK_OK( + GetNodeAttr(orig_node1->def(), "use_cudnn_on_gpu", &use_cudnn_on_gpu)); + // Get all attributes from old node 2. + TF_CHECK_OK(GetNodeAttr(orig_node2->def(), "Tpaddings", &Tpaddings)); + + // Add attributes to new node. + nb->Attr("T", T); + nb->Attr("strides", strides); + nb->Attr("padding", padding); + nb->Attr("data_format", data_format); + nb->Attr("use_cudnn_on_gpu", use_cudnn_on_gpu); + + + // Add attributes to new node. + nb->Attr("Tpaddings", Tpaddings); +} void MklLayoutRewritePass::CopyAttrsAddN(const Node* orig_node, NodeBuilder* nb) { DataType T; @@ -3824,7 +3954,7 @@ Status MklLayoutRewritePass::MergeConv2DWithBiasAdd(std::unique_ptr* g, // If 'm' is BiasAdd, then 'n' is Conv2D. Since Conv2D feeds BiasAdd, // BiasAdd is successor node, and Conv2D predecessor node. Node* pred = m->type_string() == csinfo_.bias_add ? n : m; - Node* succ = m->type_string() == csinfo_.bias_add ? m : n; + Node* succ = m->type_string() == csinfo_.bias_add ? m : n; // 1. Get all attributes from input nodes. DataType T_pred, T_succ; @@ -3963,6 +4093,161 @@ Status MklLayoutRewritePass::MergeConv2DWithBiasAdd(std::unique_ptr* g, return Status::OK(); } +Status MklLayoutRewritePass::MergePadWithConv2D(std::unique_ptr* g, + Node* m, Node* n) { + CHECK_EQ(((m->type_string() == csinfo_.pad && + n->type_string() == csinfo_.conv2d)) || + ((n->type_string() == csinfo_.pad && + m->type_string() == csinfo_.conv2d)), + true); + + // Conv2D is successor node, and Pad predecessor node. + Node* pred = m->type_string() == csinfo_.pad ? m : n; + Node* succ = m->type_string() == csinfo_.pad ? n : m; + + // 1. Get all attributes from input nodes. + DataType T_pred, T_succ; + string padding; + std::vector strides; + std::vector dilations; + string data_format_pred, data_format_succ; + bool use_cudnn_on_gnu; + TF_CHECK_OK(GetNodeAttr(pred->def(), "T", &T_pred)); + TF_CHECK_OK(GetNodeAttr(succ->def(), "T", &T_succ)); + TF_CHECK_OK(GetNodeAttr(succ->def(), "padding", &padding)); + TF_CHECK_OK(GetNodeAttr(succ->def(), "strides", &strides)); + TF_CHECK_OK(GetNodeAttr(succ->def(), "dilations", &dilations)); + // data format for pad is not available and not necessary, thus + // we dont need to match data format + // TF_CHECK_OK(GetNodeAttr(pred->def(), "data_format", &data_format_pred)); + TF_CHECK_OK(GetNodeAttr(succ->def(), "data_format", &data_format_succ)); + TF_CHECK_OK(GetNodeAttr(succ->def(), "use_cudnn_on_gpu", &use_cudnn_on_gnu)); + // We check to ensure that data formats of both succ and pred are same. + // We expect them to be same, so we can enforce this as assert. + // But assert can be too strict, so we enforce this as a check. + // If the check fails, then we do not merge two nodes. + // We also do same check for devices. + // if (data_format_pred != data_format_succ || T_pred != T_succ || + if (T_pred != T_succ || + pred->assigned_device_name() != succ->assigned_device_name() || + pred->def().device() != succ->def().device()) { + return Status(error::Code::INVALID_ARGUMENT, + "data_format or T attribute or devices of Conv2D and " + "Pad do not match. Will skip node merge optimization"); + } + + const int succ_num = succ->num_inputs(); + gtl::InlinedVector succ_control_edges; + gtl::InlinedVector, 4> succ_in(succ_num); + FillInputs(succ, &succ_control_edges, &succ_in); + + const int pred_num = pred->num_inputs(); + gtl::InlinedVector pred_control_edges; + gtl::InlinedVector, 4> pred_in(pred_num); + FillInputs(pred, &pred_control_edges, &pred_in); + + // We need to ensure that Pad only feeds to Conv2D (some other operator is + // not expecting output of Pad). If this is not the case, then we cannot + // merge Conv2D with Pad. + const int kFirstOutputSlot = 0; + for (const Edge* e : pred->out_edges()) { + if (e->src_output() == kFirstOutputSlot && e->dst() != succ) { + return Status(error::Code::INVALID_ARGUMENT, + "Pad does not feed to Conv2D, or " + "it feeds Conv2D but has multiple outputs. " + "Will skip node merge optimization"); + } + } + + // 2. Get inputs from both the nodes. ( ? ? Explanation of the following) + // Find the 2 inputs from the Pad and the Filter input from the Conv2D. + // Get operand 0, 1 of conv2D. + CHECK_EQ(pred->in_edges().size(), 2); // Pad must have 2 inputs. + // Get operand 1 of add_bias??? + // Conv2D must have 2 inputs: pad output and Filter + CHECK_EQ(succ->in_edges().size(), 2); + + // We will use the node name of Conv2D as the name of new node + // Build new node. We use same name as original node, but change the op + // name. + NodeBuilder nb(succ->name(), csinfo_.pad_with_conv2d); + nb.Input(pred_in[0].first, pred_in[0].second); // In1 (input data) of Pad + // pred_in[1] will be 2nd Tensorflow tensor for Conv2D. + nb.Input(succ_in[1].first, succ_in[1].second); // In2 (filter) of conv2d + // In1 of Conv2D is same as output of Pad. + // Thus, only need to add In2 of Conv2D + nb.Input(pred_in[1].first, pred_in[1].second); // In2 (paddings) of Pad + + // Copy attributes from Pad and conv2D to PadWithConv2D. + CopyAttrsFromPadAndConv2D(const_cast(succ), const_cast(pred), + &nb); + + // Copy the device assigned to old node to new node. + nb.Device(succ->def().device()); + + // Create node. + Node* new_node; + TF_CHECK_OK(nb.Finalize(&**g, &new_node)); + CHECK_NOTNULL(new_node); + + // Incoming data edges from 'pred' node and 'succ' node to new 'new_node' + // node are already copied in BuildNode. + // We handle control edges now. + for (const Edge* e : pred->in_edges()) { + if (e->IsControlEdge()) { + // Allow duplicate while adding control edge as it would fail (return + // NULL) if we try to add duplicate edge. + CHECK_NOTNULL((*g)->AddControlEdge(e->src(), new_node, true)); + } + } + for (const Edge* e : succ->in_edges()) { + if (e->IsControlEdge()) { + // Allow duplicate while adding control edge as it would fail (return + // NULL) if we try to add duplicate edge. + CHECK_NOTNULL((*g)->AddControlEdge(e->src(), new_node, true)); + } + } + + // Incoming edges are fixed, we will fix the outgoing edges now. + // First, we will fix outgoing control edges from 'pred' node. + for (const Edge* e : pred->out_edges()) { + if (e->IsControlEdge()) { + // Allow duplicate while adding control edge as it would fail (return + // NULL) if we try to add duplicate edge. + CHECK_NOTNULL((*g)->AddControlEdge(new_node, e->dst(), true)); + } + } + + // Second, we will fix outgoing control and data edges from 'succ' node. + for (const Edge* e : succ->out_edges()) { + if (e->IsControlEdge()) { + // Allow duplicate while adding control edge as it would fail (return + // NULL) if we try to add duplicate edge. + CHECK_NOTNULL((*g)->AddControlEdge(new_node, e->dst(), true)); + } else { + // Conv2D has only 1 output (at slot 0) and merged node also has only 1 + // output (at slot 0). + const int kPadWithConv2DOutputSlot = 0; + CHECK_NOTNULL((*g)->AddEdge(new_node, kPadWithConv2DOutputSlot, e->dst(), + e->dst_input())); + } + } + + // Copy device assigned to old node to new node. + // It's ok to use pred or succ as we have enforced a check that + // both have same device assigned. + new_node->set_assigned_device_name(pred->assigned_device_name()); + + VLOG(1) << "MklLayoutRewritePass: Merged old node:" << pred->DebugString() + << ", and node: " << succ->DebugString() + << ", into node:" << new_node->DebugString(); + + (*g)->RemoveNode(succ); + (*g)->RemoveNode(pred); + + return Status::OK(); +} + Status MklLayoutRewritePass::MergeConv2DBackpropFilterWithBiasAddGrad( std::unique_ptr* g, Node* m, Node* n) { CHECK_EQ(((m->type_string() == csinfo_.bias_add_grad && @@ -4096,6 +4381,12 @@ Status MklLayoutRewritePass::MergeNode(std::unique_ptr* g, Node* m, m->type_string() == csinfo_.conv2d))) { return this->MergeConv2DWithBiasAdd(g, m, n); } + if (((m->type_string() == csinfo_.pad && + n->type_string() == csinfo_.conv2d)) || + ((n->type_string() == csinfo_.pad && + m->type_string() == csinfo_.conv2d))) { + return this->MergePadWithConv2D(g, m, n); + } if (((m->type_string() == csinfo_.bias_add_grad && n->type_string() == csinfo_.conv2d_grad_filter)) || @@ -4207,9 +4498,10 @@ MklLayoutRewritePass::CheckForNodeRewrite(const Node* n) const { } // We make an exception for __MklDummyConv2DWithBias and - // __MklConv2DBackpropFilterWithBias since their names do not match Mkl node - // names. + // __MklConv2DBackpropFilterWithBias, __MklDummyPadWithConv2D since their names + // do not match Mkl node names. if (n->type_string() != csinfo_.conv2d_with_bias && + n->type_string() != csinfo_.pad_with_conv2d && n->type_string() != csinfo_.conv2d_grad_filter_with_bias && !mkl_op_registry::IsMklOp(mkl_op_registry::GetMklOpName(n->type_string()), T)) { diff --git a/tensorflow/core/graph/mkl_layout_pass_test.cc b/tensorflow/core/graph/mkl_layout_pass_test.cc index a41f5861af..020e3c9168 100644 --- a/tensorflow/core/graph/mkl_layout_pass_test.cc +++ b/tensorflow/core/graph/mkl_layout_pass_test.cc @@ -2012,6 +2012,76 @@ TEST_F(MklLayoutPassTest, Basic) { "A->C;A->D;B->C:1;B->D:1"); } +// Test set 0: Pad + Conv2D; padding is VALID +// A = input(image), B = input(paddings), C= Pad = input of conv2D, +// D=input(filter), E = Conv2D, Z = Zeta +// C=Pad(A,B); E=Conv2D(C,D); Z=Zeta(E,Y) +// After layout pass +// _MklPadWithConv2D(A, D, B, DMT/_0, DMT/_1, DMT/_2) +TEST_F(MklLayoutPassTest, NodeMerge_PadWithConv2D_Positive) { + CHECK_EQ(kTensorOrdering, MklTfTensorOrdering::TENSORS_CONTIGUOUS); + InitGraph( + "node { name: 'A' op: 'Input'}" + "node { name: 'B' op: 'Int32Input'}" + "node { name: 'C' op: 'Pad'" + " attr { key: 'T' value { type: DT_FLOAT } }" + " attr { key: 'Tpaddings' value { type: DT_INT32 } }" + " input: ['A', 'B']}" + "node { name: 'D' op: 'Input'}" + "node { name: 'E' op: 'Conv2D'" + " attr { key: 'T' value { type: DT_FLOAT } }" + " attr { key: 'data_format' value { s: 'NHWC' } }" + " attr { key: 'use_cudnn_on_gpu' value { b: false } }" + " attr { key: 'strides' value { list: {i: 1, i:1, i:1, i:1} } }" + " attr { key: 'padding' value { s: 'VALID' } }" + " attr { key: 'dilations' value { list: {i: 1, i:1, i:1, i:1} } }" + " input: ['C', 'D'] }" + "node { name: 'Y' op: 'Input'}" + "node { name: 'Z' op: 'Zeta'" + " attr {key: 'T' value { type: DT_FLOAT } }" + " input: ['E', 'Y']}"); + EXPECT_EQ(DoMklLayoutOptimizationPass(), + "A(Input);B(Int32Input);D(Input);DMT/_0(Const);DMT/_1(Const);" + "DMT/_2(Const);E(_MklPadWithConv2D);Y(Input);Z(Zeta)|A->E;" + "A:control->DMT/_0:control;A:control->DMT/_1:control;" + "A:control->DMT/_2:control;B->E:2;D->E:1;DMT/_0->E:3;DMT/_1->E:4;" + "DMT/_2->E:5;E->Z;Y->Z:1"); +} + +// Test set 0: Pad + Conv2D; padding is SAME +// A = input(image), B = input(paddings), C= Pad = input of conv2D, +// D=input(filter), E = Conv2D, Z = Zeta +// C=Pad(A,B); E=Conv2D(C,D); Z=Zeta(E,Y) +// After layout pass - No merging +TEST_F(MklLayoutPassTest, NodeMerge_PadWithConv2D_Negative) { + CHECK_EQ(kTensorOrdering, MklTfTensorOrdering::TENSORS_CONTIGUOUS); + InitGraph( + "node { name: 'A' op: 'Input'}" + "node { name: 'B' op: 'Int32Input'}" + "node { name: 'C' op: 'Pad'" + " attr { key: 'T' value { type: DT_FLOAT } }" + " attr { key: 'Tpaddings' value { type: DT_INT32 } }" + " input: ['A', 'B']}" + "node { name: 'D' op: 'Input'}" + "node { name: 'E' op: 'Conv2D'" + " attr { key: 'T' value { type: DT_FLOAT } }" + " attr { key: 'data_format' value { s: 'NHWC' } }" + " attr { key: 'use_cudnn_on_gpu' value { b: false } }" + " attr { key: 'strides' value { list: {i: 1, i:1, i:1, i:1} } }" + " attr { key: 'padding' value { s: 'SAME' } }" + " attr { key: 'dilations' value { list: {i: 1, i:1, i:1, i:1} } }" + " input: ['C', 'D'] }" + "node { name: 'Y' op: 'Input'}" + "node { name: 'Z' op: 'Zeta'" + " attr {key: 'T' value { type: DT_FLOAT } }" + " input: ['E', 'Y']}"); + EXPECT_EQ(DoMklLayoutOptimizationPass(), + "A(Input);B(Int32Input);C(Pad);D(Input);DMT/_0(Const);DMT/_1(Const);" + "E(_MklConv2D);Y(Input);Z(Zeta)|A->C;B->C:1;C->E;" + "C:control->DMT/_0:control;C:control->DMT/_1:control;" + "D->E:1;DMT/_0->E:2;DMT/_1->E:3;E->Z;Y->Z:1"); +} + // Test set 1: Conv2D + AddBias // C=Conv2D(A,B); E=BiasAdd(C,D); Z=Zeta(E,Y) diff --git a/tensorflow/core/kernels/BUILD b/tensorflow/core/kernels/BUILD index 6126e8b7ba..f14542068d 100644 --- a/tensorflow/core/kernels/BUILD +++ b/tensorflow/core/kernels/BUILD @@ -39,6 +39,7 @@ load( "cc_header_only_library", "if_not_windows", "if_override_eigen_strong_inline", + "tf_cc_test_mkl", ) load("@local_config_sycl//sycl:build_defs.bzl", "if_sycl") load("//tensorflow:tensorflow.bzl", "tf_cuda_cc_test") @@ -1129,6 +1130,7 @@ tf_cuda_cc_test( ], ) + tf_cc_test( name = "decode_wav_op_test", size = "small", @@ -6124,6 +6126,7 @@ tf_mkl_kernel_library( ] + if_mkl(["@mkl_dnn"]), ) + tf_mkl_kernel_library( name = "mkl_tfconv_op", prefix = "mkl_tfconv", @@ -6269,6 +6272,31 @@ tf_mkl_kernel_library( ], ) +tf_cc_test_mkl( + name = "mkl_fused_ops_test", + size = "small", + srcs = ["mkl_fused_ops_test.cc"], + linkstatic = 1, + deps = [ + ":mkl_conv_op", + ":mkl_tfconv_op", + ":conv_ops", + ":image", + ":ops_testutil", + ":ops_util", + "//tensorflow/cc:cc_ops", + "//tensorflow/core:core_cpu", + "//tensorflow/core:framework", + "//tensorflow/core:framework_internal", + "//tensorflow/core:lib", + "//tensorflow/core:protos_all_cc", + "//tensorflow/core:tensorflow", + "//tensorflow/core:test", + "//tensorflow/core:test_main", + "//tensorflow/core:testlib", + ] +) + # NOTE(lespeholt): This rule is deprecated, please use: # tensorflow/core/util/batch_util.h cc_library( diff --git a/tensorflow/core/kernels/mkl_conv_ops.cc b/tensorflow/core/kernels/mkl_conv_ops.cc index 62396eeb8b..d4ec831cf2 100644 --- a/tensorflow/core/kernels/mkl_conv_ops.cc +++ b/tensorflow/core/kernels/mkl_conv_ops.cc @@ -753,9 +753,31 @@ class MklConv2DOp : public OpKernel { TensorFormat data_format_; }; + +#define REGISTER_MKL_CPU(T) \ + REGISTER_KERNEL_BUILDER(Name("_MklConv2D") \ + .Device(DEVICE_CPU) \ + .TypeConstraint("T") \ + .Label(mkl_op_registry::kMklOpLabel), \ + MklConv2DOp); \ + REGISTER_KERNEL_BUILDER(Name("_MklConv2DWithBias") \ + .Device(DEVICE_CPU) \ + .TypeConstraint("T") \ + .Label(mkl_op_registry::kMklOpLabel), \ + MklConv2DOp); \ + REGISTER_KERNEL_BUILDER(Name("__MklDummyConv2DWithBias") \ + .Device(DEVICE_CPU) \ + .TypeConstraint("T") \ + .Label(mkl_op_registry::kMklOpLabel), \ + MklDummyOp); + +TF_CALL_float(REGISTER_MKL_CPU); +#undef REGISTER_MKL_CPU + #else -template +template class MklConv2DOp : public OpKernel { public: ~MklConv2DOp() {} @@ -814,6 +836,11 @@ class MklConv2DOp : public OpKernel { dilations, strides; memory::dims dst_dims_tf_order, dst_dims_mkl_order; + // If pad with conv2d fusion is enabled + if (padEnabled) { + PadWithConvFusion(context, padding_left, padding_right); + } + // Get shapes of input tensors in MKL-DNN order MklDnnConvUtil conv_utl(context, strides_, padding_, data_format_, dilations_); @@ -822,7 +849,7 @@ class MklConv2DOp : public OpKernel { conv_utl.GetConvFwdSizesInMklOrder( src_tf_shape, filter_tf_shape, &src_dims, &filter_dims, &strides, &dilations, &dst_dims_tf_order, &dst_dims_mkl_order, - &padding_left, &padding_right); + &padding_left, &padding_right, padEnabled); if (!context->status().ok()) return; // Check for corner case - if there is nothing to compute, return. @@ -869,7 +896,6 @@ class MklConv2DOp : public OpKernel { // MKLDNN dilation starts from 0. dilations[kDilationH] -= 1; dilations[kDilationW] -= 1; - // get a conv2d fwd from primitive pool MklConv2DFwdPrimitive* conv2d_fwd = nullptr; if (biasEnabled) { @@ -937,13 +963,53 @@ class MklConv2DOp : public OpKernel { errors::Aborted("Operation received an exception:", error_msg)); } } + + void PadWithConvFusion(OpKernelContext* context, memory::dims &padding_left, + memory::dims &padding_right){ + const Tensor& paddings_tf = MklGetInput(context, 2); + OP_REQUIRES(context, paddings_tf.dims() == 2, + errors::InvalidArgument("paddings must be 2-dimensional: ", + paddings_tf.shape().DebugString())); + Tpadding* paddings = nullptr; + // To get individual pad, need to flatten the tensor + paddings = static_cast(const_cast + (paddings_tf.flat().data())); + // For NHWC format: + // paddings[0], paddings[1], paddings[6], paddings[7] should be zero + // if the paddings_tf is [ [0, 0] [1,2] [3,4] [0,0] ] + // paddings = {0, 0, 1, 2, 3, 4, 0, 0} ; flat method is row major + // then, values are: top = 1, bottom =2, left=3, right=4 + // For NCHW format, + // paddings[0], paddings[1], paddings[2], paddings[3] should be zero + // similar explanation as NHWC format will apply. + string data_format = ToString(data_format_); + if(data_format == "NHWC"){ + pad_top = paddings[2]; + pad_bottom = paddings[3]; + pad_left = paddings[4]; + pad_right = paddings[5]; + } + else if (data_format == "NCHW"){ + pad_top = paddings[4]; + pad_bottom = paddings[5]; + pad_left = paddings[6]; + pad_right = paddings[7]; + } + // Create padding arrays for MKL DNN convolutions. + // MKL-DNN uses asymetric padding. + padding_left = {static_cast(pad_top), static_cast(pad_left)}; + padding_right = {static_cast(pad_bottom), static_cast(pad_right)}; + } private: std::vector strides_; std::vector dilations_; + int64 pad_top, pad_left; + int64 pad_bottom, pad_right; Padding padding_; TensorFormat data_format_; const int kInputIndex_Src = 0, kInputIndex_Filter = 1, kInputIndex_Bias = 2; + const int kInputIndex_Pad = 2; const int kOutputIndex_Dst = 0, kOutputIndex_Filter = 1; const int kDilationH = 0, kDilationW = 1; engine cpu_engine = engine(engine::cpu, 0); @@ -1036,26 +1102,44 @@ class MklConv2DOp : public OpKernel { } }; -#endif #define REGISTER_MKL_CPU(T) \ REGISTER_KERNEL_BUILDER(Name("_MklConv2D") \ .Device(DEVICE_CPU) \ .TypeConstraint("T") \ .Label(mkl_op_registry::kMklOpLabel), \ - MklConv2DOp); \ + MklConv2DOp); \ REGISTER_KERNEL_BUILDER(Name("_MklConv2DWithBias") \ .Device(DEVICE_CPU) \ .TypeConstraint("T") \ .Label(mkl_op_registry::kMklOpLabel), \ - MklConv2DOp); \ + MklConv2DOp); \ REGISTER_KERNEL_BUILDER(Name("__MklDummyConv2DWithBias") \ .Device(DEVICE_CPU) \ .TypeConstraint("T") \ .Label(mkl_op_registry::kMklOpLabel), \ + MklDummyOp); \ + REGISTER_KERNEL_BUILDER(Name("_MklPadWithConv2D") \ + .Device(DEVICE_CPU) \ + .TypeConstraint("T") \ + .TypeConstraint("Tpaddings") \ + .Label(mkl_op_registry::kMklOpLabel), \ + MklConv2DOp); \ + REGISTER_KERNEL_BUILDER(Name("_MklPadWithConv2D") \ + .Device(DEVICE_CPU) \ + .TypeConstraint("T") \ + .TypeConstraint("Tpaddings") \ + .Label(mkl_op_registry::kMklOpLabel), \ + MklConv2DOp); \ + REGISTER_KERNEL_BUILDER(Name("__MklDummyPadWithConv2D") \ + .Device(DEVICE_CPU) \ + .TypeConstraint("T") \ + .TypeConstraint("Tpaddings") \ + .Label(mkl_op_registry::kMklOpLabel), \ MklDummyOp); TF_CALL_float(REGISTER_MKL_CPU); +#endif } // namespace tensorflow #endif // INTEL_MKL diff --git a/tensorflow/core/kernels/mkl_conv_ops.h b/tensorflow/core/kernels/mkl_conv_ops.h index 3f154ff33b..c6487a4512 100644 --- a/tensorflow/core/kernels/mkl_conv_ops.h +++ b/tensorflow/core/kernels/mkl_conv_ops.h @@ -232,7 +232,7 @@ class MklDnnConvUtil { const memory::dims& strides, const memory::dims& dilations, memory::dims* output_dims_tf_order, memory::dims* output_dims_mkl_order, memory::dims* pad_l, - memory::dims* pad_r) { + memory::dims* pad_r, bool padEnabled=false) { CHECK_NOTNULL(output_dims_tf_order); CHECK_NOTNULL(output_dims_mkl_order); CHECK_NOTNULL(pad_l); @@ -268,7 +268,19 @@ class MklDnnConvUtil { GetWindowedOutputSizeVerboseV2(input_cols, filter_cols, dilation_cols, stride_cols, padding_, &out_cols, &pad_left, &pad_right)); - + // If padEnabled, i.e., pad and conv op are fused, then + // all pads are already passed from pad op through + // *pad_l and *pad_r + if(padEnabled) { + pad_top = static_cast((*pad_l)[0]); + pad_left = static_cast((*pad_l)[1]); + pad_bottom = static_cast((*pad_r)[0]); + pad_right = static_cast((*pad_r)[1]); + // update the out_rows and out_cols based on all + // sides of the pads coming from pad op. + out_rows = out_rows + (pad_top + pad_bottom ) / stride_rows; + out_cols = out_cols + (pad_left + pad_right ) / stride_cols; + } // Tensorflow output is in data_format order. (NHWC or NCHW) TensorShape out_shape = ShapeFromFormat(data_format_, out_batch, out_rows, out_cols, out_depth); @@ -283,8 +295,12 @@ class MklDnnConvUtil { *output_dims_mkl_order = mkldnn_sizes; // Now handle padding. MKL-DNN uses asymetric padding. - *pad_l = {static_cast(pad_top), static_cast(pad_left)}; - *pad_r = {static_cast(pad_bottom), static_cast(pad_right)}; + // But, if padEnabled, i.e., pad and conv op are fused, + // then, *pad_l and *pad_r are already set from pad op + if(!padEnabled) { + *pad_l = {static_cast(pad_top), static_cast(pad_left)}; + *pad_r = {static_cast(pad_bottom), static_cast(pad_right)}; + } } // Calculate output and pad size of forward Convolution operator. @@ -325,7 +341,7 @@ class MklDnnConvUtil { memory::dims* strides, memory::dims *dilations, memory::dims* output_dims_tf_order, memory::dims* output_dims_mkl_order, memory::dims* pad_l, - memory::dims* pad_r) { + memory::dims* pad_r, bool padEnabled=false) { CHECK_NOTNULL(input_dims); CHECK_NOTNULL(filter_dims); CHECK_NOTNULL(strides); @@ -344,7 +360,7 @@ class MklDnnConvUtil { GetOutputAndPadSizeInMklOrder(input_shape, filter_shape, *strides, *dilations, output_dims_tf_order, output_dims_mkl_order, - pad_l, pad_r); + pad_l, pad_r, padEnabled); if (!context_->status().ok()) return; } }; diff --git a/tensorflow/core/ops/nn_ops.cc b/tensorflow/core/ops/nn_ops.cc index f947d4c30d..8bb22a8372 100644 --- a/tensorflow/core/ops/nn_ops.cc +++ b/tensorflow/core/ops/nn_ops.cc @@ -1573,6 +1573,55 @@ NOTE Do not invoke this operator directly in Python. Graph rewrite pass is expected to invoke these operators. )doc"); +REGISTER_OP("__MklDummyPadWithConv2D") + .Input("input: T") + .Input("filter: T") + .Input("paddings: Tpaddings") + .Output("output: T") + .Attr("T: {half, float, double}") + .Attr("strides: list(int)") + .Attr("use_cudnn_on_gpu: bool = true") + .Attr(GetPaddingAttrString()) + .Attr(GetConvnetDataFormatAttrString()) + .Attr("dilations: list(int) = [1, 1, 1, 1]") + .Attr("Tpaddings: {int32, int64} = DT_INT32") + .SetShapeFn(shape_inference::Conv2DShape) + .Doc(R"doc( +Dummy node that enables fusing Pad and Conv2D operator for MKL. This node +does not perform anything. It is just created as an intermediate output of +merging Pad and Conv2D. + +NOTE Do not invoke this operator directly in Python. Graph rewrite pass is +expected to invoke these operators. +)doc"); + +REGISTER_OP("_MklPadWithConv2D") + .Input("input: T") + .Input("filter: T") + .Input("paddings: Tpaddings") + .Input("mkl_input: uint8") + .Input("mkl_filter: uint8") + .Input("mkl_paddings: uint8") + .Output("output: T") + .Output("filter_output: T") + .Output("mkl_output: uint8") + .Output("mkl_filter_output: uint8") + .Attr("T: {half, float, double}") + .Attr("strides: list(int)") + .Attr("use_cudnn_on_gpu: bool = true") + .Attr(GetPaddingAttrString()) + .Attr(GetConvnetDataFormatAttrString()) + .Attr("dilations: list(int) = [1, 1, 1, 1]") + .Attr("Tpaddings: {int32, int64} = DT_INT32") + .SetShapeFn(shape_inference::Conv2DShape) + .Doc(R"doc( +MKL version of Pad and Conv2D operator. Uses MKL DNN APIs to perform +Pad and 2D convolution to the output of convolution. + +NOTE Do not invoke this operator directly in Python. Graph rewrite pass is +expected to invoke these operators. +)doc"); + REGISTER_OP("_MklConv2DBackpropFilter") .Input("input: T") .Input("filter_sizes: int32") -- GitLab From dd63093a599081accfe2a2d2ca8c029d413a15d7 Mon Sep 17 00:00:00 2001 From: mbhuiyan Date: Thu, 2 Aug 2018 08:43:06 -0700 Subject: [PATCH 006/405] adding unit test for pad+conv2d fusion op --- tensorflow/core/kernels/mkl_fused_ops_test.cc | 164 ++++++++++++++++++ 1 file changed, 164 insertions(+) create mode 100644 tensorflow/core/kernels/mkl_fused_ops_test.cc diff --git a/tensorflow/core/kernels/mkl_fused_ops_test.cc b/tensorflow/core/kernels/mkl_fused_ops_test.cc new file mode 100644 index 0000000000..216e8d0206 --- /dev/null +++ b/tensorflow/core/kernels/mkl_fused_ops_test.cc @@ -0,0 +1,164 @@ +/* Copyright 2015 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ +#ifdef INTEL_MKL +#ifndef INTEL_MKL_ML // We don't support fusion in MKL ML +#include "tensorflow/cc/ops/const_op.h" +#include "tensorflow/cc/ops/image_ops.h" +#include "tensorflow/cc/ops/nn_ops.h" +#include "tensorflow/cc/ops/standard_ops.h" +#include "tensorflow/core/common_runtime/kernel_benchmark_testlib.h" +#include "tensorflow/core/framework/fake_input.h" +#include "tensorflow/core/framework/node_def_builder.h" +#include "tensorflow/core/framework/tensor.h" +#include "tensorflow/core/framework/types.pb.h" +#include "tensorflow/core/kernels/conv_ops_gpu.h" +#include "tensorflow/core/kernels/ops_testutil.h" +#include "tensorflow/core/kernels/ops_util.h" +#include "tensorflow/core/platform/test.h" +#include "tensorflow/core/platform/test_benchmark.h" +#include "tensorflow/core/platform/types.h" +#include "tensorflow/core/public/session.h" + +namespace tensorflow { + +// Helper class for converting MKL tesnors to TF tensor and comparing to +// expected values + +const uint8 dummy_tensor[] = {0, 0, 0, 0, 0, 0, 0, 0}; +const TensorShape dummy_shape({8}); + +class ConvMklToTF : public OpsTestBase { + public: + template + void ConvertAndCompare(DataType dtype, const Tensor& first, + const Tensor& second, const Tensor& expected) { + // Create an MKL to TF conversion node and execute it + TF_EXPECT_OK(NodeDefBuilder("mkl_to_tf_op", "_MklToTf") + .Input(FakeInput(dtype)) // Input + .Input(FakeInput(DT_UINT8)) // Mkl second tensor + .Attr("T", dtype) + .Attr("_kernel", "MklOp") + .Finalize(node_def())); + TF_EXPECT_OK(InitOp()); + AddInputFromArray(first.shape(), first.flat()); + AddInputFromArray(second.shape(), second.flat()); + TF_ASSERT_OK(RunOpKernel()); + + const Tensor& output = *GetOutput(0); + test::ExpectTensorNear(expected, output, 1e-5); + } + void TestBody(){}; +}; + +// Testing fusion of pad and convolution + +class FusedPadConvOpTest : public OpsTestBase { + public: + template + void Run(DataType dtype, Tensor& image, Tensor& filter, Tensor& padding, + Tensor& expected, const string data_format) { + const int stride = 1; + + // Create a fused pad+conv2d node + TF_EXPECT_OK(NodeDefBuilder("fused_pad_conv_op", "_MklPadWithConv2D") + .Input(FakeInput(dtype)) // Input + .Input(FakeInput(dtype)) // Filter + .Input(FakeInput(DT_INT32)) // Padding + .Input(FakeInput(DT_UINT8)) // MKl second tensor + .Input(FakeInput(DT_UINT8)) // MKl second tensor + .Input(FakeInput(DT_UINT8)) // MKl second tensor + .Attr("padding", "VALID") + .Attr("data_format", data_format) + .Attr("T", dtype) + .Attr("strides", {1, stride, stride, 1}) + .Attr("_kernel", "MklOp") + .Finalize(node_def())); + TF_EXPECT_OK(InitOp()); + + // Setting up inputs and execute + AddInputFromArray(image.shape(), image.flat()); + AddInputFromArray(filter.shape(), filter.flat()); + AddInputFromArray(padding.shape(), padding.flat()); + AddInputFromArray(dummy_shape, dummy_tensor); + AddInputFromArray(dummy_shape, dummy_tensor); + AddInputFromArray(dummy_shape, dummy_tensor); + TF_ASSERT_OK(RunOpKernel()); + + // Compare output to expected results + const Tensor& first = *GetOutput(0); + const Tensor& second = *GetOutput(2); + ConvMklToTF conv_comp; + conv_comp.ConvertAndCompare(dtype, first, second, expected); + } +}; + +TEST_F(FusedPadConvOpTest, PaddingConvTest) { + const int depth = 1; + const int image_width = 4; + const int image_height = 3; + const int image_batch_count = 1; + Tensor image(DT_FLOAT, {image_batch_count, image_height, image_width, depth}); + test::FillValues(&image, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12}); + + const int filter_size = 3; + const int filter_count = 1; + Tensor filter(DT_FLOAT, {filter_size, filter_size, depth, filter_count}); + test::FillValues(&filter, {1, 4, 7, 2, 5, 8, 3, 6, 9}); + + const int padding_height = 4; + const int padding_width = 2; + Tensor padding(DT_INT32, {padding_height, padding_width}); + test::FillValues(&padding, {0, 0, 3, 4, 1, 2, 0, 0}); + + Tensor expected(DT_FLOAT, TensorShape({1, 8, 5, 1})); + test::FillValues( + &expected, + {0, 0, 0, 0, 0, 24, 42, 60, 33, 12, 105, 150, 183, 95, + 32, 235, 312, 357, 178, 56, 187, 234, 261, 121, 32, 106, 126, 138, + 59, 12, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}); + + Run(DT_FLOAT, image, filter, padding, expected, "NHWC"); +} + +TEST_F(FusedPadConvOpTest, PaddingConvTestNchw) { + const int depth = 1; + const int image_width = 4; + const int image_height = 3; + const int image_batch_count = 1; + Tensor image(DT_FLOAT, {image_batch_count, depth, image_height, image_width}); + test::FillValues(&image, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12}); + + const int filter_size = 3; + const int filter_count = 1; + Tensor filter(DT_FLOAT, {filter_size, filter_size, depth, filter_count}); + test::FillValues(&filter, {1, 4, 7, 2, 5, 8, 3, 6, 9}); + + const int padding_height = 4; + const int padding_width = 2; + Tensor padding(DT_INT32, {padding_height, padding_width}); + test::FillValues(&padding, {0, 0, 0, 0, 3, 4, 1, 2}); + + Tensor expected(DT_FLOAT, TensorShape({1, 1, 8, 5})); + test::FillValues( + &expected, + {0, 0, 0, 0, 0, 24, 42, 60, 33, 12, 105, 150, 183, 95, + 32, 235, 312, 357, 178, 56, 187, 234, 261, 121, 32, 106, 126, 138, + 59, 12, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}); + + Run(DT_FLOAT, image, filter, padding, expected, "NCHW"); +} +} // namespace tensorflow +#endif // INTEL_MKL_ML +#endif // INTEL_MKL -- GitLab From 7f94025fe72369117bf32d69156f0bd947402c96 Mon Sep 17 00:00:00 2001 From: mbhuiyan Date: Fri, 10 Aug 2018 20:00:27 -0700 Subject: [PATCH 007/405] Addressing the reviews for fused PAD and Conv2d PR --- tensorflow/core/graph/mkl_layout_pass.cc | 75 +++++++++---------- tensorflow/core/kernels/BUILD | 22 +++++- tensorflow/core/kernels/mkl_conv_ops.cc | 8 +- tensorflow/core/kernels/mkl_conv_ops.h | 12 ++- tensorflow/core/kernels/mkl_fused_ops_test.cc | 8 +- 5 files changed, 69 insertions(+), 56 deletions(-) diff --git a/tensorflow/core/graph/mkl_layout_pass.cc b/tensorflow/core/graph/mkl_layout_pass.cc index d0abe5da35..1e85b50d99 100644 --- a/tensorflow/core/graph/mkl_layout_pass.cc +++ b/tensorflow/core/graph/mkl_layout_pass.cc @@ -2786,50 +2786,48 @@ class MklLayoutRewritePass : public GraphOptimizationPass { // Find Pad or Conv2D node that can be merged with input node 'm'. // If input 'm' is Pad, then check if there exists Conv2D node that can be - // merged with 'm'. If input 'm' is Conv2D, then check if there exists BiasAdd + // merged with 'm'. If input 'm' is Conv2D, then check if there exists Pad // node that can be merged with 'm'. static Node* GetPadOrConv2D(const Node* m) { CHECK_NOTNULL(m); Node* n = nullptr; + const Node* conv_node; if (m->type_string() == csinfo_.pad) { // If m is Pad, then Conv2D is the output of Pad. for (const Edge* e : m->out_edges()) { if (!e->IsControlEdge() && e->dst()->type_string() == csinfo_.conv2d) { n = e->dst(); + conv_node = n; break; } } } else { CHECK_EQ(m->type_string(), csinfo_.conv2d); - // If m is conv2D, Go over all input edges + // If m is conv2D, Go over all input edges // and search for Pad Node. for (const Edge* e : m->in_edges()) { if (!e->IsControlEdge() && e->src()->type_string() == csinfo_.pad) { n = e->src(); + conv_node = m; break; } } } - // Check if only VALID type of padding is used - // or not. + // Check if only VALID type of padding is used + // or not. if (n != nullptr) { - const Node* conv_node; - if (m->type_string() == csinfo_.conv2d) - conv_node = m; - else - conv_node = n; string padding; TF_CHECK_OK(GetNodeAttr(conv_node->def(), "padding", &padding)); - if (padding != "VALID") - // Then do not merge. - // Only VALID type of padding in conv op can be + if (padding != "VALID") + // Then do not merge. + // Only VALID type of padding in conv op can be // merged with Pad op. n = nullptr; } - if (n == nullptr) { + else { VLOG(1) << "MklLayoutRewritePass: Could not find matching " << "Pad and Conv2D node for merging. Input node: " << m->DebugString(); @@ -3669,7 +3667,7 @@ void MklLayoutRewritePass::CopyAttrsConv2D(const Node* orig_node, nb->Attr("use_cudnn_on_gpu", use_cudnn_on_gpu); } -//used in rinfo when replacing __MklDummyPadWithConv2D by _MklPadWithConv2D +// Used in rinfo when replacing __MklDummyPadWithConv2D by _MklPadWithConv2D void MklLayoutRewritePass::CopyAttrsPadWithConv2D(const Node* orig_node, NodeBuilder* nb) { DataType Tpaddings; @@ -3677,11 +3675,13 @@ void MklLayoutRewritePass::CopyAttrsPadWithConv2D(const Node* orig_node, string data_format; string padding; std::vector strides; + std::vector dilations; bool use_cudnn_on_gpu; - // Get all attributes from old node 1. + // Get all attributes from old node. TF_CHECK_OK(GetNodeAttr(orig_node->def(), "T", &T)); TF_CHECK_OK(GetNodeAttr(orig_node->def(), "strides", &strides)); + TF_CHECK_OK(GetNodeAttr(orig_node->def(), "dilations", &dilations)); TF_CHECK_OK(GetNodeAttr(orig_node->def(), "padding", &padding)); TF_CHECK_OK(GetNodeAttr(orig_node->def(), "data_format", &data_format)); TF_CHECK_OK( @@ -3691,13 +3691,14 @@ void MklLayoutRewritePass::CopyAttrsPadWithConv2D(const Node* orig_node, // Add attributes to new node. nb->Attr("T", T); nb->Attr("strides", strides); + nb->Attr("dilations", dilations); nb->Attr("padding", padding); nb->Attr("data_format", data_format); nb->Attr("use_cudnn_on_gpu", use_cudnn_on_gpu); nb->Attr("Tpaddings", Tpaddings); } -//used with MergePadWithConv2D +// Used with MergePadWithConv2D void MklLayoutRewritePass::CopyAttrsFromPadAndConv2D(const Node* orig_node1, const Node* orig_node2, NodeBuilder* nb) { DataType Tpaddings; @@ -3705,11 +3706,13 @@ void MklLayoutRewritePass::CopyAttrsFromPadAndConv2D(const Node* orig_node1, string data_format; string padding; std::vector strides; + std::vector dilations; bool use_cudnn_on_gpu; // Get all attributes from old node 1. TF_CHECK_OK(GetNodeAttr(orig_node1->def(), "T", &T)); TF_CHECK_OK(GetNodeAttr(orig_node1->def(), "strides", &strides)); + TF_CHECK_OK(GetNodeAttr(orig_node1->def(), "dilations", &dilations)); TF_CHECK_OK(GetNodeAttr(orig_node1->def(), "padding", &padding)); TF_CHECK_OK(GetNodeAttr(orig_node1->def(), "data_format", &data_format)); TF_CHECK_OK( @@ -3720,12 +3723,10 @@ void MklLayoutRewritePass::CopyAttrsFromPadAndConv2D(const Node* orig_node1, // Add attributes to new node. nb->Attr("T", T); nb->Attr("strides", strides); + nb->Attr("dilations", dilations); nb->Attr("padding", padding); nb->Attr("data_format", data_format); nb->Attr("use_cudnn_on_gpu", use_cudnn_on_gpu); - - - // Add attributes to new node. nb->Attr("Tpaddings", Tpaddings); } void MklLayoutRewritePass::CopyAttrsAddN(const Node* orig_node, @@ -3954,7 +3955,7 @@ Status MklLayoutRewritePass::MergeConv2DWithBiasAdd(std::unique_ptr* g, // If 'm' is BiasAdd, then 'n' is Conv2D. Since Conv2D feeds BiasAdd, // BiasAdd is successor node, and Conv2D predecessor node. Node* pred = m->type_string() == csinfo_.bias_add ? n : m; - Node* succ = m->type_string() == csinfo_.bias_add ? m : n; + Node* succ = m->type_string() == csinfo_.bias_add ? m : n; // 1. Get all attributes from input nodes. DataType T_pred, T_succ; @@ -4095,11 +4096,10 @@ Status MklLayoutRewritePass::MergeConv2DWithBiasAdd(std::unique_ptr* g, Status MklLayoutRewritePass::MergePadWithConv2D(std::unique_ptr* g, Node* m, Node* n) { - CHECK_EQ(((m->type_string() == csinfo_.pad && + CHECK(((m->type_string() == csinfo_.pad && n->type_string() == csinfo_.conv2d)) || ((n->type_string() == csinfo_.pad && - m->type_string() == csinfo_.conv2d)), - true); + m->type_string() == csinfo_.conv2d))); // Conv2D is successor node, and Pad predecessor node. Node* pred = m->type_string() == csinfo_.pad ? m : n; @@ -4117,22 +4117,18 @@ Status MklLayoutRewritePass::MergePadWithConv2D(std::unique_ptr* g, TF_CHECK_OK(GetNodeAttr(succ->def(), "padding", &padding)); TF_CHECK_OK(GetNodeAttr(succ->def(), "strides", &strides)); TF_CHECK_OK(GetNodeAttr(succ->def(), "dilations", &dilations)); - // data format for pad is not available and not necessary, thus - // we dont need to match data format - // TF_CHECK_OK(GetNodeAttr(pred->def(), "data_format", &data_format_pred)); + // Data format for pad is not available and not necessary, thus + // dont need to match data format for Pad TF_CHECK_OK(GetNodeAttr(succ->def(), "data_format", &data_format_succ)); TF_CHECK_OK(GetNodeAttr(succ->def(), "use_cudnn_on_gpu", &use_cudnn_on_gnu)); - // We check to ensure that data formats of both succ and pred are same. - // We expect them to be same, so we can enforce this as assert. - // But assert can be too strict, so we enforce this as a check. - // If the check fails, then we do not merge two nodes. - // We also do same check for devices. - // if (data_format_pred != data_format_succ || T_pred != T_succ || + // Check if the data types and devices of both succ and pred are the same. + // Assert is not used, because it can be too strict. + // Don't need to check for data formats because it is not available in Pad. if (T_pred != T_succ || pred->assigned_device_name() != succ->assigned_device_name() || pred->def().device() != succ->def().device()) { return Status(error::Code::INVALID_ARGUMENT, - "data_format or T attribute or devices of Conv2D and " + "T attribute or devices of Conv2D and " "Pad do not match. Will skip node merge optimization"); } @@ -4159,11 +4155,10 @@ Status MklLayoutRewritePass::MergePadWithConv2D(std::unique_ptr* g, } } - // 2. Get inputs from both the nodes. ( ? ? Explanation of the following) - // Find the 2 inputs from the Pad and the Filter input from the Conv2D. - // Get operand 0, 1 of conv2D. - CHECK_EQ(pred->in_edges().size(), 2); // Pad must have 2 inputs. - // Get operand 1 of add_bias??? + // 2. Get inputs from both the nodes. + + // Pad must have 2 inputs: "input" and paddings. + CHECK_EQ(pred->in_edges().size(), 2); // Conv2D must have 2 inputs: pad output and Filter CHECK_EQ(succ->in_edges().size(), 2); @@ -4497,8 +4492,8 @@ MklLayoutRewritePass::CheckForNodeRewrite(const Node* n) const { return nullptr; } - // We make an exception for __MklDummyConv2DWithBias and - // __MklConv2DBackpropFilterWithBias, __MklDummyPadWithConv2D since their names + // We make an exception for __MklDummyConv2DWithBias, + // __MklConv2DBackpropFilterWithBias, and __MklDummyPadWithConv2D since their names // do not match Mkl node names. if (n->type_string() != csinfo_.conv2d_with_bias && n->type_string() != csinfo_.pad_with_conv2d && diff --git a/tensorflow/core/kernels/BUILD b/tensorflow/core/kernels/BUILD index f14542068d..b057b78ace 100644 --- a/tensorflow/core/kernels/BUILD +++ b/tensorflow/core/kernels/BUILD @@ -22,6 +22,7 @@ package_group( "//learning/brain/research/sparse_matrix/...", "//learning/faster_training/...", "//tensorflow/...", + "//third_party/car/...", ], ) @@ -783,7 +784,7 @@ tf_kernel_library( tf_kernel_library( name = "quantize_and_dequantize_op", prefix = "quantize_and_dequantize_op", - deps = ARRAY_DEPS, + deps = ARRAY_DEPS + [":cwise_op"], ) tf_kernel_library( @@ -1130,7 +1131,6 @@ tf_cuda_cc_test( ], ) - tf_cc_test( name = "decode_wav_op_test", size = "small", @@ -2855,6 +2855,8 @@ tf_kernel_library( srcs = [] + if_mkl([ "mkl_batch_matmul_op.cc", ]), + # *impl.h are excluded by default from the CPU build, add explicitly. + hdrs = ["batch_matmul_op_impl.h"], # Override EIGEN_STRONG_INLINE to inline when --define=override_eigen_strong_inline=true, # to avoid long compiling time. See https://github.com/tensorflow/tensorflow/issues/10521 copts = if_override_eigen_strong_inline(["/DEIGEN_STRONG_INLINE=inline"]), @@ -3791,7 +3793,7 @@ tf_kernel_library( "spacetodepth_op.h", "spacetodepth_op_gpu.cu.cc", ], - visibility = ["//visibility:private"], + visibility = [":friends"], deps = [ "//tensorflow/core:framework", "//tensorflow/core:lib", @@ -4888,6 +4890,7 @@ filegroup( "fill_functor.cc", "fill_functor.h", "function_ops.cc", + "function_ops.h", "gather_functor.h", "gather_nd_op.cc", "gather_nd_op.h", @@ -5379,6 +5382,18 @@ cc_library( alwayslink = 1, ) +cc_library( + name = "android_whole_file_read_ops", + srcs = if_android(["whole_file_read_ops.cc"]), + copts = tf_copts(), + linkopts = ["-ldl"], + visibility = ["//visibility:public"], + deps = [ + "//tensorflow/core:android_tensorflow_lib_lite", + ], + alwayslink = 1, +) + # Quantization-specific OpKernels tf_kernel_library( @@ -6126,7 +6141,6 @@ tf_mkl_kernel_library( ] + if_mkl(["@mkl_dnn"]), ) - tf_mkl_kernel_library( name = "mkl_tfconv_op", prefix = "mkl_tfconv", diff --git a/tensorflow/core/kernels/mkl_conv_ops.cc b/tensorflow/core/kernels/mkl_conv_ops.cc index d4ec831cf2..b5ae312fa5 100644 --- a/tensorflow/core/kernels/mkl_conv_ops.cc +++ b/tensorflow/core/kernels/mkl_conv_ops.cc @@ -979,9 +979,11 @@ class MklConv2DOp : public OpKernel { // if the paddings_tf is [ [0, 0] [1,2] [3,4] [0,0] ] // paddings = {0, 0, 1, 2, 3, 4, 0, 0} ; flat method is row major // then, values are: top = 1, bottom =2, left=3, right=4 - // For NCHW format, + // For NCHW format: // paddings[0], paddings[1], paddings[2], paddings[3] should be zero // similar explanation as NHWC format will apply. + int64 pad_top, pad_left; + int64 pad_bottom, pad_right; string data_format = ToString(data_format_); if(data_format == "NHWC"){ pad_top = paddings[2]; @@ -1004,8 +1006,6 @@ class MklConv2DOp : public OpKernel { private: std::vector strides_; std::vector dilations_; - int64 pad_top, pad_left; - int64 pad_bottom, pad_right; Padding padding_; TensorFormat data_format_; const int kInputIndex_Src = 0, kInputIndex_Filter = 1, kInputIndex_Bias = 2; @@ -1139,7 +1139,7 @@ class MklConv2DOp : public OpKernel { MklDummyOp); TF_CALL_float(REGISTER_MKL_CPU); -#endif +#endif // INTEL_MKL_ML } // namespace tensorflow #endif // INTEL_MKL diff --git a/tensorflow/core/kernels/mkl_conv_ops.h b/tensorflow/core/kernels/mkl_conv_ops.h index c6487a4512..aae4d767a2 100644 --- a/tensorflow/core/kernels/mkl_conv_ops.h +++ b/tensorflow/core/kernels/mkl_conv_ops.h @@ -281,6 +281,14 @@ class MklDnnConvUtil { out_rows = out_rows + (pad_top + pad_bottom ) / stride_rows; out_cols = out_cols + (pad_left + pad_right ) / stride_cols; } + // Handle padding. MKL-DNN uses asymetric padding. + // But, if padEnabled, i.e., pad and conv op are fused, + // then, *pad_l and *pad_r are already set from pad op. + // In that case they need not set here. + else { + *pad_l = {static_cast(pad_top), static_cast(pad_left)}; + *pad_r = {static_cast(pad_bottom), static_cast(pad_right)}; + } // Tensorflow output is in data_format order. (NHWC or NCHW) TensorShape out_shape = ShapeFromFormat(data_format_, out_batch, out_rows, out_cols, out_depth); @@ -297,10 +305,6 @@ class MklDnnConvUtil { // Now handle padding. MKL-DNN uses asymetric padding. // But, if padEnabled, i.e., pad and conv op are fused, // then, *pad_l and *pad_r are already set from pad op - if(!padEnabled) { - *pad_l = {static_cast(pad_top), static_cast(pad_left)}; - *pad_r = {static_cast(pad_bottom), static_cast(pad_right)}; - } } // Calculate output and pad size of forward Convolution operator. diff --git a/tensorflow/core/kernels/mkl_fused_ops_test.cc b/tensorflow/core/kernels/mkl_fused_ops_test.cc index 216e8d0206..e408886861 100644 --- a/tensorflow/core/kernels/mkl_fused_ops_test.cc +++ b/tensorflow/core/kernels/mkl_fused_ops_test.cc @@ -1,4 +1,4 @@ -/* Copyright 2015 The TensorFlow Authors. All Rights Reserved. +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -33,11 +33,11 @@ limitations under the License. namespace tensorflow { -// Helper class for converting MKL tesnors to TF tensor and comparing to +// Helper class for converting MKL tesnors to TF tensors and comparing to // expected values -const uint8 dummy_tensor[] = {0, 0, 0, 0, 0, 0, 0, 0}; -const TensorShape dummy_shape({8}); +static const uint8 dummy_tensor[] = {0, 0, 0, 0, 0, 0, 0, 0}; +static const TensorShape dummy_shape({8}); class ConvMklToTF : public OpsTestBase { public: -- GitLab From 819afabbeda709a94894c894515b62c85d236d50 Mon Sep 17 00:00:00 2001 From: mbhuiyan Date: Mon, 13 Aug 2018 10:44:36 -0700 Subject: [PATCH 008/405] modifying the ifdef INTEL_MKL_ML to INTEL_MKL_ML_ONLY --- tensorflow/core/kernels/mkl_fused_ops_test.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tensorflow/core/kernels/mkl_fused_ops_test.cc b/tensorflow/core/kernels/mkl_fused_ops_test.cc index e408886861..900325ac91 100644 --- a/tensorflow/core/kernels/mkl_fused_ops_test.cc +++ b/tensorflow/core/kernels/mkl_fused_ops_test.cc @@ -13,7 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. ==============================================================================*/ #ifdef INTEL_MKL -#ifndef INTEL_MKL_ML // We don't support fusion in MKL ML +#ifndef INTEL_MKL_ML_ONLY // We don't support fusion in MKL ML #include "tensorflow/cc/ops/const_op.h" #include "tensorflow/cc/ops/image_ops.h" #include "tensorflow/cc/ops/nn_ops.h" @@ -160,5 +160,5 @@ TEST_F(FusedPadConvOpTest, PaddingConvTestNchw) { Run(DT_FLOAT, image, filter, padding, expected, "NCHW"); } } // namespace tensorflow -#endif // INTEL_MKL_ML +#endif // INTEL_MKL_ML_ONLY #endif // INTEL_MKL -- GitLab From 6b292c27c7ad09a89c8b75c2505e6472b533a4e1 Mon Sep 17 00:00:00 2001 From: mbhuiyan Date: Mon, 13 Aug 2018 15:03:18 -0700 Subject: [PATCH 009/405] formatted as Clang format for Google code compliance, replaced directive INTEL_MKL_ML by INTEL_MKL_ML_ONLY, and merged with master --- tensorflow/core/graph/mkl_layout_pass.cc | 69 +++++++++---------- tensorflow/core/graph/mkl_layout_pass_test.cc | 19 ++--- tensorflow/core/kernels/mkl_conv_ops.cc | 47 +++++++------ tensorflow/core/kernels/mkl_conv_ops.h | 42 +++++------ 4 files changed, 86 insertions(+), 91 deletions(-) diff --git a/tensorflow/core/graph/mkl_layout_pass.cc b/tensorflow/core/graph/mkl_layout_pass.cc index 65b999b193..84e8ea8f70 100644 --- a/tensorflow/core/graph/mkl_layout_pass.cc +++ b/tensorflow/core/graph/mkl_layout_pass.cc @@ -2551,10 +2551,10 @@ class MklLayoutRewritePass : public GraphOptimizationPass { minfo_.push_back({csinfo_.conv2d_grad_filter, csinfo_.bias_add_grad, csinfo_.conv2d_grad_filter_with_bias, GetConv2DBackpropFilterOrBiasAddGrad}); - minfo_.push_back({csinfo_.pad, csinfo_.conv2d, - csinfo_.pad_with_conv2d, GetPadOrConv2D}); - //TODO : Need to check if pad is with zero or not - // if is zero then replace, if not then do not replace + minfo_.push_back( + {csinfo_.pad, csinfo_.conv2d, csinfo_.pad_with_conv2d, GetPadOrConv2D}); + // Merge Pad and Conv2d, only if the pad op is "Pad" + // Doesn't merge if pad op is "PadV2" or "MirrorPad" } // Standard interface to run pass @@ -2792,42 +2792,39 @@ class MklLayoutRewritePass : public GraphOptimizationPass { CHECK_NOTNULL(m); Node* n = nullptr; - const Node* conv_node; + const Node* conv_node; if (m->type_string() == csinfo_.pad) { // If m is Pad, then Conv2D is the output of Pad. for (const Edge* e : m->out_edges()) { - if (!e->IsControlEdge() && - e->dst()->type_string() == csinfo_.conv2d) { + if (!e->IsControlEdge() && e->dst()->type_string() == csinfo_.conv2d) { n = e->dst(); - conv_node = n; + conv_node = n; break; } } } else { CHECK_EQ(m->type_string(), csinfo_.conv2d); - // If m is conv2D, Go over all input edges + // If m is conv2D, Go over all input edges // and search for Pad Node. for (const Edge* e : m->in_edges()) { - if (!e->IsControlEdge() && - e->src()->type_string() == csinfo_.pad) { + if (!e->IsControlEdge() && e->src()->type_string() == csinfo_.pad) { n = e->src(); - conv_node = m; + conv_node = m; break; } } } - // Check if only VALID type of padding is used - // or not. + // Check if only VALID type of padding is used + // or not. if (n != nullptr) { string padding; TF_CHECK_OK(GetNodeAttr(conv_node->def(), "padding", &padding)); - if (padding != "VALID") - // Then do not merge. - // Only VALID type of padding in conv op can be + if (padding != "VALID") + // Then do not merge. + // Only VALID type of padding in conv op can be // merged with Pad op. n = nullptr; - } - else { + } else { VLOG(1) << "MklLayoutRewritePass: Could not find matching " << "Pad and Conv2D node for merging. Input node: " << m->DebugString(); @@ -3155,7 +3152,8 @@ class MklLayoutRewritePass : public GraphOptimizationPass { static void CopyAttrsFusedBatchNorm(const Node* orig_node, NodeBuilder* nb); static void CopyAttrsLRN(const Node* orig_node, NodeBuilder* nb); static void CopyAttrsPadWithConv2D(const Node* orig_node, NodeBuilder* nb); - static void CopyAttrsFromPadAndConv2D(const Node* orig_node1, const Node* orig_node2, + static void CopyAttrsFromPadAndConv2D(const Node* orig_node1, + const Node* orig_node2, NodeBuilder* nb); static void CopyAttrsPooling(const Node* orig_node, NodeBuilder* nb); static void CopyAttrsReshape(const Node* orig_node, NodeBuilder* nb); @@ -3356,7 +3354,7 @@ int MklLayoutRewritePass::SetUpContiguousInputs( // 2nd input (slot 1) of _MklConv2D and _MklConv2DWithBias. for (const Edge* e : filter_node->out_edges()) { if ((e->dst()->type_string() == csinfo_.mkl_conv2d || - // add check for mkl_pad_with_conv2d + // add check for mkl_pad_with_conv2d e->dst()->type_string() == csinfo_.mkl_pad_with_conv2d || e->dst()->type_string() == csinfo_.mkl_conv2d_with_bias) && e->dst_input() == kConv2DFilterInputSlotIdx @@ -3669,7 +3667,7 @@ void MklLayoutRewritePass::CopyAttrsConv2D(const Node* orig_node, // Used in rinfo when replacing __MklDummyPadWithConv2D by _MklPadWithConv2D void MklLayoutRewritePass::CopyAttrsPadWithConv2D(const Node* orig_node, - NodeBuilder* nb) { + NodeBuilder* nb) { DataType Tpaddings; DataType T; string data_format; @@ -3700,7 +3698,8 @@ void MklLayoutRewritePass::CopyAttrsPadWithConv2D(const Node* orig_node, // Used with MergePadWithConv2D void MklLayoutRewritePass::CopyAttrsFromPadAndConv2D(const Node* orig_node1, - const Node* orig_node2, NodeBuilder* nb) { + const Node* orig_node2, + NodeBuilder* nb) { DataType Tpaddings; DataType T; string data_format; @@ -4095,12 +4094,12 @@ Status MklLayoutRewritePass::MergeConv2DWithBiasAdd(std::unique_ptr* g, } Status MklLayoutRewritePass::MergePadWithConv2D(std::unique_ptr* g, - Node* m, Node* n) { + Node* m, Node* n) { CHECK(((m->type_string() == csinfo_.pad && - n->type_string() == csinfo_.conv2d)) || - ((n->type_string() == csinfo_.pad && - m->type_string() == csinfo_.conv2d))); - + n->type_string() == csinfo_.conv2d)) || + ((n->type_string() == csinfo_.pad && + m->type_string() == csinfo_.conv2d))); + // Conv2D is successor node, and Pad predecessor node. Node* pred = m->type_string() == csinfo_.pad ? m : n; Node* succ = m->type_string() == csinfo_.pad ? n : m; @@ -4158,7 +4157,7 @@ Status MklLayoutRewritePass::MergePadWithConv2D(std::unique_ptr* g, // 2. Get inputs from both the nodes. // Pad must have 2 inputs: "input" and paddings. - CHECK_EQ(pred->in_edges().size(), 2); + CHECK_EQ(pred->in_edges().size(), 2); // Conv2D must have 2 inputs: pad output and Filter CHECK_EQ(succ->in_edges().size(), 2); @@ -4174,8 +4173,8 @@ Status MklLayoutRewritePass::MergePadWithConv2D(std::unique_ptr* g, nb.Input(pred_in[1].first, pred_in[1].second); // In2 (paddings) of Pad // Copy attributes from Pad and conv2D to PadWithConv2D. - CopyAttrsFromPadAndConv2D(const_cast(succ), const_cast(pred), - &nb); + CopyAttrsFromPadAndConv2D(const_cast(succ), + const_cast(pred), &nb); // Copy the device assigned to old node to new node. nb.Device(succ->def().device()); @@ -4186,7 +4185,7 @@ Status MklLayoutRewritePass::MergePadWithConv2D(std::unique_ptr* g, CHECK_NOTNULL(new_node); // Incoming data edges from 'pred' node and 'succ' node to new 'new_node' - // node are already copied in BuildNode. + // node are already copied in BuildNode. // We handle control edges now. for (const Edge* e : pred->in_edges()) { if (e->IsControlEdge()) { @@ -4493,10 +4492,10 @@ MklLayoutRewritePass::CheckForNodeRewrite(const Node* n) const { } // We make an exception for __MklDummyConv2DWithBias, - // __MklConv2DBackpropFilterWithBias, and __MklDummyPadWithConv2D since their names - // do not match Mkl node names. + // __MklConv2DBackpropFilterWithBias, and __MklDummyPadWithConv2D since their + // names do not match Mkl node names. if (n->type_string() != csinfo_.conv2d_with_bias && - n->type_string() != csinfo_.pad_with_conv2d && + n->type_string() != csinfo_.pad_with_conv2d && n->type_string() != csinfo_.conv2d_grad_filter_with_bias && !mkl_op_registry::IsMklOp(mkl_op_registry::GetMklOpName(n->type_string()), T)) { diff --git a/tensorflow/core/graph/mkl_layout_pass_test.cc b/tensorflow/core/graph/mkl_layout_pass_test.cc index 2925b1bde0..d1b39ceeca 100644 --- a/tensorflow/core/graph/mkl_layout_pass_test.cc +++ b/tensorflow/core/graph/mkl_layout_pass_test.cc @@ -2013,11 +2013,11 @@ TEST_F(MklLayoutPassTest, Basic) { } // Test set 0: Pad + Conv2D; padding is VALID -// A = input(image), B = input(paddings), C= Pad = input of conv2D, +// A = input(image), B = input(paddings), C= Pad = input of conv2D, // D=input(filter), E = Conv2D, Z = Zeta // C=Pad(A,B); E=Conv2D(C,D); Z=Zeta(E,Y) // After layout pass -// _MklPadWithConv2D(A, D, B, DMT/_0, DMT/_1, DMT/_2) +// _MklPadWithConv2D(A, D, B, DMT/_0, DMT/_1, DMT/_2) TEST_F(MklLayoutPassTest, NodeMerge_PadWithConv2D_Positive) { CHECK_EQ(kTensorOrdering, MklTfTensorOrdering::TENSORS_CONTIGUOUS); InitGraph( @@ -2049,10 +2049,10 @@ TEST_F(MklLayoutPassTest, NodeMerge_PadWithConv2D_Positive) { } // Test set 0: Pad + Conv2D; padding is SAME -// A = input(image), B = input(paddings), C= Pad = input of conv2D, +// A = input(image), B = input(paddings), C= Pad = input of conv2D, // D=input(filter), E = Conv2D, Z = Zeta // C=Pad(A,B); E=Conv2D(C,D); Z=Zeta(E,Y) -// After layout pass - No merging +// After layout pass - No merging TEST_F(MklLayoutPassTest, NodeMerge_PadWithConv2D_Negative) { CHECK_EQ(kTensorOrdering, MklTfTensorOrdering::TENSORS_CONTIGUOUS); InitGraph( @@ -2075,11 +2075,12 @@ TEST_F(MklLayoutPassTest, NodeMerge_PadWithConv2D_Negative) { "node { name: 'Z' op: 'Zeta'" " attr {key: 'T' value { type: DT_FLOAT } }" " input: ['E', 'Y']}"); - EXPECT_EQ(DoMklLayoutOptimizationPass(), - "A(Input);B(Int32Input);C(Pad);D(Input);DMT/_0(Const);DMT/_1(Const);" - "E(_MklConv2D);Y(Input);Z(Zeta)|A->C;B->C:1;C->E;" - "C:control->DMT/_0:control;C:control->DMT/_1:control;" - "D->E:1;DMT/_0->E:2;DMT/_1->E:3;E->Z;Y->Z:1"); + EXPECT_EQ( + DoMklLayoutOptimizationPass(), + "A(Input);B(Int32Input);C(Pad);D(Input);DMT/_0(Const);DMT/_1(Const);" + "E(_MklConv2D);Y(Input);Z(Zeta)|A->C;B->C:1;C->E;" + "C:control->DMT/_0:control;C:control->DMT/_1:control;" + "D->E:1;DMT/_0->E:2;DMT/_1->E:3;E->Z;Y->Z:1"); } // Test set 1: Conv2D + AddBias diff --git a/tensorflow/core/kernels/mkl_conv_ops.cc b/tensorflow/core/kernels/mkl_conv_ops.cc index 8c8be197f9..7ee9f66810 100644 --- a/tensorflow/core/kernels/mkl_conv_ops.cc +++ b/tensorflow/core/kernels/mkl_conv_ops.cc @@ -963,39 +963,38 @@ class MklConv2DOp : public OpKernel { errors::Aborted("Operation received an exception:", error_msg)); } } - - void PadWithConvFusion(OpKernelContext* context, memory::dims &padding_left, - memory::dims &padding_right){ + + void PadWithConvFusion(OpKernelContext* context, memory::dims& padding_left, + memory::dims& padding_right) { const Tensor& paddings_tf = MklGetInput(context, 2); OP_REQUIRES(context, paddings_tf.dims() == 2, errors::InvalidArgument("paddings must be 2-dimensional: ", paddings_tf.shape().DebugString())); Tpadding* paddings = nullptr; // To get individual pad, need to flatten the tensor - paddings = static_cast(const_cast - (paddings_tf.flat().data())); + paddings = static_cast( + const_cast(paddings_tf.flat().data())); // For NHWC format: - // paddings[0], paddings[1], paddings[6], paddings[7] should be zero + // paddings[0], paddings[1], paddings[6], paddings[7] should be zero // if the paddings_tf is [ [0, 0] [1,2] [3,4] [0,0] ] // paddings = {0, 0, 1, 2, 3, 4, 0, 0} ; flat method is row major // then, values are: top = 1, bottom =2, left=3, right=4 - // For NCHW format: - // paddings[0], paddings[1], paddings[2], paddings[3] should be zero + // For NCHW format: + // paddings[0], paddings[1], paddings[2], paddings[3] should be zero // similar explanation as NHWC format will apply. - int64 pad_top, pad_left; - int64 pad_bottom, pad_right; + int64 pad_top, pad_left; + int64 pad_bottom, pad_right; string data_format = ToString(data_format_); - if(data_format == "NHWC"){ - pad_top = paddings[2]; - pad_bottom = paddings[3]; - pad_left = paddings[4]; - pad_right = paddings[5]; - } - else if (data_format == "NCHW"){ - pad_top = paddings[4]; - pad_bottom = paddings[5]; - pad_left = paddings[6]; - pad_right = paddings[7]; + if (data_format == "NHWC") { + pad_top = paddings[2]; + pad_bottom = paddings[3]; + pad_left = paddings[4]; + pad_right = paddings[5]; + } else if (data_format == "NCHW") { + pad_top = paddings[4]; + pad_bottom = paddings[5]; + pad_left = paddings[6]; + pad_right = paddings[7]; } // Create padding arrays for MKL DNN convolutions. // MKL-DNN uses asymetric padding. @@ -1124,13 +1123,13 @@ class MklConv2DOp : public OpKernel { .TypeConstraint("T") \ .TypeConstraint("Tpaddings") \ .Label(mkl_op_registry::kMklOpLabel), \ - MklConv2DOp); \ + MklConv2DOp); \ REGISTER_KERNEL_BUILDER(Name("_MklPadWithConv2D") \ .Device(DEVICE_CPU) \ .TypeConstraint("T") \ .TypeConstraint("Tpaddings") \ .Label(mkl_op_registry::kMklOpLabel), \ - MklConv2DOp); \ + MklConv2DOp); \ REGISTER_KERNEL_BUILDER(Name("__MklDummyPadWithConv2D") \ .Device(DEVICE_CPU) \ .TypeConstraint("T") \ @@ -1139,7 +1138,7 @@ class MklConv2DOp : public OpKernel { MklDummyOp); TF_CALL_float(REGISTER_MKL_CPU); -#endif // INTEL_MKL_ML +#endif // INTEL_MKL_ML_ONLY } // namespace tensorflow #endif // INTEL_MKL diff --git a/tensorflow/core/kernels/mkl_conv_ops.h b/tensorflow/core/kernels/mkl_conv_ops.h index 3955bd919d..cd24ae02c4 100644 --- a/tensorflow/core/kernels/mkl_conv_ops.h +++ b/tensorflow/core/kernels/mkl_conv_ops.h @@ -230,9 +230,8 @@ class MklDnnConvUtil { virtual inline void GetOutputAndPadSizeInMklOrder( const TensorShape& input_shape, const TensorShape& filter_shape, const memory::dims& strides, const memory::dims& dilations, - memory::dims* output_dims_tf_order, - memory::dims* output_dims_mkl_order, memory::dims* pad_l, - memory::dims* pad_r, bool padEnabled=false) { + memory::dims* output_dims_tf_order, memory::dims* output_dims_mkl_order, + memory::dims* pad_l, memory::dims* pad_r, bool padEnabled = false) { CHECK_NOTNULL(output_dims_tf_order); CHECK_NOTNULL(output_dims_mkl_order); CHECK_NOTNULL(pad_l); @@ -269,17 +268,17 @@ class MklDnnConvUtil { dilation_cols, stride_cols, padding_, &out_cols, &pad_left, &pad_right)); // If padEnabled, i.e., pad and conv op are fused, then - // all pads are already passed from pad op through - // *pad_l and *pad_r - if(padEnabled) { - pad_top = static_cast((*pad_l)[0]); - pad_left = static_cast((*pad_l)[1]); - pad_bottom = static_cast((*pad_r)[0]); - pad_right = static_cast((*pad_r)[1]); - // update the out_rows and out_cols based on all - // sides of the pads coming from pad op. - out_rows = out_rows + (pad_top + pad_bottom ) / stride_rows; - out_cols = out_cols + (pad_left + pad_right ) / stride_cols; + // all pads are already passed from pad op through + // *pad_l and *pad_r + if (padEnabled) { + pad_top = static_cast((*pad_l)[0]); + pad_left = static_cast((*pad_l)[1]); + pad_bottom = static_cast((*pad_r)[0]); + pad_right = static_cast((*pad_r)[1]); + // update the out_rows and out_cols based on all + // sides of the pads coming from pad op. + out_rows = out_rows + (pad_top + pad_bottom) / stride_rows; + out_cols = out_cols + (pad_left + pad_right) / stride_cols; } // Handle padding. MKL-DNN uses asymetric padding. // But, if padEnabled, i.e., pad and conv op are fused, @@ -342,10 +341,9 @@ class MklDnnConvUtil { inline void GetConvFwdSizesInMklOrder( const TensorShape& input_shape, const TensorShape& filter_shape, memory::dims* input_dims, memory::dims* filter_dims, - memory::dims* strides, memory::dims *dilations, - memory::dims* output_dims_tf_order, - memory::dims* output_dims_mkl_order, memory::dims* pad_l, - memory::dims* pad_r, bool padEnabled=false) { + memory::dims* strides, memory::dims* dilations, + memory::dims* output_dims_tf_order, memory::dims* output_dims_mkl_order, + memory::dims* pad_l, memory::dims* pad_r, bool padEnabled = false) { CHECK_NOTNULL(input_dims); CHECK_NOTNULL(filter_dims); CHECK_NOTNULL(strides); @@ -361,15 +359,13 @@ class MklDnnConvUtil { if (!context_->status().ok()) return; GetStridesInMklOrder(strides); GetDilationsInMklOrder(dilations); - GetOutputAndPadSizeInMklOrder(input_shape, filter_shape, - *strides, *dilations, - output_dims_tf_order, output_dims_mkl_order, - pad_l, pad_r, padEnabled); + GetOutputAndPadSizeInMklOrder( + input_shape, filter_shape, *strides, *dilations, output_dims_tf_order, + output_dims_mkl_order, pad_l, pad_r, padEnabled); if (!context_->status().ok()) return; } }; - ///////////////////////////////////////////////////////////////////// /// Common class that implements Conv2DBackpropFilter and Input ///////////////////////////////////////////////////////////////////// -- GitLab From 53f2aefe86f8b50addd4b67eb20eb91135b1fac7 Mon Sep 17 00:00:00 2001 From: mbhuiyan Date: Tue, 14 Aug 2018 13:23:16 -0700 Subject: [PATCH 010/405] fixed, so that now not allowing duplicate control edges, alos cleaning up the comments --- tensorflow/core/graph/mkl_layout_pass.cc | 15 ++++++--------- tensorflow/core/kernels/mkl_conv_ops.h | 4 ---- 2 files changed, 6 insertions(+), 13 deletions(-) diff --git a/tensorflow/core/graph/mkl_layout_pass.cc b/tensorflow/core/graph/mkl_layout_pass.cc index 84e8ea8f70..9157080330 100644 --- a/tensorflow/core/graph/mkl_layout_pass.cc +++ b/tensorflow/core/graph/mkl_layout_pass.cc @@ -4189,16 +4189,14 @@ Status MklLayoutRewritePass::MergePadWithConv2D(std::unique_ptr* g, // We handle control edges now. for (const Edge* e : pred->in_edges()) { if (e->IsControlEdge()) { - // Allow duplicate while adding control edge as it would fail (return - // NULL) if we try to add duplicate edge. - CHECK_NOTNULL((*g)->AddControlEdge(e->src(), new_node, true)); + //Don't allow duplicate edge + (*g)->AddControlEdge(e->src(), new_node, false); } } for (const Edge* e : succ->in_edges()) { if (e->IsControlEdge()) { - // Allow duplicate while adding control edge as it would fail (return - // NULL) if we try to add duplicate edge. - CHECK_NOTNULL((*g)->AddControlEdge(e->src(), new_node, true)); + //Don't allow duplicate edge + (*g)->AddControlEdge(e->src(), new_node, false); } } @@ -4206,9 +4204,8 @@ Status MklLayoutRewritePass::MergePadWithConv2D(std::unique_ptr* g, // First, we will fix outgoing control edges from 'pred' node. for (const Edge* e : pred->out_edges()) { if (e->IsControlEdge()) { - // Allow duplicate while adding control edge as it would fail (return - // NULL) if we try to add duplicate edge. - CHECK_NOTNULL((*g)->AddControlEdge(new_node, e->dst(), true)); + //Don't allow duplicate edge + (*g)->AddControlEdge(new_node, e->dst(), false); } } diff --git a/tensorflow/core/kernels/mkl_conv_ops.h b/tensorflow/core/kernels/mkl_conv_ops.h index cd24ae02c4..ebaf1a9947 100644 --- a/tensorflow/core/kernels/mkl_conv_ops.h +++ b/tensorflow/core/kernels/mkl_conv_ops.h @@ -300,10 +300,6 @@ class MklDnnConvUtil { mkldnn_sizes[MklDnnDims::Dim_H] = static_cast(out_rows); mkldnn_sizes[MklDnnDims::Dim_W] = static_cast(out_cols); *output_dims_mkl_order = mkldnn_sizes; - - // Now handle padding. MKL-DNN uses asymetric padding. - // But, if padEnabled, i.e., pad and conv op are fused, - // then, *pad_l and *pad_r are already set from pad op } // Calculate output and pad size of forward Convolution operator. -- GitLab From f6c9e054a042bf0f518a740380f3f96a28e8c5be Mon Sep 17 00:00:00 2001 From: mbhuiyan Date: Fri, 17 Aug 2018 12:45:06 -0700 Subject: [PATCH 011/405] not allowing duplicate edges, and, add two unit tests in mkl_layout_pass_test to test if common input and common output of pad an conv2D work correctly for pad+conv2D fusion --- tensorflow/core/graph/mkl_layout_pass.cc | 2 +- tensorflow/core/graph/mkl_layout_pass_test.cc | 212 ++++++++++++------ 2 files changed, 143 insertions(+), 71 deletions(-) diff --git a/tensorflow/core/graph/mkl_layout_pass.cc b/tensorflow/core/graph/mkl_layout_pass.cc index 9157080330..6d99e57417 100644 --- a/tensorflow/core/graph/mkl_layout_pass.cc +++ b/tensorflow/core/graph/mkl_layout_pass.cc @@ -4214,7 +4214,7 @@ Status MklLayoutRewritePass::MergePadWithConv2D(std::unique_ptr* g, if (e->IsControlEdge()) { // Allow duplicate while adding control edge as it would fail (return // NULL) if we try to add duplicate edge. - CHECK_NOTNULL((*g)->AddControlEdge(new_node, e->dst(), true)); + (*g)->AddControlEdge(new_node, e->dst(), false); } else { // Conv2D has only 1 output (at slot 0) and merged node also has only 1 // output (at slot 0). diff --git a/tensorflow/core/graph/mkl_layout_pass_test.cc b/tensorflow/core/graph/mkl_layout_pass_test.cc index d1b39ceeca..248520a7f4 100644 --- a/tensorflow/core/graph/mkl_layout_pass_test.cc +++ b/tensorflow/core/graph/mkl_layout_pass_test.cc @@ -1994,6 +1994,10 @@ REGISTER_OP("_MklInput2") .Output("o: uint8") .Output("o1: uint8") .SetIsStateful(); +REGISTER_OP("Output2") + .Input("i: float") + .Input("i1: float") + .SetIsStateful(); ///////////////////////////////////////////////////////////////////// // Unit tests related to node merge optiimization @@ -2012,76 +2016,6 @@ TEST_F(MklLayoutPassTest, Basic) { "A->C;A->D;B->C:1;B->D:1"); } -// Test set 0: Pad + Conv2D; padding is VALID -// A = input(image), B = input(paddings), C= Pad = input of conv2D, -// D=input(filter), E = Conv2D, Z = Zeta -// C=Pad(A,B); E=Conv2D(C,D); Z=Zeta(E,Y) -// After layout pass -// _MklPadWithConv2D(A, D, B, DMT/_0, DMT/_1, DMT/_2) -TEST_F(MklLayoutPassTest, NodeMerge_PadWithConv2D_Positive) { - CHECK_EQ(kTensorOrdering, MklTfTensorOrdering::TENSORS_CONTIGUOUS); - InitGraph( - "node { name: 'A' op: 'Input'}" - "node { name: 'B' op: 'Int32Input'}" - "node { name: 'C' op: 'Pad'" - " attr { key: 'T' value { type: DT_FLOAT } }" - " attr { key: 'Tpaddings' value { type: DT_INT32 } }" - " input: ['A', 'B']}" - "node { name: 'D' op: 'Input'}" - "node { name: 'E' op: 'Conv2D'" - " attr { key: 'T' value { type: DT_FLOAT } }" - " attr { key: 'data_format' value { s: 'NHWC' } }" - " attr { key: 'use_cudnn_on_gpu' value { b: false } }" - " attr { key: 'strides' value { list: {i: 1, i:1, i:1, i:1} } }" - " attr { key: 'padding' value { s: 'VALID' } }" - " attr { key: 'dilations' value { list: {i: 1, i:1, i:1, i:1} } }" - " input: ['C', 'D'] }" - "node { name: 'Y' op: 'Input'}" - "node { name: 'Z' op: 'Zeta'" - " attr {key: 'T' value { type: DT_FLOAT } }" - " input: ['E', 'Y']}"); - EXPECT_EQ(DoMklLayoutOptimizationPass(), - "A(Input);B(Int32Input);D(Input);DMT/_0(Const);DMT/_1(Const);" - "DMT/_2(Const);E(_MklPadWithConv2D);Y(Input);Z(Zeta)|A->E;" - "A:control->DMT/_0:control;A:control->DMT/_1:control;" - "A:control->DMT/_2:control;B->E:2;D->E:1;DMT/_0->E:3;DMT/_1->E:4;" - "DMT/_2->E:5;E->Z;Y->Z:1"); -} - -// Test set 0: Pad + Conv2D; padding is SAME -// A = input(image), B = input(paddings), C= Pad = input of conv2D, -// D=input(filter), E = Conv2D, Z = Zeta -// C=Pad(A,B); E=Conv2D(C,D); Z=Zeta(E,Y) -// After layout pass - No merging -TEST_F(MklLayoutPassTest, NodeMerge_PadWithConv2D_Negative) { - CHECK_EQ(kTensorOrdering, MklTfTensorOrdering::TENSORS_CONTIGUOUS); - InitGraph( - "node { name: 'A' op: 'Input'}" - "node { name: 'B' op: 'Int32Input'}" - "node { name: 'C' op: 'Pad'" - " attr { key: 'T' value { type: DT_FLOAT } }" - " attr { key: 'Tpaddings' value { type: DT_INT32 } }" - " input: ['A', 'B']}" - "node { name: 'D' op: 'Input'}" - "node { name: 'E' op: 'Conv2D'" - " attr { key: 'T' value { type: DT_FLOAT } }" - " attr { key: 'data_format' value { s: 'NHWC' } }" - " attr { key: 'use_cudnn_on_gpu' value { b: false } }" - " attr { key: 'strides' value { list: {i: 1, i:1, i:1, i:1} } }" - " attr { key: 'padding' value { s: 'SAME' } }" - " attr { key: 'dilations' value { list: {i: 1, i:1, i:1, i:1} } }" - " input: ['C', 'D'] }" - "node { name: 'Y' op: 'Input'}" - "node { name: 'Z' op: 'Zeta'" - " attr {key: 'T' value { type: DT_FLOAT } }" - " input: ['E', 'Y']}"); - EXPECT_EQ( - DoMklLayoutOptimizationPass(), - "A(Input);B(Int32Input);C(Pad);D(Input);DMT/_0(Const);DMT/_1(Const);" - "E(_MklConv2D);Y(Input);Z(Zeta)|A->C;B->C:1;C->E;" - "C:control->DMT/_0:control;C:control->DMT/_1:control;" - "D->E:1;DMT/_0->E:2;DMT/_1->E:3;E->Z;Y->Z:1"); -} // Test set 1: Conv2D + AddBias @@ -2389,6 +2323,144 @@ TEST_F(MklLayoutPassTest, NodeMerge_Conv2DWithBias_ConvBpropInput_FilterFwd) { "E:3->G:4;F->G;F:control->DMT/_3:control;G->Z;X->Y:1;X->Z:1"); } +// Test set 3: Pad + Conv2D fusion +// padding is VALID type +// A = input(image), B = input(paddings), C= Pad = input of conv2D, +// D=input(filter), E = Conv2D, Z = Zeta +// C=Pad(A,B); E=Conv2D(C,D); Z=Zeta(E,Y) +// After layout pass +// _MklPadWithConv2D(A, D, B, DMT/_0, DMT/_1, DMT/_2) +TEST_F(MklLayoutPassTest, NodeMerge_PadWithConv2D_Positive) { + CHECK_EQ(kTensorOrdering, MklTfTensorOrdering::TENSORS_CONTIGUOUS); + InitGraph( + "node { name: 'A' op: 'Input'}" + "node { name: 'B' op: 'Int32Input'}" + "node { name: 'C' op: 'Pad'" + " attr { key: 'T' value { type: DT_FLOAT } }" + " attr { key: 'Tpaddings' value { type: DT_INT32 } }" + " input: ['A', 'B']}" + "node { name: 'D' op: 'Input'}" + "node { name: 'E' op: 'Conv2D'" + " attr { key: 'T' value { type: DT_FLOAT } }" + " attr { key: 'data_format' value { s: 'NHWC' } }" + " attr { key: 'use_cudnn_on_gpu' value { b: false } }" + " attr { key: 'strides' value { list: {i: 1, i:1, i:1, i:1} } }" + " attr { key: 'padding' value { s: 'VALID' } }" + " attr { key: 'dilations' value { list: {i: 1, i:1, i:1, i:1} } }" + " input: ['C', 'D'] }" + "node { name: 'Y' op: 'Input'}" + "node { name: 'Z' op: 'Zeta'" + " attr {key: 'T' value { type: DT_FLOAT } }" + " input: ['E', 'Y']}"); + EXPECT_EQ(DoMklLayoutOptimizationPass(), + "A(Input);B(Int32Input);D(Input);DMT/_0(Const);DMT/_1(Const);" + "DMT/_2(Const);E(_MklPadWithConv2D);Y(Input);Z(Zeta)|A->E;" + "A:control->DMT/_0:control;A:control->DMT/_1:control;" + "A:control->DMT/_2:control;B->E:2;D->E:1;DMT/_0->E:3;DMT/_1->E:4;" + "DMT/_2->E:5;E->Z;Y->Z:1"); +} +// Pad + Conv2D fusion with padding is VALID, +// Input node pointing to both Pad and Conv2D +// A = input(image), B = input(paddings), C= Pad +// E = Conv2D, Z = Zeta +// C=Pad(A,B); E=Conv2D(C,A); Z=Zeta(E,Y) +// After layout pass +// _MklPadWithConv2D(A, A, B, DMT/_0, DMT/_1, DMT/_2) +TEST_F(MklLayoutPassTest, NodeMerge_PadWithConv2D_Common_Input) { + CHECK_EQ(kTensorOrdering, MklTfTensorOrdering::TENSORS_CONTIGUOUS); + InitGraph( + "node { name: 'A' op: 'Input'}" + "node { name: 'B' op: 'Int32Input'}" + "node { name: 'C' op: 'Pad'" + " attr { key: 'T' value { type: DT_FLOAT } }" + " attr { key: 'Tpaddings' value { type: DT_INT32 } }" + " input: ['A', 'B']}" + "node { name: 'E' op: 'Conv2D'" + " attr { key: 'T' value { type: DT_FLOAT } }" + " attr { key: 'data_format' value { s: 'NHWC' } }" + " attr { key: 'use_cudnn_on_gpu' value { b: false } }" + " attr { key: 'strides' value { list: {i: 1, i:1, i:1, i:1} } }" + " attr { key: 'padding' value { s: 'VALID' } }" + " attr { key: 'dilations' value { list: {i: 1, i:1, i:1, i:1} } }" + " input: ['C', 'A'] }" + "node { name: 'Y' op: 'Input'}" + "node { name: 'Z' op: 'Zeta'" + " attr {key: 'T' value { type: DT_FLOAT } }" + " input: ['E', 'Y']}"); + EXPECT_EQ(DoMklLayoutOptimizationPass(), + "A(Input);B(Int32Input);DMT/_0(Const);DMT/_1(Const);" + "DMT/_2(Const);E(_MklPadWithConv2D);Y(Input);Z(Zeta)|A->E;A->E:1;" + "A:control->DMT/_0:control;A:control->DMT/_1:control;" + "A:control->DMT/_2:control;B->E:2;DMT/_0->E:3;DMT/_1->E:4;" + "DMT/_2->E:5;E->Z;Y->Z:1"); +} +// Pad + Conv2D with padding is VALID, +// Input node pointing to both Pad and Conv2D +// Output of both Pad and Conv2D feeds one node (Z as Output2) +// A = input(as image), B = input(as paddings), C= Pad +// E = Conv2D, Z = Output2 +// C=Pad(A,B); E=Conv2D(C,A); Z=Output(C,E) +// After layout pass - No merging, since Pad and Conv2D both +// feed to the same node (Z) +TEST_F(MklLayoutPassTest, NodeMerge_PadWithConv2D_Common_InOutput) { + CHECK_EQ(kTensorOrdering, MklTfTensorOrdering::TENSORS_CONTIGUOUS); + InitGraph( + "node { name: 'A' op: 'Input'}" + "node { name: 'B' op: 'Int32Input'}" + "node { name: 'C' op: 'Pad'" + " attr { key: 'T' value { type: DT_FLOAT } }" + " attr { key: 'Tpaddings' value { type: DT_INT32 } }" + " input: ['A', 'B']}" + "node { name: 'E' op: 'Conv2D'" + " attr { key: 'T' value { type: DT_FLOAT } }" + " attr { key: 'data_format' value { s: 'NHWC' } }" + " attr { key: 'use_cudnn_on_gpu' value { b: false } }" + " attr { key: 'strides' value { list: {i: 1, i:1, i:1, i:1} } }" + " attr { key: 'padding' value { s: 'VALID' } }" + " attr { key: 'dilations' value { list: {i: 1, i:1, i:1, i:1} } }" + " input: ['C', 'A'] }" + "node { name: 'Z' op: 'Output2'" + " input: ['C', 'E']}"); + EXPECT_EQ(DoMklLayoutOptimizationPass(), + "A(Input);B(Int32Input);C(Pad);DMT/_0(Const);DMT/_1(Const);" + "E(_MklConv2D);Z(Output2)|A->C;A->E:1;B->C:1;C->E;C->Z;" + "C:control->DMT/_0:control;C:control->DMT/_1:control;" + "DMT/_0->E:2;DMT/_1->E:3;E->Z:1"); +} +// Pad + Conv2D; padding is SAME +// A = input(image), B = input(paddings), C= Pad = input of conv2D, +// D=input(filter), E = Conv2D, Z = Zeta +// C=Pad(A,B); E=Conv2D(C,D); Z=Zeta(E,Y) +// After layout pass - No merging +TEST_F(MklLayoutPassTest, NodeMerge_PadWithConv2D_Negative) { + CHECK_EQ(kTensorOrdering, MklTfTensorOrdering::TENSORS_CONTIGUOUS); + InitGraph( + "node { name: 'A' op: 'Input'}" + "node { name: 'B' op: 'Int32Input'}" + "node { name: 'C' op: 'Pad'" + " attr { key: 'T' value { type: DT_FLOAT } }" + " attr { key: 'Tpaddings' value { type: DT_INT32 } }" + " input: ['A', 'B']}" + "node { name: 'D' op: 'Input'}" + "node { name: 'E' op: 'Conv2D'" + " attr { key: 'T' value { type: DT_FLOAT } }" + " attr { key: 'data_format' value { s: 'NHWC' } }" + " attr { key: 'use_cudnn_on_gpu' value { b: false } }" + " attr { key: 'strides' value { list: {i: 1, i:1, i:1, i:1} } }" + " attr { key: 'padding' value { s: 'SAME' } }" + " attr { key: 'dilations' value { list: {i: 1, i:1, i:1, i:1} } }" + " input: ['C', 'D'] }" + "node { name: 'Y' op: 'Input'}" + "node { name: 'Z' op: 'Zeta'" + " attr {key: 'T' value { type: DT_FLOAT } }" + " input: ['E', 'Y']}"); + EXPECT_EQ( + DoMklLayoutOptimizationPass(), + "A(Input);B(Int32Input);C(Pad);D(Input);DMT/_0(Const);DMT/_1(Const);" + "E(_MklConv2D);Y(Input);Z(Zeta)|A->C;B->C:1;C->E;" + "C:control->DMT/_0:control;C:control->DMT/_1:control;" + "D->E:1;DMT/_0->E:2;DMT/_1->E:3;E->Z;Y->Z:1"); +} ///////////////////////////////////////////////////////////////////// // Unit tests related to rewriting node to Mkl node ///////////////////////////////////////////////////////////////////// -- GitLab From f8ec0f101bac066faa2e917ac714ca9eea310eac Mon Sep 17 00:00:00 2001 From: mbhuiyan Date: Thu, 13 Sep 2018 22:40:49 -0700 Subject: [PATCH 012/405] adding checks that pad fusion works only Conv2D --- tensorflow/core/kernels/mkl_conv_ops.cc | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/tensorflow/core/kernels/mkl_conv_ops.cc b/tensorflow/core/kernels/mkl_conv_ops.cc index 54670c8521..4b54ce1d52 100644 --- a/tensorflow/core/kernels/mkl_conv_ops.cc +++ b/tensorflow/core/kernels/mkl_conv_ops.cc @@ -900,7 +900,10 @@ class MklConvOp : public OpKernel { bool isConv2D = (strides_.size() == 4); // TODO(Intel-tf) Add check to make sure padEnabled is true only for 2D - + if(!isConv2D){ + OP_REQUIRES(context, padEnabled, + errors::InvalidArgument("Pad+Conv fusion only works for 2D")); + } // Create memory for user data. // Describe how the inputs and outputs of Convolution look like. Also // specify buffers containing actual input and output data. -- GitLab From 0e87ed82815053b4f1c038975382d72282fdf97f Mon Sep 17 00:00:00 2001 From: mbhuiyan Date: Sun, 16 Sep 2018 11:20:26 -0700 Subject: [PATCH 013/405] Adding two unit tests for pad+conv2d fusion. They test if the two merging ops get control edge from a common op, then the merged node will have only one control edge. --- tensorflow/core/graph/mkl_layout_pass.cc | 21 +++- tensorflow/core/graph/mkl_layout_pass_test.cc | 111 ++++++++++++++++++ 2 files changed, 128 insertions(+), 4 deletions(-) diff --git a/tensorflow/core/graph/mkl_layout_pass.cc b/tensorflow/core/graph/mkl_layout_pass.cc index ef8a2b0838..d3a4112ee9 100644 --- a/tensorflow/core/graph/mkl_layout_pass.cc +++ b/tensorflow/core/graph/mkl_layout_pass.cc @@ -4192,10 +4192,23 @@ Status MklLayoutRewritePass::MergePadWithConv2D(std::unique_ptr* g, // 2. Get inputs from both the nodes. - // Pad must have 2 inputs: "input" and paddings. - CHECK_EQ(pred->in_edges().size(), 2); - // Conv2D must have 2 inputs: pad output and Filter - CHECK_EQ(succ->in_edges().size(), 2); + // Pad must have 2 data inputs: "input" and paddings. + int PadDataInputEdges = 0; + for (const Edge* e : pred->in_edges()) { + if (!e->IsControlEdge()) { + PadDataInputEdges++; + } + } + CHECK_EQ(PadDataInputEdges, 2); + + // Conv2D must have 2 data inputs: pad output and Filter + int ConvDataInputEdges = 0; + for (const Edge* e : succ->in_edges()) { + if (!e->IsControlEdge()) { + ConvDataInputEdges++; + } + } + CHECK_EQ(ConvDataInputEdges, 2); // We will use the node name of Conv2D as the name of new node // Build new node. We use same name as original node, but change the op diff --git a/tensorflow/core/graph/mkl_layout_pass_test.cc b/tensorflow/core/graph/mkl_layout_pass_test.cc index 248520a7f4..e9e234010c 100644 --- a/tensorflow/core/graph/mkl_layout_pass_test.cc +++ b/tensorflow/core/graph/mkl_layout_pass_test.cc @@ -1928,6 +1928,13 @@ static void InitGraph(const string& s, Graph* graph, class MklLayoutPassTest : public ::testing::Test { public: MklLayoutPassTest() : graph_(OpRegistry::Global()) {} + // Return Node* from the Node Name + Node* FindNode(const string& name) { + for (Node* node : graph_.nodes()) { + if (node->name() == name) return node; + } + LOG(FATAL) << name; + } void InitGraph(const string& s, const string& device = kCPUDevice) { ::tensorflow::InitGraph(s, &graph_, device); @@ -1998,6 +2005,9 @@ REGISTER_OP("Output2") .Input("i: float") .Input("i1: float") .SetIsStateful(); +REGISTER_OP("Output") + .Input("i: float") + .SetIsStateful(); ///////////////////////////////////////////////////////////////////// // Unit tests related to node merge optiimization @@ -2359,6 +2369,107 @@ TEST_F(MklLayoutPassTest, NodeMerge_PadWithConv2D_Positive) { "A:control->DMT/_2:control;B->E:2;D->E:1;DMT/_0->E:3;DMT/_1->E:4;" "DMT/_2->E:5;E->Z;Y->Z:1"); } +// Test if input control edges do not duplicate after merge. +// If both the merging ops have input control edge from a common op +// then, the merged op will have only one control edge from that +// common op. +// padding is VALID type +// A = input(image), A1 = input, B = input(paddings), +// C= Pad = input of conv2D, +// D=input(filter), E = Conv2D, Z = Zeta +// C=Pad(A,B); E=Conv2D(C,D); Z=Zeta(E,Y) +// C:control->A1:control +// E:control->A1:control +// After layout pass +// _MklPadWithConv2D(A, D, B, DMT/_0, DMT/_1, DMT/_2) +// A1:control->E:control +TEST_F(MklLayoutPassTest, Output_ControlEdge_PadWithConv2D_Positive) { + CHECK_EQ(kTensorOrdering, MklTfTensorOrdering::TENSORS_CONTIGUOUS); + InitGraph( + "node { name: 'A1' op: 'Input'}" + "node { name: 'A' op: 'Input'}" + "node { name: 'B' op: 'Int32Input'}" + "node { name: 'C' op: 'Pad'" + " attr { key: 'T' value { type: DT_FLOAT } }" + " attr { key: 'Tpaddings' value { type: DT_INT32 } }" + " input: ['A', 'B']}" + "node { name: 'D' op: 'Input'}" + "node { name: 'E' op: 'Conv2D'" + " attr { key: 'T' value { type: DT_FLOAT } }" + " attr { key: 'data_format' value { s: 'NHWC' } }" + " attr { key: 'use_cudnn_on_gpu' value { b: false } }" + " attr { key: 'strides' value { list: {i: 1, i:1, i:1, i:1} } }" + " attr { key: 'padding' value { s: 'VALID' } }" + " attr { key: 'dilations' value { list: {i: 1, i:1, i:1, i:1} } }" + " input: ['C', 'D'] }" + "node { name: 'Y' op: 'Input'}" + "node { name: 'Z' op: 'Zeta'" + " attr {key: 'T' value { type: DT_FLOAT } }" + " input: ['E', 'Y']}"); + Node* a1 = FindNode("A1"); + Node* c = FindNode("C"); + Node* e = FindNode("E"); + const Edge* edge = graph_.AddControlEdge(a1, c); + const Edge* edge_1 = graph_.AddControlEdge(a1, e); + ASSERT_TRUE(edge != nullptr); + ASSERT_TRUE(edge_1 != nullptr); + EXPECT_EQ(DoMklLayoutOptimizationPass(), + "A(Input);A1(Input);B(Int32Input);D(Input);DMT/_0(Const);DMT/_1(Const);" + "DMT/_2(Const);E(_MklPadWithConv2D);Y(Input);Z(Zeta)|A->E;" + "A1:control->E:control;A:control->DMT/_0:control;A:control->DMT/_1:control;" + "A:control->DMT/_2:control;B->E:2;D->E:1;DMT/_0->E:3;DMT/_1->E:4;" + "DMT/_2->E:5;E->Z;Y->Z:1"); +} +// Test if output control edges does not duplicate after merge. +// If both the merging ops have output control edge to a common op, +// then after merge, the merged op will have only one control edge +// to that commom op. +// padding is VALID type +// A = input(image), B = input(paddings), C= Pad = input of conv2D, +// D=input(filter), E = Conv2D, Z = Zeta +// C=Pad(A,B); E=Conv2D(C,D); Z=Zeta(E,Y) +// C:control->A1:control +// E:control->A1:control +// After layout pass +// _MklPadWithConv2D(A, D, B, DMT/_0, DMT/_1, DMT/_2) +// E:control->A1:control (only one control edge) +TEST_F(MklLayoutPassTest, ControlEdge_PadWithConv2D_Positive) { + CHECK_EQ(kTensorOrdering, MklTfTensorOrdering::TENSORS_CONTIGUOUS); + InitGraph( + "node { name: 'A1' op: 'Input'}" + "node { name: 'A' op: 'Input'}" + "node { name: 'B' op: 'Int32Input'}" + "node { name: 'C' op: 'Pad'" + " attr { key: 'T' value { type: DT_FLOAT } }" + " attr { key: 'Tpaddings' value { type: DT_INT32 } }" + " input: ['A', 'B']}" + "node { name: 'D' op: 'Input'}" + "node { name: 'E' op: 'Conv2D'" + " attr { key: 'T' value { type: DT_FLOAT } }" + " attr { key: 'data_format' value { s: 'NHWC' } }" + " attr { key: 'use_cudnn_on_gpu' value { b: false } }" + " attr { key: 'strides' value { list: {i: 1, i:1, i:1, i:1} } }" + " attr { key: 'padding' value { s: 'VALID' } }" + " attr { key: 'dilations' value { list: {i: 1, i:1, i:1, i:1} } }" + " input: ['C', 'D'] }" + "node { name: 'Y' op: 'Input'}" + "node { name: 'Z' op: 'Zeta'" + " attr {key: 'T' value { type: DT_FLOAT } }" + " input: ['E', 'Y']}"); + Node* a1 = FindNode("A1"); + Node* c = FindNode("C"); + Node* e = FindNode("E"); + const Edge* edge = graph_.AddControlEdge(c, a1); + const Edge* edge_1 = graph_.AddControlEdge(e, a1); + ASSERT_TRUE(edge != nullptr); + ASSERT_TRUE(edge_1 != nullptr); + EXPECT_EQ(DoMklLayoutOptimizationPass(), + "A(Input);A1(Input);B(Int32Input);D(Input);DMT/_0(Const);DMT/_1(Const);" + "DMT/_2(Const);E(_MklPadWithConv2D);Y(Input);Z(Zeta)|A->E;" + "A:control->DMT/_0:control;A:control->DMT/_1:control;" + "A:control->DMT/_2:control;B->E:2;D->E:1;DMT/_0->E:3;DMT/_1->E:4;" + "DMT/_2->E:5;E->Z;E:control->A1:control;Y->Z:1"); +} // Pad + Conv2D fusion with padding is VALID, // Input node pointing to both Pad and Conv2D // A = input(image), B = input(paddings), C= Pad -- GitLab From 4e140eed6b4f6722b94cf85432d4519b8c5ce0bf Mon Sep 17 00:00:00 2001 From: mbhuiyan Date: Mon, 17 Sep 2018 10:05:40 -0700 Subject: [PATCH 014/405] changing the name of the unit tests --- tensorflow/core/graph/mkl_layout_pass_test.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tensorflow/core/graph/mkl_layout_pass_test.cc b/tensorflow/core/graph/mkl_layout_pass_test.cc index e9e234010c..9ad45a2cfd 100644 --- a/tensorflow/core/graph/mkl_layout_pass_test.cc +++ b/tensorflow/core/graph/mkl_layout_pass_test.cc @@ -1928,7 +1928,7 @@ static void InitGraph(const string& s, Graph* graph, class MklLayoutPassTest : public ::testing::Test { public: MklLayoutPassTest() : graph_(OpRegistry::Global()) {} - // Return Node* from the Node Name + // Ashraf added Node* FindNode(const string& name) { for (Node* node : graph_.nodes()) { if (node->name() == name) return node; @@ -2383,7 +2383,7 @@ TEST_F(MklLayoutPassTest, NodeMerge_PadWithConv2D_Positive) { // After layout pass // _MklPadWithConv2D(A, D, B, DMT/_0, DMT/_1, DMT/_2) // A1:control->E:control -TEST_F(MklLayoutPassTest, Output_ControlEdge_PadWithConv2D_Positive) { +TEST_F(MklLayoutPassTest, Input_ControlEdge_PadWithConv2D_Positive) { CHECK_EQ(kTensorOrdering, MklTfTensorOrdering::TENSORS_CONTIGUOUS); InitGraph( "node { name: 'A1' op: 'Input'}" -- GitLab From 2a8f7bcc59bc4e36ea88f4187028b4461f5f1072 Mon Sep 17 00:00:00 2001 From: mbhuiyan Date: Tue, 18 Sep 2018 11:28:21 -0700 Subject: [PATCH 015/405] minor change in the two unit tests --- tensorflow/core/graph/mkl_layout_pass_test.cc | 20 +++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/tensorflow/core/graph/mkl_layout_pass_test.cc b/tensorflow/core/graph/mkl_layout_pass_test.cc index 9ad45a2cfd..60a7f138c8 100644 --- a/tensorflow/core/graph/mkl_layout_pass_test.cc +++ b/tensorflow/core/graph/mkl_layout_pass_test.cc @@ -2378,11 +2378,11 @@ TEST_F(MklLayoutPassTest, NodeMerge_PadWithConv2D_Positive) { // C= Pad = input of conv2D, // D=input(filter), E = Conv2D, Z = Zeta // C=Pad(A,B); E=Conv2D(C,D); Z=Zeta(E,Y) -// C:control->A1:control -// E:control->A1:control -// After layout pass -// _MklPadWithConv2D(A, D, B, DMT/_0, DMT/_1, DMT/_2) +// A1:control->C:control // A1:control->E:control +// After layout pass: +// _MklPadWithConv2D(A, D, B, DMT/_0, DMT/_1, DMT/_2) +// A1:control->E:control (only one control edge) TEST_F(MklLayoutPassTest, Input_ControlEdge_PadWithConv2D_Positive) { CHECK_EQ(kTensorOrdering, MklTfTensorOrdering::TENSORS_CONTIGUOUS); InitGraph( @@ -2411,8 +2411,8 @@ TEST_F(MklLayoutPassTest, Input_ControlEdge_PadWithConv2D_Positive) { Node* e = FindNode("E"); const Edge* edge = graph_.AddControlEdge(a1, c); const Edge* edge_1 = graph_.AddControlEdge(a1, e); - ASSERT_TRUE(edge != nullptr); - ASSERT_TRUE(edge_1 != nullptr); + ASSERT_NE(edge, nullptr); + ASSERT_NE(edge_1, nullptr); EXPECT_EQ(DoMklLayoutOptimizationPass(), "A(Input);A1(Input);B(Int32Input);D(Input);DMT/_0(Const);DMT/_1(Const);" "DMT/_2(Const);E(_MklPadWithConv2D);Y(Input);Z(Zeta)|A->E;" @@ -2430,10 +2430,10 @@ TEST_F(MklLayoutPassTest, Input_ControlEdge_PadWithConv2D_Positive) { // C=Pad(A,B); E=Conv2D(C,D); Z=Zeta(E,Y) // C:control->A1:control // E:control->A1:control -// After layout pass +// After layout pass: // _MklPadWithConv2D(A, D, B, DMT/_0, DMT/_1, DMT/_2) // E:control->A1:control (only one control edge) -TEST_F(MklLayoutPassTest, ControlEdge_PadWithConv2D_Positive) { +TEST_F(MklLayoutPassTest, Output_ControlEdge_PadWithConv2D_Positive) { CHECK_EQ(kTensorOrdering, MklTfTensorOrdering::TENSORS_CONTIGUOUS); InitGraph( "node { name: 'A1' op: 'Input'}" @@ -2461,8 +2461,8 @@ TEST_F(MklLayoutPassTest, ControlEdge_PadWithConv2D_Positive) { Node* e = FindNode("E"); const Edge* edge = graph_.AddControlEdge(c, a1); const Edge* edge_1 = graph_.AddControlEdge(e, a1); - ASSERT_TRUE(edge != nullptr); - ASSERT_TRUE(edge_1 != nullptr); + ASSERT_NE(edge, nullptr); + ASSERT_NE(edge_1, nullptr); EXPECT_EQ(DoMklLayoutOptimizationPass(), "A(Input);A1(Input);B(Int32Input);D(Input);DMT/_0(Const);DMT/_1(Const);" "DMT/_2(Const);E(_MklPadWithConv2D);Y(Input);Z(Zeta)|A->E;" -- GitLab From 777e6a4e194e4cc141feb6b250702c0e4946ca2d Mon Sep 17 00:00:00 2001 From: wangsiyu Date: Mon, 1 Oct 2018 13:51:36 +0800 Subject: [PATCH 016/405] Make colocations be compatible with DistributionStrategy in SyncReplicasOptimizer --- tensorflow/python/training/sync_replicas_optimizer.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/tensorflow/python/training/sync_replicas_optimizer.py b/tensorflow/python/training/sync_replicas_optimizer.py index 7afaa92699..99d2563fc6 100644 --- a/tensorflow/python/training/sync_replicas_optimizer.py +++ b/tensorflow/python/training/sync_replicas_optimizer.py @@ -27,6 +27,7 @@ from tensorflow.python.ops import state_ops from tensorflow.python.ops import variable_scope from tensorflow.python.ops import variables from tensorflow.python.platform import tf_logging as logging +from tensorflow.python.training import distribution_strategy_context from tensorflow.python.training import optimizer from tensorflow.python.training import queue_runner from tensorflow.python.training import session_manager @@ -245,7 +246,9 @@ class SyncReplicasOptimizer(optimizer.Optimizer): # local_anchor op will be placed on this worker task by default. local_anchor = control_flow_ops.no_op() # Colocating local_step variable prevents it being placed on the PS. - with ops.colocate_with(local_anchor): + distribution_strategy = ( + distribution_strategy_context.get_distribution_strategy()) + with distribution_strategy.colocate_vars_with(local_anchor): self._local_step = variable_scope.variable( initial_value=0, trainable=False, -- GitLab From 650172a574504223ec2bdb328ed7c985389313d7 Mon Sep 17 00:00:00 2001 From: Yong Tang Date: Sat, 13 Oct 2018 20:30:26 +0000 Subject: [PATCH 017/405] Update test case for complex support of squared difference Signed-off-by: Yong Tang --- tensorflow/python/ops/math_ops_test.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tensorflow/python/ops/math_ops_test.py b/tensorflow/python/ops/math_ops_test.py index 533a00e737..62645230ee 100644 --- a/tensorflow/python/ops/math_ops_test.py +++ b/tensorflow/python/ops/math_ops_test.py @@ -217,7 +217,7 @@ class SquaredDifferenceTest(test_util.TensorFlowTestCase): for dtype in [np.complex64, np.complex128]: x = np.array([[1+3j, 2+2j, 3+1j], [4-1j, 5-2j, 6-3j]], dtype=dtype) y = np.array([-3+1j, -2+2j, -1+3j], dtype=dtype) - z = (x - y) * (x - y) + z = np.conj(x - y) * (x - y) with test_util.device(use_gpu=False): z_tf = self.evaluate(math_ops.squared_difference(x, y)) self.assertAllClose(z, z_tf) -- GitLab From 3a06e557619ebaa5437d1506af058b858806e9c7 Mon Sep 17 00:00:00 2001 From: Yong Tang Date: Sat, 13 Oct 2018 20:30:47 +0000 Subject: [PATCH 018/405] Update squared difference implementation for complex types. Signed-off-by: Yong Tang --- tensorflow/core/kernels/cwise_ops.h | 32 +++++++++++++++-------------- 1 file changed, 17 insertions(+), 15 deletions(-) diff --git a/tensorflow/core/kernels/cwise_ops.h b/tensorflow/core/kernels/cwise_ops.h index 06918075a4..5afb97dc52 100644 --- a/tensorflow/core/kernels/cwise_ops.h +++ b/tensorflow/core/kernels/cwise_ops.h @@ -296,27 +296,31 @@ struct less_equal : std::binary_function { } }; -// Functor that enables composition of multiple Eigen functors. -template -struct scalar_compose_op { +// Functor that enables squared difference functor. +template +struct scalar_squared_difference_op { EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator()(const Scalar& a, const Scalar& b) const { - return UnaryFunctor()(BinaryFunctor()(a, b)); + const Scalar v = scalar_difference_op()(a, b); + return scalar_product_op()(v, scalar_conjugate_op()(v)); } template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a, const Packet& b) const { - return UnaryFunctor().packetOp(BinaryFunctor().packetOp(a, b)); + const Packet v = scalar_difference_op().packetOp(a, b); + return scalar_product_op().packetOp(v, scalar_conjugate_op().packetOp(v)); } }; -template -struct functor_traits> { +template +struct functor_traits> { enum { - Cost = functor_traits::Cost + - functor_traits::Cost, - PacketAccess = functor_traits::PacketAccess && - functor_traits::PacketAccess + Cost = functor_traits>::Cost + + functor_traits>::Cost + + functor_traits>::Cost, + PacketAccess = functor_traits>::PacketAccess && + functor_traits>::PacketAccess && + functor_traits>::PacketAccess }; }; @@ -709,7 +713,7 @@ struct rint : base> {}; // pow(x, y) = x ^ y // maximum(x, y) = x > y ? x : y // minimum(x, y) = x < y ? x : y -// squared_difference(x, y) = (x - y) * (x - y) +// squared_difference(x, y) = conj(x - y) * (x - y) template struct add : base> { @@ -812,9 +816,7 @@ struct atan2 : base> {}; template struct squared_difference - : base, - Eigen::internal::scalar_difference_op>> {}; + : base> {}; template struct less : base, bool> {}; -- GitLab From 82642d91dbe6fbba87e6a582e396ca91df1f6440 Mon Sep 17 00:00:00 2001 From: Yong Tang Date: Sat, 13 Oct 2018 20:43:59 +0000 Subject: [PATCH 019/405] Fix `Experimental clang-format Check` error Signed-off-by: Yong Tang --- tensorflow/core/kernels/cwise_ops.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tensorflow/core/kernels/cwise_ops.h b/tensorflow/core/kernels/cwise_ops.h index 5afb97dc52..2682a25868 100644 --- a/tensorflow/core/kernels/cwise_ops.h +++ b/tensorflow/core/kernels/cwise_ops.h @@ -308,7 +308,8 @@ struct scalar_squared_difference_op { EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a, const Packet& b) const { const Packet v = scalar_difference_op().packetOp(a, b); - return scalar_product_op().packetOp(v, scalar_conjugate_op().packetOp(v)); + return scalar_product_op().packetOp( + v, scalar_conjugate_op().packetOp(v)); } }; -- GitLab From c0814e3861c0b88caebc24d4ef1ce5e61a213f2e Mon Sep 17 00:00:00 2001 From: Rholais Lii Date: Tue, 23 Oct 2018 13:44:40 +0800 Subject: [PATCH 020/405] Fix comments to match usage Fix comments of `sparse_softmax_cross_entropy_with_logits` to match usage. --- tensorflow/python/ops/nn_ops.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/tensorflow/python/ops/nn_ops.py b/tensorflow/python/ops/nn_ops.py index e31d162285..2477271a22 100644 --- a/tensorflow/python/ops/nn_ops.py +++ b/tensorflow/python/ops/nn_ops.py @@ -1980,8 +1980,9 @@ def sparse_softmax_cross_entropy_with_logits( on `logits` internally for efficiency. Do not call this op with the output of `softmax`, as it will produce incorrect results. - A common use case is to have logits and labels of shape - `[batch_size, num_classes]`, but higher dimensions are supported, in which + A common use case is to have logits of shape + `[batch_size, num_classes]` and have labels of shape + `[batch_size]`, but higher dimensions are supported, in which case the `dim`-th dimension is assumed to be of size `num_classes`. `logits` must have the dtype of `float16`, `float32`, or `float64`, and `labels` must have the dtype of `int32` or `int64`. -- GitLab From 67e2c47e2ff1222e141c51ac2794aa9b9207a573 Mon Sep 17 00:00:00 2001 From: Matt Conley Date: Wed, 24 Oct 2018 13:30:35 -0700 Subject: [PATCH 021/405] Disable denormal test on ARM until the architecture is supported. --- tensorflow/python/kernel_tests/denormal_test.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tensorflow/python/kernel_tests/denormal_test.py b/tensorflow/python/kernel_tests/denormal_test.py index 71a528c4aa..9f65a18c14 100644 --- a/tensorflow/python/kernel_tests/denormal_test.py +++ b/tensorflow/python/kernel_tests/denormal_test.py @@ -35,8 +35,8 @@ class DenormalTest(test.TestCase): self.assertEqual(tiny, tiny / 16 * 16) def _flushDenormalsTest(self, use_gpu, dtypes): - if platform.machine() == "ppc64le" or platform.machine() == "s390x": - # Disabled denormal_test on power/s390x platform + if platform.machine() == "ppc64le" or platform.machine() == "s390x" or platform.machine() == "aarch64": + # Disabled denormal_test on power/s390x/aarch64 platform # Check relevant discussion - https://github.com/tensorflow/tensorflow/issues/11902 return with self.cached_session(use_gpu=use_gpu): -- GitLab From 59617ccaca8c5980f5418a0b612b040ac8d1afba Mon Sep 17 00:00:00 2001 From: Ouwen Huang Date: Tue, 30 Oct 2018 05:37:22 +0000 Subject: [PATCH 022/405] Added note on weight decay for tf.contrib.opt optimizers. --- .../python/training/weight_decay_optimizers.py | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/tensorflow/contrib/opt/python/training/weight_decay_optimizers.py b/tensorflow/contrib/opt/python/training/weight_decay_optimizers.py index 200b0d2008..1e8351b70f 100644 --- a/tensorflow/contrib/opt/python/training/weight_decay_optimizers.py +++ b/tensorflow/contrib/opt/python/training/weight_decay_optimizers.py @@ -59,6 +59,23 @@ class DecoupledWeightDecayExtension(object): Note that this extension decays weights BEFORE applying the update based on the gradient, i.e. this extension only has the desired behaviour for optimizers which do not depend on the value of'var' in the update step! + + Note: when applying a decay to the learning rate, be sure to manually apply + the decay to the `weight_decay` as well. For example: + + ```python + decay = tf.train.piecewise_constant(tf.train.get_global_step(), + [10000, 15000], [1e-1, 1e-2, 1e-3]) + lr = 1*decay + wd = 1e-4*decay + + # ... + + optimizer = tf.contrib.opt.MomentumWOptimizer(learning_rate=lr, + weight_decay=wd, + momentum=0.9, + use_nesterov=True) + ``` """ def __init__(self, weight_decay, **kwargs): -- GitLab From 8e4ec9ae62135adbc523470af1546c178a7f97c5 Mon Sep 17 00:00:00 2001 From: frreiss Date: Tue, 6 Nov 2018 12:55:29 -0800 Subject: [PATCH 023/405] Add missing random seed field to OrderedEnqueuer Fix whitespace Simplify changes Simplify changeset --- tensorflow/python/keras/utils/data_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tensorflow/python/keras/utils/data_utils.py b/tensorflow/python/keras/utils/data_utils.py index 01a9d61a84..8e36d4dea7 100644 --- a/tensorflow/python/keras/utils/data_utils.py +++ b/tensorflow/python/keras/utils/data_utils.py @@ -598,7 +598,7 @@ class OrderedEnqueuer(SequenceEnqueuer): def pool_fn(seqs): return multiprocessing.Pool(workers, initializer=init_pool_generator, - initargs=(seqs, self.random_seed)) + initargs=(seqs, None)) return pool_fn def _wait_queue(self): -- GitLab From 3ea1267b9758fdc5582948805cdd852b09f21f6b Mon Sep 17 00:00:00 2001 From: dianlujitao Date: Wed, 7 Nov 2018 13:24:28 +0800 Subject: [PATCH 024/405] Install abseil headers to cmake shared library build * Since commit 5f004516 tensorflow::StringPiece is replaced by absl::string_view, so abseil headers should be installed to shared library build to fix compilation error for out-of-source build. * To cleanly copy abseil headers, disable in source build for abseil to avoid src tree been polluted by cmake generated files. * Meanwhile, remove _build suffix from abseil_cpp product name since it's confusing. --- .../contrib/cmake/external/abseil_cpp.cmake | 15 ++++++--------- tensorflow/contrib/cmake/tf_shared_lib.cmake | 4 ++++ 2 files changed, 10 insertions(+), 9 deletions(-) diff --git a/tensorflow/contrib/cmake/external/abseil_cpp.cmake b/tensorflow/contrib/cmake/external/abseil_cpp.cmake index 4546dbdecc..b0fee24448 100644 --- a/tensorflow/contrib/cmake/external/abseil_cpp.cmake +++ b/tensorflow/contrib/cmake/external/abseil_cpp.cmake @@ -31,17 +31,17 @@ if (systemlib_ABSEIL_CPP) message(STATUS " abseil_cpp includes: ${ABSEIL_CPP_INCLUDE_DIR}") message(STATUS " abseil_cpp libraries: ${ABSEIL_CPP_LIBRARIES}") - add_custom_target(abseil_cpp_build) - list(APPEND tensorflow_EXTERNAL_DEPENDENCIES abseil_cpp_build) + add_custom_target(abseil_cpp) + list(APPEND tensorflow_EXTERNAL_DEPENDENCIES abseil_cpp) else (systemlib_ABSEIL_CPP) include (ExternalProject) - set(abseil_cpp_INCLUDE_DIR ${CMAKE_BINARY_DIR}/abseil_cpp/src/abseil_cpp_build) + set(abseil_cpp_INCLUDE_DIR ${CMAKE_BINARY_DIR}/abseil_cpp/src/abseil_cpp) set(abseil_cpp_URL https://github.com/abseil/abseil-cpp/archive/e01d95528ea2137a4a27a88d1f57c6cb260aafed.tar.gz) set(abseil_cpp_HASH SHA256=84043ed402d2a2a6ba4cdddb7e85118b1158fd81fe4ac3a14adc343d054c1e2e) - set(abseil_cpp_BUILD ${CMAKE_BINARY_DIR}/abseil_cpp/src/abseil_cpp_build) + set(abseil_cpp_BUILD ${CMAKE_BINARY_DIR}/abseil_cpp/src/abseil_cpp-build) if(WIN32) if(${CMAKE_GENERATOR} MATCHES "Visual Studio.*") @@ -80,15 +80,12 @@ else (systemlib_ABSEIL_CPP) ${abseil_cpp_BUILD}/absl/types/libabsl_bad_optional_access.a) endif() - ExternalProject_Add(abseil_cpp_build + ExternalProject_Add(abseil_cpp PREFIX abseil_cpp URL ${abseil_cpp_URL} URL_HASH ${abseil_cpp_HASH} DOWNLOAD_DIR "${DOWNLOAD_LOCATION}" - BUILD_IN_SOURCE 1 BUILD_BYPRODUCTS ${abseil_cpp_STATIC_LIBRARIES} - BUILD_COMMAND ${CMAKE_COMMAND} --build . --config Release - COMMAND ${CMAKE_COMMAND} --build . --config Release INSTALL_COMMAND "" CMAKE_CACHE_ARGS -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=${tensorflow_ENABLE_POSITION_INDEPENDENT_CODE} @@ -99,6 +96,6 @@ else (systemlib_ABSEIL_CPP) include_directories(${abseil_cpp_INCLUDE_DIR}) list(APPEND tensorflow_EXTERNAL_LIBRARIES ${abseil_cpp_STATIC_LIBRARIES}) - list(APPEND tensorflow_EXTERNAL_DEPENDENCIES abseil_cpp_build) + list(APPEND tensorflow_EXTERNAL_DEPENDENCIES abseil_cpp) endif (systemlib_ABSEIL_CPP) diff --git a/tensorflow/contrib/cmake/tf_shared_lib.cmake b/tensorflow/contrib/cmake/tf_shared_lib.cmake index fdf522f1fd..c1bdc35fc6 100644 --- a/tensorflow/contrib/cmake/tf_shared_lib.cmake +++ b/tensorflow/contrib/cmake/tf_shared_lib.cmake @@ -145,6 +145,10 @@ install(DIRECTORY ${tensorflow_source_dir}/third_party/eigen3/ # unsupported Eigen directory install(DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/eigen/src/eigen/unsupported/Eigen/ DESTINATION include/unsupported/Eigen) +# absl directory +install(DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/abseil_cpp/src/abseil_cpp/absl/ + DESTINATION include/absl + FILES_MATCHING PATTERN "*.h") # mkl if (tensorflow_ENABLE_MKL_SUPPORT) install(DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/mkl/src/mkl/include/ -- GitLab From c806b163c4d52cf80daecf2d63e3b76c7dc696e6 Mon Sep 17 00:00:00 2001 From: Nutti Date: Sat, 10 Nov 2018 10:41:09 +0900 Subject: [PATCH 025/405] OptimizationPass::POST_REWRITE_FOR_EXEC after Grappler optimization in PartitionedCallOp --- tensorflow/core/kernels/partitioned_function_ops.cc | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/tensorflow/core/kernels/partitioned_function_ops.cc b/tensorflow/core/kernels/partitioned_function_ops.cc index 71e506e5e6..72310f33ae 100644 --- a/tensorflow/core/kernels/partitioned_function_ops.cc +++ b/tensorflow/core/kernels/partitioned_function_ops.cc @@ -166,12 +166,6 @@ class PartitionedCallOp : public AsyncOpKernel { OptimizationPassRegistry::Global()->RunGrouping( OptimizationPassRegistry::POST_PLACEMENT, optimization_options), done); - OP_REQUIRES_OK_ASYNC( - ctx, - OptimizationPassRegistry::Global()->RunGrouping( - OptimizationPassRegistry::POST_REWRITE_FOR_EXEC, - optimization_options), - done); Device* cpu_device; OP_REQUIRES_OK_ASYNC( @@ -184,6 +178,13 @@ class PartitionedCallOp : public AsyncOpKernel { device_set, cpu_device, &graph), done); + OP_REQUIRES_OK_ASYNC( + ctx, + OptimizationPassRegistry::Global()->RunGrouping( + OptimizationPassRegistry::POST_REWRITE_FOR_EXEC, + optimization_options), + done); + std::unordered_map> subgraphs; OP_REQUIRES_OK_ASYNC( ctx, PartitionHelper(device_set, std::move(graph), &subgraphs), -- GitLab From 7d96d6fbd3cdbe215c9dce78f8227ef273b5d37a Mon Sep 17 00:00:00 2001 From: Yong Tang Date: Sat, 10 Nov 2018 20:28:31 +0000 Subject: [PATCH 026/405] Fix ValueError by image.transform in eager mode This fix tries to address the issue raised in 23654 where in eager mode tf.contrib.image.transform will throw out ``` ValueError: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all() ``` This fix addresses the issue. This fix fixes 23654. Signed-off-by: Yong Tang --- tensorflow/contrib/image/python/ops/image_ops.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/tensorflow/contrib/image/python/ops/image_ops.py b/tensorflow/contrib/image/python/ops/image_ops.py index d4fb99a017..b25a6f7b57 100644 --- a/tensorflow/contrib/image/python/ops/image_ops.py +++ b/tensorflow/contrib/image/python/ops/image_ops.py @@ -17,6 +17,7 @@ from __future__ import absolute_import from __future__ import division from __future__ import print_function +from tensorflow.python.eager import context from tensorflow.contrib.image.ops import gen_image_ops from tensorflow.contrib.util import loader from tensorflow.python.framework import common_shapes @@ -271,8 +272,11 @@ def transform(images, raise TypeError("Images should have rank between 2 and 4.") if output_shape is None: - output_shape = tensor_util.constant_value( - array_ops.shape(images)[1:3]) or array_ops.shape(images)[1:3] + output_shape = array_ops.shape(images)[1:3] + if not context.executing_eagerly(): + output_shape_value = tensor_util.constant_value(output_shape) + if output_shape_value is not None: + output_shape = output_shape_value output_shape = ops.convert_to_tensor( output_shape, dtypes.int32, name="output_shape") -- GitLab From 38455b2e111fa1acef3dd7dd00517b3fc1f1c38f Mon Sep 17 00:00:00 2001 From: Yong Tang Date: Sat, 10 Nov 2018 20:30:22 +0000 Subject: [PATCH 027/405] Add test case for image.transform in eager mode Signed-off-by: Yong Tang --- .../contrib/image/python/kernel_tests/image_ops_test.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/tensorflow/contrib/image/python/kernel_tests/image_ops_test.py b/tensorflow/contrib/image/python/kernel_tests/image_ops_test.py index 4997c31a7f..ebf8a8adb3 100644 --- a/tensorflow/contrib/image/python/kernel_tests/image_ops_test.py +++ b/tensorflow/contrib/image/python/kernel_tests/image_ops_test.py @@ -281,6 +281,14 @@ class ImageOpsTest(test_util.TensorFlowTestCase): value.eval(), np.array([[4, 4], [4, 4]]).astype(dtype.as_numpy_dtype())) + @test_util.run_in_graph_and_eager_modes + def test_transform_eager(self): + image = constant_op.constant([[1., 2.], [3., 4.]]) + value = image_ops.transform(image, [1] * 8) + with self.test_session(use_gpu=True): + self.assertAllEqual( + self.evaluate(value), np.array([[4, 4], [4, 4]])) + class BipartiteMatchTest(test_util.TensorFlowTestCase): -- GitLab From c50685de2d680d7e76e4b586e14138f33272a9cb Mon Sep 17 00:00:00 2001 From: Yong Tang Date: Sat, 10 Nov 2018 20:33:28 +0000 Subject: [PATCH 028/405] Pylint fix Signed-off-by: Yong Tang --- tensorflow/contrib/image/python/kernel_tests/image_ops_test.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/tensorflow/contrib/image/python/kernel_tests/image_ops_test.py b/tensorflow/contrib/image/python/kernel_tests/image_ops_test.py index ebf8a8adb3..ba5cdfebf9 100644 --- a/tensorflow/contrib/image/python/kernel_tests/image_ops_test.py +++ b/tensorflow/contrib/image/python/kernel_tests/image_ops_test.py @@ -286,8 +286,7 @@ class ImageOpsTest(test_util.TensorFlowTestCase): image = constant_op.constant([[1., 2.], [3., 4.]]) value = image_ops.transform(image, [1] * 8) with self.test_session(use_gpu=True): - self.assertAllEqual( - self.evaluate(value), np.array([[4, 4], [4, 4]])) + self.assertAllEqual(self.evaluate(value), np.array([[4, 4], [4, 4]])) class BipartiteMatchTest(test_util.TensorFlowTestCase): -- GitLab From ada03a97ab77bdd58cd96d3a3fdca490278165d4 Mon Sep 17 00:00:00 2001 From: Abhinav Upadhyay Date: Wed, 14 Nov 2018 18:34:43 +0530 Subject: [PATCH 029/405] Fix a TypeError We cannot concatenate string and FailedPreConditionError --- tensorflow/contrib/tpu/python/tpu/keras_support.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tensorflow/contrib/tpu/python/tpu/keras_support.py b/tensorflow/contrib/tpu/python/tpu/keras_support.py index 08f58a5f5b..ac72e3f55d 100644 --- a/tensorflow/contrib/tpu/python/tpu/keras_support.py +++ b/tensorflow/contrib/tpu/python/tpu/keras_support.py @@ -132,7 +132,7 @@ def _tpu_session_context(): An error occurred connecting or initializing your TPU. The session has been reset. re-run keras_to_tpu_model to create a new session. -""" + e) +""" + str(e)) def setup_tpu_session(cluster_resolver): -- GitLab From 71ba0ec86e7cde759006f17979f71e863602182c Mon Sep 17 00:00:00 2001 From: Ouwen Huang Date: Thu, 15 Nov 2018 14:28:48 -0500 Subject: [PATCH 030/405] Update weight_decay_optimizers.py --- .../opt/python/training/weight_decay_optimizers.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tensorflow/contrib/opt/python/training/weight_decay_optimizers.py b/tensorflow/contrib/opt/python/training/weight_decay_optimizers.py index 1e8351b70f..8b8065c678 100644 --- a/tensorflow/contrib/opt/python/training/weight_decay_optimizers.py +++ b/tensorflow/contrib/opt/python/training/weight_decay_optimizers.py @@ -64,10 +64,10 @@ class DecoupledWeightDecayExtension(object): the decay to the `weight_decay` as well. For example: ```python - decay = tf.train.piecewise_constant(tf.train.get_global_step(), - [10000, 15000], [1e-1, 1e-2, 1e-3]) - lr = 1*decay - wd = 1e-4*decay + schedule = tf.train.piecewise_constant(tf.train.get_global_step(), + [10000, 15000], [1e-0, 1e-1, 1e-2]) + lr = 1e-1 * schedule() + wd = lambda: 1e-4 * schedule() # ... -- GitLab From 489e181be77b83b2b631f48968aaf40897001838 Mon Sep 17 00:00:00 2001 From: Siju Date: Fri, 16 Nov 2018 11:56:50 +0530 Subject: [PATCH 031/405] Update graph_transformations.h --- .../lite/toco/graph_transformations/graph_transformations.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tensorflow/lite/toco/graph_transformations/graph_transformations.h b/tensorflow/lite/toco/graph_transformations/graph_transformations.h index 73a90c8239..187b584b69 100644 --- a/tensorflow/lite/toco/graph_transformations/graph_transformations.h +++ b/tensorflow/lite/toco/graph_transformations/graph_transformations.h @@ -139,7 +139,7 @@ DECLARE_GRAPH_TRANSFORMATION(MakeInitialDequantizeOperator) DECLARE_GRAPH_TRANSFORMATION(MoveBinaryOperatorBeforeReshape) DECLARE_GRAPH_TRANSFORMATION(PropagateActivationFunctionIntoConstants) DECLARE_GRAPH_TRANSFORMATION(PropagateArrayDataTypes) -DECLARE_GRAPH_TRANSFORMATION(PropagateFakeQuantNumBits); +DECLARE_GRAPH_TRANSFORMATION(PropagateFakeQuantNumBits) DECLARE_GRAPH_TRANSFORMATION(PropagateFixedSizes) DECLARE_GRAPH_TRANSFORMATION(HardcodeMinMax) DECLARE_GRAPH_TRANSFORMATION(Quantize) -- GitLab From 27171a09e5812d3c8d237c69aa5d53250e7f1696 Mon Sep 17 00:00:00 2001 From: Yong Tang Date: Sun, 18 Nov 2018 01:28:50 +0000 Subject: [PATCH 032/405] Fix deprecated div While running test I noticed the following warning: ``` WARNING:tensorflow:From /usr/local/lib/python2.7/dist-packages/tensorflow/python/ops/nn_ops.py:2744: div (from tensorflow.python.ops.math_ops) is deprecated and will be removed in a future version. Instructions for updating: Deprecated in favor of operator or tf.math.divide. ``` This fix fixes the deprecated warning. Signed-off-by: Yong Tang --- tensorflow/python/ops/nn_ops.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tensorflow/python/ops/nn_ops.py b/tensorflow/python/ops/nn_ops.py index 21008fc392..223a37c87e 100644 --- a/tensorflow/python/ops/nn_ops.py +++ b/tensorflow/python/ops/nn_ops.py @@ -2741,7 +2741,7 @@ def dropout(x, keep_prob, noise_shape=None, seed=None, name=None): # pylint: di noise_shape, seed=seed, dtype=x.dtype) # 0. if [keep_prob, 1.0) and 1. if [1.0, 1.0 + keep_prob) binary_tensor = math_ops.floor(random_tensor) - ret = math_ops.div(x, keep_prob) * binary_tensor + ret = math_ops.divide(x, keep_prob) * binary_tensor if not context.executing_eagerly(): ret.set_shape(x.get_shape()) return ret -- GitLab From e320fba1e9349dee60ba1e06e1f6bbc08c2a85c1 Mon Sep 17 00:00:00 2001 From: Yong Tang Date: Thu, 22 Nov 2018 20:47:59 +0000 Subject: [PATCH 033/405] Update re2 library to 2018-10-01 This fix updates re2 library to the latest release of 2018-10-01 Signed-off-by: Yong Tang --- tensorflow/workspace.bzl | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tensorflow/workspace.bzl b/tensorflow/workspace.bzl index 7ad094c507..065a695453 100755 --- a/tensorflow/workspace.bzl +++ b/tensorflow/workspace.bzl @@ -168,12 +168,12 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""): tf_http_archive( name = "com_googlesource_code_re2", - sha256 = "803c7811146edeef8f91064de37c6f19136ff01a2a8cdb3230e940b2fd9f07fe", - strip_prefix = "re2-2018-07-01", + sha256 = "a31397714a353587413d307337d0b58f8a2e20e2b9d02f2e24e3463fa4eeda81", + strip_prefix = "re2-2018-10-01", system_build_file = clean_dep("//third_party/systemlibs:re2.BUILD"), urls = [ - "https://mirror.bazel.build/github.com/google/re2/archive/2018-07-01.tar.gz", - "https://github.com/google/re2/archive/2018-07-01.tar.gz", + "https://mirror.bazel.build/github.com/google/re2/archive/2018-10-01.tar.gz", + "https://github.com/google/re2/archive/2018-10-01.tar.gz", ], ) -- GitLab From 48809b87793882266f01b7b40bc9e4a6e0f18f57 Mon Sep 17 00:00:00 2001 From: AG Ramesh Date: Fri, 23 Nov 2018 11:44:19 -0800 Subject: [PATCH 034/405] Fixed merge errors and clang format issues --- tensorflow/core/graph/mkl_layout_pass.cc | 6 +- tensorflow/core/graph/mkl_layout_pass_test.cc | 47 +++-- tensorflow/core/kernels/mkl_conv_ops.cc | 180 +++++++++--------- tensorflow/core/kernels/mkl_conv_ops.h | 14 +- tensorflow/core/kernels/mkl_fused_ops_test.cc | 2 - tensorflow/core/ops/nn_ops.cc | 3 +- 6 files changed, 121 insertions(+), 131 deletions(-) diff --git a/tensorflow/core/graph/mkl_layout_pass.cc b/tensorflow/core/graph/mkl_layout_pass.cc index 8de0fc6083..de1a982b9d 100644 --- a/tensorflow/core/graph/mkl_layout_pass.cc +++ b/tensorflow/core/graph/mkl_layout_pass.cc @@ -2289,13 +2289,13 @@ Status MklLayoutRewritePass::MergePadWithConv2D(std::unique_ptr* g, // We handle control edges now. for (const Edge* e : pred->in_edges()) { if (e->IsControlEdge()) { - //Don't allow duplicate edge + // Don't allow duplicate edge (*g)->AddControlEdge(e->src(), new_node, false); } } for (const Edge* e : succ->in_edges()) { if (e->IsControlEdge()) { - //Don't allow duplicate edge + // Don't allow duplicate edge (*g)->AddControlEdge(e->src(), new_node, false); } } @@ -2304,7 +2304,7 @@ Status MklLayoutRewritePass::MergePadWithConv2D(std::unique_ptr* g, // First, we will fix outgoing control edges from 'pred' node. for (const Edge* e : pred->out_edges()) { if (e->IsControlEdge()) { - //Don't allow duplicate edge + // Don't allow duplicate edge (*g)->AddControlEdge(new_node, e->dst(), false); } } diff --git a/tensorflow/core/graph/mkl_layout_pass_test.cc b/tensorflow/core/graph/mkl_layout_pass_test.cc index 0c8d7f7dbb..fa059f1194 100644 --- a/tensorflow/core/graph/mkl_layout_pass_test.cc +++ b/tensorflow/core/graph/mkl_layout_pass_test.cc @@ -138,13 +138,8 @@ REGISTER_OP("_MklInput2") .Output("o: uint8") .Output("o1: uint8") .SetIsStateful(); -REGISTER_OP("Output2") - .Input("i: float") - .Input("i1: float") - .SetIsStateful(); -REGISTER_OP("Output") - .Input("i: float") - .SetIsStateful(); +REGISTER_OP("Output2").Input("i: float").Input("i1: float").SetIsStateful(); +REGISTER_OP("Output").Input("i: float").SetIsStateful(); ///////////////////////////////////////////////////////////////////// // Unit tests related to node merge optiimization @@ -163,7 +158,6 @@ TEST_F(MklLayoutPassTest, Basic) { "A->C;A->D;B->C:1;B->D:1"); } - // Test set 1: Conv2D + AddBias // C=Conv2D(A,B); E=BiasAdd(C,D); Z=Zeta(E,Y) @@ -470,7 +464,7 @@ TEST_F(MklLayoutPassTest, NodeMerge_Conv2DWithBias_ConvBpropInput_FilterFwd) { "E:3->G:4;F->G;F:control->DMT/_3:control;G->Z;X->Y:1;X->Z:1"); } -// Test set 3: Pad + Conv2D fusion +// Test set 3: Pad + Conv2D fusion // padding is VALID type // A = input(image), B = input(paddings), C= Pad = input of conv2D, // D=input(filter), E = Conv2D, Z = Zeta @@ -508,10 +502,10 @@ TEST_F(MklLayoutPassTest, NodeMerge_PadWithConv2D_Positive) { } // Test if input control edges do not duplicate after merge. // If both the merging ops have input control edge from a common op -// then, the merged op will have only one control edge from that +// then, the merged op will have only one control edge from that // common op. // padding is VALID type -// A = input(image), A1 = input, B = input(paddings), +// A = input(image), A1 = input, B = input(paddings), // C= Pad = input of conv2D, // D=input(filter), E = Conv2D, Z = Zeta // C=Pad(A,B); E=Conv2D(C,D); Z=Zeta(E,Y) @@ -550,12 +544,14 @@ TEST_F(MklLayoutPassTest, Input_ControlEdge_PadWithConv2D_Positive) { const Edge* edge_1 = graph_.AddControlEdge(a1, e); ASSERT_NE(edge, nullptr); ASSERT_NE(edge_1, nullptr); - EXPECT_EQ(DoMklLayoutOptimizationPass(), - "A(Input);A1(Input);B(Int32Input);D(Input);DMT/_0(Const);DMT/_1(Const);" - "DMT/_2(Const);E(_MklPadWithConv2D);Y(Input);Z(Zeta)|A->E;" - "A1:control->E:control;A:control->DMT/_0:control;A:control->DMT/_1:control;" - "A:control->DMT/_2:control;B->E:2;D->E:1;DMT/_0->E:3;DMT/_1->E:4;" - "DMT/_2->E:5;E->Z;Y->Z:1"); + EXPECT_EQ( + DoMklLayoutOptimizationPass(), + "A(Input);A1(Input);B(Int32Input);D(Input);DMT/_0(Const);DMT/_1(Const);" + "DMT/_2(Const);E(_MklPadWithConv2D);Y(Input);Z(Zeta)|A->E;" + "A1:control->E:control;A:control->DMT/_0:control;A:control->DMT/" + "_1:control;" + "A:control->DMT/_2:control;B->E:2;D->E:1;DMT/_0->E:3;DMT/_1->E:4;" + "DMT/_2->E:5;E->Z;Y->Z:1"); } // Test if output control edges does not duplicate after merge. // If both the merging ops have output control edge to a common op, @@ -600,16 +596,17 @@ TEST_F(MklLayoutPassTest, Output_ControlEdge_PadWithConv2D_Positive) { const Edge* edge_1 = graph_.AddControlEdge(e, a1); ASSERT_NE(edge, nullptr); ASSERT_NE(edge_1, nullptr); - EXPECT_EQ(DoMklLayoutOptimizationPass(), - "A(Input);A1(Input);B(Int32Input);D(Input);DMT/_0(Const);DMT/_1(Const);" - "DMT/_2(Const);E(_MklPadWithConv2D);Y(Input);Z(Zeta)|A->E;" - "A:control->DMT/_0:control;A:control->DMT/_1:control;" - "A:control->DMT/_2:control;B->E:2;D->E:1;DMT/_0->E:3;DMT/_1->E:4;" - "DMT/_2->E:5;E->Z;E:control->A1:control;Y->Z:1"); + EXPECT_EQ( + DoMklLayoutOptimizationPass(), + "A(Input);A1(Input);B(Int32Input);D(Input);DMT/_0(Const);DMT/_1(Const);" + "DMT/_2(Const);E(_MklPadWithConv2D);Y(Input);Z(Zeta)|A->E;" + "A:control->DMT/_0:control;A:control->DMT/_1:control;" + "A:control->DMT/_2:control;B->E:2;D->E:1;DMT/_0->E:3;DMT/_1->E:4;" + "DMT/_2->E:5;E->Z;E:control->A1:control;Y->Z:1"); } // Pad + Conv2D fusion with padding is VALID, // Input node pointing to both Pad and Conv2D -// A = input(image), B = input(paddings), C= Pad +// A = input(image), B = input(paddings), C= Pad // E = Conv2D, Z = Zeta // C=Pad(A,B); E=Conv2D(C,A); Z=Zeta(E,Y) // After layout pass @@ -645,7 +642,7 @@ TEST_F(MklLayoutPassTest, NodeMerge_PadWithConv2D_Common_Input) { // Pad + Conv2D with padding is VALID, // Input node pointing to both Pad and Conv2D // Output of both Pad and Conv2D feeds one node (Z as Output2) -// A = input(as image), B = input(as paddings), C= Pad +// A = input(as image), B = input(as paddings), C= Pad // E = Conv2D, Z = Output2 // C=Pad(A,B); E=Conv2D(C,A); Z=Output(C,E) // After layout pass - No merging, since Pad and Conv2D both diff --git a/tensorflow/core/kernels/mkl_conv_ops.cc b/tensorflow/core/kernels/mkl_conv_ops.cc index cfc36d1495..9193d00592 100644 --- a/tensorflow/core/kernels/mkl_conv_ops.cc +++ b/tensorflow/core/kernels/mkl_conv_ops.cc @@ -465,19 +465,18 @@ class MklConvOp : public OpKernel { filter.shape().DebugString())); for (int i = 0; i < 3; i++) { - OP_REQUIRES( - context, - FastBoundsCheck(filter.dim_size(i), std::numeric_limits::max()), - errors::InvalidArgument("filter too large")); + OP_REQUIRES(context, FastBoundsCheck(filter.dim_size(i), + std::numeric_limits::max()), + errors::InvalidArgument("filter too large")); } const int64 input_depth = input_in_mkl_format ? GetMklTensorDim(mkl_context.input_shape, 'C') : GetTensorDim(input, data_format_, 'C'); - OP_REQUIRES(context, input_depth == filter.dim_size(2), - errors::InvalidArgument( - "input and filter must have the same depth: ", input_depth, - " vs ", filter.dim_size(2))); + OP_REQUIRES( + context, input_depth == filter.dim_size(2), + errors::InvalidArgument("input and filter must have the same depth: ", + input_depth, " vs ", filter.dim_size(2))); // The last dimension for filter is out_depth. const int out_depth = static_cast(filter.dim_size(3)); @@ -486,10 +485,9 @@ class MklConvOp : public OpKernel { const int64 input_rows_raw = input_in_mkl_format ? GetMklTensorDim(mkl_context.input_shape, 'H') : GetTensorDim(input, data_format_, 'H'); - OP_REQUIRES( - context, - FastBoundsCheck(input_rows_raw, std::numeric_limits::max()), - errors::InvalidArgument("Input rows too large")); + OP_REQUIRES(context, FastBoundsCheck(input_rows_raw, + std::numeric_limits::max()), + errors::InvalidArgument("Input rows too large")); const int input_rows = static_cast(input_rows_raw); const int filter_rows = static_cast(filter.dim_size(0)); @@ -498,10 +496,9 @@ class MklConvOp : public OpKernel { const int64 input_cols_raw = input_in_mkl_format ? GetMklTensorDim(mkl_context.input_shape, 'W') : GetTensorDim(input, data_format_, 'W'); - OP_REQUIRES( - context, - FastBoundsCheck(input_cols_raw, std::numeric_limits::max()), - errors::InvalidArgument("Input cols too large")); + OP_REQUIRES(context, FastBoundsCheck(input_cols_raw, + std::numeric_limits::max()), + errors::InvalidArgument("Input cols too large")); const int input_cols = static_cast(input_cols_raw); const int filter_cols = static_cast(filter.dim_size(1)); @@ -509,10 +506,9 @@ class MklConvOp : public OpKernel { const int64 input_batch_raw = input_in_mkl_format ? GetMklTensorDim(mkl_context.input_shape, 'N') : GetTensorDim(input, data_format_, 'N'); - OP_REQUIRES( - context, - FastBoundsCheck(input_batch_raw, std::numeric_limits::max()), - errors::InvalidArgument("batch is too large")); + OP_REQUIRES(context, FastBoundsCheck(input_batch_raw, + std::numeric_limits::max()), + errors::InvalidArgument("batch is too large")); const int batch = static_cast(input_batch_raw); // For now we take the stride from the second and third dimensions only (we @@ -850,8 +846,8 @@ REGISTER_KERNEL_BUILDER(Name("_MklConv2DWithBias") // Base class for convolution forward operations template + typename Toutput, typename Ttemp_output, typename Tpadding, + bool biasEnabled, bool padEnabled> class MklConvOp : public OpKernel { public: ~MklConvOp() {} @@ -894,17 +890,15 @@ class MklConvOp : public OpKernel { OP_REQUIRES(context, dilations_.size() == 5, errors::InvalidArgument("Dilation rates field must " "specify 5 dimensions")); - OP_REQUIRES(context, - (GetTensorDim(dilations_, data_format_, 'N') == 1 && - GetTensorDim(dilations_, data_format_, 'C') == 1), + OP_REQUIRES(context, (GetTensorDim(dilations_, data_format_, 'N') == 1 && + GetTensorDim(dilations_, data_format_, 'C') == 1), errors::InvalidArgument( "Current implementation does not yet support " "dilations rates in the batch and depth dimensions.")); OP_REQUIRES( - context, - (GetTensorDim(dilations_, data_format_, '0') > 0 && - GetTensorDim(dilations_, data_format_, '1') > 0 && - GetTensorDim(dilations_, data_format_, '2') > 0), + context, (GetTensorDim(dilations_, data_format_, '0') > 0 && + GetTensorDim(dilations_, data_format_, '1') > 0 && + GetTensorDim(dilations_, data_format_, '2') > 0), errors::InvalidArgument("Dilated rates should be larger than 0.")); } } @@ -940,9 +934,9 @@ class MklConvOp : public OpKernel { auto src_tf_shape = GetTfShape(context, kInputIndex_Src); auto filter_tf_shape = GetTfShape(context, kInputIndex_Filter); conv_utl.GetConvFwdSizesInMklOrder( - src_tf_shape, filter_tf_shape, &src_dims, &filter_dims, - &strides, &dilations, &dst_dims_tf_order, &dst_dims_mkl_order, - &padding_left, &padding_right, padEnabled); + src_tf_shape, filter_tf_shape, &src_dims, &filter_dims, &strides, + &dilations, &dst_dims_tf_order, &dst_dims_mkl_order, &padding_left, + &padding_right, padEnabled); if (!context->status().ok()) return; // Check for corner case - if there is nothing to compute, return. @@ -974,9 +968,10 @@ class MklConvOp : public OpKernel { bool isConv2D = (strides_.size() == 4); // TODO(Intel-tf) Add check to make sure padEnabled is true only for 2D - if(!isConv2D){ - OP_REQUIRES(context, padEnabled, - errors::InvalidArgument("Pad+Conv fusion only works for 2D")); + if (!isConv2D) { + OP_REQUIRES( + context, !padEnabled, + errors::InvalidArgument("Pad+Conv fusion only works for 2D")); } // Create memory for user data. // Describe how the inputs and outputs of Convolution look like. Also @@ -1211,7 +1206,6 @@ class MklConvOp : public OpKernel { const int kInputIndex_Pad = 2; const int kOutputIndex_Dst = 0, kOutputIndex_Filter = 1; const int kDilationH = 0, kDilationW = 1; - // Allocate filter output tensor. void AllocateFilterOutputTensor( @@ -1282,7 +1276,7 @@ template class MklQuantizedConv2DOp : public MklConvOp { + int32, biasEnabled, false> { public: virtual ~MklQuantizedConv2DOp() { if (this->input_bias_ != nullptr) { @@ -1297,13 +1291,13 @@ class MklQuantizedConv2DOp } explicit MklQuantizedConv2DOp(OpKernelConstruction* context) - : MklConvOp(context) {} + : MklConvOp(context) {} void Compute(OpKernelContext* context) override { // Compute int32 output tensor - MklConvOp::Compute(context); + MklConvOp::Compute(context); // Compute additional outputs: min/max scalars. int bias_index_offset; @@ -1349,8 +1343,8 @@ class MklQuantizedConv2DOp protected: void ExtendConvFwdParams(OpKernelContext* context, MklConvFwdParams& params) override { - MklConvOp::ExtendConvFwdParams(context, params); + MklConvOp::ExtendConvFwdParams(context, params); // When the output type is quint8, the output data id requantized // into quint8. A post_op "output_scale" is added to do the conversion. @@ -1561,11 +1555,11 @@ class MklQuantizedConv2DSumReluOp } } // TODO(mdfaijul): Add cleaner code for non-mkl tensor - MklConvOp::AllocateOutputTensor(context, conv_prim_desc, - output_dims_mkl_order, - output_tf_format, - output_tensor); + MklConvOp::AllocateOutputTensor(context, conv_prim_desc, + output_dims_mkl_order, + output_tf_format, + output_tensor); const Tensor& summand = MklGetInput(context, summand_idx); if (summand.dtype() != DT_FLOAT) TF_CHECK_OK(Status(error::Code::FAILED_PRECONDITION, @@ -1583,8 +1577,8 @@ class MklQuantizedConv2DSumReluOp const float max_filter = context->input(5 + bias_index_offset).flat()(0); - reorder_sum_scale = 255.0 * 127.0 / - (std::max(std::abs(max_input), std::abs(min_input)) * + reorder_sum_scale = + 255.0 * 127.0 / (std::max(std::abs(max_input), std::abs(min_input)) * std::max(std::abs(max_filter), std::abs(min_filter))); std::vector scales; scales.push_back(reorder_sum_scale); @@ -1833,52 +1827,56 @@ REGISTER_KERNEL_BUILDER( MklQuantizedConv2DSumReluOp); #endif // INTEL_MKL_ML - // Register 2D operations -#define REGISTER_MKL_CPU_2D(T) \ - REGISTER_KERNEL_BUILDER(Name("_MklConv2D") \ - .Device(DEVICE_CPU) \ - .TypeConstraint("T") \ - .Label(mkl_op_registry::kMklOpLabel), \ - MklConvOp); \ - REGISTER_KERNEL_BUILDER(Name("_MklConv2DWithBias") \ - .Device(DEVICE_CPU) \ - .TypeConstraint("T") \ - .Label(mkl_op_registry::kMklOpLabel), \ - MklConvOp); \ - REGISTER_KERNEL_BUILDER(Name("__MklDummyConv2DWithBias") \ - .Device(DEVICE_CPU) \ - .TypeConstraint("T") \ - .Label(mkl_op_registry::kMklOpLabel), \ - MklDummyOp); \ - REGISTER_KERNEL_BUILDER(Name("_MklPadWithConv2D") \ - .Device(DEVICE_CPU) \ - .TypeConstraint("T") \ - .TypeConstraint("Tpaddings") \ - .Label(mkl_op_registry::kMklOpLabel), \ - MklConvOp); \ - REGISTER_KERNEL_BUILDER(Name("_MklPadWithConv2D") \ - .Device(DEVICE_CPU) \ - .TypeConstraint("T") \ - .TypeConstraint("Tpaddings") \ - .Label(mkl_op_registry::kMklOpLabel), \ - MklConvOp); \ - REGISTER_KERNEL_BUILDER(Name("__MklDummyPadWithConv2D") \ - .Device(DEVICE_CPU) \ - .TypeConstraint("T") \ - .TypeConstraint("Tpaddings") \ - .Label(mkl_op_registry::kMklOpLabel), \ +#define REGISTER_MKL_CPU_2D(T) \ + REGISTER_KERNEL_BUILDER(Name("_MklConv2D") \ + .Device(DEVICE_CPU) \ + .TypeConstraint("T") \ + .Label(mkl_op_registry::kMklOpLabel), \ + MklConvOp); \ + REGISTER_KERNEL_BUILDER(Name("_MklConv2DWithBias") \ + .Device(DEVICE_CPU) \ + .TypeConstraint("T") \ + .Label(mkl_op_registry::kMklOpLabel), \ + MklConvOp); \ + REGISTER_KERNEL_BUILDER(Name("__MklDummyConv2DWithBias") \ + .Device(DEVICE_CPU) \ + .TypeConstraint("T") \ + .Label(mkl_op_registry::kMklOpLabel), \ + MklDummyOp); \ + REGISTER_KERNEL_BUILDER(Name("_MklPadWithConv2D") \ + .Device(DEVICE_CPU) \ + .TypeConstraint("T") \ + .TypeConstraint("Tpaddings") \ + .Label(mkl_op_registry::kMklOpLabel), \ + MklConvOp); \ + REGISTER_KERNEL_BUILDER(Name("_MklPadWithConv2D") \ + .Device(DEVICE_CPU) \ + .TypeConstraint("T") \ + .TypeConstraint("Tpaddings") \ + .Label(mkl_op_registry::kMklOpLabel), \ + MklConvOp); \ + REGISTER_KERNEL_BUILDER(Name("__MklDummyPadWithConv2D") \ + .Device(DEVICE_CPU) \ + .TypeConstraint("T") \ + .TypeConstraint("Tpaddings") \ + .Label(mkl_op_registry::kMklOpLabel), \ MklDummyOp); -TF_CALL_float(REGISTER_MKL_CPU); +TF_CALL_float(REGISTER_MKL_CPU_2D); // Register 3D operations -#define REGISTER_MKL_CPU_3D(T) \ - REGISTER_KERNEL_BUILDER(Name("_MklConv3D") \ - .Device(DEVICE_CPU) \ - .TypeConstraint("T") \ - .Label(mkl_op_registry::kMklOpLabel), \ - MklConvOp); +#define REGISTER_MKL_CPU_3D(T) \ + REGISTER_KERNEL_BUILDER( \ + Name("_MklConv3D") \ + .Device(DEVICE_CPU) \ + .TypeConstraint("T") \ + .Label(mkl_op_registry::kMklOpLabel), \ + MklConvOp); TF_CALL_float(REGISTER_MKL_CPU_3D); } // namespace tensorflow diff --git a/tensorflow/core/kernels/mkl_conv_ops.h b/tensorflow/core/kernels/mkl_conv_ops.h index 8c71c20879..963826a73a 100644 --- a/tensorflow/core/kernels/mkl_conv_ops.h +++ b/tensorflow/core/kernels/mkl_conv_ops.h @@ -17,8 +17,8 @@ limitations under the License. #define TENSORFLOW_CORE_KERNELS_MKL_CONV_OPS_H_ #include -#include #include +#include #include "mkldnn.hpp" #include "tensorflow/core/framework/numeric_op.h" @@ -85,7 +85,7 @@ class MklDnnConvUtil { } // Calculate Convolution dilations - virtual inline void GetDilationsInMklOrder(memory::dims *dilations) { + virtual inline void GetDilationsInMklOrder(memory::dims* dilations) { // For now we take the dilation from the second and third dimensions only // (we do not support dilation on the batch or depth dimension). CHECK_NOTNULL(dilations); @@ -195,9 +195,8 @@ class MklDnnConvUtil { filter_shape.DebugString())); for (int i = 0; i < ((strides_.size() == 4) ? 3 : 5); i++) { - OP_REQUIRES(context_, - FastBoundsCheck(filter_shape.dim_size(i), - std::numeric_limits::max()), + OP_REQUIRES(context_, FastBoundsCheck(filter_shape.dim_size(i), + std::numeric_limits::max()), errors::InvalidArgument("filter too large")); } @@ -463,8 +462,8 @@ class MklDnnConvUtil { input_tf_shape.DebugString())); } - GetOutputAndPadSizeInMklOrder(input_tf_shape, filter_tf_shape, - strides, dilations, output_dims_tf_order, + GetOutputAndPadSizeInMklOrder(input_tf_shape, filter_tf_shape, strides, + dilations, output_dims_tf_order, output_dims_mkl_order, pad_l, pad_r); } @@ -556,7 +555,6 @@ class MklConvBackpropCommonOp : public OpKernel { TensorFormat data_format_; // NCHW or NHWC }; - ///////////////////////////////////////////////////////////////////// /// Dummy Mkl op that is just used for operators that are intermediate /// output of node fusion in the graph diff --git a/tensorflow/core/kernels/mkl_fused_ops_test.cc b/tensorflow/core/kernels/mkl_fused_ops_test.cc index 900325ac91..991fb08093 100644 --- a/tensorflow/core/kernels/mkl_fused_ops_test.cc +++ b/tensorflow/core/kernels/mkl_fused_ops_test.cc @@ -13,7 +13,6 @@ See the License for the specific language governing permissions and limitations under the License. ==============================================================================*/ #ifdef INTEL_MKL -#ifndef INTEL_MKL_ML_ONLY // We don't support fusion in MKL ML #include "tensorflow/cc/ops/const_op.h" #include "tensorflow/cc/ops/image_ops.h" #include "tensorflow/cc/ops/nn_ops.h" @@ -160,5 +159,4 @@ TEST_F(FusedPadConvOpTest, PaddingConvTestNchw) { Run(DT_FLOAT, image, filter, padding, expected, "NCHW"); } } // namespace tensorflow -#endif // INTEL_MKL_ML_ONLY #endif // INTEL_MKL diff --git a/tensorflow/core/ops/nn_ops.cc b/tensorflow/core/ops/nn_ops.cc index 8afbe0333a..0b99542c5c 100644 --- a/tensorflow/core/ops/nn_ops.cc +++ b/tensorflow/core/ops/nn_ops.cc @@ -1649,7 +1649,7 @@ REGISTER_OP("_MklPadWithConv2D") .Attr(GetConvnetDataFormatAttrString()) .Attr("dilations: list(int) = [1, 1, 1, 1]") .Attr("Tpaddings: {int32, int64} = DT_INT32") - .SetShapeFn(shape_inference::Conv2DShape) + .SetShapeFn(shape_inference::Conv2DShape) .Doc(R"doc( MKL version of Pad and Conv2D operator. Uses MKL DNN APIs to perform Pad and 2D convolution to the output of convolution. @@ -2159,7 +2159,6 @@ NOTE Do not invoke this operator directly in Python. Graph rewrite pass is expected to invoke these operators. )doc"); - REGISTER_OP("_MklAvgPool3DGrad") .Input("orig_input_shape: int32") .Input("grad: T") -- GitLab From 768b36822e0d5b988a697e0c9e3b65302b051630 Mon Sep 17 00:00:00 2001 From: mbhuiyan Date: Mon, 26 Nov 2018 15:45:08 -0800 Subject: [PATCH 035/405] fixing the buildifer error of tensorflow/core/kernels/BUILD --- tensorflow/core/kernels/BUILD | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/tensorflow/core/kernels/BUILD b/tensorflow/core/kernels/BUILD index 1759a7f790..efa571b23b 100644 --- a/tensorflow/core/kernels/BUILD +++ b/tensorflow/core/kernels/BUILD @@ -29,26 +29,26 @@ package_group( load( "//tensorflow:tensorflow.bzl", + "cc_header_only_library", "if_android", + "if_not_windows", + "tf_cc_binary", "tf_cc_test", "tf_cc_test_mkl", "tf_cc_tests", - "tf_cc_binary", "tf_copts", "tf_cuda_library", - "tf_opts_nortti_if_android", "tf_kernel_library", "tf_mkl_kernel_library", - "cc_header_only_library", - "if_not_windows", + "tf_opts_nortti_if_android", ) load("@local_config_sycl//sycl:build_defs.bzl", "if_sycl") load("//tensorflow:tensorflow.bzl", "tf_cuda_cc_test") load("//tensorflow:tensorflow.bzl", "tf_cuda_cc_tests") load( "//tensorflow/core:platform/default/build_config.bzl", - "tf_proto_library", "tf_kernel_tests_linkstatic", + "tf_proto_library", ) load( "//tensorflow/core:platform/default/build_config_root.bzl", @@ -6714,10 +6714,10 @@ tf_cc_test_mkl( srcs = ["mkl_fused_ops_test.cc"], linkstatic = 1, deps = [ - ":mkl_conv_op", - ":mkl_tfconv_op", ":conv_ops", ":image", + ":mkl_conv_op", + ":mkl_tfconv_op", ":ops_testutil", ":ops_util", "//tensorflow/cc:cc_ops", @@ -6730,8 +6730,9 @@ tf_cc_test_mkl( "//tensorflow/core:test", "//tensorflow/core:test_main", "//tensorflow/core:testlib", - ] + ], ) + tf_mkl_kernel_library( name = "mkl_transpose_op", srcs = [ -- GitLab From 59ebe545b3385c4c36d3b1602671d109e44ea38c Mon Sep 17 00:00:00 2001 From: Fei Hu Date: Fri, 16 Nov 2018 16:20:05 -0800 Subject: [PATCH 036/405] Add CancellationManager to LoopCondOp --- tensorflow/core/kernels/control_flow_ops.cc | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/tensorflow/core/kernels/control_flow_ops.cc b/tensorflow/core/kernels/control_flow_ops.cc index 1587eb5114..21aabd9295 100644 --- a/tensorflow/core/kernels/control_flow_ops.cc +++ b/tensorflow/core/kernels/control_flow_ops.cc @@ -600,6 +600,15 @@ LoopCondOp::LoopCondOp(OpKernelConstruction* context) : OpKernel(context) {} LoopCondOp::~LoopCondOp() = default; void LoopCondOp::Compute(OpKernelContext* context) { + CancellationManager* cm = context->cancellation_manager(); + bool already_cancelled = cm->IsCancelled(); + + if (already_cancelled) { + Tensor continue_running(false); + context->set_output(0, continue_running); + return; + } + context->set_output(0, context->input(0)); } -- GitLab From c0b128c45396560f26769a293525c60f76850a3f Mon Sep 17 00:00:00 2001 From: Fei Hu Date: Fri, 16 Nov 2018 16:20:52 -0800 Subject: [PATCH 037/405] Add a python test for while_loop timeout --- .../python/kernel_tests/control_flow_ops_py_test.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/tensorflow/python/kernel_tests/control_flow_ops_py_test.py b/tensorflow/python/kernel_tests/control_flow_ops_py_test.py index 37654abd18..595d4ff37b 100644 --- a/tensorflow/python/kernel_tests/control_flow_ops_py_test.py +++ b/tensorflow/python/kernel_tests/control_flow_ops_py_test.py @@ -1988,6 +1988,16 @@ class ControlFlowTest(test.TestCase): for i in xrange(10): self.assertEqual([i], q.dequeue().eval()) + def testWhileTimeOut(self): + run_options = config_pb2.RunOptions(timeout_in_ms=1) + with self.cached_session() as sess: + n = constant_op.constant(0) + c = lambda x: True + b = lambda x: math_ops.add(x, 1) + r = control_flow_ops.while_loop(c, b, [n]) + with self.assertRaises(errors_impl.DeadlineExceededError): + sess.run(r, options=run_options) + @test_util.disable_control_flow_v2("b/117119329 (stack)") def testWhileStack_1(self): with self.cached_session(): -- GitLab From fc5392dedec126f988788a597edb55021fb07b60 Mon Sep 17 00:00:00 2001 From: Fei Hu Date: Tue, 20 Nov 2018 22:25:51 -0500 Subject: [PATCH 038/405] Raise an error when loop execution is cancelled --- tensorflow/core/kernels/control_flow_ops.cc | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/tensorflow/core/kernels/control_flow_ops.cc b/tensorflow/core/kernels/control_flow_ops.cc index 21aabd9295..61547adb73 100644 --- a/tensorflow/core/kernels/control_flow_ops.cc +++ b/tensorflow/core/kernels/control_flow_ops.cc @@ -602,12 +602,8 @@ LoopCondOp::~LoopCondOp() = default; void LoopCondOp::Compute(OpKernelContext* context) { CancellationManager* cm = context->cancellation_manager(); bool already_cancelled = cm->IsCancelled(); - - if (already_cancelled) { - Tensor continue_running(false); - context->set_output(0, continue_running); - return; - } + OP_REQUIRES(context, !already_cancelled, + errors::Cancelled("Loop execution was cancelled.")); context->set_output(0, context->input(0)); } -- GitLab From 5bb0553a1b3bd580bd1502ab6b339ca3a1f0b5df Mon Sep 17 00:00:00 2001 From: Fei Hu Date: Tue, 27 Nov 2018 22:05:43 -0800 Subject: [PATCH 039/405] Handle the case that CancellationManager is null in eager mode --- tensorflow/core/kernels/control_flow_ops.cc | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/tensorflow/core/kernels/control_flow_ops.cc b/tensorflow/core/kernels/control_flow_ops.cc index 61547adb73..b3bdff2575 100644 --- a/tensorflow/core/kernels/control_flow_ops.cc +++ b/tensorflow/core/kernels/control_flow_ops.cc @@ -601,9 +601,11 @@ LoopCondOp::~LoopCondOp() = default; void LoopCondOp::Compute(OpKernelContext* context) { CancellationManager* cm = context->cancellation_manager(); - bool already_cancelled = cm->IsCancelled(); - OP_REQUIRES(context, !already_cancelled, - errors::Cancelled("Loop execution was cancelled.")); + if (cm != nullptr) { + bool already_cancelled = cm->IsCancelled(); + OP_REQUIRES(context, !already_cancelled, + errors::Cancelled("Loop execution was cancelled.")); + } context->set_output(0, context->input(0)); } -- GitLab From 0c7a31a168aae02e323c97bc6b81f2d3f19cbb2a Mon Sep 17 00:00:00 2001 From: Nutti Date: Thu, 29 Nov 2018 22:32:54 +0900 Subject: [PATCH 040/405] Fix: clang-format error --- tensorflow/core/kernels/partitioned_function_ops.cc | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/tensorflow/core/kernels/partitioned_function_ops.cc b/tensorflow/core/kernels/partitioned_function_ops.cc index 72310f33ae..8aac78f56f 100644 --- a/tensorflow/core/kernels/partitioned_function_ops.cc +++ b/tensorflow/core/kernels/partitioned_function_ops.cc @@ -179,10 +179,9 @@ class PartitionedCallOp : public AsyncOpKernel { done); OP_REQUIRES_OK_ASYNC( - ctx, - OptimizationPassRegistry::Global()->RunGrouping( - OptimizationPassRegistry::POST_REWRITE_FOR_EXEC, - optimization_options), + ctx, OptimizationPassRegistry::Global()->RunGrouping( + OptimizationPassRegistry::POST_REWRITE_FOR_EXEC, + optimization_options), done); std::unordered_map> subgraphs; -- GitLab From d00013e72cd3a1a4805395eb8e66748dcf387295 Mon Sep 17 00:00:00 2001 From: Wen yun Date: Wed, 31 Oct 2018 15:22:14 +0800 Subject: [PATCH 041/405] fix the case when input value are MirroredVariable for assign_moving_average --- .../distribute/python/moving_averages_test.py | 20 +++++++++++++++++++ .../python/distribute/mirrored_strategy.py | 3 +++ 2 files changed, 23 insertions(+) diff --git a/tensorflow/contrib/distribute/python/moving_averages_test.py b/tensorflow/contrib/distribute/python/moving_averages_test.py index c492d8bafc..da3353b2d5 100644 --- a/tensorflow/contrib/distribute/python/moving_averages_test.py +++ b/tensorflow/contrib/distribute/python/moving_averages_test.py @@ -139,6 +139,26 @@ class AssignMovingAveragesTest(test.TestCase, parameterized.TestCase): (2.0 * 0.25 + 0.0) / (1.0 * 0.25 + 1.0)], var.eval()) + @combinations.generate(all_combinations) + def testAssignVariable(self, distribution): + def replica_fn(): + var = variables.Variable([10.0, 11.0]) + # Here we expect to check the case when input value are variable. + val = variables.Variable([1., 2.]) + decay = 0.25 + assign = moving_averages.assign_moving_average( + var, val, decay, zero_debias=False) + return var, assign + + with distribution.scope(), self.cached_session() as sess: + var, assign = distribution.call_for_each_replica(replica_fn) + variables.global_variables_initializer().run() + self.assertAllClose([10.0, 11.0], var.eval()) + sess.run(distribution.unwrap(assign)) + self.assertAllClose( + [10 * 0.25 + 1. * (1 - 0.25), + 11 * 0.25 + 2. * (1 - 0.25)], + var.eval()) if __name__ == "__main__": test.main() diff --git a/tensorflow/python/distribute/mirrored_strategy.py b/tensorflow/python/distribute/mirrored_strategy.py index d6d40df5ce..3cd5cf09c0 100644 --- a/tensorflow/python/distribute/mirrored_strategy.py +++ b/tensorflow/python/distribute/mirrored_strategy.py @@ -598,6 +598,9 @@ class MirroredExtended(distribute_lib.DistributionStrategyExtended): return self._cross_device_ops def _reduce_to(self, reduce_op, value, destinations): + if (isinstance(value, values.Mirrored) and + reduce_op == reduce_util.ReduceOp.MEAN): + return value assert not isinstance(value, values.Mirrored) if not isinstance(value, values.DistributedValues): # This function handles reducing values that are not PerReplica or -- GitLab From 7d931d7b85f65b6145643fdc638aabbce779ab21 Mon Sep 17 00:00:00 2001 From: Neargye Date: Mon, 23 Apr 2018 00:12:31 +0500 Subject: [PATCH 042/405] change toolchain to clang --- tensorflow/contrib/android/cmake/build.gradle | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tensorflow/contrib/android/cmake/build.gradle b/tensorflow/contrib/android/cmake/build.gradle index 17a57b99fd..9e7fd317f3 100644 --- a/tensorflow/contrib/android/cmake/build.gradle +++ b/tensorflow/contrib/android/cmake/build.gradle @@ -22,8 +22,8 @@ android { } externalNativeBuild { cmake { - arguments '-DANDROID_TOOLCHAIN=gcc', - '-DANDROID_STL=gnustl_static' + arguments '-DANDROID_TOOLCHAIN=clang', + '-DANDROID_STL=c++_static' } } } -- GitLab From f56a6058a44edb2dc6172a0723fb92fd63e7a36f Mon Sep 17 00:00:00 2001 From: neargye Date: Fri, 30 Nov 2018 16:57:33 +0500 Subject: [PATCH 043/405] clean-up --- tensorflow/contrib/android/cmake/build.gradle | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tensorflow/contrib/android/cmake/build.gradle b/tensorflow/contrib/android/cmake/build.gradle index 9e7fd317f3..ddec08894f 100644 --- a/tensorflow/contrib/android/cmake/build.gradle +++ b/tensorflow/contrib/android/cmake/build.gradle @@ -70,7 +70,7 @@ if (ndkDir == null || ndkDir == "") { ndkDir = System.getenv('ANDROID_NDK_HOME') } -if(! Os.isFamily(Os.FAMILY_WINDOWS)) { +if (!Os.isFamily(Os.FAMILY_WINDOWS)) { // This script is for non-Windows OS. For Windows OS, MANUALLY build // (or copy the built) libs/headers to the // ${TENSORFLOW_ROOT_DIR}/tensorflow/contrib/makefile/gen -- GitLab From 29c1506fecd37c853f4c5d53e0a199c04f213436 Mon Sep 17 00:00:00 2001 From: Fei Hu Date: Fri, 30 Nov 2018 16:10:51 -0800 Subject: [PATCH 044/405] Change the variable names to be more readable --- .../data/kernel_tests/list_files_test.py | 33 ++++++++++--------- 1 file changed, 17 insertions(+), 16 deletions(-) diff --git a/tensorflow/python/data/kernel_tests/list_files_test.py b/tensorflow/python/data/kernel_tests/list_files_test.py index 789f1ab6de..d4e0cdeb97 100644 --- a/tensorflow/python/data/kernel_tests/list_files_test.py +++ b/tensorflow/python/data/kernel_tests/list_files_test.py @@ -82,27 +82,27 @@ class ListFilesTest(test_base.DatasetTestBase): dataset = dataset_ops.Dataset.list_files( path.join(self.tmp_dir, '*'), shuffle=True, seed=37) - full_filenames = [compat.as_bytes(path.join(self.tmp_dir, filename)) + expected_filenames = [compat.as_bytes(path.join(self.tmp_dir, filename)) for filename in filenames] - all_produced_filenames = [] + all_actual_filenames = [] for _ in range(3): - produced_filenames = [] + actual_filenames = [] next_element = self.getNext(dataset, requires_initialization=True) try: while True: - produced_filenames.append(self.evaluate(next_element())) + actual_filenames.append(self.evaluate(next_element())) except errors.OutOfRangeError: pass - all_produced_filenames.append(produced_filenames) + all_actual_filenames.append(actual_filenames) # Each run should produce the same set of filenames, which may be - # different from the order of `full_filenames`. - self.assertItemsEqual(full_filenames, all_produced_filenames[0]) + # different from the order of `expected_filenames`. + self.assertItemsEqual(expected_filenames, all_actual_filenames[0]) # However, the different runs should produce filenames in the same order # as each other. - self.assertEqual(all_produced_filenames[0], all_produced_filenames[1]) - self.assertEqual(all_produced_filenames[0], all_produced_filenames[2]) + self.assertEqual(all_actual_filenames[0], all_actual_filenames[1]) + self.assertEqual(all_actual_filenames[0], all_actual_filenames[2]) # TODO(b/117581999): eager mode assertion fail wrapped, debug. def tesSkipEagerEmptyDirectoryInitializer(self): @@ -169,16 +169,17 @@ class ListFilesTest(test_base.DatasetTestBase): path.join(self.tmp_dir, '*'), shuffle=False).repeat(2) next_element = self.getNext(dataset) - full_filenames = [] - produced_filenames = [] + expected_filenames = [] + actual_filenames = [] for filename in filenames * 2: - full_filenames.append(compat.as_bytes(path.join(self.tmp_dir, filename))) - produced_filenames.append(compat.as_bytes(self.evaluate(next_element()))) + expected_filenames.append( + compat.as_bytes(path.join(self.tmp_dir, filename))) + actual_filenames.append(compat.as_bytes(self.evaluate(next_element()))) with self.assertRaises(errors.OutOfRangeError): self.evaluate(next_element()) - self.assertItemsEqual(full_filenames, produced_filenames) - self.assertEqual(produced_filenames[:len(filenames)], - produced_filenames[len(filenames):]) + self.assertItemsEqual(expected_filenames, actual_filenames) + self.assertEqual(actual_filenames[:len(filenames)], + actual_filenames[len(filenames):]) def testMultiplePatternsAsList(self): filenames = ['a.txt', 'b.py', 'c.py', 'd.pyc'] -- GitLab From cb57c4dd8d9cadcb0c25c0940d7cb4f08d5aa530 Mon Sep 17 00:00:00 2001 From: Fei Hu Date: Fri, 30 Nov 2018 16:32:34 -0800 Subject: [PATCH 045/405] Fix the coding style --- tensorflow/python/data/kernel_tests/list_files_test.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tensorflow/python/data/kernel_tests/list_files_test.py b/tensorflow/python/data/kernel_tests/list_files_test.py index d4e0cdeb97..093d6cb3ca 100644 --- a/tensorflow/python/data/kernel_tests/list_files_test.py +++ b/tensorflow/python/data/kernel_tests/list_files_test.py @@ -83,7 +83,7 @@ class ListFilesTest(test_base.DatasetTestBase): path.join(self.tmp_dir, '*'), shuffle=True, seed=37) expected_filenames = [compat.as_bytes(path.join(self.tmp_dir, filename)) - for filename in filenames] + for filename in filenames] all_actual_filenames = [] for _ in range(3): -- GitLab From fe9a9dbb8d0ef118b15beb4724f256190fd04d13 Mon Sep 17 00:00:00 2001 From: "Wen-Heng (Jack) Chung" Date: Fri, 30 Nov 2018 16:30:48 -0600 Subject: [PATCH 046/405] Split convolution invocation into preparation and actual invocation - split DoConvolve into: PrepareForConvolution DoConvolve - split DoConvolveBackwardData into: PrepareForConvolutionBackwardData DoConvolveBackwardData - split DoConvolveBackwardFilter into: PrepareForConvolutionBackwardFilter DoConvolveBackwardFilter PrepareForConvolutionXXX would allocate scratch memory. DoConolveXXX would invoke actual convolution algorithms. Implement forward convoution, backward input convolution, backward filter convolution on CUDA path. --- tensorflow/stream_executor/cuda/cuda_dnn.cc | 459 +++++++++++++++----- tensorflow/stream_executor/cuda/cuda_dnn.h | 196 ++++++++- tensorflow/stream_executor/dnn.h | 180 ++++++-- tensorflow/stream_executor/stream.cc | 172 ++++++-- 4 files changed, 827 insertions(+), 180 deletions(-) diff --git a/tensorflow/stream_executor/cuda/cuda_dnn.cc b/tensorflow/stream_executor/cuda/cuda_dnn.cc index 1f2e2f48bb..387afefc21 100644 --- a/tensorflow/stream_executor/cuda/cuda_dnn.cc +++ b/tensorflow/stream_executor/cuda/cuda_dnn.cc @@ -2380,7 +2380,7 @@ bool ShouldIncludeWinogradNonfusedAlgo( } // namespace template -port::Status CudnnSupport::DoConvolveImpl( +port::Status CudnnSupport::PrepareForConvolutionImpl( Stream* stream, const dnn::BatchDescriptor& input_descriptor, const DeviceMemory& input_data, const dnn::FilterDescriptor& filter_descriptor, @@ -2389,6 +2389,34 @@ port::Status CudnnSupport::DoConvolveImpl( const dnn::BatchDescriptor& output_descriptor, DeviceMemory* output_data, dnn::DataType accumulator_type, ScratchAllocator* scratch_allocator, const dnn::AlgorithmConfig& algorithm_config, + dnn::AlgorithmDesc* algorithm_desc, DeviceMemory* scratch_memory) { + cudnnDataType_t cudnn_type = GetCudnnDataType(); + CudnnTensorDescriptor input_nd(input_descriptor, cudnn_type); + CudnnTensorDescriptor output_nd(output_descriptor, cudnn_type); + CudnnFilterDescriptor filter(filter_descriptor, cudnn_type); + CudnnConvolutionDescriptor conv(convolution_descriptor, + ToCudnnDataType(accumulator_type)); + + auto cudnn = cudnn_->GetHandle(parent_, stream); + + SE_ASSIGN_OR_RETURN(*algorithm_desc, + GetCudnnConvolutionForwardAlgorithm( + stream, cudnn, algorithm_config, input_nd, filter, + conv, output_nd, scratch_allocator, scratch_memory)); + + return port::Status::OK(); +} + +template +port::Status CudnnSupport::DoConvolveImpl( + Stream* stream, const dnn::BatchDescriptor& input_descriptor, + const DeviceMemory& input_data, + const dnn::FilterDescriptor& filter_descriptor, + const DeviceMemory& filter_data, + const dnn::ConvolutionDescriptor& convolution_descriptor, + const dnn::BatchDescriptor& output_descriptor, DeviceMemory* output_data, + dnn::DataType accumulator_type, const dnn::AlgorithmDesc& algorithm_desc, + DeviceMemory* scratch_memory, dnn::ProfileResult* output_profile_result) { cudnnDataType_t cudnn_type = GetCudnnDataType(); CudnnTensorDescriptor input_nd(input_descriptor, cudnn_type); @@ -2411,12 +2439,6 @@ port::Status CudnnSupport::DoConvolveImpl( const bool is_profiling = output_profile_result != nullptr; - DeviceMemory scratch; - SE_ASSIGN_OR_RETURN(dnn::AlgorithmDesc algo_desc, - GetCudnnConvolutionForwardAlgorithm( - stream, cudnn, algorithm_config, input_nd, filter, - conv, output_nd, scratch_allocator, &scratch)); - std::unique_ptr timer; if (is_profiling) { timer.reset(new CUDATimer(parent_)); // NOLINT @@ -2432,7 +2454,7 @@ port::Status CudnnSupport::DoConvolveImpl( // memory. See nvbugs/2138754, b/80018418. if (CUDNN_VERSION < 7300) { SE_RETURN_IF_ERROR([&] { - if (algo_desc.algo_id() != CUDNN_CONVOLUTION_FWD_ALGO_FFT_TILING) { + if (algorithm_desc.algo_id() != CUDNN_CONVOLUTION_FWD_ALGO_FFT_TILING) { return port::Status::OK(); } if (input_descriptor.ndims() < 3) { @@ -2457,7 +2479,8 @@ port::Status CudnnSupport::DoConvolveImpl( }()); } - if (algo_desc.algo_id() == CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD_NONFUSED && + if (algorithm_desc.algo_id() == + CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD_NONFUSED && !ShouldIncludeWinogradNonfusedAlgo(input_descriptor, output_descriptor)) { return port::Status(port::error::FAILED_PRECONDITION, "This configuration has potential integer overflow in " @@ -2469,18 +2492,19 @@ port::Status CudnnSupport::DoConvolveImpl( /*alpha=*/alpha, /*srcDesc=*/input_nd.handle(), /*srcData=*/input_data.opaque(), /*filterDesc=*/filter.handle(), /*filterData=*/filter_data.opaque(), /*convDesc=*/conv.handle(), - /*algo=*/ToConvForwardAlgo(algo_desc), /*workSpace=*/scratch.opaque(), - /*workSpaceSizeInBytes=*/scratch.size(), /*beta=*/beta, + /*algo=*/ToConvForwardAlgo(algorithm_desc), + /*workSpace=*/scratch_memory->opaque(), + /*workSpaceSizeInBytes=*/scratch_memory->size(), /*beta=*/beta, /*yDesc=*/output_nd.handle(), /*y=*/output_data->opaque())); if (is_profiling) { if (!timer->Stop(AsCUDAStream(stream))) { return port::Status(port::error::INTERNAL, "Failed to stop timer"); } - output_profile_result->set_algorithm(algo_desc); + output_profile_result->set_algorithm(algorithm_desc); output_profile_result->set_elapsed_time_in_ms( timer->GetElapsedMilliseconds()); - output_profile_result->set_scratch_size(scratch.size()); + output_profile_result->set_scratch_size(scratch_memory->size()); } return port::Status::OK(); @@ -2877,7 +2901,7 @@ port::Status CudnnSupport::DoBatchNormalizationBackwardImpl( return port::Status::OK(); } -bool CudnnSupport::DoConvolve( +bool CudnnSupport::PrepareForConvolution( Stream* stream, const dnn::BatchDescriptor& batch_descriptor, const DeviceMemory& input_data, const dnn::FilterDescriptor& filter_descriptor, @@ -2886,12 +2910,70 @@ bool CudnnSupport::DoConvolve( const dnn::BatchDescriptor& output_descriptor, DeviceMemory* output_data, ScratchAllocator* scratch_allocator, const dnn::AlgorithmConfig& algorithm_config, + dnn::AlgorithmDesc* algorithm_desc, DeviceMemory* scratch_memory) { + return IsStatusOk(PrepareForConvolutionImpl( + stream, batch_descriptor, input_data, filter_descriptor, + filter_data, convolution_descriptor, output_descriptor, + output_data, dnn::DataType::kFloat, scratch_allocator, + algorithm_config, algorithm_desc, scratch_memory), + /*report_error=*/true); +} + +bool CudnnSupport::PrepareForConvolution( + Stream* stream, const dnn::BatchDescriptor& batch_descriptor, + const DeviceMemory& input_data, + const dnn::FilterDescriptor& filter_descriptor, + const DeviceMemory& filter_data, + const dnn::ConvolutionDescriptor& convolution_descriptor, + const dnn::BatchDescriptor& output_descriptor, + DeviceMemory* output_data, ScratchAllocator* scratch_allocator, + const dnn::AlgorithmConfig& algorithm_config, + dnn::AlgorithmDesc* algorithm_desc, DeviceMemory* scratch_memory) { + return IsStatusOk(PrepareForConvolutionImpl( + stream, batch_descriptor, input_data, filter_descriptor, + filter_data, convolution_descriptor, output_descriptor, + output_data, dnn::DataType::kDouble, scratch_allocator, + algorithm_config, algorithm_desc, scratch_memory), + /*report_error=*/true); +} + +bool CudnnSupport::PrepareForConvolution( + Stream* stream, const dnn::BatchDescriptor& batch_descriptor, + const DeviceMemory& input_data, + const dnn::FilterDescriptor& filter_descriptor, + const DeviceMemory& filter_data, + const dnn::ConvolutionDescriptor& convolution_descriptor, + const dnn::BatchDescriptor& output_descriptor, + DeviceMemory* output_data, ScratchAllocator* scratch_allocator, + const dnn::AlgorithmConfig& algorithm_config, + dnn::AlgorithmDesc* algorithm_desc, DeviceMemory* scratch_memory) { + dnn::DataType acc_type = + CudnnEnvVar::IsEnabled() + ? dnn::DataType::kFloat + : dnn::DataType::kHalf; + return IsStatusOk( + PrepareForConvolutionImpl( + stream, batch_descriptor, input_data, filter_descriptor, filter_data, + convolution_descriptor, output_descriptor, output_data, acc_type, + scratch_allocator, algorithm_config, algorithm_desc, scratch_memory), + /*report_error=*/true); +} + +bool CudnnSupport::DoConvolve( + Stream* stream, const dnn::BatchDescriptor& batch_descriptor, + const DeviceMemory& input_data, + const dnn::FilterDescriptor& filter_descriptor, + const DeviceMemory& filter_data, + const dnn::ConvolutionDescriptor& convolution_descriptor, + const dnn::BatchDescriptor& output_descriptor, + DeviceMemory* output_data, const dnn::AlgorithmDesc& algorithm_desc, + DeviceMemory* scratch_memory, dnn::ProfileResult* output_profile_result) { return IsStatusOk( DoConvolveImpl(stream, batch_descriptor, input_data, filter_descriptor, filter_data, convolution_descriptor, output_descriptor, - output_data, dnn::DataType::kFloat, scratch_allocator, - algorithm_config, output_profile_result), + output_data, dnn::DataType::kFloat, algorithm_desc, + scratch_memory, output_profile_result), /*report_error=*/!output_profile_result); } @@ -2902,14 +2984,14 @@ bool CudnnSupport::DoConvolve( const DeviceMemory& filter_data, const dnn::ConvolutionDescriptor& convolution_descriptor, const dnn::BatchDescriptor& output_descriptor, - DeviceMemory* output_data, ScratchAllocator* scratch_allocator, - const dnn::AlgorithmConfig& algorithm_config, + DeviceMemory* output_data, const dnn::AlgorithmDesc& algorithm_desc, + DeviceMemory* scratch_memory, dnn::ProfileResult* output_profile_result) { return IsStatusOk( DoConvolveImpl(stream, batch_descriptor, input_data, filter_descriptor, filter_data, convolution_descriptor, output_descriptor, - output_data, dnn::DataType::kDouble, scratch_allocator, - algorithm_config, output_profile_result), + output_data, dnn::DataType::kDouble, algorithm_desc, + scratch_memory, output_profile_result), /*report_error=*/!output_profile_result); } @@ -2920,8 +3002,9 @@ bool CudnnSupport::DoConvolve( const DeviceMemory& filter_data, const dnn::ConvolutionDescriptor& convolution_descriptor, const dnn::BatchDescriptor& output_descriptor, - DeviceMemory* output_data, ScratchAllocator* scratch_allocator, - const dnn::AlgorithmConfig& algorithm_config, + DeviceMemory* output_data, + const dnn::AlgorithmDesc& algorithm_desc, + DeviceMemory* scratch_memory, dnn::ProfileResult* output_profile_result) { dnn::DataType acc_type = CudnnEnvVar::IsEnabled() @@ -2930,7 +3013,7 @@ bool CudnnSupport::DoConvolve( return IsStatusOk( DoConvolveImpl(stream, batch_descriptor, input_data, filter_descriptor, filter_data, convolution_descriptor, output_descriptor, - output_data, acc_type, scratch_allocator, algorithm_config, + output_data, acc_type, algorithm_desc, scratch_memory, output_profile_result), /*report_error=*/!output_profile_result); } @@ -3066,7 +3149,7 @@ bool CudnnSupport::DoTransformTensor(Stream* stream, } template -port::Status CudnnSupport::DoConvolveBackwardDataImpl( +port::Status CudnnSupport::PrepareForConvolutionBackwardDataImpl( Stream* stream, const dnn::FilterDescriptor& filter_descriptor, const DeviceMemory& filter_data, const dnn::BatchDescriptor& output_descriptor, @@ -3076,6 +3159,36 @@ port::Status CudnnSupport::DoConvolveBackwardDataImpl( DeviceMemory* backward_input_data, dnn::DataType accumulator_type, ScratchAllocator* scratch_allocator, const dnn::AlgorithmConfig& algorithm_config, + dnn::AlgorithmDesc* algorithm_desc, DeviceMemory* scratch_memory) { + cudnnDataType_t cudnn_type = GetCudnnDataType(); + auto cudnn = cudnn_->GetHandle(parent_, stream); + + CudnnTensorDescriptor out_back_nd(output_descriptor, cudnn_type); + CudnnTensorDescriptor in_back_nd(input_descriptor, cudnn_type); + CudnnFilterDescriptor filter(filter_descriptor, cudnn_type); + CudnnConvolutionDescriptor conv(convolution_descriptor, + ToCudnnDataType(accumulator_type)); + + SE_ASSIGN_OR_RETURN( + *algorithm_desc, + GetCudnnConvolutionBackwardDataAlgorithm( + stream, cudnn, algorithm_config, in_back_nd, filter, conv, + out_back_nd, scratch_allocator, scratch_memory)); + + return port::Status::OK(); +} + +template +port::Status CudnnSupport::DoConvolveBackwardDataImpl( + Stream* stream, const dnn::FilterDescriptor& filter_descriptor, + const DeviceMemory& filter_data, + const dnn::BatchDescriptor& output_descriptor, + DeviceMemory backward_output_data, + const dnn::ConvolutionDescriptor& convolution_descriptor, + const dnn::BatchDescriptor& input_descriptor, + DeviceMemory* backward_input_data, dnn::DataType accumulator_type, + const dnn::AlgorithmDesc& algorithm_desc, + DeviceMemory* scratch_memory, dnn::ProfileResult* output_profile_result) { cudnnDataType_t cudnn_type = GetCudnnDataType(); // Alpha is the scaling factor for input. @@ -3099,12 +3212,6 @@ port::Status CudnnSupport::DoConvolveBackwardDataImpl( const bool is_profiling = output_profile_result != nullptr; - DeviceMemory scratch; - SE_ASSIGN_OR_RETURN(dnn::AlgorithmDesc algo_desc, - GetCudnnConvolutionBackwardDataAlgorithm( - stream, cudnn, algorithm_config, in_back_nd, filter, - conv, out_back_nd, scratch_allocator, &scratch)); - std::unique_ptr timer; if (is_profiling) { timer.reset(new CUDATimer(parent_)); // NOLINT @@ -3116,7 +3223,8 @@ port::Status CudnnSupport::DoConvolveBackwardDataImpl( } } - if (algo_desc.algo_id() == CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD_NONFUSED && + if (algorithm_desc.algo_id() == + CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD_NONFUSED && !ShouldIncludeWinogradNonfusedAlgo(input_descriptor, output_descriptor)) { return port::Status(port::error::FAILED_PRECONDITION, "This configuration has potential integer overflow in " @@ -3126,44 +3234,44 @@ port::Status CudnnSupport::DoConvolveBackwardDataImpl( // Cudnn 7.1.4 has a bug if the workspace of the following convolution is not // zero-initialized, nvbugs/2254619. if (CUDNN_VERSION >= 7000 && CUDNN_VERSION < 7300 && - algo_desc.algo_id() == CUDNN_CONVOLUTION_BWD_DATA_ALGO_1 && - cudnn_type == CUDNN_DATA_HALF && algo_desc.tensor_ops_enabled() && + algorithm_desc.algo_id() == CUDNN_CONVOLUTION_BWD_DATA_ALGO_1 && + cudnn_type == CUDNN_DATA_HALF && algorithm_desc.tensor_ops_enabled() && input_descriptor.layout() == dnn::DataLayout::kBatchYXDepth && filter_descriptor.layout() == dnn::FilterLayout::kOutputInputYX && output_descriptor.layout() == dnn::DataLayout::kBatchDepthYX && (convolution_descriptor.vertical_filter_stride() > 1 || convolution_descriptor.horizontal_filter_stride() > 1)) { - stream->ThenMemZero(&scratch, scratch.size()); + stream->ThenMemZero(scratch_memory, scratch_memory->size()); } - RETURN_IF_CUDNN_ERROR( - cudnnConvolutionBackwardData(cudnn.handle(), - /*alpha=*/alpha, - /*wDesc=*/filter.handle(), - /*w=*/filter_data.opaque(), - /*dyDesc=*/out_back_nd.handle(), - /*dy=*/backward_output_data.opaque(), - /*convDesc=*/conv.handle(), - /*algo=*/ToConvBackwardDataAlgo(algo_desc), - /*workSpace=*/scratch.opaque(), - /*workSpaceSizeInBytes=*/scratch.size(), - /*beta=*/beta, - /*dxDesc=*/in_back_nd.handle(), - /*dx=*/backward_input_data->opaque())); + RETURN_IF_CUDNN_ERROR(cudnnConvolutionBackwardData( + cudnn.handle(), + /*alpha=*/alpha, + /*wDesc=*/filter.handle(), + /*w=*/filter_data.opaque(), + /*dyDesc=*/out_back_nd.handle(), + /*dy=*/backward_output_data.opaque(), + /*convDesc=*/conv.handle(), + /*algo=*/ToConvBackwardDataAlgo(algorithm_desc), + /*workSpace=*/scratch_memory->opaque(), + /*workSpaceSizeInBytes=*/scratch_memory->size(), + /*beta=*/beta, + /*dxDesc=*/in_back_nd.handle(), + /*dx=*/backward_input_data->opaque())); if (is_profiling) { if (!timer->Stop(AsCUDAStream(stream))) { return port::Status(port::error::INTERNAL, "Failed to stop timer"); } - output_profile_result->set_algorithm(algo_desc); + output_profile_result->set_algorithm(algorithm_desc); output_profile_result->set_elapsed_time_in_ms( timer->GetElapsedMilliseconds()); - output_profile_result->set_scratch_size(scratch.size()); + output_profile_result->set_scratch_size(scratch_memory->size()); } return port::Status::OK(); } -bool CudnnSupport::DoConvolveBackwardData( +bool CudnnSupport::PrepareForConvolutionBackwardData( Stream* stream, const dnn::FilterDescriptor& filter_descriptor, const DeviceMemory& filter_data, const dnn::BatchDescriptor& output_descriptor, @@ -3173,17 +3281,17 @@ bool CudnnSupport::DoConvolveBackwardData( DeviceMemory* backward_input_data, ScratchAllocator* scratch_allocator, const dnn::AlgorithmConfig& algorithm_config, - dnn::ProfileResult* output_profile_result) { + dnn::AlgorithmDesc* algorithm_desc, DeviceMemory* scratch_memory) { return IsStatusOk( - DoConvolveBackwardDataImpl( + PrepareForConvolutionBackwardDataImpl( stream, filter_descriptor, filter_data, output_descriptor, backward_output_data, convolution_descriptor, input_descriptor, backward_input_data, dnn::DataType::kDouble, scratch_allocator, - algorithm_config, output_profile_result), - /*report_error=*/!output_profile_result); + algorithm_config, algorithm_desc, scratch_memory), + /*report_error=*/true); } -bool CudnnSupport::DoConvolveBackwardData( +bool CudnnSupport::PrepareForConvolutionBackwardData( Stream* stream, const dnn::FilterDescriptor& filter_descriptor, const DeviceMemory& filter_data, const dnn::BatchDescriptor& output_descriptor, @@ -3193,17 +3301,17 @@ bool CudnnSupport::DoConvolveBackwardData( DeviceMemory* backward_input_data, ScratchAllocator* scratch_allocator, const dnn::AlgorithmConfig& algorithm_config, - dnn::ProfileResult* output_profile_result) { + dnn::AlgorithmDesc* algorithm_desc, DeviceMemory* scratch_memory) { return IsStatusOk( - DoConvolveBackwardDataImpl( + PrepareForConvolutionBackwardDataImpl( stream, filter_descriptor, filter_data, output_descriptor, backward_output_data, convolution_descriptor, input_descriptor, backward_input_data, dnn::DataType::kFloat, scratch_allocator, - algorithm_config, output_profile_result), - /*report_error=*/!output_profile_result); + algorithm_config, algorithm_desc, scratch_memory), + /*report_error=*/true); } -bool CudnnSupport::DoConvolveBackwardData( +bool CudnnSupport::PrepareForConvolutionBackwardData( Stream* stream, const dnn::FilterDescriptor& filter_descriptor, const DeviceMemory& filter_data, const dnn::BatchDescriptor& output_descriptor, @@ -3213,22 +3321,86 @@ bool CudnnSupport::DoConvolveBackwardData( DeviceMemory* backward_input_data, ScratchAllocator* scratch_allocator, const dnn::AlgorithmConfig& algorithm_config, - dnn::ProfileResult* output_profile_result) { + dnn::AlgorithmDesc* algorithm_desc, DeviceMemory* scratch_memory) { dnn::DataType acc_type = CudnnEnvVar::IsEnabled() ? dnn::DataType::kFloat : dnn::DataType::kHalf; return IsStatusOk( - DoConvolveBackwardDataImpl( + PrepareForConvolutionBackwardDataImpl( stream, filter_descriptor, filter_data, output_descriptor, backward_output_data, convolution_descriptor, input_descriptor, backward_input_data, acc_type, scratch_allocator, algorithm_config, - output_profile_result), + algorithm_desc, scratch_memory), + /*report_error=*/true); +} + +bool CudnnSupport::DoConvolveBackwardData( + Stream* stream, const dnn::FilterDescriptor& filter_descriptor, + const DeviceMemory& filter_data, + const dnn::BatchDescriptor& output_descriptor, + DeviceMemory backward_output_data, + const dnn::ConvolutionDescriptor& convolution_descriptor, + const dnn::BatchDescriptor& input_descriptor, + DeviceMemory* backward_input_data, + const dnn::AlgorithmDesc& algorithm_desc, + DeviceMemory* scratch_memory, + dnn::ProfileResult* output_profile_result) { + return IsStatusOk( + DoConvolveBackwardDataImpl( + stream, filter_descriptor, filter_data, output_descriptor, + backward_output_data, convolution_descriptor, input_descriptor, + backward_input_data, dnn::DataType::kDouble, algorithm_desc, + scratch_memory, output_profile_result), + /*report_error=*/!output_profile_result); +} + +bool CudnnSupport::DoConvolveBackwardData( + Stream* stream, const dnn::FilterDescriptor& filter_descriptor, + const DeviceMemory& filter_data, + const dnn::BatchDescriptor& output_descriptor, + DeviceMemory backward_output_data, + const dnn::ConvolutionDescriptor& convolution_descriptor, + const dnn::BatchDescriptor& input_descriptor, + DeviceMemory* backward_input_data, + const dnn::AlgorithmDesc& algorithm_desc, + DeviceMemory* scratch_memory, + dnn::ProfileResult* output_profile_result) { + return IsStatusOk( + DoConvolveBackwardDataImpl( + stream, filter_descriptor, filter_data, output_descriptor, + backward_output_data, convolution_descriptor, input_descriptor, + backward_input_data, dnn::DataType::kFloat, algorithm_desc, + scratch_memory, output_profile_result), + /*report_error=*/!output_profile_result); +} + +bool CudnnSupport::DoConvolveBackwardData( + Stream* stream, const dnn::FilterDescriptor& filter_descriptor, + const DeviceMemory& filter_data, + const dnn::BatchDescriptor& output_descriptor, + DeviceMemory backward_output_data, + const dnn::ConvolutionDescriptor& convolution_descriptor, + const dnn::BatchDescriptor& input_descriptor, + DeviceMemory* backward_input_data, + const dnn::AlgorithmDesc& algorithm_desc, + DeviceMemory* scratch_memory, + dnn::ProfileResult* output_profile_result) { + dnn::DataType acc_type = + CudnnEnvVar::IsEnabled() + ? dnn::DataType::kFloat + : dnn::DataType::kHalf; + return IsStatusOk( + DoConvolveBackwardDataImpl(stream, filter_descriptor, filter_data, + output_descriptor, backward_output_data, + convolution_descriptor, input_descriptor, + backward_input_data, acc_type, algorithm_desc, + scratch_memory, output_profile_result), /*report_error=*/!output_profile_result); } template -port::Status CudnnSupport::DoConvolveBackwardFilterImpl( +port::Status CudnnSupport::PrepareForConvolutionBackwardFilterImpl( Stream* stream, const dnn::BatchDescriptor& input_descriptor, const DeviceMemory& input_data, const dnn::BatchDescriptor& output_descriptor, @@ -3238,6 +3410,36 @@ port::Status CudnnSupport::DoConvolveBackwardFilterImpl( DeviceMemory* backward_filter_data, dnn::DataType accumulator_type, ScratchAllocator* scratch_allocator, const dnn::AlgorithmConfig& algorithm_config, + dnn::AlgorithmDesc* algorithm_desc, DeviceMemory* scratch_memory) { + cudnnDataType_t cudnn_type = GetCudnnDataType(); + auto cudnn = cudnn_->GetHandle(parent_, stream); + + CudnnTensorDescriptor out_back_nd(output_descriptor, cudnn_type); + CudnnTensorDescriptor input_nd(input_descriptor, cudnn_type); + CudnnFilterDescriptor filter(filter_descriptor, cudnn_type); + CudnnConvolutionDescriptor conv(convolution_descriptor, + ToCudnnDataType(accumulator_type)); + + SE_ASSIGN_OR_RETURN( + *algorithm_desc, + GetCudnnConvolutionBackwardFilterAlgorithm( + stream, cudnn, algorithm_config, input_nd, filter, conv, out_back_nd, + scratch_allocator, scratch_memory)); + + return port::Status::OK(); +} + +template +port::Status CudnnSupport::DoConvolveBackwardFilterImpl( + Stream* stream, const dnn::BatchDescriptor& input_descriptor, + const DeviceMemory& input_data, + const dnn::BatchDescriptor& output_descriptor, + DeviceMemory backward_output_data, + const dnn::ConvolutionDescriptor& convolution_descriptor, + const dnn::FilterDescriptor& filter_descriptor, + DeviceMemory* backward_filter_data, dnn::DataType accumulator_type, + const dnn::AlgorithmDesc& algorithm_desc, + DeviceMemory* scratch_memory, dnn::ProfileResult* output_profile_result) { cudnnDataType_t cudnn_type = GetCudnnDataType(); // Alpha is the scaling factor for input. @@ -3261,12 +3463,6 @@ port::Status CudnnSupport::DoConvolveBackwardFilterImpl( const bool is_profiling = output_profile_result != nullptr; - DeviceMemory scratch; - SE_ASSIGN_OR_RETURN(dnn::AlgorithmDesc algo_desc, - GetCudnnConvolutionBackwardFilterAlgorithm( - stream, cudnn, algorithm_config, input_nd, filter, - conv, out_back_nd, scratch_allocator, &scratch)); - std::unique_ptr timer; if (is_profiling) { timer.reset(new CUDATimer(parent_)); // NOLINT @@ -3282,7 +3478,8 @@ port::Status CudnnSupport::DoConvolveBackwardFilterImpl( // results. See nvbugs/2072856 if (CUDNN_VERSION < 7300) { SE_RETURN_IF_ERROR([&] { - if (algo_desc.algo_id() != CUDNN_CONVOLUTION_BWD_FILTER_ALGO_FFT_TILING) { + if (algorithm_desc.algo_id() != + CUDNN_CONVOLUTION_BWD_FILTER_ALGO_FFT_TILING) { return port::Status::OK(); } if (output_descriptor.height() > 1 && output_descriptor.width() > 1) { @@ -3308,7 +3505,8 @@ port::Status CudnnSupport::DoConvolveBackwardFilterImpl( }()); } - if (algo_desc.algo_id() == CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD_NONFUSED && + if (algorithm_desc.algo_id() == + CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD_NONFUSED && !ShouldIncludeWinogradNonfusedAlgo(input_descriptor, output_descriptor)) { return port::Status(port::error::FAILED_PRECONDITION, "This configuration has potential integer overflow in " @@ -3324,7 +3522,7 @@ port::Status CudnnSupport::DoConvolveBackwardFilterImpl( // // See nvbugs/2379553. if (CUDNN_VERSION >= 7100 && CUDNN_VERSION < 7300 && - algo_desc.algo_id() == CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1 && + algorithm_desc.algo_id() == CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1 && cudnn_type == CUDNN_DATA_HALF && input_descriptor.layout() == dnn::DataLayout::kBatchYXDepth && filter_descriptor.layout() == dnn::FilterLayout::kOutputYXInput && @@ -3342,9 +3540,9 @@ port::Status CudnnSupport::DoConvolveBackwardFilterImpl( /*diffDesc=*/out_back_nd.handle(), /*diffData=*/backward_output_data.opaque(), /*convDesc=*/conv.handle(), - /*algo=*/ToConvBackwardFilterAlgo(algo_desc), - /*workSpace=*/scratch.opaque(), - /*workSpaceSizeInBytes=*/scratch.size(), + /*algo=*/ToConvBackwardFilterAlgo(algorithm_desc), + /*workSpace=*/scratch_memory->opaque(), + /*workSpaceSizeInBytes=*/scratch_memory->size(), /*beta=*/beta, /*gradDesc=*/filter.handle(), /*dw=*/backward_filter_data->opaque())); @@ -3352,16 +3550,16 @@ port::Status CudnnSupport::DoConvolveBackwardFilterImpl( if (!timer->Stop(AsCUDAStream(stream))) { return port::Status(port::error::INTERNAL, "Failed to stop timer"); } - output_profile_result->set_algorithm(algo_desc); + output_profile_result->set_algorithm(algorithm_desc); output_profile_result->set_elapsed_time_in_ms( timer->GetElapsedMilliseconds()); - output_profile_result->set_scratch_size(scratch.size()); + output_profile_result->set_scratch_size(scratch_memory->size()); } return port::Status::OK(); } -bool CudnnSupport::DoConvolveBackwardFilter( +bool CudnnSupport::PrepareForConvolutionBackwardFilter( Stream* stream, const dnn::BatchDescriptor& input_descriptor, const DeviceMemory& input_data, const dnn::BatchDescriptor& output_descriptor, @@ -3371,18 +3569,17 @@ bool CudnnSupport::DoConvolveBackwardFilter( DeviceMemory* backward_filter_data, ScratchAllocator* scratch_allocator, const dnn::AlgorithmConfig& algorithm_config, - dnn::ProfileResult* output_profile_result) { + dnn::AlgorithmDesc* algorithm_desc, DeviceMemory* scratch_memory) { return IsStatusOk( - DoConvolveBackwardFilterImpl( + PrepareForConvolutionBackwardFilterImpl( stream, input_descriptor, input_data, output_descriptor, backward_output_data, convolution_descriptor, filter_descriptor, - backward_filter_data, dnn::DataType::kDouble, - - scratch_allocator, algorithm_config, output_profile_result), - /*report_error=*/!output_profile_result); + backward_filter_data, dnn::DataType::kDouble, scratch_allocator, + algorithm_config, algorithm_desc, scratch_memory), + /*report_error=*/true); } -bool CudnnSupport::DoConvolveBackwardFilter( +bool CudnnSupport::PrepareForConvolutionBackwardFilter( Stream* stream, const dnn::BatchDescriptor& input_descriptor, const DeviceMemory& input_data, const dnn::BatchDescriptor& output_descriptor, @@ -3392,18 +3589,17 @@ bool CudnnSupport::DoConvolveBackwardFilter( DeviceMemory* backward_filter_data, ScratchAllocator* scratch_allocator, const dnn::AlgorithmConfig& algorithm_config, - dnn::ProfileResult* output_profile_result) { - return IsStatusOk(DoConvolveBackwardFilterImpl( - stream, input_descriptor, input_data, output_descriptor, - backward_output_data, convolution_descriptor, - filter_descriptor, backward_filter_data, - - dnn::DataType::kFloat, scratch_allocator, - algorithm_config, output_profile_result), - /*report_error=*/!output_profile_result); + dnn::AlgorithmDesc* algorithm_desc, DeviceMemory* scratch_memory) { + return IsStatusOk( + PrepareForConvolutionBackwardFilterImpl( + stream, input_descriptor, input_data, output_descriptor, + backward_output_data, convolution_descriptor, filter_descriptor, + backward_filter_data, dnn::DataType::kFloat, scratch_allocator, + algorithm_config, algorithm_desc, scratch_memory), + /*report_error=*/true); } -bool CudnnSupport::DoConvolveBackwardFilter( +bool CudnnSupport::PrepareForConvolutionBackwardFilter( Stream* stream, const dnn::BatchDescriptor& input_descriptor, const DeviceMemory& input_data, const dnn::BatchDescriptor& output_descriptor, @@ -3413,20 +3609,83 @@ bool CudnnSupport::DoConvolveBackwardFilter( DeviceMemory* backward_filter_data, ScratchAllocator* scratch_allocator, const dnn::AlgorithmConfig& algorithm_config, - dnn::ProfileResult* output_profile_result) { + dnn::AlgorithmDesc* algorithm_desc, DeviceMemory* scratch_memory) { dnn::DataType acc_type = CudnnEnvVar::IsEnabled() ? dnn::DataType::kFloat : dnn::DataType::kHalf; return IsStatusOk( - DoConvolveBackwardFilterImpl( + PrepareForConvolutionBackwardFilterImpl( stream, input_descriptor, input_data, output_descriptor, backward_output_data, convolution_descriptor, filter_descriptor, backward_filter_data, acc_type, scratch_allocator, algorithm_config, - output_profile_result), + algorithm_desc, scratch_memory), + /*report_error=*/true); +} + +bool CudnnSupport::DoConvolveBackwardFilter( + Stream* stream, const dnn::BatchDescriptor& input_descriptor, + const DeviceMemory& input_data, + const dnn::BatchDescriptor& output_descriptor, + DeviceMemory backward_output_data, + const dnn::ConvolutionDescriptor& convolution_descriptor, + const dnn::FilterDescriptor& filter_descriptor, + DeviceMemory* backward_filter_data, + const dnn::AlgorithmDesc& algorithm_desc, + DeviceMemory* scratch_memory, + dnn::ProfileResult* output_profile_result) { + return IsStatusOk( + DoConvolveBackwardFilterImpl( + stream, input_descriptor, input_data, output_descriptor, + backward_output_data, convolution_descriptor, filter_descriptor, + backward_filter_data, dnn::DataType::kDouble, algorithm_desc, + scratch_memory, output_profile_result), /*report_error=*/!output_profile_result); } +bool CudnnSupport::DoConvolveBackwardFilter( + Stream* stream, const dnn::BatchDescriptor& input_descriptor, + const DeviceMemory& input_data, + const dnn::BatchDescriptor& output_descriptor, + DeviceMemory backward_output_data, + const dnn::ConvolutionDescriptor& convolution_descriptor, + const dnn::FilterDescriptor& filter_descriptor, + DeviceMemory* backward_filter_data, + const dnn::AlgorithmDesc& algorithm_desc, + DeviceMemory* scratch_memory, + dnn::ProfileResult* output_profile_result) { + return IsStatusOk( + DoConvolveBackwardFilterImpl( + stream, input_descriptor, input_data, output_descriptor, + backward_output_data, convolution_descriptor, filter_descriptor, + backward_filter_data, dnn::DataType::kFloat, algorithm_desc, + scratch_memory, output_profile_result), + /*report_error=*/!output_profile_result); +} + +bool CudnnSupport::DoConvolveBackwardFilter( + Stream* stream, const dnn::BatchDescriptor& input_descriptor, + const DeviceMemory& input_data, + const dnn::BatchDescriptor& output_descriptor, + DeviceMemory backward_output_data, + const dnn::ConvolutionDescriptor& convolution_descriptor, + const dnn::FilterDescriptor& filter_descriptor, + DeviceMemory* backward_filter_data, + const dnn::AlgorithmDesc& algorithm_desc, + DeviceMemory* scratch_memory, + dnn::ProfileResult* output_profile_result) { + dnn::DataType acc_type = + CudnnEnvVar::IsEnabled() + ? dnn::DataType::kFloat + : dnn::DataType::kHalf; + return IsStatusOk(DoConvolveBackwardFilterImpl( + stream, input_descriptor, input_data, output_descriptor, + backward_output_data, convolution_descriptor, + filter_descriptor, backward_filter_data, acc_type, + algorithm_desc, scratch_memory, output_profile_result), + /*report_error=*/!output_profile_result); +} + template port::Status CudnnSupport::DoConvolveBackwardBiasImpl( Stream* stream, const dnn::BatchDescriptor& input_descriptor, diff --git a/tensorflow/stream_executor/cuda/cuda_dnn.h b/tensorflow/stream_executor/cuda/cuda_dnn.h index 0641be140d..0b8f4d035c 100644 --- a/tensorflow/stream_executor/cuda/cuda_dnn.h +++ b/tensorflow/stream_executor/cuda/cuda_dnn.h @@ -252,6 +252,43 @@ class CudnnSupport : public dnn::DnnSupport { DeviceMemory* scale_backprop, DeviceMemory* offset_backprop) override; + bool PrepareForConvolution( + Stream* stream, const dnn::BatchDescriptor& batch_descriptor, + const DeviceMemory& input_data, + const dnn::FilterDescriptor& filter_descriptor, + const DeviceMemory& filter_data, + const dnn::ConvolutionDescriptor& convolution_descriptor, + const dnn::BatchDescriptor& output_descriptor, + DeviceMemory* output_data, ScratchAllocator* scratch_allocator, + const dnn::AlgorithmConfig& algorithm_config, + dnn::AlgorithmDesc* algorithm_desc, + DeviceMemory* scratch_memory) override; + + bool PrepareForConvolution( + Stream* stream, const dnn::BatchDescriptor& batch_descriptor, + const DeviceMemory& input_data, + const dnn::FilterDescriptor& filter_descriptor, + const DeviceMemory& filter_data, + const dnn::ConvolutionDescriptor& convolution_descriptor, + const dnn::BatchDescriptor& output_descriptor, + DeviceMemory* output_data, ScratchAllocator* scratch_allocator, + const dnn::AlgorithmConfig& algorithm_config, + dnn::AlgorithmDesc* algorithm_desc, + DeviceMemory* scratch_memory) override; + + bool PrepareForConvolution( + Stream* stream, const dnn::BatchDescriptor& batch_descriptor, + const DeviceMemory& input_data, + const dnn::FilterDescriptor& filter_descriptor, + const DeviceMemory& filter_data, + const dnn::ConvolutionDescriptor& convolution_descriptor, + const dnn::BatchDescriptor& output_descriptor, + DeviceMemory* output_data, + ScratchAllocator* scratch_allocator, + const dnn::AlgorithmConfig& algorithm_config, + dnn::AlgorithmDesc* algorithm_desc, + DeviceMemory* scratch_memory) override; + bool DoConvolve(Stream* stream, const dnn::BatchDescriptor& batch_descriptor, const DeviceMemory& input_data, const dnn::FilterDescriptor& filter_descriptor, @@ -259,8 +296,8 @@ class CudnnSupport : public dnn::DnnSupport { const dnn::ConvolutionDescriptor& convolution_descriptor, const dnn::BatchDescriptor& output_descriptor, DeviceMemory* output_data, - ScratchAllocator* scratch_allocator, - const dnn::AlgorithmConfig& algorithm_config, + const dnn::AlgorithmDesc& algorithm_desc, + DeviceMemory* scratch_memory, dnn::ProfileResult* output_profile_result) override; bool DoConvolve(Stream* stream, const dnn::BatchDescriptor& batch_descriptor, @@ -270,8 +307,8 @@ class CudnnSupport : public dnn::DnnSupport { const dnn::ConvolutionDescriptor& convolution_descriptor, const dnn::BatchDescriptor& output_descriptor, DeviceMemory* output_data, - ScratchAllocator* scratch_allocator, - const dnn::AlgorithmConfig& algorithm_config, + const dnn::AlgorithmDesc& algorithm_desc, + DeviceMemory* scratch_memory, dnn::ProfileResult* output_profile_result) override; bool DoConvolve(Stream* stream, const dnn::BatchDescriptor& batch_descriptor, @@ -281,8 +318,8 @@ class CudnnSupport : public dnn::DnnSupport { const dnn::ConvolutionDescriptor& convolution_descriptor, const dnn::BatchDescriptor& output_descriptor, DeviceMemory* output_data, - ScratchAllocator* scratch_allocator, - const dnn::AlgorithmConfig& algorithm_config, + const dnn::AlgorithmDesc& algorithm_desc, + DeviceMemory* scratch_memory, dnn::ProfileResult* output_profile_result) override; bool DoFusedConvolve( @@ -384,7 +421,20 @@ class CudnnSupport : public dnn::DnnSupport { return false; } - bool DoConvolveBackwardData( + bool PrepareForConvolutionBackwardData( + Stream* stream, const dnn::FilterDescriptor& filter_descriptor, + const DeviceMemory& filter_data, + const dnn::BatchDescriptor& output_descriptor, + DeviceMemory backward_output_data, + const dnn::ConvolutionDescriptor& convolution_descriptor, + const dnn::BatchDescriptor& input_descriptor, + DeviceMemory* backward_input_data, + ScratchAllocator* scratch_allocator, + const dnn::AlgorithmConfig& algorithm_config, + dnn::AlgorithmDesc* algorithm_desc, + DeviceMemory* scratch_memory) override; + + bool PrepareForConvolutionBackwardData( Stream* stream, const dnn::FilterDescriptor& filter_descriptor, const DeviceMemory& filter_data, const dnn::BatchDescriptor& output_descriptor, @@ -394,6 +444,32 @@ class CudnnSupport : public dnn::DnnSupport { DeviceMemory* backward_input_data, ScratchAllocator* scratch_allocator, const dnn::AlgorithmConfig& algorithm_config, + dnn::AlgorithmDesc* algorithm_desc, + DeviceMemory* scratch_memory) override; + + bool PrepareForConvolutionBackwardData( + Stream* stream, const dnn::FilterDescriptor& filter_descriptor, + const DeviceMemory& filter_data, + const dnn::BatchDescriptor& output_descriptor, + DeviceMemory backward_output_data, + const dnn::ConvolutionDescriptor& convolution_descriptor, + const dnn::BatchDescriptor& input_descriptor, + DeviceMemory* backward_input_data, + ScratchAllocator* scratch_allocator, + const dnn::AlgorithmConfig& algorithm_config, + dnn::AlgorithmDesc* algorithm_desc, + DeviceMemory* scratch_memory) override; + + bool DoConvolveBackwardData( + Stream* stream, const dnn::FilterDescriptor& filter_descriptor, + const DeviceMemory& filter_data, + const dnn::BatchDescriptor& output_descriptor, + DeviceMemory backward_output_data, + const dnn::ConvolutionDescriptor& convolution_descriptor, + const dnn::BatchDescriptor& input_descriptor, + DeviceMemory* backward_input_data, + const dnn::AlgorithmDesc& algorithm_desc, + DeviceMemory* scratch_memory, dnn::ProfileResult* output_profile_result) override; bool DoConvolveBackwardData( @@ -404,8 +480,8 @@ class CudnnSupport : public dnn::DnnSupport { const dnn::ConvolutionDescriptor& convolution_descriptor, const dnn::BatchDescriptor& input_descriptor, DeviceMemory* backward_input_data, - ScratchAllocator* scratch_allocator, - const dnn::AlgorithmConfig& algorithm_config, + const dnn::AlgorithmDesc& algorithm_desc, + DeviceMemory* scratch_memory, dnn::ProfileResult* output_profile_result) override; bool DoConvolveBackwardData( @@ -416,11 +492,11 @@ class CudnnSupport : public dnn::DnnSupport { const dnn::ConvolutionDescriptor& convolution_descriptor, const dnn::BatchDescriptor& input_descriptor, DeviceMemory* backward_input_data, - ScratchAllocator* scratch_allocator, - const dnn::AlgorithmConfig& algorithm_config, + const dnn::AlgorithmDesc& algorithm_desc, + DeviceMemory* scratch_memory, dnn::ProfileResult* output_profile_result) override; - bool DoConvolveBackwardFilter( + bool PrepareForConvolutionBackwardFilter( Stream* stream, const dnn::BatchDescriptor& input_descriptor, const DeviceMemory& input_data, const dnn::BatchDescriptor& output_descriptor, @@ -430,9 +506,10 @@ class CudnnSupport : public dnn::DnnSupport { DeviceMemory* backward_filter_data, ScratchAllocator* scratch_allocator, const dnn::AlgorithmConfig& algorithm_config, - dnn::ProfileResult* output_profile_result) override; + dnn::AlgorithmDesc* algorithm_desc, + DeviceMemory* scratch_memory) override; - bool DoConvolveBackwardFilter( + bool PrepareForConvolutionBackwardFilter( Stream* stream, const dnn::BatchDescriptor& input_descriptor, const DeviceMemory& input_data, const dnn::BatchDescriptor& output_descriptor, @@ -442,9 +519,10 @@ class CudnnSupport : public dnn::DnnSupport { DeviceMemory* backward_filter_data, ScratchAllocator* scratch_allocator, const dnn::AlgorithmConfig& algorithm_config, - dnn::ProfileResult* output_profile_result) override; + dnn::AlgorithmDesc* algorithm_desc, + DeviceMemory* scratch_memory) override; - bool DoConvolveBackwardFilter( + bool PrepareForConvolutionBackwardFilter( Stream* stream, const dnn::BatchDescriptor& input_descriptor, const DeviceMemory& input_data, const dnn::BatchDescriptor& output_descriptor, @@ -454,6 +532,43 @@ class CudnnSupport : public dnn::DnnSupport { DeviceMemory* backward_filter_data, ScratchAllocator* scratch_allocator, const dnn::AlgorithmConfig& algorithm_config, + dnn::AlgorithmDesc* algorithm_desc, + DeviceMemory* scratch_memory) override; + + bool DoConvolveBackwardFilter( + Stream* stream, const dnn::BatchDescriptor& input_descriptor, + const DeviceMemory& input_data, + const dnn::BatchDescriptor& output_descriptor, + DeviceMemory backward_output_data, + const dnn::ConvolutionDescriptor& convolution_descriptor, + const dnn::FilterDescriptor& filter_descriptor, + DeviceMemory* backward_filter_data, + const dnn::AlgorithmDesc& algorithm_desc, + DeviceMemory* scratch_memory, + dnn::ProfileResult* output_profile_result) override; + + bool DoConvolveBackwardFilter( + Stream* stream, const dnn::BatchDescriptor& input_descriptor, + const DeviceMemory& input_data, + const dnn::BatchDescriptor& output_descriptor, + DeviceMemory backward_output_data, + const dnn::ConvolutionDescriptor& convolution_descriptor, + const dnn::FilterDescriptor& filter_descriptor, + DeviceMemory* backward_filter_data, + const dnn::AlgorithmDesc& algorithm_desc, + DeviceMemory* scratch_memory, + dnn::ProfileResult* output_profile_result) override; + + bool DoConvolveBackwardFilter( + Stream* stream, const dnn::BatchDescriptor& input_descriptor, + const DeviceMemory& input_data, + const dnn::BatchDescriptor& output_descriptor, + DeviceMemory backward_output_data, + const dnn::ConvolutionDescriptor& convolution_descriptor, + const dnn::FilterDescriptor& filter_descriptor, + DeviceMemory* backward_filter_data, + const dnn::AlgorithmDesc& algorithm_desc, + DeviceMemory* scratch_memory, dnn::ProfileResult* output_profile_result) override; bool DoConvolveBackwardBias( @@ -663,8 +778,8 @@ class CudnnSupport : public dnn::DnnSupport { DeviceMemory* offset_backprop); template - port::Status DoConvolveImpl( - Stream* stream, const dnn::BatchDescriptor& input_descriptor, + port::Status PrepareForConvolutionImpl( + Stream* stream, const dnn::BatchDescriptor& batch_descriptor, const DeviceMemory& input_data, const dnn::FilterDescriptor& filter_descriptor, const DeviceMemory& filter_data, @@ -673,6 +788,19 @@ class CudnnSupport : public dnn::DnnSupport { DeviceMemory* output_data, dnn::DataType accumulator_type, ScratchAllocator* scratch_allocator, const dnn::AlgorithmConfig& algorithm_config, + dnn::AlgorithmDesc* algorithm_desc, DeviceMemory* scratch_memory); + + template + port::Status DoConvolveImpl( + Stream* stream, const dnn::BatchDescriptor& input_descriptor, + const DeviceMemory& input_data, + const dnn::FilterDescriptor& filter_descriptor, + const DeviceMemory& filter_data, + const dnn::ConvolutionDescriptor& convolution_descriptor, + const dnn::BatchDescriptor& output_descriptor, + DeviceMemory* output_data, dnn::DataType accumulator_type, + const dnn::AlgorithmDesc& algorithm_desc, + DeviceMemory* scratch_memory, dnn::ProfileResult* output_profile_result); template @@ -693,7 +821,7 @@ class CudnnSupport : public dnn::DnnSupport { dnn::ProfileResult* output_profile_result); template - port::Status DoConvolveBackwardDataImpl( + port::Status PrepareForConvolutionBackwardDataImpl( Stream* stream, const dnn::FilterDescriptor& filter_descriptor, const DeviceMemory& filter_data, const dnn::BatchDescriptor& output_descriptor, @@ -703,19 +831,45 @@ class CudnnSupport : public dnn::DnnSupport { DeviceMemory* backward_input_data, dnn::DataType accumulator_type, ScratchAllocator* scratch_allocator, const dnn::AlgorithmConfig& algorithm_config, + dnn::AlgorithmDesc* algorithm_desc, DeviceMemory* scratch_memory); + + template + port::Status DoConvolveBackwardDataImpl( + Stream* stream, const dnn::FilterDescriptor& filter_descriptor, + const DeviceMemory& filter_data, + const dnn::BatchDescriptor& output_descriptor, + DeviceMemory backward_output_data, + const dnn::ConvolutionDescriptor& convolution_descriptor, + const dnn::BatchDescriptor& input_descriptor, + DeviceMemory* backward_input_data, dnn::DataType accumulator_type, + const dnn::AlgorithmDesc& algorithm_desc, + DeviceMemory* scratch_memory, dnn::ProfileResult* output_profile_result); template - port::Status DoConvolveBackwardFilterImpl( + port::Status PrepareForConvolutionBackwardFilterImpl( Stream* stream, const dnn::BatchDescriptor& input_descriptor, const DeviceMemory& input_data, - const dnn::BatchDescriptor& output_descriptor, + const dnn::BatchDescriptor& output_descriptor_in, DeviceMemory backward_output_data, const dnn::ConvolutionDescriptor& convolution_descriptor, const dnn::FilterDescriptor& filter_descriptor, DeviceMemory* backward_filter_data, dnn::DataType accumulator_type, ScratchAllocator* scratch_allocator, const dnn::AlgorithmConfig& algorithm_config, + dnn::AlgorithmDesc* algorithm_desc, DeviceMemory* scratch_memory); + + template + port::Status DoConvolveBackwardFilterImpl( + Stream* stream, const dnn::BatchDescriptor& input_descriptor, + const DeviceMemory& input_data, + const dnn::BatchDescriptor& output_descriptor, + DeviceMemory backward_output_data, + const dnn::ConvolutionDescriptor& convolution_descriptor, + const dnn::FilterDescriptor& filter_descriptor, + DeviceMemory* backward_filter_data, dnn::DataType accumulator_type, + const dnn::AlgorithmDesc& algorithm_desc, + DeviceMemory* scratch_memory, dnn::ProfileResult* output_profile_result); template diff --git a/tensorflow/stream_executor/dnn.h b/tensorflow/stream_executor/dnn.h index c044a356ef..322232d263 100644 --- a/tensorflow/stream_executor/dnn.h +++ b/tensorflow/stream_executor/dnn.h @@ -730,6 +730,7 @@ class PoolingDescriptor { class AlgorithmDesc { public: typedef int64 Index; + AlgorithmDesc() : AlgorithmDesc(0, false) {} AlgorithmDesc(Index a, bool use_tensor_ops) { proto_.set_algo_id(a); proto_.set_math_type(use_tensor_ops ? AlgorithmProto::TENSOR_OP_MATH @@ -1175,6 +1176,43 @@ class DnnSupport { return false; } + virtual bool PrepareForConvolution( + Stream* stream, const BatchDescriptor& batch_descriptor, + const DeviceMemory& input_data, + const FilterDescriptor& filter_descriptor, + const DeviceMemory& filter_data, + const ConvolutionDescriptor& convolution_descriptor, + const BatchDescriptor& output_descriptor, + DeviceMemory* output_data, ScratchAllocator* scratch_allocator, + const dnn::AlgorithmConfig& algorithm_config, + dnn::AlgorithmDesc* algorithm_desc, + DeviceMemory* scratch_memory) = 0; + + virtual bool PrepareForConvolution( + Stream* stream, const BatchDescriptor& batch_descriptor, + const DeviceMemory& input_data, + const FilterDescriptor& filter_descriptor, + const DeviceMemory& filter_data, + const ConvolutionDescriptor& convolution_descriptor, + const BatchDescriptor& output_descriptor, + DeviceMemory* output_data, ScratchAllocator* scratch_allocator, + const dnn::AlgorithmConfig& algorithm_config, + dnn::AlgorithmDesc* algorithm_desc, + DeviceMemory* scratch_memory) = 0; + + virtual bool PrepareForConvolution( + Stream* stream, const BatchDescriptor& batch_descriptor, + const DeviceMemory& input_data, + const FilterDescriptor& filter_descriptor, + const DeviceMemory& filter_data, + const ConvolutionDescriptor& convolution_descriptor, + const BatchDescriptor& output_descriptor, + DeviceMemory* output_data, + ScratchAllocator* scratch_allocator, + const dnn::AlgorithmConfig& algorithm_config, + dnn::AlgorithmDesc* algorithm_desc, + DeviceMemory* scratch_memory) = 0; + // Enqueues a single-precision convolution operation onto the stream. // // Arguments (all borrowed): @@ -1188,10 +1226,10 @@ class DnnSupport { // output_descriptor: dimensions of the output layer. // output_data: un-owned device memory region in which to place the // convolution result. - // scratch_allocator: un-owned, may-be-null object that may allocate scratch - // space in order to speed up the convolution operation. - // algorithm_config: specifies which algorithm should be used for the + // algorithm_desc: specifies which algorithm should be used for the // operation. + // scratch: un-owned device memory for scratch space in order to speed up + // the convolution operation. // output_profile_result: the output profile result for this call. The // profiling is only enabled when this is not nullptr. // @@ -1216,8 +1254,9 @@ class DnnSupport { const DeviceMemory& filter_data, const dnn::ConvolutionDescriptor& convolution_descriptor, const dnn::BatchDescriptor& output_descriptor, - DeviceMemory* output_data, ScratchAllocator* scratch_allocator, - const dnn::AlgorithmConfig& algorithm_config, + DeviceMemory* output_data, + const dnn::AlgorithmDesc& algorithm_desc, + DeviceMemory* scratch_memory, ProfileResult* output_profile_result) = 0; // Enqueues a double-precision convolution operation onto the stream. @@ -1229,8 +1268,9 @@ class DnnSupport { const DeviceMemory& filter_data, const dnn::ConvolutionDescriptor& convolution_descriptor, const dnn::BatchDescriptor& output_descriptor, - DeviceMemory* output_data, ScratchAllocator* scratch_allocator, - const dnn::AlgorithmConfig& algorithm_config, + DeviceMemory* output_data, + const dnn::AlgorithmDesc& algorithm_desc, + DeviceMemory* scratch_memory, dnn::ProfileResult* output_profile_result) = 0; // Enqueues a half-precision convolution operation onto the stream. @@ -1243,8 +1283,8 @@ class DnnSupport { const dnn::ConvolutionDescriptor& convolution_descriptor, const dnn::BatchDescriptor& output_descriptor, DeviceMemory* output_data, - ScratchAllocator* scratch_allocator, - const dnn::AlgorithmConfig& algorithm_config, + const dnn::AlgorithmDesc& algorithm_desc, + DeviceMemory* scratch_memory, ProfileResult* output_profile_result) = 0; // Return a list of algorithms supported by the forward convolution pass. @@ -1300,6 +1340,45 @@ class DnnSupport { const BatchDescriptor& output_descriptor, DeviceMemory* output_data) = 0; + virtual bool PrepareForConvolutionBackwardData( + Stream* stream, const dnn::FilterDescriptor& filter_descriptor, + const DeviceMemory& filter_data, + const dnn::BatchDescriptor& output_descriptor, + DeviceMemory backward_output_data, + const dnn::ConvolutionDescriptor& convolution_descriptor, + const dnn::BatchDescriptor& input_descriptor, + DeviceMemory* backward_input_data, + ScratchAllocator* scratch_allocator, + const dnn::AlgorithmConfig& algorithm_config, + dnn::AlgorithmDesc* algorithm_desc, + DeviceMemory* scratch_memory) = 0; + + virtual bool PrepareForConvolutionBackwardData( + Stream* stream, const dnn::FilterDescriptor& filter_descriptor, + const DeviceMemory& filter_data, + const dnn::BatchDescriptor& output_descriptor, + DeviceMemory backward_output_data, + const dnn::ConvolutionDescriptor& convolution_descriptor, + const dnn::BatchDescriptor& input_descriptor, + DeviceMemory* backward_input_data, + ScratchAllocator* scratch_allocator, + const dnn::AlgorithmConfig& algorithm_config, + dnn::AlgorithmDesc* algorithm_desc, + DeviceMemory* scratch_memory) = 0; + + virtual bool PrepareForConvolutionBackwardData( + Stream* stream, const dnn::FilterDescriptor& filter_descriptor, + const DeviceMemory& filter_data, + const dnn::BatchDescriptor& output_descriptor, + DeviceMemory backward_output_data, + const dnn::ConvolutionDescriptor& convolution_descriptor, + const dnn::BatchDescriptor& input_descriptor, + DeviceMemory* backward_input_data, + ScratchAllocator* scratch_allocator, + const dnn::AlgorithmConfig& algorithm_config, + dnn::AlgorithmDesc* algorithm_desc, + DeviceMemory* scratch_memory) = 0; + // Enqueues a single-precision backward convolution (for data) operation onto // the stream. // @@ -1319,15 +1398,15 @@ class DnnSupport { // scratch_allocator: un-owned, may-be-null object that may allocate scratch // space in order to speed up the convolution operation. virtual bool DoConvolveBackwardData( - Stream* stream, const FilterDescriptor& filter_descriptor, + Stream* stream, const dnn::FilterDescriptor& filter_descriptor, const DeviceMemory& filter_data, - const BatchDescriptor& output_descriptor, + const dnn::BatchDescriptor& output_descriptor, DeviceMemory backward_output_data, - const ConvolutionDescriptor& convolution_descriptor, - const BatchDescriptor& input_descriptor, + const dnn::ConvolutionDescriptor& convolution_descriptor, + const dnn::BatchDescriptor& input_descriptor, DeviceMemory* backward_input_data, - ScratchAllocator* scratch_allocator, - const dnn::AlgorithmConfig& algorithm_config, + const dnn::AlgorithmDesc& algorithm_desc, + DeviceMemory* scratch_memory, ProfileResult* output_profile_result) = 0; // Return a list of algorithms supported by the backward convolution pass for @@ -1337,28 +1416,67 @@ class DnnSupport { std::vector* out_algorithms); virtual bool DoConvolveBackwardData( - Stream* stream, const FilterDescriptor& filter_descriptor, + Stream* stream, const dnn::FilterDescriptor& filter_descriptor, const DeviceMemory& filter_data, - const BatchDescriptor& output_descriptor, + const dnn::BatchDescriptor& output_descriptor, DeviceMemory backward_output_data, - const ConvolutionDescriptor& convolution_descriptor, - const BatchDescriptor& input_descriptor, + const dnn::ConvolutionDescriptor& convolution_descriptor, + const dnn::BatchDescriptor& input_descriptor, DeviceMemory* backward_input_data, - ScratchAllocator* scratch_allocator, - const dnn::AlgorithmConfig& algorithm_config, + const dnn::AlgorithmDesc& algorithm_desc, + DeviceMemory* scratch_memory, ProfileResult* output_profile_result) = 0; virtual bool DoConvolveBackwardData( - Stream* stream, const FilterDescriptor& filter_descriptor, + Stream* stream, const dnn::FilterDescriptor& filter_descriptor, const DeviceMemory& filter_data, + const dnn::BatchDescriptor& output_descriptor, + DeviceMemory backward_output_data, + const dnn::ConvolutionDescriptor& convolution_descriptor, + const dnn::BatchDescriptor& input_descriptor, + DeviceMemory* backward_input_data, + const dnn::AlgorithmDesc& algorithm_desc, + DeviceMemory* scratch_memory, + ProfileResult* output_profile_result) = 0; + + virtual bool PrepareForConvolutionBackwardFilter( + Stream* stream, const BatchDescriptor& input_descriptor, + const DeviceMemory& input_data, + const BatchDescriptor& output_descriptor, + DeviceMemory backward_output_data, + const ConvolutionDescriptor& convolution_descriptor, + const FilterDescriptor& filter_descriptor, + DeviceMemory* backward_filter_data, + ScratchAllocator* scratch_allocator, + const dnn::AlgorithmConfig& algorithm_config, + dnn::AlgorithmDesc* algorithm_desc, + DeviceMemory* scratch_memory) = 0; + + virtual bool PrepareForConvolutionBackwardFilter( + Stream* stream, const BatchDescriptor& input_descriptor, + const DeviceMemory& input_data, + const BatchDescriptor& output_descriptor, + DeviceMemory backward_output_data, + const ConvolutionDescriptor& convolution_descriptor, + const FilterDescriptor& filter_descriptor, + DeviceMemory* backward_filter_data, + ScratchAllocator* scratch_allocator, + const dnn::AlgorithmConfig& algorithm_config, + dnn::AlgorithmDesc* algorithm_desc, + DeviceMemory* scratch_memory) = 0; + + virtual bool PrepareForConvolutionBackwardFilter( + Stream* stream, const BatchDescriptor& input_descriptor, + const DeviceMemory& input_data, const BatchDescriptor& output_descriptor, DeviceMemory backward_output_data, const ConvolutionDescriptor& convolution_descriptor, - const BatchDescriptor& input_descriptor, - DeviceMemory* backward_input_data, + const FilterDescriptor& filter_descriptor, + DeviceMemory* backward_filter_data, ScratchAllocator* scratch_allocator, const dnn::AlgorithmConfig& algorithm_config, - ProfileResult* output_profile_result) = 0; + dnn::AlgorithmDesc* algorithm_desc, + DeviceMemory* scratch_memory) = 0; // Enqueues a single-precision backward convolution (for filter) operation // onto the stream. @@ -1387,8 +1505,8 @@ class DnnSupport { const ConvolutionDescriptor& convolution_descriptor, const FilterDescriptor& filter_descriptor, DeviceMemory* backward_filter_data, - ScratchAllocator* scratch_allocator, - const dnn::AlgorithmConfig& algorithm_config, + const dnn::AlgorithmDesc& algorithm_desc, + DeviceMemory* scratch_memory, ProfileResult* output_profile_result) = 0; // Return a list of algorithms supported by the backward convolution pass for @@ -1405,8 +1523,8 @@ class DnnSupport { const ConvolutionDescriptor& convolution_descriptor, const FilterDescriptor& filter_descriptor, DeviceMemory* backward_filter_data, - ScratchAllocator* scratch_allocator, - const dnn::AlgorithmConfig& algorithm_config, + const dnn::AlgorithmDesc& algorithm_desc, + DeviceMemory* scratch_memory, ProfileResult* output_profile_result) = 0; virtual bool DoConvolveBackwardFilter( @@ -1417,8 +1535,8 @@ class DnnSupport { const ConvolutionDescriptor& convolution_descriptor, const FilterDescriptor& filter_descriptor, DeviceMemory* backward_filter_data, - ScratchAllocator* scratch_allocator, - const dnn::AlgorithmConfig& algorithm_config, + const dnn::AlgorithmDesc& algorithm_desc, + DeviceMemory* scratch_memory, ProfileResult* output_profile_result) = 0; // Enqueues a single-precision backward convolution (for bias) operation onto diff --git a/tensorflow/stream_executor/stream.cc b/tensorflow/stream_executor/stream.cc index 3edc66cde8..4503127bee 100644 --- a/tensorflow/stream_executor/stream.cc +++ b/tensorflow/stream_executor/stream.cc @@ -549,11 +549,16 @@ Stream &Stream::ThenConvolveWithScratch( if (ok()) { if (dnn::DnnSupport *dnn = parent_->AsDnn()) { - CheckError(dnn->DoConvolve( + DeviceMemory scratch_memory; + dnn::AlgorithmDesc algorithm_desc; + CheckError(dnn->PrepareForConvolution( this, input_descriptor, input_data, filter_descriptor, filter_data, convolution_descriptor, output_descriptor, output, scratch_allocator, - dnn::AlgorithmConfig(), - /*output_profile_result=*/nullptr)); + dnn::AlgorithmConfig(), &algorithm_desc, &scratch_memory)); + CheckError(dnn->DoConvolve( + this, input_descriptor, input_data, filter_descriptor, filter_data, + convolution_descriptor, output_descriptor, output, algorithm_desc, + &scratch_memory, nullptr)); } else { SetErrorAndLogNoDnnSupport(); } @@ -576,11 +581,16 @@ Stream &Stream::ThenConvolveWithScratch( if (ok()) { if (dnn::DnnSupport *dnn = parent_->AsDnn()) { - CheckError(dnn->DoConvolve( + DeviceMemory scratch_memory; + dnn::AlgorithmDesc algorithm_desc; + CheckError(dnn->PrepareForConvolution( this, input_descriptor, input_data, filter_descriptor, filter_data, convolution_descriptor, output_descriptor, output, scratch_allocator, - dnn::AlgorithmConfig(), - /*output_profile_result=*/nullptr)); + dnn::AlgorithmConfig(), &algorithm_desc, &scratch_memory)); + CheckError(dnn->DoConvolve( + this, input_descriptor, input_data, filter_descriptor, filter_data, + convolution_descriptor, output_descriptor, output, algorithm_desc, + &scratch_memory, nullptr)); } else { SetErrorAndLogNoDnnSupport(); } @@ -758,10 +768,18 @@ Stream &Stream::ThenConvolveWithAlgorithm( if (ok()) { if (dnn::DnnSupport *dnn = parent_->AsDnn()) { - auto status = dnn->DoConvolve( + DeviceMemory scratch_memory; + dnn::AlgorithmDesc algorithm_desc; + auto status = dnn->PrepareForConvolution( this, input_descriptor, input_data, filter_descriptor, filter_data, convolution_descriptor, output_descriptor, output, scratch_allocator, - algorithm_config, output_profile_result); + algorithm_config, &algorithm_desc, &scratch_memory); + if (status) { + status = dnn->DoConvolve( + this, input_descriptor, input_data, filter_descriptor, filter_data, + convolution_descriptor, output_descriptor, output, algorithm_desc, + &scratch_memory, output_profile_result); + } if (!status && !output_profile_result) { SetError(); } @@ -789,10 +807,18 @@ Stream &Stream::ThenConvolveWithAlgorithm( if (ok()) { if (dnn::DnnSupport *dnn = parent_->AsDnn()) { - auto status = dnn->DoConvolve( + DeviceMemory scratch_memory; + dnn::AlgorithmDesc algorithm_desc; + auto status = dnn->PrepareForConvolution( this, input_descriptor, input_data, filter_descriptor, filter_data, convolution_descriptor, output_descriptor, output, scratch_allocator, - algorithm_config, output_profile_result); + algorithm_config, &algorithm_desc, &scratch_memory); + if (status) { + status = dnn->DoConvolve( + this, input_descriptor, input_data, filter_descriptor, filter_data, + convolution_descriptor, output_descriptor, output, algorithm_desc, + &scratch_memory, output_profile_result); + } if (!status && !output_profile_result) { SetError(); } @@ -820,10 +846,18 @@ Stream &Stream::ThenConvolveWithAlgorithm( if (ok()) { if (dnn::DnnSupport *dnn = parent_->AsDnn()) { - auto status = dnn->DoConvolve( + DeviceMemory scratch_memory; + dnn::AlgorithmDesc algorithm_desc; + auto status = dnn->PrepareForConvolution( this, input_descriptor, input_data, filter_descriptor, filter_data, convolution_descriptor, output_descriptor, output, scratch_allocator, - algorithm_config, output_profile_result); + algorithm_config, &algorithm_desc, &scratch_memory); + if (status) { + status = dnn->DoConvolve( + this, input_descriptor, input_data, filter_descriptor, filter_data, + convolution_descriptor, output_descriptor, output, algorithm_desc, + &scratch_memory, output_profile_result); + } if (!status && !output_profile_result) { SetError(); } @@ -969,10 +1003,17 @@ Stream &Stream::ThenConvolveBackwardDataWithScratch( if (ok()) { if (dnn::DnnSupport *dnn = parent_->AsDnn()) { - CheckError(dnn->DoConvolveBackwardData( + DeviceMemory scratch_memory; + dnn::AlgorithmDesc algorithm_desc; + CheckError(dnn->PrepareForConvolutionBackwardData( this, filter_descriptor, filter_data, output_descriptor, backward_output_data, convolution_descriptor, input_descriptor, backward_input_data, scratch_allocator, dnn::AlgorithmConfig(), + &algorithm_desc, &scratch_memory)); + CheckError(dnn->DoConvolveBackwardData( + this, filter_descriptor, filter_data, output_descriptor, + backward_output_data, convolution_descriptor, input_descriptor, + backward_input_data, algorithm_desc, &scratch_memory, /*output_profile_result=*/nullptr)); } else { SetErrorAndLogNoDnnSupport(); @@ -999,11 +1040,20 @@ Stream &Stream::ThenConvolveBackwardDataWithAlgorithm( if (ok()) { if (dnn::DnnSupport *dnn = parent_->AsDnn()) { - auto status = dnn->DoConvolveBackwardData( + DeviceMemory scratch_memory; + dnn::AlgorithmDesc algorithm_desc; + auto status = dnn->PrepareForConvolutionBackwardData( this, filter_descriptor, filter_data, output_descriptor, backward_output_data, convolution_descriptor, input_descriptor, backward_input_data, scratch_allocator, algorithm_config, - output_profile_result); + &algorithm_desc, &scratch_memory); + if (status) { + status = dnn->DoConvolveBackwardData( + this, filter_descriptor, filter_data, output_descriptor, + backward_output_data, convolution_descriptor, input_descriptor, + backward_input_data, algorithm_desc, &scratch_memory, + output_profile_result); + } if (!status && !output_profile_result) { SetError(); } @@ -1032,11 +1082,20 @@ Stream &Stream::ThenConvolveBackwardDataWithAlgorithm( if (ok()) { if (dnn::DnnSupport *dnn = parent_->AsDnn()) { - auto status = dnn->DoConvolveBackwardData( + DeviceMemory scratch_memory; + dnn::AlgorithmDesc algorithm_desc; + auto status = dnn->PrepareForConvolutionBackwardData( this, filter_descriptor, filter_data, output_descriptor, backward_output_data, convolution_descriptor, input_descriptor, backward_input_data, scratch_allocator, algorithm_config, - output_profile_result); + &algorithm_desc, &scratch_memory); + if (status) { + status = dnn->DoConvolveBackwardData( + this, filter_descriptor, filter_data, output_descriptor, + backward_output_data, convolution_descriptor, input_descriptor, + backward_input_data, algorithm_desc, &scratch_memory, + output_profile_result); + } if (!status && !output_profile_result) { SetError(); } @@ -1065,11 +1124,20 @@ Stream &Stream::ThenConvolveBackwardDataWithAlgorithm( if (ok()) { if (dnn::DnnSupport *dnn = parent_->AsDnn()) { - auto status = dnn->DoConvolveBackwardData( + DeviceMemory scratch_memory; + dnn::AlgorithmDesc algorithm_desc; + auto status = dnn->PrepareForConvolutionBackwardData( this, filter_descriptor, filter_data, output_descriptor, backward_output_data, convolution_descriptor, input_descriptor, backward_input_data, scratch_allocator, algorithm_config, - output_profile_result); + &algorithm_desc, &scratch_memory); + if (status) { + status = dnn->DoConvolveBackwardData( + this, filter_descriptor, filter_data, output_descriptor, + backward_output_data, convolution_descriptor, input_descriptor, + backward_input_data, algorithm_desc, &scratch_memory, + output_profile_result); + } if (!status && !output_profile_result) { SetError(); } @@ -1096,10 +1164,17 @@ Stream &Stream::ThenConvolveBackwardDataWithScratch( if (ok()) { if (dnn::DnnSupport *dnn = parent_->AsDnn()) { - CheckError(dnn->DoConvolveBackwardData( + DeviceMemory scratch_memory; + dnn::AlgorithmDesc algorithm_desc; + CheckError(dnn->PrepareForConvolutionBackwardData( this, filter_descriptor, filter_data, output_descriptor, backward_output_data, convolution_descriptor, input_descriptor, backward_input_data, scratch_allocator, dnn::AlgorithmConfig(), + &algorithm_desc, &scratch_memory)); + CheckError(dnn->DoConvolveBackwardData( + this, filter_descriptor, filter_data, output_descriptor, + backward_output_data, convolution_descriptor, input_descriptor, + backward_input_data, algorithm_desc, &scratch_memory, /*output_profile_result=*/nullptr)); } else { SetErrorAndLogNoDnnSupport(); @@ -1138,10 +1213,17 @@ Stream &Stream::ThenConvolveBackwardFilterWithScratch( if (ok()) { if (dnn::DnnSupport *dnn = parent_->AsDnn()) { - CheckError(dnn->DoConvolveBackwardFilter( + DeviceMemory scratch_memory; + dnn::AlgorithmDesc algorithm_desc; + CheckError(dnn->PrepareForConvolutionBackwardFilter( this, input_descriptor, input_data, output_descriptor, backward_output_data, convolution_descriptor, filter_descriptor, backward_filter_data, scratch_allocator, dnn::AlgorithmConfig(), + &algorithm_desc, &scratch_memory)); + CheckError(dnn->DoConvolveBackwardFilter( + this, input_descriptor, input_data, output_descriptor, + backward_output_data, convolution_descriptor, filter_descriptor, + backward_filter_data, algorithm_desc, &scratch_memory, /*output_profile_result=*/nullptr)); } else { SetErrorAndLogNoDnnSupport(); @@ -1168,11 +1250,20 @@ Stream &Stream::ThenConvolveBackwardFilterWithAlgorithm( if (ok()) { if (dnn::DnnSupport *dnn = parent_->AsDnn()) { - auto status = dnn->DoConvolveBackwardFilter( + DeviceMemory scratch_memory; + dnn::AlgorithmDesc algorithm_desc; + auto status = dnn->PrepareForConvolutionBackwardFilter( this, input_descriptor, input_data, output_descriptor, backward_output_data, convolution_descriptor, filter_descriptor, backward_filter_data, scratch_allocator, algorithm_config, - output_profile_result); + &algorithm_desc, &scratch_memory); + if (status) { + status = dnn->DoConvolveBackwardFilter( + this, input_descriptor, input_data, output_descriptor, + backward_output_data, convolution_descriptor, filter_descriptor, + backward_filter_data, algorithm_desc, &scratch_memory, + output_profile_result); + } if (!status && !output_profile_result) { SetError(); } @@ -1201,11 +1292,20 @@ Stream &Stream::ThenConvolveBackwardFilterWithAlgorithm( if (ok()) { if (dnn::DnnSupport *dnn = parent_->AsDnn()) { - auto status = dnn->DoConvolveBackwardFilter( + DeviceMemory scratch_memory; + dnn::AlgorithmDesc algorithm_desc; + auto status = dnn->PrepareForConvolutionBackwardFilter( this, input_descriptor, input_data, output_descriptor, backward_output_data, convolution_descriptor, filter_descriptor, backward_filter_data, scratch_allocator, algorithm_config, - output_profile_result); + &algorithm_desc, &scratch_memory); + if (status) { + status = dnn->DoConvolveBackwardFilter( + this, input_descriptor, input_data, output_descriptor, + backward_output_data, convolution_descriptor, filter_descriptor, + backward_filter_data, algorithm_desc, &scratch_memory, + output_profile_result); + } if (!status && !output_profile_result) { SetError(); } @@ -1232,10 +1332,17 @@ Stream &Stream::ThenConvolveBackwardFilterWithScratch( if (ok()) { if (dnn::DnnSupport *dnn = parent_->AsDnn()) { - CheckError(dnn->DoConvolveBackwardFilter( + DeviceMemory scratch_memory; + dnn::AlgorithmDesc algorithm_desc; + CheckError(dnn->PrepareForConvolutionBackwardFilter( this, input_descriptor, input_data, output_descriptor, backward_output_data, convolution_descriptor, filter_descriptor, backward_filter_data, scratch_allocator, dnn::AlgorithmConfig(), + &algorithm_desc, &scratch_memory)); + CheckError(dnn->DoConvolveBackwardFilter( + this, input_descriptor, input_data, output_descriptor, + backward_output_data, convolution_descriptor, filter_descriptor, + backward_filter_data, algorithm_desc, &scratch_memory, /*output_profile_result=*/nullptr)); } else { SetErrorAndLogNoDnnSupport(); @@ -1262,11 +1369,20 @@ Stream &Stream::ThenConvolveBackwardFilterWithAlgorithm( if (ok()) { if (dnn::DnnSupport *dnn = parent_->AsDnn()) { - auto status = dnn->DoConvolveBackwardFilter( + DeviceMemory scratch_memory; + dnn::AlgorithmDesc algorithm_desc; + auto status = dnn->PrepareForConvolutionBackwardFilter( this, input_descriptor, input_data, output_descriptor, backward_output_data, convolution_descriptor, filter_descriptor, backward_filter_data, scratch_allocator, algorithm_config, - output_profile_result); + &algorithm_desc, &scratch_memory); + if (status) { + status = dnn->DoConvolveBackwardFilter( + this, input_descriptor, input_data, output_descriptor, + backward_output_data, convolution_descriptor, filter_descriptor, + backward_filter_data, algorithm_desc, &scratch_memory, + output_profile_result); + } if (!status && !output_profile_result) { SetError(); } -- GitLab From 8584f21392772170b007ee3b3fbfed17fe19e32f Mon Sep 17 00:00:00 2001 From: Yong Tang Date: Sat, 1 Dec 2018 20:58:09 +0000 Subject: [PATCH 047/405] Fix SparseDenseCwise's broadcasting issue This fix tries to address the issue raised in 24072. In `sparse_dense_cwise_mul/add` operations the broadcasting only support dense to sparse, though the validation was not captured. This fix fixes the validation in SparseDenseBinaryOpShared so that error could be thrown correctly. This fix fixes 24072. Signed-off-by: Yong Tang --- tensorflow/core/kernels/sparse_dense_binary_op_shared.cc | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/tensorflow/core/kernels/sparse_dense_binary_op_shared.cc b/tensorflow/core/kernels/sparse_dense_binary_op_shared.cc index ac48202ada..3a6b66302f 100644 --- a/tensorflow/core/kernels/sparse_dense_binary_op_shared.cc +++ b/tensorflow/core/kernels/sparse_dense_binary_op_shared.cc @@ -88,12 +88,11 @@ class SparseDenseBinaryOpShared : public OpKernel { const auto rhs_dims = BCast::FromShape(dense_t->shape()); BCast b(lhs_dims, rhs_dims, false); // false for keeping the same num dims. - // True iff (size(lhs) > size(rhs)), or (sizes equal, lhs cwise rhs). + // True iff (size(lhs) >= size(rhs)) and all dims in lhs is smaller or equal to dims in rhs (from right to left). auto VecGreaterEq = [](ArraySlice lhs, ArraySlice rhs) { - if (lhs.size() > rhs.size()) return true; if (lhs.size() < rhs.size()) return false; - for (size_t i = 0; i < lhs.size(); ++i) { - if (lhs[i] < rhs[i]) return false; + for (size_t i = 0; i < rhs.size(); ++i) { + if (lhs[lhs.size() - 1 - i] < rhs[rhs.size() - 1 - i]) return false; } return true; }; -- GitLab From a7cd4dbea9f276160ebadf82178e77ae5c8d557e Mon Sep 17 00:00:00 2001 From: Yong Tang Date: Sat, 1 Dec 2018 21:02:04 +0000 Subject: [PATCH 048/405] Add test case for sparse_dense_cwise shape validation. Signed-off-by: Yong Tang --- tensorflow/python/kernel_tests/sparse_ops_test.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/tensorflow/python/kernel_tests/sparse_ops_test.py b/tensorflow/python/kernel_tests/sparse_ops_test.py index 75f65e6251..f58832a89e 100644 --- a/tensorflow/python/kernel_tests/sparse_ops_test.py +++ b/tensorflow/python/kernel_tests/sparse_ops_test.py @@ -22,6 +22,7 @@ import numpy as np from tensorflow.python.framework import constant_op from tensorflow.python.framework import dtypes +from tensorflow.python.framework import errors from tensorflow.python.framework import ops from tensorflow.python.framework import sparse_tensor from tensorflow.python.framework import test_util @@ -798,6 +799,17 @@ class SparseMathOpsTest(test_util.TensorFlowTestCase): result_tensor.values).eval() self.assertAllEqual(result_np, res_densified) + @test_util.run_deprecated_v1 + def testCwiseShapeValidation(self): + # Test case for GitHub 24072. + with self.session(use_gpu=False): + a = array_ops.ones([3, 4, 1], dtype=dtypes.int32) + b = sparse_tensor.SparseTensor([[0, 0, 1, 0], [0, 0, 3, 0]], [10, 20], [1, 1, 4, 2]) + c = a * b + with self.assertRaisesRegexp(errors.InvalidArgumentError, + "broadcasts dense to sparse only; got incompatible shapes"): + c.eval() + @test_util.run_deprecated_v1 def testCwiseDivAndMul(self): np.random.seed(1618) -- GitLab From 676c6ea316f4dda962d8b2e29855c040020533f5 Mon Sep 17 00:00:00 2001 From: Yong Tang Date: Sat, 1 Dec 2018 21:04:41 +0000 Subject: [PATCH 049/405] Pylint fix Signed-off-by: Yong Tang --- tensorflow/python/kernel_tests/sparse_ops_test.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/tensorflow/python/kernel_tests/sparse_ops_test.py b/tensorflow/python/kernel_tests/sparse_ops_test.py index f58832a89e..50b6239185 100644 --- a/tensorflow/python/kernel_tests/sparse_ops_test.py +++ b/tensorflow/python/kernel_tests/sparse_ops_test.py @@ -804,10 +804,12 @@ class SparseMathOpsTest(test_util.TensorFlowTestCase): # Test case for GitHub 24072. with self.session(use_gpu=False): a = array_ops.ones([3, 4, 1], dtype=dtypes.int32) - b = sparse_tensor.SparseTensor([[0, 0, 1, 0], [0, 0, 3, 0]], [10, 20], [1, 1, 4, 2]) + b = sparse_tensor.SparseTensor( + [[0, 0, 1, 0], [0, 0, 3, 0]], [10, 20], [1, 1, 4, 2]) c = a * b - with self.assertRaisesRegexp(errors.InvalidArgumentError, - "broadcasts dense to sparse only; got incompatible shapes"): + with self.assertRaisesRegexp( + errors.InvalidArgumentError, + "broadcasts dense to sparse only; got incompatible shapes"): c.eval() @test_util.run_deprecated_v1 -- GitLab From 28e034bd293db09a0ecc707f71f65fe6c5dd2943 Mon Sep 17 00:00:00 2001 From: Yong Tang Date: Sat, 1 Dec 2018 21:28:19 +0000 Subject: [PATCH 050/405] Fix `Experimental clang-format Check` Signed-off-by: Yong Tang --- tensorflow/core/kernels/sparse_dense_binary_op_shared.cc | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tensorflow/core/kernels/sparse_dense_binary_op_shared.cc b/tensorflow/core/kernels/sparse_dense_binary_op_shared.cc index 3a6b66302f..d7460363fc 100644 --- a/tensorflow/core/kernels/sparse_dense_binary_op_shared.cc +++ b/tensorflow/core/kernels/sparse_dense_binary_op_shared.cc @@ -88,7 +88,8 @@ class SparseDenseBinaryOpShared : public OpKernel { const auto rhs_dims = BCast::FromShape(dense_t->shape()); BCast b(lhs_dims, rhs_dims, false); // false for keeping the same num dims. - // True iff (size(lhs) >= size(rhs)) and all dims in lhs is smaller or equal to dims in rhs (from right to left). + // True iff (size(lhs) >= size(rhs)) and all dims in lhs is smaller or equal + // to dims in rhs (from right to left). auto VecGreaterEq = [](ArraySlice lhs, ArraySlice rhs) { if (lhs.size() < rhs.size()) return false; for (size_t i = 0; i < rhs.size(); ++i) { -- GitLab From cc8e28b72b52718db5f22830e1d529d8e077b537 Mon Sep 17 00:00:00 2001 From: Yong Tang Date: Sun, 2 Dec 2018 01:55:11 +0000 Subject: [PATCH 051/405] Fix broken link in lite api docs This fix fixes broken link in lite apis docs Signed-off-by: Yong Tang --- tensorflow/lite/g3doc/apis.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tensorflow/lite/g3doc/apis.md b/tensorflow/lite/g3doc/apis.md index e9fa24bff1..60ce11d685 100644 --- a/tensorflow/lite/g3doc/apis.md +++ b/tensorflow/lite/g3doc/apis.md @@ -347,7 +347,7 @@ interpreter.runForMultipleInputsOutputs(inputs, map_of_indices_to_outputs); where each entry in `inputs` corresponds to an input tensor and `map_of_indices_to_outputs` maps indices of output tensors to the corresponding output data. In both cases the tensor indices should correspond to -the values given to the [TensorFlow Lite Optimized Converter](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/toco/g3doc/cmdline_examples.md) +the values given to the [TensorFlow Lite Optimized Converter](convert/cmdline_examples.md) when the model was created. Be aware that the order of tensors in `input` must match the order given to the `TensorFlow Lite Optimized Converter`. -- GitLab From bd9a4ff87899e71c95739b45fcc1bb9f0c10fbe3 Mon Sep 17 00:00:00 2001 From: tomguluson92 <314913739@qq.com> Date: Mon, 3 Dec 2018 10:50:10 +0800 Subject: [PATCH 052/405] [Proto/decode]word order fixed --- tensorflow/core/util/proto/decode.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tensorflow/core/util/proto/decode.h b/tensorflow/core/util/proto/decode.h index cbcb203ee7..248490a2fa 100644 --- a/tensorflow/core/util/proto/decode.h +++ b/tensorflow/core/util/proto/decode.h @@ -318,7 +318,7 @@ inline int ReadPackedPrimitives(const void* bufp, const size_t len, return count; } -// Reads a primitive value field from a serialized proto. +// Reads a primitive field value from a serialized proto. // The value is parsed from the serialized format, then static_cast // to the desired type for TensorFlow and stored. template Date: Mon, 3 Dec 2018 16:57:57 +0000 Subject: [PATCH 053/405] Update comment based on feedback Signed-off-by: Yong Tang --- tensorflow/core/kernels/sparse_dense_binary_op_shared.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tensorflow/core/kernels/sparse_dense_binary_op_shared.cc b/tensorflow/core/kernels/sparse_dense_binary_op_shared.cc index d7460363fc..a4e89f439e 100644 --- a/tensorflow/core/kernels/sparse_dense_binary_op_shared.cc +++ b/tensorflow/core/kernels/sparse_dense_binary_op_shared.cc @@ -88,7 +88,7 @@ class SparseDenseBinaryOpShared : public OpKernel { const auto rhs_dims = BCast::FromShape(dense_t->shape()); BCast b(lhs_dims, rhs_dims, false); // false for keeping the same num dims. - // True iff (size(lhs) >= size(rhs)) and all dims in lhs is smaller or equal + // True iff (size(lhs) >= size(rhs)) and all dims in lhs is greater or equal // to dims in rhs (from right to left). auto VecGreaterEq = [](ArraySlice lhs, ArraySlice rhs) { if (lhs.size() < rhs.size()) return false; -- GitLab From 3667478d73da230972b9a6432753e329ecfe7aef Mon Sep 17 00:00:00 2001 From: mbhuiyan Date: Mon, 3 Dec 2018 09:42:29 -0800 Subject: [PATCH 054/405] replacing CHECK by DCHECK --- tensorflow/core/graph/mkl_layout_pass.cc | 16 ++++++++-------- tensorflow/core/graph/mkl_layout_pass_test.cc | 10 +++++----- 2 files changed, 13 insertions(+), 13 deletions(-) diff --git a/tensorflow/core/graph/mkl_layout_pass.cc b/tensorflow/core/graph/mkl_layout_pass.cc index de1a982b9d..eca9afeada 100644 --- a/tensorflow/core/graph/mkl_layout_pass.cc +++ b/tensorflow/core/graph/mkl_layout_pass.cc @@ -776,7 +776,7 @@ class MklLayoutRewritePass : public GraphOptimizationPass { // merged with 'm'. If input 'm' is Conv2D, then check if there exists Pad // node that can be merged with 'm'. static Node* GetPadOrConv2D(const Node* m) { - CHECK_NOTNULL(m); + DCHECK(m); Node* n = nullptr; const Node* conv_node; @@ -790,7 +790,7 @@ class MklLayoutRewritePass : public GraphOptimizationPass { } } } else { - CHECK_EQ(m->type_string(), csinfo_.conv2d); + DCHECK_EQ(m->type_string(), csinfo_.conv2d); // If m is conv2D, Go over all input edges // and search for Pad Node. for (const Edge* e : m->in_edges()) { @@ -2182,7 +2182,7 @@ Status MklLayoutRewritePass::MergeConv2DWithBiasAdd(std::unique_ptr* g, Status MklLayoutRewritePass::MergePadWithConv2D(std::unique_ptr* g, Node* m, Node* n) { - CHECK(((m->type_string() == csinfo_.pad && + DCHECK(((m->type_string() == csinfo_.pad && n->type_string() == csinfo_.conv2d)) || ((n->type_string() == csinfo_.pad && m->type_string() == csinfo_.conv2d))); @@ -2250,7 +2250,7 @@ Status MklLayoutRewritePass::MergePadWithConv2D(std::unique_ptr* g, PadDataInputEdges++; } } - CHECK_EQ(PadDataInputEdges, 2); + DCHECK_EQ(PadDataInputEdges, 2); // Conv2D must have 2 data inputs: pad output and Filter int ConvDataInputEdges = 0; @@ -2259,7 +2259,7 @@ Status MklLayoutRewritePass::MergePadWithConv2D(std::unique_ptr* g, ConvDataInputEdges++; } } - CHECK_EQ(ConvDataInputEdges, 2); + DCHECK_EQ(ConvDataInputEdges, 2); // We will use the node name of Conv2D as the name of new node // Build new node. We use same name as original node, but change the op @@ -2282,7 +2282,7 @@ Status MklLayoutRewritePass::MergePadWithConv2D(std::unique_ptr* g, // Create node. Node* new_node; TF_CHECK_OK(nb.Finalize(&**g, &new_node)); - CHECK_NOTNULL(new_node); + DCHECK(new_node); // Incoming data edges from 'pred' node and 'succ' node to new 'new_node' // node are already copied in BuildNode. @@ -2319,8 +2319,8 @@ Status MklLayoutRewritePass::MergePadWithConv2D(std::unique_ptr* g, // Conv2D has only 1 output (at slot 0) and merged node also has only 1 // output (at slot 0). const int kPadWithConv2DOutputSlot = 0; - CHECK_NOTNULL((*g)->AddEdge(new_node, kPadWithConv2DOutputSlot, e->dst(), - e->dst_input())); + (*g)->AddEdge(new_node, kPadWithConv2DOutputSlot, e->dst(), + e->dst_input()); } } diff --git a/tensorflow/core/graph/mkl_layout_pass_test.cc b/tensorflow/core/graph/mkl_layout_pass_test.cc index fa059f1194..5ba98308ae 100644 --- a/tensorflow/core/graph/mkl_layout_pass_test.cc +++ b/tensorflow/core/graph/mkl_layout_pass_test.cc @@ -515,7 +515,7 @@ TEST_F(MklLayoutPassTest, NodeMerge_PadWithConv2D_Positive) { // _MklPadWithConv2D(A, D, B, DMT/_0, DMT/_1, DMT/_2) // A1:control->E:control (only one control edge) TEST_F(MklLayoutPassTest, Input_ControlEdge_PadWithConv2D_Positive) { - CHECK_EQ(kTensorOrdering, MklTfTensorOrdering::TENSORS_CONTIGUOUS); + DCHECK_EQ(kTensorOrdering, MklTfTensorOrdering::TENSORS_CONTIGUOUS); InitGraph( "node { name: 'A1' op: 'Input'}" "node { name: 'A' op: 'Input'}" @@ -567,7 +567,7 @@ TEST_F(MklLayoutPassTest, Input_ControlEdge_PadWithConv2D_Positive) { // _MklPadWithConv2D(A, D, B, DMT/_0, DMT/_1, DMT/_2) // E:control->A1:control (only one control edge) TEST_F(MklLayoutPassTest, Output_ControlEdge_PadWithConv2D_Positive) { - CHECK_EQ(kTensorOrdering, MklTfTensorOrdering::TENSORS_CONTIGUOUS); + DCHECK_EQ(kTensorOrdering, MklTfTensorOrdering::TENSORS_CONTIGUOUS); InitGraph( "node { name: 'A1' op: 'Input'}" "node { name: 'A' op: 'Input'}" @@ -612,7 +612,7 @@ TEST_F(MklLayoutPassTest, Output_ControlEdge_PadWithConv2D_Positive) { // After layout pass // _MklPadWithConv2D(A, A, B, DMT/_0, DMT/_1, DMT/_2) TEST_F(MklLayoutPassTest, NodeMerge_PadWithConv2D_Common_Input) { - CHECK_EQ(kTensorOrdering, MklTfTensorOrdering::TENSORS_CONTIGUOUS); + DCHECK_EQ(kTensorOrdering, MklTfTensorOrdering::TENSORS_CONTIGUOUS); InitGraph( "node { name: 'A' op: 'Input'}" "node { name: 'B' op: 'Int32Input'}" @@ -648,7 +648,7 @@ TEST_F(MklLayoutPassTest, NodeMerge_PadWithConv2D_Common_Input) { // After layout pass - No merging, since Pad and Conv2D both // feed to the same node (Z) TEST_F(MklLayoutPassTest, NodeMerge_PadWithConv2D_Common_InOutput) { - CHECK_EQ(kTensorOrdering, MklTfTensorOrdering::TENSORS_CONTIGUOUS); + DCHECK_EQ(kTensorOrdering, MklTfTensorOrdering::TENSORS_CONTIGUOUS); InitGraph( "node { name: 'A' op: 'Input'}" "node { name: 'B' op: 'Int32Input'}" @@ -678,7 +678,7 @@ TEST_F(MklLayoutPassTest, NodeMerge_PadWithConv2D_Common_InOutput) { // C=Pad(A,B); E=Conv2D(C,D); Z=Zeta(E,Y) // After layout pass - No merging TEST_F(MklLayoutPassTest, NodeMerge_PadWithConv2D_Negative) { - CHECK_EQ(kTensorOrdering, MklTfTensorOrdering::TENSORS_CONTIGUOUS); + DCHECK_EQ(kTensorOrdering, MklTfTensorOrdering::TENSORS_CONTIGUOUS); InitGraph( "node { name: 'A' op: 'Input'}" "node { name: 'B' op: 'Int32Input'}" -- GitLab From 2ff90ad8d1599b9c5207d813b5e07ac480d588ab Mon Sep 17 00:00:00 2001 From: mbhuiyan Date: Mon, 3 Dec 2018 09:45:37 -0800 Subject: [PATCH 055/405] replacing CHECK BY DCHECK --- tensorflow/core/graph/mkl_layout_pass_test.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tensorflow/core/graph/mkl_layout_pass_test.cc b/tensorflow/core/graph/mkl_layout_pass_test.cc index 88b592a5fc..082278ee3e 100644 --- a/tensorflow/core/graph/mkl_layout_pass_test.cc +++ b/tensorflow/core/graph/mkl_layout_pass_test.cc @@ -472,7 +472,7 @@ TEST_F(MklLayoutPassTest, NodeMerge_Conv2DWithBias_ConvBpropInput_FilterFwd) { // After layout pass // _MklPadWithConv2D(A, D, B, DMT/_0, DMT/_1, DMT/_2) TEST_F(MklLayoutPassTest, NodeMerge_PadWithConv2D_Positive) { - CHECK_EQ(kTensorOrdering, MklTfTensorOrdering::TENSORS_CONTIGUOUS); + DCHECK_EQ(kTensorOrdering, MklTfTensorOrdering::TENSORS_CONTIGUOUS); InitGraph( "node { name: 'A' op: 'Input'}" "node { name: 'B' op: 'Int32Input'}" -- GitLab From 8b9636d1d7201f369fbfdb3e079d595888143be6 Mon Sep 17 00:00:00 2001 From: Pooya Davoodi Date: Mon, 3 Dec 2018 11:04:40 -0800 Subject: [PATCH 056/405] Update README.md --- tensorflow/contrib/tensorrt/README.md | 57 ++++++++++++++++++++------- 1 file changed, 43 insertions(+), 14 deletions(-) diff --git a/tensorflow/contrib/tensorrt/README.md b/tensorflow/contrib/tensorrt/README.md index caf8b6db0d..09ef7f459f 100644 --- a/tensorflow/contrib/tensorrt/README.md +++ b/tensorflow/contrib/tensorrt/README.md @@ -1,8 +1,47 @@ -# Using TensorRT in TensorFlow +# Using TensorRT in TensorFlow (TF-TRT) -This module provides necessary bindings and introduces TRT_engine_op operator -that wraps a subgraph in TensorRT. This is still a work in progress but should -be useable with most common graphs. +This module provides necessary bindings and introduces +`TRTEngineOp` operator that wraps a subgraph in TensorRT. +This is still a work in progress but should be useable +with most common graphs. + +## Installing TF-TRT + +Currently Tensorflow nightly builds include TF-TRT by default, +which means you don't need to install TF-TRT separately. +You can pull the latest TF containers from docker hub or +install the latest TF pip package to get access to the latest TF-TRT. + +If you want to use TF-TRT on NVIDIA Jetson platform, you can find +the download links for the relevant Tensorflow pip packages here: +https://docs.nvidia.com/deeplearning/dgx/index.html#installing-frameworks-for-jetson + +## Installing TensorRT + +In order to make use of TF-TRT, you will need a local installation +of TensorRT from the +[NVIDIA Developer website](https://developer.nvidia.com/tensorrt). +Installation instructions for compatibility with TensorFlow are provided on the +[TensorFlow GPU support](https://www.tensorflow.org/install/gpu) guide. + +## Tests + +TF-TRT includes both Python tests and C++ unit tests. +Most of Python tests are located in the test directory +and they can be executed uring `bazel test` or directly +with the Python command. Most of the C++ unit tests are +used to test the conversion functions that convert each TF op to +a number of TensorRT layers. + +## Examples + +You can find example scripts for running inference on deep learning models +in this repository: https://github.com/tensorflow/tensorrt + +## Documentation + +You can find documentation for TF-TRT here: +https://docs.nvidia.com/deeplearning/dgx/integrate-tf-trt/index.html ## Compilation @@ -17,13 +56,3 @@ has to set path to location where the library is installed during configuration. bazel build --config=cuda --config=opt //tensorflow/tools/pip_package:build_pip_package bazel-bin/tensorflow/tools/pip_package/build_pip_package /tmp/ ``` - -After the installation of tensorflow package, TensorRT transformation will be -available. An example use can be found in test/test_tftrt.py script - -## Installing TensorRT 3.0.4 - -In order to make use of TensorRT integration, you will need a local installation -of TensorRT 3.0.4 from the [NVIDIA Developer website](https://developer.nvidia.com/tensorrt). -Installation instructions for compatibility with TensorFlow are provided on the -[TensorFlow GPU support](https://www.tensorflow.org/install/gpu) guide. -- GitLab From 215beaca8c28f22a2d6b66b16b4770851491a792 Mon Sep 17 00:00:00 2001 From: Trevor Morris Date: Mon, 3 Dec 2018 11:26:56 -0800 Subject: [PATCH 057/405] Prevent segments with no inputs --- tensorflow/contrib/tensorrt/convert/convert_graph.cc | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/tensorflow/contrib/tensorrt/convert/convert_graph.cc b/tensorflow/contrib/tensorrt/convert/convert_graph.cc index 3b32f72bc1..3e599b9174 100644 --- a/tensorflow/contrib/tensorrt/convert/convert_graph.cc +++ b/tensorflow/contrib/tensorrt/convert/convert_graph.cc @@ -585,6 +585,13 @@ tensorflow::Status CreateTRTNode(const std::vector& infos, int pos, } } } + // We don't support segments with no inputs. Fall back to native TF here to + // avoid crash later. Constant folding should've folded the ops that make up + // these segments. + if (inputs.size() == 0) { + return tensorflow::errors::Internal("Segment has no inputs (possible " + "constfold failure)"); + } const bool calibrate_int8 = (info.precision_mode == INT8MODE && info.use_calibration); -- GitLab From 3661ae91df08852ebb49db0463a21628026fcdcc Mon Sep 17 00:00:00 2001 From: Sami Kama Date: Mon, 3 Dec 2018 15:15:55 -0800 Subject: [PATCH 058/405] Use session config when creating XLA devices --- tensorflow/compiler/jit/xla_gpu_device.cc | 31 +++++++++++++++++++++-- 1 file changed, 29 insertions(+), 2 deletions(-) diff --git a/tensorflow/compiler/jit/xla_gpu_device.cc b/tensorflow/compiler/jit/xla_gpu_device.cc index 944f732b99..fb13635414 100644 --- a/tensorflow/compiler/jit/xla_gpu_device.cc +++ b/tensorflow/compiler/jit/xla_gpu_device.cc @@ -23,6 +23,7 @@ limitations under the License. #include "tensorflow/compiler/tf2xla/xla_op_registry.h" #include "tensorflow/core/common_runtime/device_factory.h" #include "tensorflow/core/lib/core/status.h" +#include "tensorflow/core/lib/strings/str_util.h" namespace tensorflow { @@ -52,8 +53,34 @@ Status XlaGpuDeviceFactory::CreateDevices( VLOG(1) << "Failed to create XLA_GPU device: " << platform.status(); return Status::OK(); } - - for (int i = 0; i < platform.ValueOrDie()->VisibleDeviceCount(); ++i) { + const auto& allowed_gpus = + session_options.config.gpu_options().visible_device_list(); + std::unordered_set gpu_ids; + int num_visible_devices = platform.ValueOrDie()->VisibleDeviceCount(); + if (allowed_gpus.empty()) { + for (int i = 0; i < num_visible_devices; ++i) gpu_ids.insert(i); + } else { + const std::vector visible_devices = + str_util::Split(allowed_gpus, ','); + // copied from gpu/gpu_device.cc Should be redundant since code would fail + // there before it gets to here. + for (const string& platform_gpu_id_str : visible_devices) { + int32 platform_gpu_id; + if (!strings::safe_strto32(platform_gpu_id_str, &platform_gpu_id)) { + return errors::InvalidArgument( + "Could not parse entry in 'visible_device_list': '", + platform_gpu_id_str, "'. visible_device_list = ", allowed_gpus); + } + if (platform_gpu_id < 0 || platform_gpu_id >= num_visible_devices) { + return errors::InvalidArgument( + "'visible_device_list' listed an invalid GPU id '", platform_gpu_id, + "' but visible device count is ", num_visible_devices); + } + gpu_ids.insert(platform_gpu_id); + } + } + for (int i = 0; i < num_visible_devices; ++i) { + if (gpu_ids.count(i) == 0) continue; XlaDevice::Options options; options.platform = platform.ValueOrDie(); options.device_name_prefix = name_prefix; -- GitLab From 8ecabf5888da937f61d1b8cc0aedc99b453b821c Mon Sep 17 00:00:00 2001 From: tomguluson92 <314913739@qq.com> Date: Tue, 4 Dec 2018 09:01:18 +0800 Subject: [PATCH 059/405] Update decode.h --- tensorflow/core/util/proto/decode.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tensorflow/core/util/proto/decode.h b/tensorflow/core/util/proto/decode.h index 248490a2fa..8dde14dffc 100644 --- a/tensorflow/core/util/proto/decode.h +++ b/tensorflow/core/util/proto/decode.h @@ -318,7 +318,7 @@ inline int ReadPackedPrimitives(const void* bufp, const size_t len, return count; } -// Reads a primitive field value from a serialized proto. +// Reads a value of a primitive type field from a serialized proto. // The value is parsed from the serialized format, then static_cast // to the desired type for TensorFlow and stored. template Date: Mon, 3 Dec 2018 17:59:44 -0800 Subject: [PATCH 060/405] Correctly reduce loss metric in Dist strategy + Keras + eager case. PiperOrigin-RevId: 223897668 --- tensorflow/contrib/distribute/python/keras_test.py | 14 +++++--------- .../python/keras/engine/training_distributed.py | 3 ++- 2 files changed, 7 insertions(+), 10 deletions(-) diff --git a/tensorflow/contrib/distribute/python/keras_test.py b/tensorflow/contrib/distribute/python/keras_test.py index 00a220880b..2b2c840daa 100644 --- a/tensorflow/contrib/distribute/python/keras_test.py +++ b/tensorflow/contrib/distribute/python/keras_test.py @@ -27,7 +27,6 @@ from tensorflow.contrib.distribute.python import tpu_strategy from tensorflow.python import keras from tensorflow.python.data.ops import dataset_ops from tensorflow.python.distribute import values -from tensorflow.python.eager import context from tensorflow.python.eager import test from tensorflow.python.estimator import keras as keras_lib from tensorflow.python.estimator import run_config as run_config_lib @@ -1210,8 +1209,7 @@ class TestDistributionStrategyWithNormalizationLayer( class TestDistributionStrategyCorrectness(test.TestCase, parameterized.TestCase): - # TODO(b/120186218): Enable this for eager once metrics are working. - @combinations.generate(strategy_for_numpy_input_combinations()) + @combinations.generate(all_strategy_combinations()) def test_metric_correctness(self, distribution): with self.cached_session(): keras.backend.set_image_data_format('channels_last') @@ -1309,12 +1307,10 @@ class TestDistributionStrategyCorrectness(test.TestCase, self.assertAllClose( wts_with_ds, wts_without_ds, atol=tolerance, rtol=tolerance) - # TODO(b/120186218): Enable this once metrics are working with eager. - if not context.executing_eagerly(): - self.assertAllClose( - eval_with_ds, eval_without_ds, atol=tolerance, rtol=tolerance) - self.assertAllClose( - predict_with_ds, predict_without_ds, atol=tolerance, rtol=tolerance) + self.assertAllClose( + eval_with_ds, eval_without_ds, atol=tolerance, rtol=tolerance) + self.assertAllClose( + predict_with_ds, predict_without_ds, atol=tolerance, rtol=tolerance) if __name__ == '__main__': diff --git a/tensorflow/python/keras/engine/training_distributed.py b/tensorflow/python/keras/engine/training_distributed.py index 7cf961b9ec..b0a2dfe00d 100644 --- a/tensorflow/python/keras/engine/training_distributed.py +++ b/tensorflow/python/keras/engine/training_distributed.py @@ -657,7 +657,8 @@ def _get_eager_execution_function(model, mode): (all_inputs, all_outputs, _, _) = distributed_training_utils.unwrap_values( strategy, grouped_inputs, - grouped_outputs) + grouped_outputs, + with_loss_tensor=(mode != 'predict')) return K.function( all_inputs, -- GitLab From 0781147372a4b0e85203088380bb803735c7b6f1 Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Mon, 3 Dec 2018 18:03:37 -0800 Subject: [PATCH 061/405] Fix a typo -- invalid function name. PiperOrigin-RevId: 223898343 --- tensorflow/python/ops/random_ops.py | 4 ++-- tensorflow/tools/api/golden/v1/tensorflow.random.pbtxt | 4 ++++ 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/tensorflow/python/ops/random_ops.py b/tensorflow/python/ops/random_ops.py index f2df87cf2d..62e2f6d102 100644 --- a/tensorflow/python/ops/random_ops.py +++ b/tensorflow/python/ops/random_ops.py @@ -357,9 +357,9 @@ def multinomial(logits, num_samples, seed=None, name=None, output_dtype=None): return multinomial_categorical_impl(logits, num_samples, output_dtype, seed) -@tf_export("random.categorical", v1=[]) +@tf_export("random.categorical") def categorical(logits, num_samples, dtype=None, seed=None, name=None): - """Draws samples from a multinomial distribution. + """Draws samples from a categorical distribution. Example: diff --git a/tensorflow/tools/api/golden/v1/tensorflow.random.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.random.pbtxt index d788f6dfca..107534e086 100644 --- a/tensorflow/tools/api/golden/v1/tensorflow.random.pbtxt +++ b/tensorflow/tools/api/golden/v1/tensorflow.random.pbtxt @@ -1,5 +1,9 @@ path: "tensorflow.random" tf_module { + member_method { + name: "categorical" + argspec: "args=[\'logits\', \'num_samples\', \'dtype\', \'seed\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], " + } member_method { name: "gamma" argspec: "args=[\'shape\', \'alpha\', \'beta\', \'dtype\', \'seed\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \"\", \'None\', \'None\'], " -- GitLab From 4b6547557148ac0e8ca8eb6ac401a96c1e5b8a44 Mon Sep 17 00:00:00 2001 From: Justin Lebar Date: Mon, 3 Dec 2018 18:06:57 -0800 Subject: [PATCH 062/405] [XLA] Add WithOneUse(r) matchers to pattern_matcher.h These are useful for guarding certain complex algebraic simplifications that would make the graph more complex if an instruction has more than one use(r). PiperOrigin-RevId: 223898824 --- .../compiler/xla/service/pattern_matcher.h | 77 +++++++++++++++++++ .../xla/service/pattern_matcher_test.cc | 55 +++++++++++++ 2 files changed, 132 insertions(+) diff --git a/tensorflow/compiler/xla/service/pattern_matcher.h b/tensorflow/compiler/xla/service/pattern_matcher.h index cc06cf6ec7..90ea5dc51e 100644 --- a/tensorflow/compiler/xla/service/pattern_matcher.h +++ b/tensorflow/compiler/xla/service/pattern_matcher.h @@ -64,6 +64,9 @@ namespace xla { // e.g. IsConstantScalar() or IsConstantScalar(42). // - WithFusionKind // - WithTupleIndex: get-tuple-element operations with the given tuple index +// - WithOneUse: Instruction is used as an operand exactly once. +// - WithOneUser: Instruction is used by exactly one other instruction, but +// is possibly used more than once as an operand (e.g. multiply(x,x)). // // Shape(): // - EqualTo @@ -1603,6 +1606,64 @@ class HloInstructionPatternParameterNumImpl { int64 parameter_num_; }; +// Superclass that contains common code used by Op::WithOneUse() and +// Op::WithOneUser(). +class HloInstructionPatternOneUseOrUserImpl { + protected: + bool MatchOneUser(const HloInstruction* inst, MatchOption option) const { + if (inst->user_count() != 1) { + EXPLAIN << "HloInstruction has " << inst->user_count() + << " users, but expected exactly one."; + if (inst->user_count() > 1) { + EXPLAIN << "\nAll users:"; + for (const HloInstruction* user : inst->users()) { + EXPLAIN << "\n - " << user->ToShortString(); + } + } + return false; + } + return true; + } +}; + +class HloInstructionPatternOneUseImpl + : public HloInstructionPatternOneUseOrUserImpl { + public: + bool Match(const HloInstruction* inst, MatchOption option) const { + if (!MatchOneUser(inst, option)) { + return false; + } + + int64 use_count = absl::c_count_if( + inst->users()[0]->operands(), + [&](const HloInstruction* operand) { return operand == inst; }); + if (use_count != 1) { + EXPLAIN << "HloInstruction is used " << use_count + << " times by its user, but is expected to be used just once: " + << inst->users()[0]->ToShortString(); + return false; + } + return true; + } + + void DescribeTo(std::ostream* os, int64 indent = 0) const { + *os << "which has exactly one use"; + } +}; + +class HloInstructionPatternOneUserImpl + : public HloInstructionPatternOneUseOrUserImpl { + public: + bool Match(const HloInstruction* inst, MatchOption option) const { + return MatchOneUser(inst, option); + } + + void DescribeTo(std::ostream* os, int64 indent = 0) const { + *os << "which has exactly one user (but possibly is used multiple times by " + "that instruction)"; + } +}; + // Matches a constant scalar or effective scalar, optionally with a given value. template class HloConstantScalarImpl { @@ -1877,6 +1938,22 @@ class HloInstructionPattern { return AppendImpl(HloInstructionPatternParameterNumImpl(parameter_num)); } + // Modifies the pattern to match if the instruction is used exactly once. + // Does not match if the instruction is used twice by the same user (e.g. + // multiply(x,x)). + constexpr auto WithOneUse() const + -> decltype(this->AppendImpl(HloInstructionPatternOneUseImpl())) { + return AppendImpl(HloInstructionPatternOneUseImpl()); + } + + // Modifies the pattern to match if the instruction is used by exactly one + // other instruction. Will match if the instruction is used twice, so long as + // it's by the same user (e.g. multiply(x,x)). + constexpr auto WithOneUser() const + -> decltype(this->AppendImpl(HloInstructionPatternOneUserImpl())) { + return AppendImpl(HloInstructionPatternOneUserImpl()); + } + void DescribeTo(std::ostream* os, int64 indent = 0) const { impl_.DescribeTo(os, indent); } diff --git a/tensorflow/compiler/xla/service/pattern_matcher_test.cc b/tensorflow/compiler/xla/service/pattern_matcher_test.cc index 13886fa6f5..3dd18898c8 100644 --- a/tensorflow/compiler/xla/service/pattern_matcher_test.cc +++ b/tensorflow/compiler/xla/service/pattern_matcher_test.cc @@ -875,5 +875,60 @@ TEST(PatternMatcherTest, Parameter) { "in p0 = f32[] parameter(0)"); } +TEST(PatternMatcherTest, OneUseAndOneUser) { + auto param = + HloInstruction::CreateParameter(0, ShapeUtil::MakeShape(F32, {}), "p0"); + + EXPECT_FALSE(Match(param.get(), m::Op().WithOneUse())); + EXPECT_DESC_AND_EXPLANATION( + param, m::Op().WithOneUse(), + "an HloInstruction which has exactly one use", + "HloInstruction has 0 users, but expected exactly one.\n" + "in p0 = f32[] parameter(0)"); + + EXPECT_FALSE(Match(param.get(), m::Op().WithOneUser())); + EXPECT_DESC_AND_EXPLANATION( + param, m::Op().WithOneUser(), + "an HloInstruction which has exactly one user (but possibly is used " + "multiple times by that instruction)", + "HloInstruction has 0 users, but expected exactly one.\n" + "in p0 = f32[] parameter(0)"); + + { + auto reshape = + SetName("r", HloInstruction::CreateReshape( + ShapeUtil::MakeShape(F32, {1}), param.get())); + EXPECT_TRUE(Match(param.get(), m::Op().WithOneUse())); + EXPECT_TRUE(Match(param.get(), m::Op().WithOneUser())); + + auto reshape1 = + SetName("r1", HloInstruction::CreateReshape( + ShapeUtil::MakeShape(F32, {1}), param.get())); + EXPECT_FALSE(Match(param.get(), m::Op().WithOneUse())); + EXPECT_FALSE(Match(param.get(), m::Op().WithOneUser())); + + const char* kMultipleUserExplanation = + "HloInstruction has 2 users, but expected exactly one.\n" + "All users:\n" + " - %r = reshape(%p0)\n" + " - %r1 = reshape(%p0)\n" + "in p0 = f32[] parameter(0)"; + EXPECT_EQ(Explanation(param.get(), m::Op().WithOneUse()), + kMultipleUserExplanation); + EXPECT_EQ(Explanation(param.get(), m::Op().WithOneUser()), + kMultipleUserExplanation); + } + + auto add = SetName("add", HloInstruction::CreateBinary( + ShapeUtil::MakeShape(F32, {}), HloOpcode::kAdd, + param.get(), param.get())); + EXPECT_TRUE(Match(param.get(), m::Op().WithOneUser())); + EXPECT_FALSE(Match(param.get(), m::Op().WithOneUse())); + EXPECT_EQ(Explanation(param.get(), m::Op().WithOneUse()), + "HloInstruction is used 2 times by its user, but is expected to be " + "used just once: %add = add(%p0, %p0)\n" + "in p0 = f32[] parameter(0)"); +} + } // namespace } // namespace xla -- GitLab From b9a02e33c5917c9aa8040592cc469613ab1a657d Mon Sep 17 00:00:00 2001 From: Justin Lebar Date: Mon, 3 Dec 2018 18:09:45 -0800 Subject: [PATCH 063/405] [XLA] Simplify A*C + B*C => (A+B)*C when safe. "Safe" means: - A, B, and C are integers, OR - A, B, and C are floating-point types, and C is a power of 2 (including 2^(-k)). PiperOrigin-RevId: 223899195 --- .../xla/service/algebraic_simplifier.cc | 74 +++++++++++ ..._simplifier_proof_distributive_property.py | 82 ++++++++++++ .../xla/service/algebraic_simplifier_test.cc | 122 ++++++++++++++++++ 3 files changed, 278 insertions(+) create mode 100644 tensorflow/compiler/xla/service/algebraic_simplifier_proof_distributive_property.py diff --git a/tensorflow/compiler/xla/service/algebraic_simplifier.cc b/tensorflow/compiler/xla/service/algebraic_simplifier.cc index a348bcf0a2..985c5af1c4 100644 --- a/tensorflow/compiler/xla/service/algebraic_simplifier.cc +++ b/tensorflow/compiler/xla/service/algebraic_simplifier.cc @@ -16,6 +16,7 @@ limitations under the License. #include "tensorflow/compiler/xla/service/algebraic_simplifier.h" #include +#include #include #include #include @@ -68,6 +69,45 @@ bool IsAll(const HloInstruction* op, int8 value) { } } +// Checks whether `op` is a floating-point constant or broadcast of a constant +// of the form +/- 2^k for some integer k positive, negative, or zero. Such +// values are interesting because multiplying by a power of 2 just moves the +// exponent. +bool IsAllFpConstantPowerOf2(const HloInstruction* op) { + // Unwrap the broadcast if necessary. + const HloInstruction* c; + if (!Match(op, m::ConstantEffectiveScalar(&c)) && + !Match(op, m::Broadcast(m::Constant(&c).WithShape( + m::Shape().IsEffectiveScalar())))) { + return false; + } + auto val = [&]() -> absl::optional { + switch (c->shape().element_type()) { + case BF16: + return static_cast(c->literal().GetFirstElement()); + case F16: + return static_cast(c->literal().GetFirstElement()); + case F32: + return c->literal().GetFirstElement(); + case F64: + return c->literal().GetFirstElement(); + default: + // Cowardly refuse to consider complex types. + return absl::nullopt; + } + }(); + if (!val) { + return false; + } + + int exp; + double mantissa = std::frexp(*val, &exp); + // frexp returns a value in the range (-1; -0.5] U [0.5, 1). A return value + // of +/-0.5 therefore indicates that the floating point value is a power of + // 2. + return mantissa == 0.5 || mantissa == -0.5; +} + // Returns whether the given transpose produces a result which is bit-wise // identical to its operand and thus may be replaced with a bitcast. bool TransposeIsBitcast(const HloInstruction* transpose) { @@ -415,6 +455,40 @@ Status AlgebraicSimplifierVisitor::HandleAdd(HloInstruction* add) { sum_of_constants)); } + // A*C + B*C => (A+B)*C + // + // - If A, B, and C are integers, do this unconditionally. Proof of + // correctness: https://rise4fun.com/Alive/u9X. + // + // - If A, B, and C are floating point, do this if C is a scalar constant or + // broadcast of scalar constant and is equal to +/- 2^k for some (possibly + // negative) integer k. + // + // Multiplying by a power of 2 just moves the exponent, so our answer is + // exact modulo rounding of intermediate results so long as + // + // - none of the three products has an exponent which underflows (so the + // result is 0 or denormal), and + // - none of the three products overflows to inf. + // + // Proof: See algebraic_simplifier_proof_distributive_property.py. + // + // We deem these differences in rounding, underflow, and overflow + // acceptable in the ML context. + HloInstruction *b, *c; + if (((Match(lhs, m::Multiply(m::Op(&a), m::Op(&c))) && + Match(rhs, m::MultiplyAnyOrder(m::Op().Is(c), m::Op(&b)))) || + (Match(lhs, m::Multiply(m::Op(&c), m::Op(&a))) && + Match(rhs, m::MultiplyAnyOrder(m::Op().Is(c), m::Op(&b))))) && + (ShapeUtil::ElementIsIntegral(add->shape()) || + IsAllFpConstantPowerOf2(c))) { + return ReplaceWithNewInstruction( + add, HloInstruction::CreateBinary( + add->shape(), HloOpcode::kMultiply, + computation_->AddInstruction(HloInstruction::CreateBinary( + add->shape(), HloOpcode::kAdd, a, b)), + c)); + } return Status::OK(); } diff --git a/tensorflow/compiler/xla/service/algebraic_simplifier_proof_distributive_property.py b/tensorflow/compiler/xla/service/algebraic_simplifier_proof_distributive_property.py new file mode 100644 index 0000000000..5da13da041 --- /dev/null +++ b/tensorflow/compiler/xla/service/algebraic_simplifier_proof_distributive_property.py @@ -0,0 +1,82 @@ +# Copyright 2018 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Proof that transforming (A*C)+(B*C) <=> (A+B)*C is "safe" if C=2^k. + +Specifically, for all floating-point values A, B, and C, if + + - C is equal to +/- 2^k for some (possibly negative) integer k, and + - A, B, C, A*C, B*C, and A+B are not subnormal, zero, or inf, + +then there exists a rounding mode rm in [RTZ, RNE] such that + + (A*C) + (B*C) == (A+B) * C (computed with rounding mode rm). + +Informally, this means that the equivalence holds for powers of 2 C, modulo +flushing to zero or inf, and modulo rounding of intermediate results. + +Requires z3 python bindings; try `pip install z3-solver`. +""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function +import z3 + +# We do float16 because it lets the solver run much faster. These results +# should generalize to fp32 and fp64, and you can verify this by changing the +# value of FLOAT_TY (and then waiting a while). +FLOAT_TY = z3.Float16 + +a = z3.FP("a", FLOAT_TY()) +b = z3.FP("b", FLOAT_TY()) +c = z3.FP("c", FLOAT_TY()) + +s = z3.Solver() + +# C must be a power of 2, i.e. significand bits must all be 0. +s.add(z3.Extract(FLOAT_TY().sbits() - 1, 0, z3.fpToIEEEBV(c)) == 0) + +for rm in [z3.RTZ(), z3.RNE()]: + z3.set_default_rounding_mode(rm) + before = a * c + b * c + after = (a + b) * c + + # Check that before == after, allowing that 0 == -0. + s.add( + z3.Not( + z3.Or( + before == after, # + z3.And(z3.fpIsZero(before), z3.fpIsZero(after))))) + + for x in [ + (a * c), + (b * c), + (a + b), + ]: + s.add(z3.Not(z3.fpIsSubnormal(x))) + s.add(z3.Not(z3.fpIsZero(x))) + s.add(z3.Not(z3.fpIsInf(x))) + +if s.check() == z3.sat: + m = s.model() + print("Counterexample found!") + print(m) + print("a*c: ", z3.simplify(m[a] * m[c])) + print("b*c: ", z3.simplify(m[b] * m[c])) + print("a+b: ", z3.simplify(m[a] + m[b])) + print("a*c + b*c: ", z3.simplify(m[a] * m[c] + m[b] * m[c])) + print("(a+b) * c: ", z3.simplify((m[a] + m[b]) * m[c])) +else: + print("Proved!") diff --git a/tensorflow/compiler/xla/service/algebraic_simplifier_test.cc b/tensorflow/compiler/xla/service/algebraic_simplifier_test.cc index 628d805f1c..14ce519b6a 100644 --- a/tensorflow/compiler/xla/service/algebraic_simplifier_test.cc +++ b/tensorflow/compiler/xla/service/algebraic_simplifier_test.cc @@ -80,6 +80,128 @@ TEST_F(AlgebraicSimplifierTest, AddZero) { EXPECT_EQ(root, param0); } +TEST_F(AlgebraicSimplifierTest, FactorIntegerAddition) { + const char* kModuleStr = R"( + HloModule m + test { + p0 = s32[8] parameter(0) + p1 = s32[8] parameter(1) + p2 = s32[8] parameter(2) + x = s32[8] multiply(p0, p2) + y = s32[8] multiply(p1, p2) + ROOT sum = s32[8] add(x, y) + } + )"; + TF_ASSERT_OK_AND_ASSIGN(auto m, ParseAndReturnVerifiedModule(kModuleStr)); + AlgebraicSimplifier simplifier(default_options_); + ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie()); + EXPECT_THAT( + m->entry_computation()->root_instruction(), + GmockMatch(m::MultiplyAnyOrder( + m::AddAnyOrder(m::Parameter(0), m::Parameter(1)), m::Parameter(2)))); +} + +// A*C + B*C => (A+B)*C if C is a floating-point power of 2. +TEST_F(AlgebraicSimplifierTest, FactorFpAddition) { + const char* kModuleStr = R"( + HloModule m + test { + p0 = f32[] parameter(0) + p1 = f32[] parameter(1) + c = f32[] constant(0.125) + x = f32[] multiply(p0, c) + y = f32[] multiply(p1, c) + ROOT sum = f32[] add(x, y) + } + )"; + TF_ASSERT_OK_AND_ASSIGN(auto m, ParseAndReturnVerifiedModule(kModuleStr)); + ASSERT_TRUE(AlgebraicSimplifier(default_options_).Run(m.get()).ValueOrDie()); + EXPECT_THAT(m->entry_computation()->root_instruction(), + GmockMatch(m::MultiplyAnyOrder( + m::AddAnyOrder(m::Parameter(0), m::Parameter(1)), + m::ConstantScalar(0.125)))); +} + +// A*C + B*C => (A+B)*C if C is a broadcast of a floating-point power of 2. +TEST_F(AlgebraicSimplifierTest, FactorFpAdditionWithBroadcast) { + const char* kModuleStr = R"( + HloModule m + test { + p0 = f32[4] parameter(0) + p1 = f32[4] parameter(1) + c = f32[] constant(0.125) + b = f32[4] broadcast(c), dimensions={} + x = f32[4] multiply(p0, b) + y = f32[4] multiply(p1, b) + ROOT sum = f32[4] add(x, y) + } + )"; + TF_ASSERT_OK_AND_ASSIGN(auto m, ParseAndReturnVerifiedModule(kModuleStr)); + ASSERT_TRUE(AlgebraicSimplifier(default_options_).Run(m.get()).ValueOrDie()); + EXPECT_THAT(m->entry_computation()->root_instruction(), + GmockMatch(m::MultiplyAnyOrder( + m::AddAnyOrder(m::Parameter(0), m::Parameter(1)), + m::Broadcast(m::ConstantScalar(0.125))))); +} + +// A*C + B*C => (A+B)*C simplification should not happen if C is not a +// floating-point power of 2. +TEST_F(AlgebraicSimplifierTest, FactorFpAdditionNotPowerOf2) { + const char* kModuleStr = R"( + HloModule m + test { + p0 = f32[] parameter(0) + p1 = f32[] parameter(1) + c = f32[] constant(0.3) + x = f32[] multiply(p0, c) + y = f32[] multiply(p1, c) + ROOT sum = f32[] add(x, y) + } + )"; + TF_ASSERT_OK_AND_ASSIGN(auto m, ParseAndReturnVerifiedModule(kModuleStr)); + EXPECT_FALSE(AlgebraicSimplifier(default_options_).Run(m.get()).ValueOrDie()); +} + +// A*C + B*C => (A+B)*C simplification should not happen if A, B, and C are +// complex numbers. +TEST_F(AlgebraicSimplifierTest, FactorFpAdditionComplex) { + const char* kModuleStr = R"( + HloModule m + test { + p0 = c64[8] parameter(0) + p1 = c64[8] parameter(1) + p2 = c64[8] parameter(2) + x = c64[8] multiply(p0, p2) + y = c64[8] multiply(p1, p2) + ROOT sum = c64[8] add(x, y) + } + )"; + TF_ASSERT_OK_AND_ASSIGN(auto m, ParseAndReturnVerifiedModule(kModuleStr)); + EXPECT_FALSE(AlgebraicSimplifier(default_options_).Run(m.get()).ValueOrDie()); +} + +// A*C + B*C => (A+B)*C simplification is OK if A, B, and C are complex. +TEST_F(AlgebraicSimplifierTest, FactorFpAdditionBfloat16) { + const char* kModuleStr = R"( + HloModule m + test { + p0 = bf16[4] parameter(0) + p1 = bf16[4] parameter(1) + c = bf16[] constant(0.125) + b = bf16[4] broadcast(c), dimensions={} + x = bf16[4] multiply(p0, b) + y = bf16[4] multiply(p1, b) + ROOT sum = bf16[4] add(x, y) + } + )"; + TF_ASSERT_OK_AND_ASSIGN(auto m, ParseAndReturnVerifiedModule(kModuleStr)); + ASSERT_TRUE(AlgebraicSimplifier(default_options_).Run(m.get()).ValueOrDie()); + EXPECT_THAT(m->entry_computation()->root_instruction(), + GmockMatch(m::MultiplyAnyOrder( + m::AddAnyOrder(m::Parameter(0), m::Parameter(1)), + m::Broadcast(m::ConstantScalar(0.125))))); +} + // Test that A * 0 is simplified to 0 TEST_F(AlgebraicSimplifierTest, MulZero) { auto m = CreateNewVerifiedModule(); -- GitLab From 48a49df614f0392c2c548e6d01d14d01f7c47714 Mon Sep 17 00:00:00 2001 From: Justin Lebar Date: Mon, 3 Dec 2018 18:40:22 -0800 Subject: [PATCH 064/405] [XLA] Use a uniform HloInstruction stringification function within pattern_matcher.h. Previously we used HloInstruction::ToString with custom options in some places, and HloInstruction::ToShortString() in other places. PiperOrigin-RevId: 223902223 --- .../compiler/xla/service/pattern_matcher.h | 25 ++++++++++--------- .../xla/service/pattern_matcher_test.cc | 13 +++++----- 2 files changed, 20 insertions(+), 18 deletions(-) diff --git a/tensorflow/compiler/xla/service/pattern_matcher.h b/tensorflow/compiler/xla/service/pattern_matcher.h index 90ea5dc51e..432aa1ea0b 100644 --- a/tensorflow/compiler/xla/service/pattern_matcher.h +++ b/tensorflow/compiler/xla/service/pattern_matcher.h @@ -1136,6 +1136,13 @@ inline const HloInstruction* HloOperand(const HloInstruction* instr, return instr->operand(idx); } +// Pretty-printer for HloInstruction. Sort of like ToShortString, but with +// fewer %s and more shapes. +inline string InstToString(const HloInstruction* inst) { + return inst->ToString( + HloPrintOptions().set_print_metadata(false).set_print_percent(false)); +} + template class HloInstructionPattern; @@ -1190,14 +1197,14 @@ class HloInstructionIsImpl { bool Match(const ::xla::HloInstruction* inst, MatchOption option) const { if (inst != inst_) { EXPLAIN << "HloInstruction " << inst << " is not " << inst_ << " (" - << inst_->ToShortString() << ")"; + << InstToString(inst_) << ")"; return false; } return true; } void DescribeTo(std::ostream* os, int64 indent = 0) const { - *os << "which is " << inst_ << " (" << inst_->ToShortString() << ")"; + *os << "which is " << inst_ << " (" << InstToString(inst_) << ")"; } private: @@ -1617,7 +1624,7 @@ class HloInstructionPatternOneUseOrUserImpl { if (inst->user_count() > 1) { EXPLAIN << "\nAll users:"; for (const HloInstruction* user : inst->users()) { - EXPLAIN << "\n - " << user->ToShortString(); + EXPLAIN << "\n - " << InstToString(user); } } return false; @@ -1640,7 +1647,7 @@ class HloInstructionPatternOneUseImpl if (use_count != 1) { EXPLAIN << "HloInstruction is used " << use_count << " times by its user, but is expected to be used just once: " - << inst->users()[0]->ToShortString(); + << InstToString(inst->users()[0]); return false; } return true; @@ -1767,10 +1774,7 @@ class HloInstructionPattern { return true; } if (inst != nullptr) { - EXPLAIN << "\nin " - << inst->ToString(HloPrintOptions() - .set_print_metadata(false) - .set_print_percent(false)); + EXPLAIN << "\nin " << InstToString(inst); } return false; } @@ -1783,10 +1787,7 @@ class HloInstructionPattern { } return true; } - EXPLAIN << "\nin " - << inst->ToString(HloPrintOptions() - .set_print_metadata(false) - .set_print_percent(false)); + EXPLAIN << "\nin " << InstToString(inst); return false; } diff --git a/tensorflow/compiler/xla/service/pattern_matcher_test.cc b/tensorflow/compiler/xla/service/pattern_matcher_test.cc index 3dd18898c8..186ef0c791 100644 --- a/tensorflow/compiler/xla/service/pattern_matcher_test.cc +++ b/tensorflow/compiler/xla/service/pattern_matcher_test.cc @@ -767,10 +767,11 @@ TEST(PatternMatcherTest, HloInstructionDescribeToAndExplain) { "in c = f64[] constant(2.25)"); EXPECT_DESC_AND_EXPLANATION( constant, m::Op().Is(iota.get()), - absl::StrCat("an HloInstruction which is 0x", absl::Hex(iota.get()), " (", - iota->ToShortString(), ")"), + absl::StrCat("an HloInstruction which is 0x", absl::Hex(iota.get()), + " (i = s32[42]{0} iota(), iota_dimension=0)"), absl::StrCat("HloInstruction 0x", absl::Hex(constant.get()), " is not 0x", - absl::Hex(iota.get()), " (", iota->ToShortString(), ")\n", + absl::Hex(iota.get()), + " (i = s32[42]{0} iota(), iota_dimension=0)\n" "in c = s32[] constant(0)")); } @@ -910,8 +911,8 @@ TEST(PatternMatcherTest, OneUseAndOneUser) { const char* kMultipleUserExplanation = "HloInstruction has 2 users, but expected exactly one.\n" "All users:\n" - " - %r = reshape(%p0)\n" - " - %r1 = reshape(%p0)\n" + " - r = f32[1]{0} reshape(f32[] p0)\n" + " - r1 = f32[1]{0} reshape(f32[] p0)\n" "in p0 = f32[] parameter(0)"; EXPECT_EQ(Explanation(param.get(), m::Op().WithOneUse()), kMultipleUserExplanation); @@ -926,7 +927,7 @@ TEST(PatternMatcherTest, OneUseAndOneUser) { EXPECT_FALSE(Match(param.get(), m::Op().WithOneUse())); EXPECT_EQ(Explanation(param.get(), m::Op().WithOneUse()), "HloInstruction is used 2 times by its user, but is expected to be " - "used just once: %add = add(%p0, %p0)\n" + "used just once: add = f32[] add(f32[] p0, f32[] p0)\n" "in p0 = f32[] parameter(0)"); } -- GitLab From 85c4d28aed41682a2d0a7290d934184d5dbad283 Mon Sep 17 00:00:00 2001 From: Derek Murray Date: Mon, 3 Dec 2018 20:26:35 -0800 Subject: [PATCH 065/405] [tf.data] Remove `Dataset.make_initializable_iterator()` from the V2 API. Add `tf.compat.v1.data.make_initializable_iterator(dataset)` to enable the use of V2 `Dataset` objects in a legacy V1 pipeline. PiperOrigin-RevId: 223911205 --- tensorflow/contrib/bigtable/README.md | 23 +-- .../python/kernel_tests/bigtable_ops_test.py | 18 +-- .../kernel_tests/assert_element_shape_test.py | 18 +-- .../kernel_tests/lmdb_dataset_op_test.py | 3 +- .../kernel_tests/slide_dataset_op_test.py | 34 ++--- .../linear_regression_graph_test.py | 2 +- .../hadoop/python/kernel_tests/hadoop_test.py | 3 +- .../contrib/tpu/python/tpu/datasets_test.py | 10 +- .../contrib/tpu/python/tpu/keras_support.py | 2 +- .../contrib/tpu/python/tpu/tpu_estimator.py | 2 +- .../autograph/operators/control_flow.py | 2 +- tensorflow/python/data/__init__.py | 1 + .../python/data/benchmarks/batch_benchmark.py | 2 +- .../from_tensor_slices_benchmark.py | 8 +- .../python/data/benchmarks/range_benchmark.py | 2 +- .../benchmarks/map_and_batch_benchmark.py | 2 +- .../benchmarks/unbatch_benchmark.py | 4 +- .../kernel_tests/copy_to_device_test.py | 26 ++-- .../dense_to_sparse_batch_test.py | 20 ++- .../directed_interleave_dataset_test.py | 2 +- .../kernel_tests/enumerate_dataset_test.py | 5 +- .../kernel_tests/group_by_window_test.py | 35 ++--- .../kernel_tests/ignore_errors_test.py | 6 +- .../make_batched_features_dataset_test.py | 2 +- .../make_tf_record_dataset_test.py | 2 +- .../kernel_tests/map_and_batch_test.py | 10 +- .../optimization/optimize_dataset_test.py | 4 +- .../kernel_tests/override_threadpool_test.py | 2 +- .../kernel_tests/parallel_interleave_test.py | 13 +- .../kernel_tests/prefetch_to_device_test.py | 6 +- .../experimental/kernel_tests/scan_test.py | 9 +- .../dataset_serialization_test_base.py | 3 +- .../range_dataset_serialization_test.py | 4 +- .../experimental/kernel_tests/sleep_test.py | 2 +- .../kernel_tests/sql_dataset_test_base.py | 3 +- .../kernel_tests/stats_dataset_ops_test.py | 28 ++-- .../kernel_tests/stats_dataset_test_base.py | 3 +- .../experimental/kernel_tests/unbatch_test.py | 4 +- .../experimental/kernel_tests/unique_test.py | 2 +- .../kernel_tests/dataset_checkpoint_test.py | 22 +-- .../data/kernel_tests/from_generator_test.py | 77 +++++----- .../from_sparse_tensor_slices_test.py | 4 +- .../data/kernel_tests/from_tensors_test.py | 2 +- .../kernel_tests/iterator_checkpoint_test.py | 2 +- .../kernel_tests/iterator_cluster_test.py | 2 +- .../python/data/kernel_tests/iterator_test.py | 17 +-- .../python/data/kernel_tests/map_test.py | 138 ++++++++---------- .../python/data/kernel_tests/shuffle_test.py | 8 +- .../python/data/kernel_tests/test_base.py | 2 +- tensorflow/python/data/ops/dataset_ops.py | 129 ++++++++++------ tensorflow/python/distribute/values.py | 5 +- tensorflow/python/keras/engine/training.py | 2 +- .../tools/api/golden/v1/tensorflow.data.pbtxt | 4 + .../golden/v2/tensorflow.data.-dataset.pbtxt | 4 - ...ow.data.-fixed-length-record-dataset.pbtxt | 4 - .../tensorflow.data.-t-f-record-dataset.pbtxt | 4 - .../tensorflow.data.-text-line-dataset.pbtxt | 4 - ...rflow.data.experimental.-csv-dataset.pbtxt | 4 - ...ow.data.experimental.-random-dataset.pbtxt | 4 - ...rflow.data.experimental.-sql-dataset.pbtxt | 4 - tensorflow/tools/compatibility/renames_v2.py | 1 + 61 files changed, 376 insertions(+), 394 deletions(-) diff --git a/tensorflow/contrib/bigtable/README.md b/tensorflow/contrib/bigtable/README.md index 2c44abed5e..79052bee35 100644 --- a/tensorflow/contrib/bigtable/README.md +++ b/tensorflow/contrib/bigtable/README.md @@ -51,25 +51,18 @@ BIGTABLE_TABLE_NAME = '' PREFIX = 'train-' def main(): + tf.enable_eager_execution() + client = tf.contrib.cloud.BigtableClient(GCP_PROJECT_ID, BIGTABLE_INSTANCE_ID) table = client.table(BIGTABLE_TABLE_NAME) dataset = table.keys_by_prefix_dataset(PREFIX) - iterator = dataset.make_initializable_iterator() - get_next_op = iterator.get_next() - with tf.Session() as sess: - print('Initializing the iterator.') - sess.run(iterator.initializer) - print('Retrieving rows:') - row_index = 0 - while True: - try: - row_key = sess.run(get_next_op) - print('Row key %d: %s' % (row_index, row_key)) - row_index += 1 - except tf.errors.OutOfRangeError: - print('Finished reading data!') - break + print('Retrieving rows:') + row_index = 0 + for row_key in dataset: + print('Row key %d: %s' % (row_index, row_key)) + row_index += 1 + print('Finished reading data!') if __name__ == '__main__': main() diff --git a/tensorflow/contrib/bigtable/python/kernel_tests/bigtable_ops_test.py b/tensorflow/contrib/bigtable/python/kernel_tests/bigtable_ops_test.py index 316da9ebe1..197f5578eb 100644 --- a/tensorflow/contrib/bigtable/python/kernel_tests/bigtable_ops_test.py +++ b/tensorflow/contrib/bigtable/python/kernel_tests/bigtable_ops_test.py @@ -57,7 +57,7 @@ class BigtableOpsTest(test.TestCase): sess.run(write_op) def runReadKeyTest(self, read_ds): - itr = read_ds.make_initializable_iterator() + itr = dataset_ops.make_initializable_iterator(read_ds) n = itr.get_next() expected = list(self.COMMON_ROW_KEYS) expected.reverse() @@ -78,7 +78,7 @@ class BigtableOpsTest(test.TestCase): self.runReadKeyTest(self._table.keys_by_range_dataset("r1", "r4")) def runScanTest(self, read_ds): - itr = read_ds.make_initializable_iterator() + itr = dataset_ops.make_initializable_iterator(read_ds) n = itr.get_next() expected_keys = list(self.COMMON_ROW_KEYS) expected_keys.reverse() @@ -120,7 +120,7 @@ class BigtableOpsTest(test.TestCase): def testLookup(self): ds = self._table.keys_by_prefix_dataset("r") ds = ds.apply(self._table.lookup_columns(cf1="c1")) - itr = ds.make_initializable_iterator() + itr = dataset_ops.make_initializable_iterator(ds) n = itr.get_next() expected_keys = list(self.COMMON_ROW_KEYS) expected_values = list(self.COMMON_VALUES) @@ -141,7 +141,7 @@ class BigtableOpsTest(test.TestCase): def testSampleKeys(self): ds = self._table.sample_keys() - itr = ds.make_initializable_iterator() + itr = dataset_ops.make_initializable_iterator(ds) n = itr.get_next() expected_key = self.COMMON_ROW_KEYS[0] with self.cached_session() as sess: @@ -161,7 +161,7 @@ class BigtableOpsTest(test.TestCase): sess.run(n) def runSampleKeyPairsTest(self, ds, expected_key_pairs): - itr = ds.make_initializable_iterator() + itr = dataset_ops.make_initializable_iterator(ds) n = itr.get_next() with self.cached_session() as sess: self._writeCommonValues(sess) @@ -218,7 +218,7 @@ class BigtableOpsTest(test.TestCase): def testSampleKeyPairsPrefixAndStartKey(self): ds = bigtable_api._BigtableSampleKeyPairsDataset( self._table, prefix="r", start="r1", end="") - itr = ds.make_initializable_iterator() + itr = dataset_ops.make_initializable_iterator(ds) with self.cached_session() as sess: with self.assertRaises(errors.InvalidArgumentError): sess.run(itr.initializer) @@ -226,14 +226,14 @@ class BigtableOpsTest(test.TestCase): def testSampleKeyPairsPrefixAndEndKey(self): ds = bigtable_api._BigtableSampleKeyPairsDataset( self._table, prefix="r", start="", end="r3") - itr = ds.make_initializable_iterator() + itr = dataset_ops.make_initializable_iterator(ds) with self.cached_session() as sess: with self.assertRaises(errors.InvalidArgumentError): sess.run(itr.initializer) def testParallelScanPrefix(self): ds = self._table.parallel_scan_prefix(prefix="r", cf1="c1") - itr = ds.make_initializable_iterator() + itr = dataset_ops.make_initializable_iterator(ds) n = itr.get_next() with self.cached_session() as sess: self._writeCommonValues(sess) @@ -251,7 +251,7 @@ class BigtableOpsTest(test.TestCase): def testParallelScanRange(self): ds = self._table.parallel_scan_range(start="r1", end="r4", cf1="c1") - itr = ds.make_initializable_iterator() + itr = dataset_ops.make_initializable_iterator(ds) n = itr.get_next() with self.cached_session() as sess: self._writeCommonValues(sess) diff --git a/tensorflow/contrib/data/python/kernel_tests/assert_element_shape_test.py b/tensorflow/contrib/data/python/kernel_tests/assert_element_shape_test.py index 0456463a19..6c5f8c6b00 100644 --- a/tensorflow/contrib/data/python/kernel_tests/assert_element_shape_test.py +++ b/tensorflow/contrib/data/python/kernel_tests/assert_element_shape_test.py @@ -46,7 +46,7 @@ class AssertElementShapeTest(test_base.DatasetTestBase): result = dataset.apply(batching.assert_element_shape(expected_shapes)) self.assertEqual(expected_shapes, result.output_shapes) - iterator = result.make_initializable_iterator() + iterator = dataset_ops.make_initializable_iterator(result) init_op = iterator.initializer get_next = iterator.get_next() with self.cached_session() as sess: @@ -88,7 +88,7 @@ class AssertElementShapeTest(test_base.DatasetTestBase): result = dataset.apply(batching.assert_element_shape(expected_shapes)) self.assertEqual(expected_shapes, result.output_shapes) - iterator = result.make_initializable_iterator() + iterator = dataset_ops.make_initializable_iterator(result) init_op = iterator.initializer get_next = iterator.get_next() with self.cached_session() as sess: @@ -115,9 +115,8 @@ class AssertElementShapeTest(test_base.DatasetTestBase): wrong_shapes = (tensor_shape.TensorShape(2), tensor_shape.TensorShape((3, 10))) - iterator = ( - dataset.apply(batching.assert_element_shape(wrong_shapes)) - .make_initializable_iterator()) + iterator = dataset_ops.make_initializable_iterator( + dataset.apply(batching.assert_element_shape(wrong_shapes))) init_op = iterator.initializer get_next = iterator.get_next() with self.cached_session() as sess: @@ -142,7 +141,7 @@ class AssertElementShapeTest(test_base.DatasetTestBase): tensor_shape.TensorShape((3, 4))) self.assertEqual(actual_shapes, result.output_shapes) - iterator = result.make_initializable_iterator() + iterator = dataset_ops.make_initializable_iterator(result) init_op = iterator.initializer get_next = iterator.get_next() with self.cached_session() as sess: @@ -184,7 +183,7 @@ class AssertElementShapeTest(test_base.DatasetTestBase): result = dataset.apply(batching.assert_element_shape(expected_shapes)) self.assertEqual(expected_shapes, result.output_shapes) - iterator = result.make_initializable_iterator() + iterator = dataset_ops.make_initializable_iterator(result) init_op = iterator.initializer get_next = iterator.get_next() with self.cached_session() as sess: @@ -211,9 +210,8 @@ class AssertElementShapeTest(test_base.DatasetTestBase): wrong_shapes = (tensor_shape.TensorShape(2), tensor_shape.TensorShape((None, 10))) - iterator = ( - dataset.apply(batching.assert_element_shape(wrong_shapes)) - .make_initializable_iterator()) + iterator = dataset_ops.make_initializable_iterator( + dataset.apply(batching.assert_element_shape(wrong_shapes))) init_op = iterator.initializer get_next = iterator.get_next() with self.cached_session() as sess: diff --git a/tensorflow/contrib/data/python/kernel_tests/lmdb_dataset_op_test.py b/tensorflow/contrib/data/python/kernel_tests/lmdb_dataset_op_test.py index d2a72272db..b9840b1ff1 100644 --- a/tensorflow/contrib/data/python/kernel_tests/lmdb_dataset_op_test.py +++ b/tensorflow/contrib/data/python/kernel_tests/lmdb_dataset_op_test.py @@ -23,6 +23,7 @@ import shutil from tensorflow.contrib.data.python.ops import readers from tensorflow.python.data.kernel_tests import test_base +from tensorflow.python.data.ops import dataset_ops from tensorflow.python.framework import constant_op from tensorflow.python.framework import dtypes from tensorflow.python.framework import errors @@ -48,7 +49,7 @@ class LMDBDatasetTest(test_base.DatasetTestBase): num_repeats = 2 dataset = readers.LMDBDataset(filenames).repeat(num_repeats) - iterator = dataset.make_initializable_iterator() + iterator = dataset_ops.make_initializable_iterator(dataset) init_op = iterator.initializer get_next = iterator.get_next() diff --git a/tensorflow/contrib/data/python/kernel_tests/slide_dataset_op_test.py b/tensorflow/contrib/data/python/kernel_tests/slide_dataset_op_test.py index c5a7862322..2527706709 100644 --- a/tensorflow/contrib/data/python/kernel_tests/slide_dataset_op_test.py +++ b/tensorflow/contrib/data/python/kernel_tests/slide_dataset_op_test.py @@ -63,13 +63,13 @@ class SlideDatasetTest(test_base.DatasetTestBase, parameterized.TestCase): # The pipeline is TensorSliceDataset -> MapDataset(square_3) -> # RepeatDataset(count) -> # _SlideDataset(window_size, window_shift, window_stride). - iterator = ( + iterator = dataset_ops.make_initializable_iterator( dataset_ops.Dataset.from_tensor_slices(components).map(_map_fn) .repeat(count).apply( sliding.sliding_window_batch( window_size=window_size_t, window_shift=window_shift_t, - window_stride=window_stride_t)).make_initializable_iterator()) + window_stride=window_stride_t))) init_op = iterator.initializer get_next = iterator.get_next() @@ -127,13 +127,13 @@ class SlideDatasetTest(test_base.DatasetTestBase, parameterized.TestCase): # The pipeline is TensorSliceDataset -> MapDataset(square_3) -> # RepeatDataset(count) -> _SlideDataset(window_size, stride, window_stride). - iterator = ( + iterator = dataset_ops.make_initializable_iterator( dataset_ops.Dataset.from_tensor_slices(components).map(_map_fn) .repeat(count).apply( sliding.sliding_window_batch( window_size=window_size_t, stride=stride_t, - window_stride=window_stride_t)).make_initializable_iterator()) + window_stride=window_stride_t))) init_op = iterator.initializer get_next = iterator.get_next() @@ -173,12 +173,12 @@ class SlideDatasetTest(test_base.DatasetTestBase, parameterized.TestCase): window_shift_t = array_ops.placeholder(dtypes.int64, shape=[]) window_stride_t = array_ops.placeholder(dtypes.int64, shape=[]) - iterator = ( + iterator = dataset_ops.make_initializable_iterator( dataset_ops.Dataset.range(10).map(lambda x: x).repeat(count_t).apply( sliding.sliding_window_batch( window_size=window_size_t, window_shift=window_shift_t, - window_stride=window_stride_t)).make_initializable_iterator()) + window_stride=window_stride_t))) init_op = iterator.initializer with self.cached_session() as sess: @@ -204,9 +204,9 @@ class SlideDatasetTest(test_base.DatasetTestBase, parameterized.TestCase): return sparse_tensor.SparseTensorValue( indices=[[0]], values=(i * [1]), dense_shape=[1]) - iterator = dataset_ops.Dataset.range(10).map(_sparse).apply( - sliding.sliding_window_batch( - window_size=5, window_shift=3)).make_initializable_iterator() + iterator = dataset_ops.make_initializable_iterator( + dataset_ops.Dataset.range(10).map(_sparse).apply( + sliding.sliding_window_batch(window_size=5, window_shift=3))) init_op = iterator.initializer get_next = iterator.get_next() @@ -233,9 +233,9 @@ class SlideDatasetTest(test_base.DatasetTestBase, parameterized.TestCase): values=array_ops.fill([math_ops.to_int32(i)], i), dense_shape=[i]) - iterator = dataset_ops.Dataset.range(10).map(_sparse).apply( - sliding.sliding_window_batch( - window_size=5, window_shift=3)).make_initializable_iterator() + iterator = dataset_ops.make_initializable_iterator( + dataset_ops.Dataset.range(10).map(_sparse).apply( + sliding.sliding_window_batch(window_size=5, window_shift=3))) init_op = iterator.initializer get_next = iterator.get_next() @@ -265,11 +265,10 @@ class SlideDatasetTest(test_base.DatasetTestBase, parameterized.TestCase): return sparse_tensor.SparseTensorValue( indices=[[0]], values=(i * [1]), dense_shape=[1]) - iterator = ( + iterator = dataset_ops.make_initializable_iterator( dataset_ops.Dataset.range(10).map(_sparse).apply( sliding.sliding_window_batch(window_size=4, window_shift=2)).apply( - sliding.sliding_window_batch(window_size=3, window_shift=1)) - .make_initializable_iterator()) + sliding.sliding_window_batch(window_size=3, window_shift=1))) init_op = iterator.initializer get_next = iterator.get_next() @@ -305,11 +304,10 @@ class SlideDatasetTest(test_base.DatasetTestBase, parameterized.TestCase): yield [4.0, 5.0, 6.0] yield [7.0, 8.0, 9.0, 10.0] - iterator = ( + iterator = dataset_ops.make_initializable_iterator( dataset_ops.Dataset.from_generator( generator, dtypes.float32, output_shapes=[None]).apply( - sliding.sliding_window_batch(window_size=3, window_shift=1)) - .make_initializable_iterator()) + sliding.sliding_window_batch(window_size=3, window_shift=1))) next_element = iterator.get_next() with self.cached_session() as sess: diff --git a/tensorflow/contrib/eager/python/examples/linear_regression/linear_regression_graph_test.py b/tensorflow/contrib/eager/python/examples/linear_regression/linear_regression_graph_test.py index 557ad42752..d412b25b36 100644 --- a/tensorflow/contrib/eager/python/examples/linear_regression/linear_regression_graph_test.py +++ b/tensorflow/contrib/eager/python/examples/linear_regression/linear_regression_graph_test.py @@ -36,7 +36,7 @@ class GraphLinearRegressionBenchmark(tf.test.Benchmark): noise_level=0.01, batch_size=batch_size, num_batches=num_batches) - iterator = dataset.make_initializable_iterator() + iterator = tf.compat.v1.data.make_initializable_iterator(dataset) x, y = iterator.get_next() model = linear_regression.LinearModel() diff --git a/tensorflow/contrib/hadoop/python/kernel_tests/hadoop_test.py b/tensorflow/contrib/hadoop/python/kernel_tests/hadoop_test.py index f7f1189bb9..bc941ae9f2 100644 --- a/tensorflow/contrib/hadoop/python/kernel_tests/hadoop_test.py +++ b/tensorflow/contrib/hadoop/python/kernel_tests/hadoop_test.py @@ -21,6 +21,7 @@ from __future__ import print_function import os from tensorflow.contrib.hadoop.python.ops import hadoop_dataset_ops +from tensorflow.python.data.ops import dataset_ops from tensorflow.python.framework import constant_op from tensorflow.python.framework import dtypes from tensorflow.python.framework import errors @@ -47,7 +48,7 @@ class SequenceFileDatasetTest(test.TestCase): dataset = hadoop_dataset_ops.SequenceFileDataset(filenames).repeat( num_repeats) - iterator = dataset.make_initializable_iterator() + iterator = dataset_ops.make_initializable_iterator(dataset) init_op = iterator.initializer get_next = iterator.get_next() diff --git a/tensorflow/contrib/tpu/python/tpu/datasets_test.py b/tensorflow/contrib/tpu/python/tpu/datasets_test.py index b58d05eac5..52d87b8004 100644 --- a/tensorflow/contrib/tpu/python/tpu/datasets_test.py +++ b/tensorflow/contrib/tpu/python/tpu/datasets_test.py @@ -70,7 +70,7 @@ class DatasetsTest(test.TestCase): dataset = datasets.StreamingFilesDataset( os.path.join(self.get_temp_dir(), 'text_line.*.txt'), filetype='text') - iterator = dataset.make_initializable_iterator() + iterator = dataset_ops.make_initializable_iterator(dataset) self._sess.run(iterator.initializer) get_next = iterator.get_next() @@ -94,7 +94,7 @@ class DatasetsTest(test.TestCase): dataset = datasets.StreamingFilesDataset( os.path.join(self.get_temp_dir(), 'tf_record*'), filetype='tfrecord') - iterator = dataset.make_initializable_iterator() + iterator = dataset_ops.make_initializable_iterator(dataset) self._sess.run(iterator.initializer) get_next = iterator.get_next() @@ -121,7 +121,7 @@ class DatasetsTest(test.TestCase): dataset = datasets.StreamingFilesDataset(filenames, filetype='tfrecord') - iterator = dataset.make_initializable_iterator() + iterator = dataset_ops.make_initializable_iterator(dataset) self._sess.run(iterator.initializer) get_next = iterator.get_next() @@ -154,7 +154,7 @@ class DatasetsTest(test.TestCase): os.path.join(self.get_temp_dir(), 'fixed_length*'), filetype=FixedLengthFile) - iterator = dataset.make_initializable_iterator() + iterator = dataset_ops.make_initializable_iterator(dataset) self._sess.run(iterator.initializer) get_next = iterator.get_next() @@ -177,7 +177,7 @@ class DatasetsTest(test.TestCase): dataset = datasets.StreamingFilesDataset( dataset_ops.Dataset.range(10), filetype=gen_dataset) - iterator = dataset.make_initializable_iterator() + iterator = dataset_ops.make_initializable_iterator(dataset) self._sess.run(iterator.initializer) get_next = iterator.get_next() diff --git a/tensorflow/contrib/tpu/python/tpu/keras_support.py b/tensorflow/contrib/tpu/python/tpu/keras_support.py index cf3b2e68e9..7f280eabd2 100644 --- a/tensorflow/contrib/tpu/python/tpu/keras_support.py +++ b/tensorflow/contrib/tpu/python/tpu/keras_support.py @@ -729,7 +729,7 @@ class TPUDatasetInfeedManager(TPUInfeedManager): dummy_x_shape[0] *= tpu_assignment.num_towers dummy_y_shape = dataset.output_shapes[1].as_list() dummy_y_shape[0] *= tpu_assignment.num_towers - self._iterator = dataset.make_initializable_iterator() + self._iterator = dataset_ops.make_initializable_iterator(dataset) K.get_session().run(self._iterator.initializer) self._get_next_ops = [] diff --git a/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py b/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py index 7171587ff7..a9dc542ae5 100644 --- a/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py +++ b/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py @@ -3081,7 +3081,7 @@ class _Inputs(object): The initializer must be run before calling `features_and_labels`. """ - self._iterator = self._dataset.make_initializable_iterator() + self._iterator = dataset_ops.make_initializable_iterator(self._dataset) return self._iterator.initializer def features_and_labels(self): diff --git a/tensorflow/python/autograph/operators/control_flow.py b/tensorflow/python/autograph/operators/control_flow.py index 1a35efedfa..670897744a 100644 --- a/tensorflow/python/autograph/operators/control_flow.py +++ b/tensorflow/python/autograph/operators/control_flow.py @@ -123,7 +123,7 @@ def _dataset_for_stmt(ds, extra_test, body, init_state): (dataset_ops.Dataset.from_tensors(tag).repeat(), ds)) ds_with_epoch = epoch_numbers.flat_map(lambda i: tag_with(ds, i)) - iterator = ds_with_epoch.make_initializable_iterator() + iterator = dataset_ops.make_initializable_iterator(ds_with_epoch) with ops.control_dependencies((iterator.initializer,)): epoch_number, iterate = iterator.get_next() diff --git a/tensorflow/python/data/__init__.py b/tensorflow/python/data/__init__.py index 2da98bbb8e..75ba88f303 100644 --- a/tensorflow/python/data/__init__.py +++ b/tensorflow/python/data/__init__.py @@ -24,6 +24,7 @@ from __future__ import print_function # pylint: disable=unused-import from tensorflow.python.data import experimental from tensorflow.python.data.ops.dataset_ops import Dataset +from tensorflow.python.data.ops.dataset_ops import make_initializable_iterator from tensorflow.python.data.ops.dataset_ops import make_one_shot_iterator from tensorflow.python.data.ops.iterator_ops import Iterator from tensorflow.python.data.ops.readers import FixedLengthRecordDataset diff --git a/tensorflow/python/data/benchmarks/batch_benchmark.py b/tensorflow/python/data/benchmarks/batch_benchmark.py index b61ac86eb5..e063849f70 100644 --- a/tensorflow/python/data/benchmarks/batch_benchmark.py +++ b/tensorflow/python/data/benchmarks/batch_benchmark.py @@ -42,7 +42,7 @@ class BatchBenchmark(test.Benchmark): dataset = dataset_ops.Dataset.from_tensors(sparse_placeholder).repeat( ).batch(batch_size_placeholder) - iterator = dataset.make_initializable_iterator() + iterator = dataset_ops.make_initializable_iterator(dataset) next_element = iterator.get_next() for non_zeros_per_row in non_zeros_per_row_values: diff --git a/tensorflow/python/data/benchmarks/from_tensor_slices_benchmark.py b/tensorflow/python/data/benchmarks/from_tensor_slices_benchmark.py index 74a2d271ad..d7f1a4e7af 100644 --- a/tensorflow/python/data/benchmarks/from_tensor_slices_benchmark.py +++ b/tensorflow/python/data/benchmarks/from_tensor_slices_benchmark.py @@ -41,7 +41,7 @@ class FromTensorSlicesBenchmark(test.Benchmark): dataset = ( dataset_ops.Dataset.from_tensor_slices(input_data) .repeat(num_epochs + 1).batch(batch_size)) - iterator = dataset.make_initializable_iterator() + iterator = dataset_ops.make_initializable_iterator(dataset) next_element = iterator.get_next() with session.Session() as sess: @@ -77,7 +77,7 @@ class FromTensorSlicesBenchmark(test.Benchmark): dataset = ( dataset_ops.Dataset.from_tensor_slices(input_data) .repeat(num_epochs + 1).batch(batch_size)) - iterator = dataset.make_initializable_iterator() + iterator = dataset_ops.make_initializable_iterator(dataset) next_element = iterator.get_next() with session.Session() as sess: @@ -116,7 +116,7 @@ class FromTensorSlicesBenchmark(test.Benchmark): dataset = ( dataset_ops.Dataset.from_tensor_slices(input_data.reshape(100, 100)) .repeat(num_epochs + 1)) - iterator = dataset.make_initializable_iterator() + iterator = dataset_ops.make_initializable_iterator(dataset) next_element = iterator.get_next() with session.Session() as sess: @@ -154,7 +154,7 @@ class FromTensorSlicesBenchmark(test.Benchmark): dataset = ( dataset_ops.Dataset.from_tensor_slices(input_data).batch(batch_size) .cache().repeat(num_epochs + 1)) - iterator = dataset.make_initializable_iterator() + iterator = dataset_ops.make_initializable_iterator(dataset) next_element = iterator.get_next() with session.Session() as sess: diff --git a/tensorflow/python/data/benchmarks/range_benchmark.py b/tensorflow/python/data/benchmarks/range_benchmark.py index 25f63b79a2..a5020e2873 100644 --- a/tensorflow/python/data/benchmarks/range_benchmark.py +++ b/tensorflow/python/data/benchmarks/range_benchmark.py @@ -39,7 +39,7 @@ class RangeBenchmark(test.Benchmark): # costs). dataset = dataset_ops.Dataset.range(num_elements).skip( num_elements - 1).take(1).with_options(options) - iterator = dataset.make_initializable_iterator() + iterator = dataset_ops.make_initializable_iterator(dataset) next_element = iterator.get_next() with session.Session() as sess: diff --git a/tensorflow/python/data/experimental/benchmarks/map_and_batch_benchmark.py b/tensorflow/python/data/experimental/benchmarks/map_and_batch_benchmark.py index 3e0701ee4f..fbd06a5a78 100644 --- a/tensorflow/python/data/experimental/benchmarks/map_and_batch_benchmark.py +++ b/tensorflow/python/data/experimental/benchmarks/map_and_batch_benchmark.py @@ -54,7 +54,7 @@ class MapAndBatchBenchmark(test.Benchmark): dataset = dataset.apply(batching.map_and_batch( lambda _: dense_value, batch_size_placeholder)) - iterator = dataset.make_initializable_iterator() + iterator = dataset_ops.make_initializable_iterator(dataset) next_element = iterator.get_next() for shape in shapes: diff --git a/tensorflow/python/data/experimental/benchmarks/unbatch_benchmark.py b/tensorflow/python/data/experimental/benchmarks/unbatch_benchmark.py index c40d479823..c36a32534d 100644 --- a/tensorflow/python/data/experimental/benchmarks/unbatch_benchmark.py +++ b/tensorflow/python/data/experimental/benchmarks/unbatch_benchmark.py @@ -42,7 +42,7 @@ class UnbatchBenchmark(test.Benchmark): dataset = dataset.batch(batch_size_placeholder) dataset = dataset.apply(batching.unbatch()) dataset = dataset.skip(elems_per_trial) - iterator = dataset.make_initializable_iterator() + iterator = dataset_ops.make_initializable_iterator(dataset) next_element = iterator.get_next() with session.Session() as sess: @@ -78,7 +78,7 @@ class UnbatchBenchmark(test.Benchmark): dataset = dataset.batch(batch_size_placeholder) dataset = dataset.flat_map(dataset_ops.Dataset.from_tensor_slices) dataset = dataset.skip(elems_per_trial) - iterator = dataset.make_initializable_iterator() + iterator = dataset_ops.make_initializable_iterator(dataset) next_element = iterator.get_next() with session.Session() as sess: diff --git a/tensorflow/python/data/experimental/kernel_tests/copy_to_device_test.py b/tensorflow/python/data/experimental/kernel_tests/copy_to_device_test.py index 78b8bda1b7..b8166fe833 100644 --- a/tensorflow/python/data/experimental/kernel_tests/copy_to_device_test.py +++ b/tensorflow/python/data/experimental/kernel_tests/copy_to_device_test.py @@ -275,7 +275,7 @@ class CopyToDeviceTest(test_base.DatasetTestBase): prefetching_ops.copy_to_device("/gpu:0")) with ops.device("/gpu:0"): - iterator = device_dataset.make_initializable_iterator() + iterator = dataset_ops.make_initializable_iterator(device_dataset) next_element = iterator.get_next() with self.cached_session( @@ -295,7 +295,7 @@ class CopyToDeviceTest(test_base.DatasetTestBase): prefetching_ops.copy_to_device("/gpu:0")).prefetch(1) with ops.device("/gpu:0"): - iterator = device_dataset.make_initializable_iterator() + iterator = dataset_ops.make_initializable_iterator(device_dataset) next_element = iterator.get_next() with self.cached_session( @@ -329,7 +329,7 @@ class CopyToDeviceTest(test_base.DatasetTestBase): device_dataset = device_dataset.with_options(options) with ops.device("/gpu:0"): - iterator = device_dataset.make_initializable_iterator() + iterator = dataset_ops.make_initializable_iterator(device_dataset) next_element = iterator.get_next() with self.cached_session( @@ -352,7 +352,7 @@ class CopyToDeviceTest(test_base.DatasetTestBase): prefetching_ops.copy_to_device("/gpu:0")) with ops.device("/gpu:0"): - iterator = device_dataset.make_initializable_iterator() + iterator = dataset_ops.make_initializable_iterator(device_dataset) next_element = iterator.get_next() with self.cached_session( @@ -371,7 +371,7 @@ class CopyToDeviceTest(test_base.DatasetTestBase): prefetching_ops.copy_to_device("/gpu:0")).prefetch(1) with ops.device("/gpu:0"): - iterator = device_dataset.make_initializable_iterator() + iterator = dataset_ops.make_initializable_iterator(device_dataset) next_element = iterator.get_next() with self.cached_session( @@ -390,7 +390,7 @@ class CopyToDeviceTest(test_base.DatasetTestBase): prefetching_ops.copy_to_device("/gpu:0")) with ops.device("/gpu:0"): - iterator = device_dataset.make_initializable_iterator() + iterator = dataset_ops.make_initializable_iterator(device_dataset) next_element = iterator.get_next() with self.cached_session( @@ -409,7 +409,7 @@ class CopyToDeviceTest(test_base.DatasetTestBase): prefetching_ops.copy_to_device("/gpu:0")) with ops.device("/gpu:0"): - iterator = device_dataset.make_initializable_iterator() + iterator = dataset_ops.make_initializable_iterator(device_dataset) next_element = iterator.get_next() with self.cached_session( @@ -431,7 +431,7 @@ class CopyToDeviceTest(test_base.DatasetTestBase): prefetching_ops.copy_to_device("/cpu:0", source_device="/gpu:0")) with ops.device("/cpu:0"): - iterator = back_to_cpu_dataset.make_initializable_iterator() + iterator = dataset_ops.make_initializable_iterator(back_to_cpu_dataset) next_element = iterator.get_next() with self.cached_session( @@ -449,7 +449,7 @@ class CopyToDeviceTest(test_base.DatasetTestBase): prefetching_ops.copy_to_device("/cpu:1")) with ops.device("/cpu:1"): - iterator = device_dataset.make_initializable_iterator() + iterator = dataset_ops.make_initializable_iterator(device_dataset) next_element = iterator.get_next() self.assertEqual(host_dataset.output_types, device_dataset.output_types) @@ -480,7 +480,7 @@ class CopyToDeviceTest(test_base.DatasetTestBase): prefetching_ops.copy_to_device("/cpu:1")).prefetch(1) with ops.device("/cpu:1"): - iterator = device_dataset.make_initializable_iterator() + iterator = dataset_ops.make_initializable_iterator(device_dataset) next_element = iterator.get_next() self.assertEqual(host_dataset.output_types, device_dataset.output_types) @@ -513,7 +513,7 @@ class CopyToDeviceTest(test_base.DatasetTestBase): prefetching_ops.copy_to_device("/gpu:0")) with ops.device("/gpu:0"): - iterator = device_dataset.make_initializable_iterator() + iterator = dataset_ops.make_initializable_iterator(device_dataset) next_element = iterator.get_next() with self.cached_session( @@ -536,7 +536,7 @@ class CopyToDeviceTest(test_base.DatasetTestBase): prefetching_ops.copy_to_device("/gpu:0")).prefetch(1) with ops.device("/gpu:0"): - iterator = device_dataset.make_initializable_iterator() + iterator = dataset_ops.make_initializable_iterator(device_dataset) next_element = iterator.get_next() with self.cached_session( @@ -558,7 +558,7 @@ class CopyToDeviceTest(test_base.DatasetTestBase): device_dataset = host_dataset.apply( prefetching_ops.copy_to_device("/gpu:0")) with ops.device("/gpu:0"): - iterator = device_dataset.make_initializable_iterator() + iterator = dataset_ops.make_initializable_iterator(device_dataset) next_elem = iterator_ops.get_next_as_optional(iterator) elem_has_value_t = next_elem.has_value() elem_value_t = next_elem.get_value() diff --git a/tensorflow/python/data/experimental/kernel_tests/dense_to_sparse_batch_test.py b/tensorflow/python/data/experimental/kernel_tests/dense_to_sparse_batch_test.py index 4b84446be8..22e057a284 100644 --- a/tensorflow/python/data/experimental/kernel_tests/dense_to_sparse_batch_test.py +++ b/tensorflow/python/data/experimental/kernel_tests/dense_to_sparse_batch_test.py @@ -34,11 +34,10 @@ class DenseToSparseBatchTest(test_base.DatasetTestBase): @test_util.run_deprecated_v1 def testDenseToSparseBatchDataset(self): components = np.random.randint(12, size=(100,)).astype(np.int32) - iterator = ( + iterator = dataset_ops.make_initializable_iterator( dataset_ops.Dataset.from_tensor_slices(components) .map(lambda x: array_ops.fill([x], x)).apply( - batching.dense_to_sparse_batch(4, [12])) - .make_initializable_iterator()) + batching.dense_to_sparse_batch(4, [12]))) init_op = iterator.initializer get_next = iterator.get_next() @@ -63,11 +62,10 @@ class DenseToSparseBatchTest(test_base.DatasetTestBase): @test_util.run_deprecated_v1 def testDenseToSparseBatchDatasetWithUnknownShape(self): components = np.random.randint(5, size=(40,)).astype(np.int32) - iterator = ( + iterator = dataset_ops.make_initializable_iterator( dataset_ops.Dataset.from_tensor_slices(components) .map(lambda x: array_ops.fill([x, x], x)).apply( - batching.dense_to_sparse_batch( - 4, [5, None])).make_initializable_iterator()) + batching.dense_to_sparse_batch(4, [5, None]))) init_op = iterator.initializer get_next = iterator.get_next() @@ -98,16 +96,16 @@ class DenseToSparseBatchTest(test_base.DatasetTestBase): def testDenseToSparseBatchDatasetWithInvalidShape(self): input_tensor = array_ops.constant([[1]]) with self.assertRaisesRegexp(ValueError, "Dimension -2 must be >= 0"): - dataset_ops.Dataset.from_tensors(input_tensor).apply( - batching.dense_to_sparse_batch(4, [-2])).make_initializable_iterator() + dataset_ops.make_initializable_iterator( + dataset_ops.Dataset.from_tensors(input_tensor).apply( + batching.dense_to_sparse_batch(4, [-2]))) @test_util.run_deprecated_v1 def testDenseToSparseBatchDatasetShapeErrors(self): input_tensor = array_ops.placeholder(dtypes.int32) - iterator = ( + iterator = dataset_ops.make_initializable_iterator( dataset_ops.Dataset.from_tensors(input_tensor).apply( - batching.dense_to_sparse_batch(4, [12])) - .make_initializable_iterator()) + batching.dense_to_sparse_batch(4, [12]))) init_op = iterator.initializer get_next = iterator.get_next() diff --git a/tensorflow/python/data/experimental/kernel_tests/directed_interleave_dataset_test.py b/tensorflow/python/data/experimental/kernel_tests/directed_interleave_dataset_test.py index ec2c5407b4..2144342066 100644 --- a/tensorflow/python/data/experimental/kernel_tests/directed_interleave_dataset_test.py +++ b/tensorflow/python/data/experimental/kernel_tests/directed_interleave_dataset_test.py @@ -38,7 +38,7 @@ class DirectedInterleaveDatasetTest(test_base.DatasetTestBase): ] dataset = interleave_ops._DirectedInterleaveDataset(selector_dataset, input_datasets) - iterator = dataset.make_initializable_iterator() + iterator = dataset_ops.make_initializable_iterator(dataset) next_element = iterator.get_next() with self.cached_session() as sess: diff --git a/tensorflow/python/data/experimental/kernel_tests/enumerate_dataset_test.py b/tensorflow/python/data/experimental/kernel_tests/enumerate_dataset_test.py index 3c2e1bb7f3..25742098f1 100644 --- a/tensorflow/python/data/experimental/kernel_tests/enumerate_dataset_test.py +++ b/tensorflow/python/data/experimental/kernel_tests/enumerate_dataset_test.py @@ -35,8 +35,9 @@ class EnumerateDatasetTest(test_base.DatasetTestBase): components = (["a", "b"], [1, 2], [37.0, 38]) start = constant_op.constant(20, dtype=dtypes.int64) - iterator = (dataset_ops.Dataset.from_tensor_slices(components).apply( - enumerate_ops.enumerate_dataset(start)).make_initializable_iterator()) + iterator = dataset_ops.make_initializable_iterator( + dataset_ops.Dataset.from_tensor_slices(components).apply( + enumerate_ops.enumerate_dataset(start))) init_op = iterator.initializer get_next = iterator.get_next() diff --git a/tensorflow/python/data/experimental/kernel_tests/group_by_window_test.py b/tensorflow/python/data/experimental/kernel_tests/group_by_window_test.py index 1e54091c7d..cbb79e55f5 100644 --- a/tensorflow/python/data/experimental/kernel_tests/group_by_window_test.py +++ b/tensorflow/python/data/experimental/kernel_tests/group_by_window_test.py @@ -65,7 +65,7 @@ class GroupByWindowTest(test_base.DatasetTestBase): lambda x, y, z: 0, lambda k, bucket: self._dynamicPad(k, bucket, 32), 32)) - iterator = bucketed_dataset.make_initializable_iterator() + iterator = dataset_ops.make_initializable_iterator(bucketed_dataset) init_op = iterator.initializer get_next = iterator.get_next() @@ -101,7 +101,7 @@ class GroupByWindowTest(test_base.DatasetTestBase): lambda x, y, z: math_ops.cast(x % 2, dtypes.int64), lambda k, bucket: self._dynamicPad(k, bucket, 32), 32)) - iterator = bucketed_dataset.make_initializable_iterator() + iterator = dataset_ops.make_initializable_iterator(bucketed_dataset) init_op = iterator.initializer get_next = iterator.get_next() @@ -173,7 +173,7 @@ class GroupByWindowTest(test_base.DatasetTestBase): lambda d: math_ops.cast(d["x"] % 2, dtypes.int64), lambda k, bucket: _dynamic_pad_fn(k, bucket, 32), 32)) - iterator = bucketed_dataset.make_initializable_iterator() + iterator = dataset_ops.make_initializable_iterator(bucketed_dataset) init_op = iterator.initializer get_next = iterator.get_next() @@ -207,7 +207,7 @@ class GroupByWindowTest(test_base.DatasetTestBase): dataset = dataset_ops.Dataset.from_tensor_slices(components).apply( grouping.group_by_window(lambda x: x % 2, lambda _, xs: xs.batch(20), None, window_size_func)) - iterator = dataset.make_initializable_iterator() + iterator = dataset_ops.make_initializable_iterator(dataset) init_op = iterator.initializer get_next = iterator.get_next() @@ -229,11 +229,11 @@ class GroupByWindowTest(test_base.DatasetTestBase): @test_util.run_deprecated_v1 def testSimple(self): components = np.random.randint(100, size=(200,)).astype(np.int64) - iterator = ( + iterator = dataset_ops.make_initializable_iterator( dataset_ops.Dataset.from_tensor_slices(components).map(lambda x: x * x) .apply( grouping.group_by_window(lambda x: x % 2, lambda _, xs: xs.batch(4), - 4)).make_initializable_iterator()) + 4))) init_op = iterator.initializer get_next = iterator.get_next() @@ -258,10 +258,10 @@ class GroupByWindowTest(test_base.DatasetTestBase): def testImmediateOutput(self): components = np.array( [0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 0, 0, 2, 2, 0, 0], dtype=np.int64) - iterator = ( + iterator = dataset_ops.make_initializable_iterator( dataset_ops.Dataset.from_tensor_slices(components).repeat(-1).apply( grouping.group_by_window(lambda x: x % 3, lambda _, xs: xs.batch(4), - 4)).make_initializable_iterator()) + 4))) init_op = iterator.initializer get_next = iterator.get_next() @@ -280,10 +280,10 @@ class GroupByWindowTest(test_base.DatasetTestBase): @test_util.run_deprecated_v1 def testSmallGroups(self): components = np.array([0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0], dtype=np.int64) - iterator = ( + iterator = dataset_ops.make_initializable_iterator( dataset_ops.Dataset.from_tensor_slices(components).apply( grouping.group_by_window(lambda x: x % 2, lambda _, xs: xs.batch(4), - 4)).make_initializable_iterator()) + 4))) init_op = iterator.initializer get_next = iterator.get_next() @@ -298,10 +298,9 @@ class GroupByWindowTest(test_base.DatasetTestBase): @test_util.run_deprecated_v1 def testEmpty(self): - iterator = ( + iterator = dataset_ops.make_initializable_iterator( dataset_ops.Dataset.range(4).apply( - grouping.group_by_window(lambda _: 0, lambda _, xs: xs, 0)) - .make_initializable_iterator()) + grouping.group_by_window(lambda _: 0, lambda _, xs: xs, 0))) init_op = iterator.initializer get_next = iterator.get_next() @@ -324,11 +323,10 @@ class GroupByWindowTest(test_base.DatasetTestBase): padded_shapes=(tensor_shape.TensorShape([]), constant_op.constant([5], dtype=dtypes.int64) * -1)) - iterator = ( + iterator = dataset_ops.make_initializable_iterator( dataset_ops.Dataset.from_tensor_slices(components) .map(lambda x: (x, ops.convert_to_tensor([x * x]))).apply( - grouping.group_by_window(lambda x, _: x % 2, reduce_func, - 32)).make_initializable_iterator()) + grouping.group_by_window(lambda x, _: x % 2, reduce_func, 32))) init_op = iterator.initializer get_next = iterator.get_next() @@ -351,13 +349,12 @@ class GroupByWindowTest(test_base.DatasetTestBase): 4, padded_shapes=ops.convert_to_tensor([(key + 1) * 10])), )) - iterator = ( + iterator = dataset_ops.make_initializable_iterator( dataset_ops.Dataset.from_tensor_slices(components) .map(lambda x: array_ops.fill([math_ops.cast(x, dtypes.int32)], x)) .apply(grouping.group_by_window( lambda x: math_ops.cast(array_ops.shape(x)[0] // 10, dtypes.int64), - reduce_func, 4)) - .make_initializable_iterator()) + reduce_func, 4))) init_op = iterator.initializer get_next = iterator.get_next() diff --git a/tensorflow/python/data/experimental/kernel_tests/ignore_errors_test.py b/tensorflow/python/data/experimental/kernel_tests/ignore_errors_test.py index bd323592e4..81f580fccb 100644 --- a/tensorflow/python/data/experimental/kernel_tests/ignore_errors_test.py +++ b/tensorflow/python/data/experimental/kernel_tests/ignore_errors_test.py @@ -44,7 +44,7 @@ class IgnoreErrorsTest(test_base.DatasetTestBase): dataset_ops.Dataset.from_tensor_slices(components) .map(lambda x: array_ops.check_numerics(x, "message")).apply( error_ops.ignore_errors())) - iterator = dataset.make_initializable_iterator() + iterator = dataset_ops.make_initializable_iterator(dataset) init_op = iterator.initializer get_next = iterator.get_next() @@ -63,7 +63,7 @@ class IgnoreErrorsTest(test_base.DatasetTestBase): dataset_ops.Dataset.from_tensor_slices(components).map( lambda x: array_ops.check_numerics(x, "message"), num_parallel_calls=2).prefetch(2).apply(error_ops.ignore_errors())) - iterator = dataset.make_initializable_iterator() + iterator = dataset_ops.make_initializable_iterator(dataset) init_op = iterator.initializer get_next = iterator.get_next() @@ -91,7 +91,7 @@ class IgnoreErrorsTest(test_base.DatasetTestBase): dataset_ops.Dataset.from_tensor_slices(filenames).map( io_ops.read_file, num_parallel_calls=2).prefetch(2).apply(error_ops.ignore_errors())) - iterator = dataset.make_initializable_iterator() + iterator = dataset_ops.make_initializable_iterator(dataset) init_op = iterator.initializer get_next = iterator.get_next() diff --git a/tensorflow/python/data/experimental/kernel_tests/make_batched_features_dataset_test.py b/tensorflow/python/data/experimental/kernel_tests/make_batched_features_dataset_test.py index 3a0ff018e8..7c78810494 100644 --- a/tensorflow/python/data/experimental/kernel_tests/make_batched_features_dataset_test.py +++ b/tensorflow/python/data/experimental/kernel_tests/make_batched_features_dataset_test.py @@ -114,7 +114,7 @@ class MakeBatchedFeaturesDatasetTest( core_readers.TFRecordDataset(self.test_filenames) .map(lambda x: parsing_ops.parse_single_example(x, features)) .repeat(10).batch(2)) - iterator = dataset.make_initializable_iterator() + iterator = dataset_ops.make_initializable_iterator(dataset) init_op = iterator.initializer next_element = iterator.get_next() diff --git a/tensorflow/python/data/experimental/kernel_tests/make_tf_record_dataset_test.py b/tensorflow/python/data/experimental/kernel_tests/make_tf_record_dataset_test.py index 64798feaf8..ab2feb6426 100644 --- a/tensorflow/python/data/experimental/kernel_tests/make_tf_record_dataset_test.py +++ b/tensorflow/python/data/experimental/kernel_tests/make_tf_record_dataset_test.py @@ -187,7 +187,7 @@ class MakeTFRecordDatasetTest( num_parallel_reads=num_parallel_reads, shuffle=True, shuffle_seed=seed) - iterator = dataset.make_initializable_iterator() + iterator = dataset_ops.make_initializable_iterator(dataset) next_element = iterator.get_next() self.evaluate(iterator.initializer) diff --git a/tensorflow/python/data/experimental/kernel_tests/map_and_batch_test.py b/tensorflow/python/data/experimental/kernel_tests/map_and_batch_test.py index cad04a70e3..e6e24c3db1 100644 --- a/tensorflow/python/data/experimental/kernel_tests/map_and_batch_test.py +++ b/tensorflow/python/data/experimental/kernel_tests/map_and_batch_test.py @@ -78,7 +78,7 @@ class MapAndBatchTest(test_base.DatasetTestBase, parameterized.TestCase): options.experimental_numa_aware = True dataset = dataset.with_options(options) - iterator = dataset.make_initializable_iterator() + iterator = dataset_ops.make_initializable_iterator(dataset) init_op = iterator.initializer get_next = iterator.get_next() @@ -262,7 +262,7 @@ class MapAndBatchTest(test_base.DatasetTestBase, parameterized.TestCase): options = dataset_ops.Options() options.experimental_numa_aware = True dataset = dataset.with_options(options) - iterator = dataset.make_initializable_iterator() + iterator = dataset_ops.make_initializable_iterator(dataset) init_op = iterator.initializer get_next = iterator.get_next() @@ -296,7 +296,7 @@ class MapAndBatchTest(test_base.DatasetTestBase, parameterized.TestCase): options = dataset_ops.Options() options.experimental_numa_aware = True dataset = dataset.with_options(options) - iterator = dataset.make_initializable_iterator() + iterator = dataset_ops.make_initializable_iterator(dataset) init_op = iterator.initializer with self.cached_session() as sess: @@ -325,7 +325,7 @@ class MapAndBatchTest(test_base.DatasetTestBase, parameterized.TestCase): options = dataset_ops.Options() options.experimental_numa_aware = True dataset = dataset.with_options(options) - iterator = dataset.make_initializable_iterator() + iterator = dataset_ops.make_initializable_iterator(dataset) init_op = iterator.initializer get_next = iterator.get_next() @@ -482,7 +482,7 @@ class MapAndBatchTest(test_base.DatasetTestBase, parameterized.TestCase): captured_t = array_ops.placeholder(dtypes.int64, shape=[]) dataset = self.structuredDataset(None).repeat().apply( batching.map_and_batch(lambda x: captured_t, batch_size=10)) - iterator = dataset.make_initializable_iterator() + iterator = dataset_ops.make_initializable_iterator(dataset) get_next = iterator.get_next() with self.cached_session() as sess: diff --git a/tensorflow/python/data/experimental/kernel_tests/optimization/optimize_dataset_test.py b/tensorflow/python/data/experimental/kernel_tests/optimization/optimize_dataset_test.py index 22705e20bf..2e31df716a 100644 --- a/tensorflow/python/data/experimental/kernel_tests/optimization/optimize_dataset_test.py +++ b/tensorflow/python/data/experimental/kernel_tests/optimization/optimize_dataset_test.py @@ -45,7 +45,7 @@ class OptimizeDatasetTest(test_base.DatasetTestBase): input_t = array_ops.placeholder(dtypes.int32, (None, None, None)) dataset = dataset_ops.Dataset.from_tensors(input_t) dataset = dataset_ops._OptimizeDataset(dataset, []) - iterator = dataset.make_initializable_iterator() + iterator = dataset_ops.make_initializable_iterator(dataset) init_op = iterator.initializer get_next = iterator.get_next() @@ -58,7 +58,7 @@ class OptimizeDatasetTest(test_base.DatasetTestBase): input_t = array_ops.placeholder(dtypes.int32, (None, None, None, None)) dataset = dataset_ops.Dataset.from_tensor_slices(input_t) dataset = dataset_ops._OptimizeDataset(dataset, []) - iterator = dataset.make_initializable_iterator() + iterator = dataset_ops.make_initializable_iterator(dataset) init_op = iterator.initializer get_next = iterator.get_next() diff --git a/tensorflow/python/data/experimental/kernel_tests/override_threadpool_test.py b/tensorflow/python/data/experimental/kernel_tests/override_threadpool_test.py index de2788dfcb..aa81663a18 100644 --- a/tensorflow/python/data/experimental/kernel_tests/override_threadpool_test.py +++ b/tensorflow/python/data/experimental/kernel_tests/override_threadpool_test.py @@ -53,7 +53,7 @@ class OverrideThreadpoolTest(test_base.DatasetTestBase, lambda x: script_ops.py_func(get_thread_id, [x], dtypes.int64), num_parallel_calls=32).apply(unique.unique())) dataset = override_threadpool_fn(dataset) - iterator = dataset.make_initializable_iterator() + iterator = dataset_ops.make_initializable_iterator(dataset) next_element = iterator.get_next() self.evaluate(iterator.initializer) diff --git a/tensorflow/python/data/experimental/kernel_tests/parallel_interleave_test.py b/tensorflow/python/data/experimental/kernel_tests/parallel_interleave_test.py index c09bfa695e..113326c028 100644 --- a/tensorflow/python/data/experimental/kernel_tests/parallel_interleave_test.py +++ b/tensorflow/python/data/experimental/kernel_tests/parallel_interleave_test.py @@ -86,7 +86,7 @@ class ParallelInterleaveTest(test_base.DatasetTestBase): self.block_length, self.sloppy, self.buffer_output_elements, self.prefetch_input_elements))) - self.iterator = self.dataset.make_initializable_iterator() + self.iterator = dataset_ops.make_initializable_iterator(self.dataset) self.init_op = self.iterator.initializer self.next_element = self.iterator.get_next() @@ -630,9 +630,8 @@ class ParallelInterleaveTest(test_base.DatasetTestBase): sparse_ops.sparse_to_dense(x.indices, x.dense_shape, x.values)) dataset = dataset_ops.Dataset.range(10).map(_map_fn) - iterator = dataset.apply( - interleave_ops.parallel_interleave( - _interleave_fn, cycle_length=1)).make_initializable_iterator() + iterator = dataset_ops.make_initializable_iterator(dataset.apply( + interleave_ops.parallel_interleave(_interleave_fn, cycle_length=1))) init_op = iterator.initializer get_next = iterator.get_next() @@ -701,7 +700,7 @@ class ParallelInterleaveTest(test_base.DatasetTestBase): self.buffer_output_elements, self.prefetch_input_elements))) - self.iterator = self.dataset.make_initializable_iterator() + self.iterator = dataset_ops.make_initializable_iterator(self.dataset) self.init_op = self.iterator.initializer self.next_element = self.iterator.get_next() @@ -750,7 +749,7 @@ class ParallelInterleaveTest(test_base.DatasetTestBase): self.buffer_output_elements, self.prefetch_input_elements))) - self.iterator = self.dataset.make_initializable_iterator() + self.iterator = dataset_ops.make_initializable_iterator(self.dataset) self.init_op = self.iterator.initializer self.next_element = self.iterator.get_next() @@ -789,7 +788,7 @@ class ParallelInterleaveTest(test_base.DatasetTestBase): buffer_output_elements=1, prefetch_input_elements=0)) dataset = dataset.batch(32) - iterator = dataset.make_initializable_iterator() + iterator = dataset_ops.make_initializable_iterator(dataset) next_element = iterator.get_next() results = [] diff --git a/tensorflow/python/data/experimental/kernel_tests/prefetch_to_device_test.py b/tensorflow/python/data/experimental/kernel_tests/prefetch_to_device_test.py index 19d5cdb80c..80bd43e9ad 100644 --- a/tensorflow/python/data/experimental/kernel_tests/prefetch_to_device_test.py +++ b/tensorflow/python/data/experimental/kernel_tests/prefetch_to_device_test.py @@ -153,7 +153,7 @@ class PrefetchToDeviceTest(test_base.DatasetTestBase): device_dataset = host_dataset.apply( prefetching_ops.prefetch_to_device("/gpu:0")) - iterator = device_dataset.make_initializable_iterator() + iterator = dataset_ops.make_initializable_iterator(device_dataset) next_element = iterator.get_next() with self.cached_session( @@ -171,7 +171,7 @@ class PrefetchToDeviceTest(test_base.DatasetTestBase): prefetching_ops.prefetch_to_device("/cpu:1")) with ops.device("/cpu:1"): - iterator = device_dataset.make_initializable_iterator() + iterator = dataset_ops.make_initializable_iterator(device_dataset) next_element = iterator.get_next() self.assertEqual(host_dataset.output_types, device_dataset.output_types) @@ -203,7 +203,7 @@ class PrefetchToDeviceTest(test_base.DatasetTestBase): device_dataset = host_dataset.apply( prefetching_ops.prefetch_to_device("/gpu:0")) - iterator = device_dataset.make_initializable_iterator() + iterator = dataset_ops.make_initializable_iterator(device_dataset) next_element = iterator.get_next() with self.cached_session( diff --git a/tensorflow/python/data/experimental/kernel_tests/scan_test.py b/tensorflow/python/data/experimental/kernel_tests/scan_test.py index 3b978b004b..03af7ecd2f 100644 --- a/tensorflow/python/data/experimental/kernel_tests/scan_test.py +++ b/tensorflow/python/data/experimental/kernel_tests/scan_test.py @@ -48,8 +48,8 @@ class ScanTest(test_base.DatasetTestBase): start = array_ops.placeholder(dtypes.int32, shape=[]) step = array_ops.placeholder(dtypes.int32, shape=[]) take = array_ops.placeholder(dtypes.int64, shape=[]) - iterator = self._counting_dataset( - start, make_scan_fn(step)).take(take).make_initializable_iterator() + iterator = dataset_ops.make_initializable_iterator(self._counting_dataset( + start, make_scan_fn(step)).take(take)) next_element = iterator.get_next() with self.cached_session() as sess: @@ -98,9 +98,8 @@ class ScanTest(test_base.DatasetTestBase): start = array_ops.placeholder(dtypes.int32, shape=[]) step = array_ops.placeholder(dtypes.int32, shape=[]) take = array_ops.placeholder(dtypes.int64, shape=[]) - iterator = self._counting_dataset( - _sparse(start), - make_scan_fn(step)).take(take).make_initializable_iterator() + iterator = dataset_ops.make_initializable_iterator(self._counting_dataset( + _sparse(start), make_scan_fn(step)).take(take)) next_element = iterator.get_next() with self.cached_session() as sess: diff --git a/tensorflow/python/data/experimental/kernel_tests/serialization/dataset_serialization_test_base.py b/tensorflow/python/data/experimental/kernel_tests/serialization/dataset_serialization_test_base.py index 7f435b8239..e65aa44d06 100644 --- a/tensorflow/python/data/experimental/kernel_tests/serialization/dataset_serialization_test_base.py +++ b/tensorflow/python/data/experimental/kernel_tests/serialization/dataset_serialization_test_base.py @@ -23,6 +23,7 @@ import os import numpy as np from tensorflow.python.data.experimental.ops import iterator_ops as contrib_iterator_ops +from tensorflow.python.data.ops import dataset_ops from tensorflow.python.data.ops import iterator_ops from tensorflow.python.framework import dtypes from tensorflow.python.framework import errors @@ -578,7 +579,7 @@ class DatasetSerializationTestBase(test.TestCase): return np.linspace(0, num_outputs, num_samples, dtype=int) def _build_graph(self, ds_fn, sparse_tensors=False): - iterator = ds_fn().make_initializable_iterator() + iterator = dataset_ops.make_initializable_iterator(ds_fn()) saveable = contrib_iterator_ops.make_saveable_from_iterator(iterator) ops.add_to_collection(ops.GraphKeys.SAVEABLE_OBJECTS, saveable) diff --git a/tensorflow/python/data/experimental/kernel_tests/serialization/range_dataset_serialization_test.py b/tensorflow/python/data/experimental/kernel_tests/serialization/range_dataset_serialization_test.py index aeb338dfd5..34419a3149 100644 --- a/tensorflow/python/data/experimental/kernel_tests/serialization/range_dataset_serialization_test.py +++ b/tensorflow/python/data/experimental/kernel_tests/serialization/range_dataset_serialization_test.py @@ -56,8 +56,8 @@ class RangeDatasetSerializationTest( def testSaveRestore(self): def _build_graph(start, stop): - iterator = dataset_ops.Dataset.range(start, - stop).make_initializable_iterator() + iterator = dataset_ops.make_initializable_iterator( + dataset_ops.Dataset.range(start, stop)) init_op = iterator.initializer get_next = iterator.get_next() save_op = self._save_op(iterator._iterator_resource) diff --git a/tensorflow/python/data/experimental/kernel_tests/sleep_test.py b/tensorflow/python/data/experimental/kernel_tests/sleep_test.py index 8c9e8225d6..46b22f80b6 100644 --- a/tensorflow/python/data/experimental/kernel_tests/sleep_test.py +++ b/tensorflow/python/data/experimental/kernel_tests/sleep_test.py @@ -36,7 +36,7 @@ class SleepTest(test_base.DatasetTestBase): sleep_microseconds = 100 dataset = dataset_ops.Dataset.range(10).apply( sleep.sleep(sleep_microseconds)) - iterator = dataset.make_initializable_iterator() + iterator = dataset_ops.make_initializable_iterator(dataset) next_element = iterator.get_next() with self.cached_session() as sess: diff --git a/tensorflow/python/data/experimental/kernel_tests/sql_dataset_test_base.py b/tensorflow/python/data/experimental/kernel_tests/sql_dataset_test_base.py index 6aaaa90c65..809e09c804 100644 --- a/tensorflow/python/data/experimental/kernel_tests/sql_dataset_test_base.py +++ b/tensorflow/python/data/experimental/kernel_tests/sql_dataset_test_base.py @@ -24,6 +24,7 @@ import sqlite3 from tensorflow.python.data.experimental.ops import readers from tensorflow.python.data.kernel_tests import test_base +from tensorflow.python.data.ops import dataset_ops from tensorflow.python.framework import dtypes from tensorflow.python.ops import array_ops from tensorflow.python.platform import test @@ -35,7 +36,7 @@ class SqlDatasetTestBase(test_base.DatasetTestBase): def _createSqlDataset(self, output_types, num_repeats=1): dataset = readers.SqlDataset(self.driver_name, self.data_source_name, self.query, output_types).repeat(num_repeats) - iterator = dataset.make_initializable_iterator() + iterator = dataset_ops.make_initializable_iterator(dataset) init_op = iterator.initializer get_next = iterator.get_next() return init_op, get_next diff --git a/tensorflow/python/data/experimental/kernel_tests/stats_dataset_ops_test.py b/tensorflow/python/data/experimental/kernel_tests/stats_dataset_ops_test.py index b89aa20432..8a300364f9 100644 --- a/tensorflow/python/data/experimental/kernel_tests/stats_dataset_ops_test.py +++ b/tensorflow/python/data/experimental/kernel_tests/stats_dataset_ops_test.py @@ -67,7 +67,7 @@ class StatsDatasetTest(stats_dataset_test_base.StatsDatasetTestBase): lambda x: array_ops.tile([x], ops.convert_to_tensor([x]))).apply( stats_ops.bytes_produced_stats("bytes_produced")) dataset = dataset_transformation(dataset, aggregator) - iterator = dataset.make_initializable_iterator() + iterator = dataset_ops.make_initializable_iterator(dataset) next_element = iterator.get_next() summary_t = aggregator.get_summary() @@ -93,7 +93,7 @@ class StatsDatasetTest(stats_dataset_test_base.StatsDatasetTestBase): dataset = dataset_ops.Dataset.range(100).apply( stats_ops.latency_stats("record_latency")) dataset = dataset_transformation(dataset, aggregator) - iterator = dataset.make_initializable_iterator() + iterator = dataset_ops.make_initializable_iterator(dataset) next_element = iterator.get_next() summary_t = aggregator.get_summary() @@ -114,7 +114,7 @@ class StatsDatasetTest(stats_dataset_test_base.StatsDatasetTestBase): dataset = dataset_ops.Dataset.range(100).map( lambda x: array_ops.tile([x], ops.convert_to_tensor([x]))).prefetch(-1) dataset = dataset_transformation(dataset, aggregator) - iterator = dataset.make_initializable_iterator() + iterator = dataset_ops.make_initializable_iterator(dataset) next_element = iterator.get_next() summary_t = aggregator.get_summary() @@ -142,7 +142,7 @@ class StatsDatasetTest(stats_dataset_test_base.StatsDatasetTestBase): dataset = dataset_ops.Dataset.range(10).map( lambda x: array_ops.tile([x], ops.convert_to_tensor([x]))).prefetch(0) dataset = dataset_transformation(dataset, aggregator) - iterator = dataset.make_initializable_iterator() + iterator = dataset_ops.make_initializable_iterator(dataset) next_element = iterator.get_next() summary_t = aggregator.get_summary() @@ -165,7 +165,7 @@ class StatsDatasetTest(stats_dataset_test_base.StatsDatasetTestBase): dataset = dataset_ops.Dataset.range(101).filter( lambda x: math_ops.equal(math_ops.mod(x, 3), 0)) dataset = dataset_transformation(dataset, aggregator) - iterator = dataset.make_initializable_iterator() + iterator = dataset_ops.make_initializable_iterator(dataset) next_element = iterator.get_next() summary_t = aggregator.get_summary() @@ -264,7 +264,7 @@ class StatsDatasetTest(stats_dataset_test_base.StatsDatasetTestBase): dataset = dataset_ops.Dataset.range(100).apply( stats_ops.latency_stats("record_latency")) dataset = dataset_transformation(dataset, aggregator) - iterator = dataset.make_initializable_iterator() + iterator = dataset_ops.make_initializable_iterator(dataset) next_element = iterator.get_next() summary_t = aggregator.get_summary() @@ -285,7 +285,7 @@ class StatsDatasetTest(stats_dataset_test_base.StatsDatasetTestBase): def testNoAggregatorRegistered(self, dataset_transformation): dataset = dataset_ops.Dataset.range(100).apply( stats_ops.latency_stats("record_latency")) - iterator = dataset.make_initializable_iterator() + iterator = dataset_ops.make_initializable_iterator(dataset) next_element = iterator.get_next() with self.cached_session() as sess: @@ -302,7 +302,7 @@ class StatsDatasetTest(stats_dataset_test_base.StatsDatasetTestBase): stats_ops.latency_stats("record_latency")).apply( stats_ops.latency_stats("record_latency_2")) dataset = dataset_transformation(dataset, aggregator) - iterator = dataset.make_initializable_iterator() + iterator = dataset_ops.make_initializable_iterator(dataset) next_element = iterator.get_next() summary_t = aggregator.get_summary() @@ -328,7 +328,7 @@ class StatsDatasetTest(stats_dataset_test_base.StatsDatasetTestBase): stats_ops.latency_stats("record_latency")).apply( stats_ops.latency_stats("record_latency")) dataset = dataset_transformation(dataset, aggregator) - iterator = dataset.make_initializable_iterator() + iterator = dataset_ops.make_initializable_iterator(dataset) next_element = iterator.get_next() summary_t = aggregator.get_summary() @@ -349,8 +349,8 @@ class StatsDatasetTest(stats_dataset_test_base.StatsDatasetTestBase): dataset = dataset_ops.Dataset.range(100).apply( stats_ops.latency_stats("record_latency")) dataset = dataset_transformation(dataset, aggregator) - iterator_0 = dataset.make_initializable_iterator() - iterator_1 = dataset.make_initializable_iterator() + iterator_0 = dataset_ops.make_initializable_iterator(dataset) + iterator_1 = dataset_ops.make_initializable_iterator(dataset) next_element = iterator_0.get_next() + iterator_1.get_next() summary_t = aggregator.get_summary() @@ -374,8 +374,8 @@ class StatsDatasetTest(stats_dataset_test_base.StatsDatasetTestBase): dataset2 = dataset_ops.Dataset.range(100).apply( stats_ops.latency_stats("record_latency")) dataset2 = dataset_transformation(dataset2, aggregator, prefix="dataset2") - iterator_0 = dataset.make_initializable_iterator() - iterator_1 = dataset2.make_initializable_iterator() + iterator_0 = dataset_ops.make_initializable_iterator(dataset) + iterator_1 = dataset_ops.make_initializable_iterator(dataset2) next_element = iterator_0.get_next() + iterator_1.get_next() summary_t = aggregator.get_summary() @@ -435,7 +435,7 @@ class FeatureStatsDatasetTest( dataset = dataset_transformation( dataset_fn(), aggregator, prefix="record_stats") - iterator = dataset.make_initializable_iterator() + iterator = dataset_ops.make_initializable_iterator(dataset) next_element = iterator.get_next() summary_t = aggregator.get_summary() diff --git a/tensorflow/python/data/experimental/kernel_tests/stats_dataset_test_base.py b/tensorflow/python/data/experimental/kernel_tests/stats_dataset_test_base.py index c5bf926759..ab1d1c3028 100644 --- a/tensorflow/python/data/experimental/kernel_tests/stats_dataset_test_base.py +++ b/tensorflow/python/data/experimental/kernel_tests/stats_dataset_test_base.py @@ -22,6 +22,7 @@ import numpy as np from tensorflow.core.framework import summary_pb2 from tensorflow.python.data.experimental.ops import stats_aggregator from tensorflow.python.data.kernel_tests import test_base +from tensorflow.python.data.ops import dataset_ops from tensorflow.python.framework import errors @@ -93,7 +94,7 @@ class StatsDatasetTestBase(test_base.DatasetTestBase): aggregator = stats_aggregator.StatsAggregator() dataset = dataset_fn() dataset = dataset_transformation(dataset, aggregator) - iterator = dataset.make_initializable_iterator() + iterator = dataset_ops.make_initializable_iterator(dataset) next_element = iterator.get_next() summary_t = aggregator.get_summary() diff --git a/tensorflow/python/data/experimental/kernel_tests/unbatch_test.py b/tensorflow/python/data/experimental/kernel_tests/unbatch_test.py index b6b12bd155..cef5e8d269 100644 --- a/tensorflow/python/data/experimental/kernel_tests/unbatch_test.py +++ b/tensorflow/python/data/experimental/kernel_tests/unbatch_test.py @@ -43,7 +43,7 @@ class UnbatchTest(test_base.DatasetTestBase, parameterized.TestCase): placeholder = array_ops.placeholder(dtypes.int32) dataset = dataset_ops.Dataset.from_tensors(placeholder).apply( batching.unbatch()) - iterator = dataset.make_initializable_iterator() + iterator = dataset_ops.make_initializable_iterator(dataset) next_elem = iterator.get_next() with self.cached_session() as sess: @@ -206,7 +206,7 @@ class UnbatchTest(test_base.DatasetTestBase, parameterized.TestCase): ph2 = array_ops.placeholder(dtypes.int32, shape=None) data = dataset_ops.Dataset.from_tensors((ph1, ph2)) data = data.apply(batching.unbatch()) - iterator = data.make_initializable_iterator() + iterator = dataset_ops.make_initializable_iterator(data) next_element = iterator.get_next() with self.cached_session() as sess: diff --git a/tensorflow/python/data/experimental/kernel_tests/unique_test.py b/tensorflow/python/data/experimental/kernel_tests/unique_test.py index ddec968858..1d9941d7f4 100644 --- a/tensorflow/python/data/experimental/kernel_tests/unique_test.py +++ b/tensorflow/python/data/experimental/kernel_tests/unique_test.py @@ -44,7 +44,7 @@ class UniqueTest(test_base.DatasetTestBase): current_test_case = [] dataset = dataset_ops.Dataset.from_generator(lambda: current_test_case, dtype).apply(unique.unique()) - iterator = dataset.make_initializable_iterator() + iterator = dataset_ops.make_initializable_iterator(dataset) next_element = iterator.get_next() with self.cached_session() as sess: diff --git a/tensorflow/python/data/kernel_tests/dataset_checkpoint_test.py b/tensorflow/python/data/kernel_tests/dataset_checkpoint_test.py index cdaa4fd4d5..6dcd94ea02 100644 --- a/tensorflow/python/data/kernel_tests/dataset_checkpoint_test.py +++ b/tensorflow/python/data/kernel_tests/dataset_checkpoint_test.py @@ -64,8 +64,8 @@ class DatasetCheckpointTest(test_base.DatasetTestBase): def testSaveRestore(self): def _build_graph(start, stop): - iterator = dataset_ops.Dataset.range(start, - stop).make_initializable_iterator() + iterator = dataset_ops.make_initializable_iterator( + dataset_ops.Dataset.range(start, stop)) init_op = iterator.initializer get_next = iterator.get_next() save_op = self._save_op(iterator._iterator_resource) @@ -114,7 +114,7 @@ class DatasetCheckpointTest(test_base.DatasetTestBase): def _build_graph(start, stop, num_epochs): dataset = dataset_ops.Dataset.range(start, stop).repeat(num_epochs) - iterator = dataset.make_initializable_iterator() + iterator = dataset_ops.make_initializable_iterator(dataset) init_op = iterator.initializer get_next = iterator.get_next() save_op = self._save_op(iterator._iterator_resource) @@ -161,7 +161,7 @@ class DatasetCheckpointTest(test_base.DatasetTestBase): def _build_graph(start, stop): dataset = dataset_ops.Dataset.range(start, stop) - iterator = dataset.make_initializable_iterator() + iterator = dataset_ops.make_initializable_iterator(dataset) init_op = iterator.initializer get_next = iterator.get_next() save_op = self._save_op(iterator._iterator_resource) @@ -200,7 +200,7 @@ class DatasetCheckpointTest(test_base.DatasetTestBase): def _build_graph(start, stop): dataset = dataset_ops.Dataset.range(start, stop) - iterator = dataset.make_initializable_iterator() + iterator = dataset_ops.make_initializable_iterator(dataset) init_op = iterator.initializer get_next = iterator.get_next() save_op = self._save_op(iterator._iterator_resource) @@ -233,8 +233,8 @@ class DatasetCheckpointTest(test_base.DatasetTestBase): def testMultipleSaves(self): def _build_graph(start, stop): - iterator = dataset_ops.Dataset.range(start, - stop).make_initializable_iterator() + iterator = dataset_ops.make_initializable_iterator( + dataset_ops.Dataset.range(start, stop)) init_op = iterator.initializer get_next = iterator.get_next() save_op = self._save_op(iterator._iterator_resource) @@ -276,8 +276,8 @@ class DatasetCheckpointTest(test_base.DatasetTestBase): def testSaveRestoreWithRepeat(self): def _build_graph(start, stop, num_epochs): - iterator = dataset_ops.Dataset.range( - start, stop).repeat(num_epochs).make_initializable_iterator() + iterator = dataset_ops.make_initializable_iterator( + dataset_ops.Dataset.range(start, stop).repeat(num_epochs)) init_op = iterator.initializer get_next = iterator.get_next() save_op = self._save_op(iterator._iterator_resource) @@ -321,8 +321,8 @@ class DatasetCheckpointTest(test_base.DatasetTestBase): def testSaveRestoreExhaustedIterator(self): def _build_graph(start, stop, num_epochs): - iterator = dataset_ops.Dataset.range( - start, stop).repeat(num_epochs).make_initializable_iterator() + iterator = dataset_ops.make_initializable_iterator( + dataset_ops.Dataset.range(start, stop).repeat(num_epochs)) init_op = iterator.initializer get_next = iterator.get_next() save_op = self._save_op(iterator._iterator_resource) diff --git a/tensorflow/python/data/kernel_tests/from_generator_test.py b/tensorflow/python/data/kernel_tests/from_generator_test.py index e113b45496..a6625534e7 100644 --- a/tensorflow/python/data/kernel_tests/from_generator_test.py +++ b/tensorflow/python/data/kernel_tests/from_generator_test.py @@ -38,11 +38,10 @@ class FromGeneratorTest(test_base.DatasetTestBase): output_types=None): if output_types is None: output_types = dtypes.int64 - iterator = ( + iterator = dataset_ops.make_initializable_iterator( dataset_ops.Dataset.from_generator(generator, output_types=output_types) .repeat(num_repeats) - .prefetch(5) - .make_initializable_iterator()) + .prefetch(5)) init_op = iterator.initializer get_next = iterator.get_next() @@ -129,11 +128,10 @@ class FromGeneratorTest(test_base.DatasetTestBase): output_shapes=([None], [3])) .repeat(num_inner_repeats).prefetch(5)) - iterator = ( + iterator = dataset_ops.make_initializable_iterator( dataset_ops.Dataset.range(num_outer_repeats) .interleave(interleave_fn, cycle_length=10, - block_length=len(input_list)) - .make_initializable_iterator()) + block_length=len(input_list))) init_op = iterator.initializer get_next = iterator.get_next() @@ -188,11 +186,10 @@ class FromGeneratorTest(test_base.DatasetTestBase): return dataset_ops.Dataset.from_generator( generator, output_types=dtypes.int64, output_shapes=[]).prefetch(2) - iterator = ( + iterator = dataset_ops.make_initializable_iterator( dataset_ops.Dataset.range(num_parallel_iterators) .interleave( - interleave_fn, cycle_length=num_parallel_iterators, block_length=1) - .make_initializable_iterator()) + interleave_fn, cycle_length=num_parallel_iterators, block_length=1)) init_op = iterator.initializer get_next = iterator.get_next() @@ -212,9 +209,9 @@ class FromGeneratorTest(test_base.DatasetTestBase): yield [3] for dtype in [dtypes.int8, dtypes.int32, dtypes.int64]: - iterator = (dataset_ops.Dataset.from_generator( - generator, output_types=dtype, output_shapes=[1]) - .make_initializable_iterator()) + iterator = dataset_ops.make_initializable_iterator( + dataset_ops.Dataset.from_generator( + generator, output_types=dtype, output_shapes=[1])) init_op = iterator.initializer get_next = iterator.get_next() @@ -236,9 +233,9 @@ class FromGeneratorTest(test_base.DatasetTestBase): yield b"bar" yield u"baz" - iterator = (dataset_ops.Dataset.from_generator( - generator, output_types=dtypes.string, output_shapes=[]) - .make_initializable_iterator()) + iterator = dataset_ops.make_initializable_iterator( + dataset_ops.Dataset.from_generator( + generator, output_types=dtypes.string, output_shapes=[])) init_op = iterator.initializer get_next = iterator.get_next() @@ -258,9 +255,9 @@ class FromGeneratorTest(test_base.DatasetTestBase): yield "ERROR" yield np.array([7, 8, 9], dtype=np.int64) - iterator = (dataset_ops.Dataset.from_generator( - generator, output_types=dtypes.int64, output_shapes=[3]) - .make_initializable_iterator()) + iterator = dataset_ops.make_initializable_iterator( + dataset_ops.Dataset.from_generator( + generator, output_types=dtypes.int64, output_shapes=[3])) init_op = iterator.initializer get_next = iterator.get_next() @@ -282,9 +279,9 @@ class FromGeneratorTest(test_base.DatasetTestBase): yield np.array([7, 8, 9, 10], dtype=np.int64) yield np.array([11, 12, 13], dtype=np.int64) - iterator = (dataset_ops.Dataset.from_generator( - generator, output_types=dtypes.int64, output_shapes=[3]) - .make_initializable_iterator()) + iterator = dataset_ops.make_initializable_iterator( + dataset_ops.Dataset.from_generator( + generator, output_types=dtypes.int64, output_shapes=[3])) init_op = iterator.initializer get_next = iterator.get_next() @@ -307,9 +304,9 @@ class FromGeneratorTest(test_base.DatasetTestBase): yield 6, 7, 8 yield 9, 10 - iterator = (dataset_ops.Dataset.from_generator( - generator, output_types=(dtypes.int64, dtypes.int64)) - .make_initializable_iterator()) + iterator = dataset_ops.make_initializable_iterator( + dataset_ops.Dataset.from_generator( + generator, output_types=(dtypes.int64, dtypes.int64))) init_op = iterator.initializer get_next = iterator.get_next() @@ -333,9 +330,9 @@ class FromGeneratorTest(test_base.DatasetTestBase): yield 1 yield [2, 3] - iterator = ( + iterator = dataset_ops.make_initializable_iterator( dataset_ops.Dataset.from_generator( - generator, output_types=dtypes.int64).make_initializable_iterator()) + generator, output_types=dtypes.int64)) init_op = iterator.initializer get_next = iterator.get_next() @@ -354,9 +351,9 @@ class FromGeneratorTest(test_base.DatasetTestBase): yield 1 yield 2 - iterator = ( + iterator = dataset_ops.make_initializable_iterator( dataset_ops.Dataset.from_generator( - generator, output_types=dtypes.int64).make_initializable_iterator()) + generator, output_types=dtypes.int64)) init_op = iterator.initializer get_next = iterator.get_next() @@ -384,9 +381,9 @@ class FromGeneratorTest(test_base.DatasetTestBase): def __del__(self): event.set() - iterator = dataset_ops.Dataset.from_generator( - GeneratorWrapper, - output_types=dtypes.int64).take(2).make_initializable_iterator() + iterator = dataset_ops.make_initializable_iterator( + dataset_ops.Dataset.from_generator( + GeneratorWrapper, output_types=dtypes.int64).take(2)) init_op = iterator.initializer get_next = iterator.get_next() @@ -413,10 +410,8 @@ class FromGeneratorTest(test_base.DatasetTestBase): generator_with_arg, output_types=dtypes.int64, output_shapes=(), args=(elem,)) - iterator = (dataset_ops.Dataset - .range(5) - .flat_map(flat_map_fn) - .make_initializable_iterator()) + iterator = dataset_ops.make_initializable_iterator( + dataset_ops.Dataset.range(5).flat_map(flat_map_fn)) init_op = iterator.initializer get_next = iterator.get_next() @@ -441,12 +436,11 @@ class FromGeneratorTest(test_base.DatasetTestBase): generator_with_arg, output_types=(dtypes.int64, dtypes.string), output_shapes=((), ()), args=(elem, message)) - iterator = ( + iterator = dataset_ops.make_initializable_iterator( dataset_ops.Dataset.zip( (dataset_ops.Dataset.range(5), dataset_ops.Dataset.from_tensors("Hi!").repeat(None))) - .flat_map(flat_map_fn) - .make_initializable_iterator()) + .flat_map(flat_map_fn)) init_op = iterator.initializer get_next = iterator.get_next() @@ -478,10 +472,9 @@ class FromGeneratorTest(test_base.DatasetTestBase): stateful=True) dummy = constant_op.constant(37) - iterator = (dataset_ops._GeneratorDataset(dummy, lambda x: x, - lambda x: x, finalize_fn) - .take(2) - .make_initializable_iterator()) + iterator = dataset_ops.make_initializable_iterator( + dataset_ops._GeneratorDataset( + dummy, lambda x: x, lambda x: x, finalize_fn).take(2)) init_op = iterator.initializer get_next = iterator.get_next() diff --git a/tensorflow/python/data/kernel_tests/from_sparse_tensor_slices_test.py b/tensorflow/python/data/kernel_tests/from_sparse_tensor_slices_test.py index 80ed26e7fb..ef608ebb67 100644 --- a/tensorflow/python/data/kernel_tests/from_sparse_tensor_slices_test.py +++ b/tensorflow/python/data/kernel_tests/from_sparse_tensor_slices_test.py @@ -36,8 +36,8 @@ class FromSparseTensorSlicesTest(test_base.DatasetTestBase): def testSkipEagerFromSparseTensorSlices(self): """Test a dataset based on slices of a `tf.SparseTensor`.""" st = array_ops.sparse_placeholder(dtypes.float64) - iterator = (dataset_ops.Dataset.from_sparse_tensor_slices(st) - .make_initializable_iterator()) + iterator = dataset_ops.make_initializable_iterator( + dataset_ops.Dataset.from_sparse_tensor_slices(st)) init_op = iterator.initializer get_next = sparse_tensor.SparseTensor(*iterator.get_next()) diff --git a/tensorflow/python/data/kernel_tests/from_tensors_test.py b/tensorflow/python/data/kernel_tests/from_tensors_test.py index ce70637572..ab3c15263f 100644 --- a/tensorflow/python/data/kernel_tests/from_tensors_test.py +++ b/tensorflow/python/data/kernel_tests/from_tensors_test.py @@ -246,7 +246,7 @@ class FromTensorsTest(test_base.DatasetTestBase): dataset = dataset.map(lambda x: x + var_1.read_value()) sess.run(var_1.initializer) - iterator = dataset.make_initializable_iterator() + iterator = dataset_ops.make_initializable_iterator(dataset) sess.run(iterator.initializer) with self.assertRaisesRegexp( diff --git a/tensorflow/python/data/kernel_tests/iterator_checkpoint_test.py b/tensorflow/python/data/kernel_tests/iterator_checkpoint_test.py index 2d9d554e5e..91b356691b 100644 --- a/tensorflow/python/data/kernel_tests/iterator_checkpoint_test.py +++ b/tensorflow/python/data/kernel_tests/iterator_checkpoint_test.py @@ -113,7 +113,7 @@ class IteratorCheckpointingTest(test_base.DatasetTestBase): checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt") dataset = dataset_ops.Dataset.range(10) iterator = iter(dataset) if context.executing_eagerly( - ) else dataset.make_initializable_iterator() + ) else dataset_ops.make_initializable_iterator(dataset) get_next = iterator.get_next checkpoint = checkpointable_utils.Checkpoint(iterator=iterator) for i in range(5): diff --git a/tensorflow/python/data/kernel_tests/iterator_cluster_test.py b/tensorflow/python/data/kernel_tests/iterator_cluster_test.py index 6e8210f904..728bed20a1 100644 --- a/tensorflow/python/data/kernel_tests/iterator_cluster_test.py +++ b/tensorflow/python/data/kernel_tests/iterator_cluster_test.py @@ -161,7 +161,7 @@ class IteratorClusterTest(test.TestCase): dataset_ops.Dataset.from_tensor_slices(components).map(_map_fn) .repeat(None).prefetch(10000)) - iterator = dataset.make_initializable_iterator() + iterator = dataset_ops.make_initializable_iterator(dataset) init_op = iterator.initializer get_next = iterator.get_next() diff --git a/tensorflow/python/data/kernel_tests/iterator_test.py b/tensorflow/python/data/kernel_tests/iterator_test.py index 32332a0790..916cf8bb45 100644 --- a/tensorflow/python/data/kernel_tests/iterator_test.py +++ b/tensorflow/python/data/kernel_tests/iterator_test.py @@ -289,9 +289,8 @@ class IteratorTest(test.TestCase, parameterized.TestCase): @test_util.run_deprecated_v1 def testNotInitializedError(self): components = (np.array(1), np.array([1, 2, 3]), np.array(37.0)) - iterator = ( - dataset_ops.Dataset.from_tensors(components) - .make_initializable_iterator()) + iterator = dataset_ops.make_initializable_iterator( + dataset_ops.Dataset.from_tensors(components)) get_next = iterator.get_next() with self.cached_session() as sess: @@ -523,7 +522,7 @@ class IteratorTest(test.TestCase, parameterized.TestCase): def testIteratorStringHandleReuseTensorObject(self): dataset = dataset_ops.Dataset.from_tensor_slices([1, 2, 3]) one_shot_iterator = dataset_ops.make_one_shot_iterator(dataset) - initializable_iterator = dataset.make_initializable_iterator() + initializable_iterator = dataset_ops.make_initializable_iterator(dataset) structure_iterator = iterator_ops.Iterator.from_structure( dataset.output_types) @@ -687,7 +686,7 @@ class IteratorTest(test.TestCase, parameterized.TestCase): with ops.device("/job:client"): client_dataset = dataset_ops.Dataset.zip((targets, handles)).map(map_fn) - itr = client_dataset.make_initializable_iterator() + itr = dataset_ops.make_initializable_iterator(client_dataset) n = itr.get_next() with session.Session(s3.target, config=config) as sess: @@ -777,8 +776,8 @@ class IteratorTest(test.TestCase, parameterized.TestCase): def _build_range_dataset_graph(): start = 1 stop = 10 - iterator = dataset_ops.Dataset.range(start, - stop).make_initializable_iterator() + iterator = dataset_ops.make_initializable_iterator( + dataset_ops.Dataset.range(start, stop)) init_op = iterator.initializer get_next = iterator.get_next() save_op = _save_op(iterator._iterator_resource) @@ -787,8 +786,8 @@ class IteratorTest(test.TestCase, parameterized.TestCase): def _build_reader_dataset_graph(): filenames = ["test"] # Does not exist but we don't care in this test. - iterator = readers.FixedLengthRecordDataset( - filenames, 1, 0, 0).make_initializable_iterator() + iterator = dataset_ops.make_initializable_iterator( + readers.FixedLengthRecordDataset(filenames, 1, 0, 0)) init_op = iterator.initializer get_next_op = iterator.get_next() save_op = _save_op(iterator._iterator_resource) diff --git a/tensorflow/python/data/kernel_tests/map_test.py b/tensorflow/python/data/kernel_tests/map_test.py index df522c4351..fdce791447 100644 --- a/tensorflow/python/data/kernel_tests/map_test.py +++ b/tensorflow/python/data/kernel_tests/map_test.py @@ -100,7 +100,7 @@ class MapDatasetTest(test_base.DatasetTestBase, parameterized.TestCase): count = array_ops.placeholder(dtypes.int64, shape=[]) dataset = self._buildMapDataset(components, count) - iterator = dataset.make_initializable_iterator() + iterator = dataset_ops.make_initializable_iterator(dataset) init_op = iterator.initializer get_next = iterator.get_next() @@ -166,7 +166,7 @@ class MapDatasetTest(test_base.DatasetTestBase, parameterized.TestCase): dataset = self._buildParallelMapDataset( components, count, num_parallel_calls, output_buffer_size) - iterator = dataset.make_initializable_iterator() + iterator = dataset_ops.make_initializable_iterator(dataset) init_op = iterator.initializer get_next = iterator.get_next() @@ -235,7 +235,7 @@ class MapDatasetTest(test_base.DatasetTestBase, parameterized.TestCase): dataset = self._buildParallelMapDataset(components, 1000, 100, 100) # NOTE(mrry): Also test that the prefetching thread is cancelled correctly. dataset = dataset.prefetch(100) - iterator = dataset.make_initializable_iterator() + iterator = dataset_ops.make_initializable_iterator(dataset) init_op = iterator.initializer get_next = iterator.get_next() @@ -250,7 +250,7 @@ class MapDatasetTest(test_base.DatasetTestBase, parameterized.TestCase): dataset = (dataset_ops.Dataset.from_tensor_slices(components) .map(lambda x: array_ops.check_numerics(x, "message"), num_parallel_calls=2)) - iterator = dataset.make_initializable_iterator() + iterator = dataset_ops.make_initializable_iterator(dataset) init_op = iterator.initializer get_next = iterator.get_next() @@ -265,7 +265,7 @@ class MapDatasetTest(test_base.DatasetTestBase, parameterized.TestCase): dataset = (dataset_ops.Dataset.from_tensor_slices(components) .map(lambda x: array_ops.check_numerics(x, "message"), num_parallel_calls=2)) - iterator = dataset.make_initializable_iterator() + iterator = dataset_ops.make_initializable_iterator(dataset) init_op = iterator.initializer get_next = iterator.get_next() @@ -286,7 +286,7 @@ class MapDatasetTest(test_base.DatasetTestBase, parameterized.TestCase): dataset = (dataset_ops.Dataset.from_tensor_slices(components) .map(lambda x: array_ops.check_numerics(x, "message")) .prefetch(2)) - iterator = dataset.make_initializable_iterator() + iterator = dataset_ops.make_initializable_iterator(dataset) init_op = iterator.initializer get_next = iterator.get_next() @@ -312,8 +312,8 @@ class MapDatasetTest(test_base.DatasetTestBase, parameterized.TestCase): return dataset_ops.Dataset.range(10).map(_map_fn) def _build_graph(): - captured_iterator = dataset_ops.Dataset.range( - 10).make_initializable_iterator() + captured_iterator = dataset_ops.make_initializable_iterator( + dataset_ops.Dataset.range(10)) ds = _build_ds(captured_iterator) iterator = ds.make_initializable_iterator() init_op = iterator.initializer @@ -343,10 +343,9 @@ class MapDatasetTest(test_base.DatasetTestBase, parameterized.TestCase): input_sentences = dataset_ops.Dataset.from_tensor_slices( ["brain brain tank salad surgery", "surgery brain"]) - iterator = (input_sentences - .map(lambda x: string_ops.string_split([x]).values) - .map(table.lookup) - .make_initializable_iterator()) + iterator = dataset_ops.make_initializable_iterator( + input_sentences + .map(lambda x: string_ops.string_split([x]).values).map(table.lookup)) init_op = iterator.initializer get_next = iterator.get_next() @@ -363,8 +362,9 @@ class MapDatasetTest(test_base.DatasetTestBase, parameterized.TestCase): queue = data_flow_ops.FIFOQueue(200, dtypes.int64, shapes=[]) enqueue_op = queue.enqueue_many(elements) close_op = queue.close() - iterator = (dataset_ops.Dataset.from_tensors(0).repeat(-1) - .map(lambda _: queue.dequeue()).make_initializable_iterator()) + iterator = dataset_ops.make_initializable_iterator( + dataset_ops.Dataset.from_tensors(0).repeat(-1) + .map(lambda _: queue.dequeue())) init_op = iterator.initializer get_next = iterator.get_next() @@ -387,9 +387,9 @@ class MapDatasetTest(test_base.DatasetTestBase, parameterized.TestCase): enqueue_op = queue.enqueue_many(elements) close_op = queue.close() - iterator = (dataset_ops.Dataset.from_tensors(0).repeat(-1) - .map(lambda _: (queue.dequeue(), queue_2.dequeue())) - .make_initializable_iterator()) + iterator = dataset_ops.make_initializable_iterator( + dataset_ops.Dataset.from_tensors(0).repeat(-1) + .map(lambda _: (queue.dequeue(), queue_2.dequeue()))) init_op = iterator.initializer get_next = iterator.get_next() @@ -406,9 +406,9 @@ class MapDatasetTest(test_base.DatasetTestBase, parameterized.TestCase): def testCaptureVariable(self): counter_var = variable_scope.get_variable( "counter", (), dtypes.int32, use_resource=True) - iterator = (dataset_ops.Dataset.from_tensors(0).repeat(10) - .map(lambda _: counter_var.assign_add(1)) - .make_initializable_iterator()) + iterator = dataset_ops.make_initializable_iterator( + dataset_ops.Dataset.from_tensors(0).repeat(10) + .map(lambda _: counter_var.assign_add(1))) init_op = iterator.initializer get_next = iterator.get_next() @@ -426,9 +426,9 @@ class MapDatasetTest(test_base.DatasetTestBase, parameterized.TestCase): def testCaptureUninitializedVariableError(self): counter_var = variable_scope.get_variable( "counter", (), dtypes.int32, use_resource=True) - iterator = (dataset_ops.Dataset.from_tensors(0).repeat(10) - .map(lambda _: counter_var.assign_add(1)) - .make_initializable_iterator()) + iterator = dataset_ops.make_initializable_iterator( + dataset_ops.Dataset.from_tensors(0).repeat(10) + .map(lambda _: counter_var.assign_add(1))) init_op = iterator.initializer get_next = iterator.get_next() @@ -438,9 +438,9 @@ class MapDatasetTest(test_base.DatasetTestBase, parameterized.TestCase): sess.run(get_next) def testSeededStatefulOperatorIsProperlyStateful(self): - iterator = (dataset_ops.Dataset.from_tensors(0).repeat(10) - .map(lambda _: random_ops.random_uniform((), seed=11)).batch(2) - .make_initializable_iterator()) + iterator = dataset_ops.make_initializable_iterator( + dataset_ops.Dataset.from_tensors(0).repeat(10) + .map(lambda _: random_ops.random_uniform((), seed=11)).batch(2)) init_op = iterator.initializer get_next = iterator.get_next() @@ -462,11 +462,11 @@ class MapDatasetTest(test_base.DatasetTestBase, parameterized.TestCase): self.assertAllClose(random_values, random_values_2) def testStatefulMapKeepsStateAcrossIterators(self): - iterator = (dataset_ops.Dataset.from_tensors(0).repeat(10) - .map(lambda _: random_ops.random_uniform((), seed=11)) - .repeat(1000) - .batch(10) - .make_initializable_iterator()) + iterator = dataset_ops.make_initializable_iterator( + dataset_ops.Dataset.from_tensors(0).repeat(10) + .map(lambda _: random_ops.random_uniform((), seed=11)) + .repeat(1000) + .batch(10)) init_op = iterator.initializer get_next = iterator.get_next() @@ -491,9 +491,8 @@ class MapDatasetTest(test_base.DatasetTestBase, parameterized.TestCase): counter_var.assign_add(1) return x - iterator = (dataset_ops.Dataset.range(10) - .map(increment_fn) - .make_initializable_iterator()) + iterator = dataset_ops.make_initializable_iterator( + dataset_ops.Dataset.range(10).map(increment_fn)) init_op = iterator.initializer get_next = iterator.get_next() @@ -509,10 +508,10 @@ class MapDatasetTest(test_base.DatasetTestBase, parameterized.TestCase): self.assertEqual(10, sess.run(counter_var)) def testMapDict(self): - iterator = (dataset_ops.Dataset.range(10) - .map(lambda x: {"foo": x * 2, "bar": x ** 2}) - .map(lambda d: d["foo"] + d["bar"]) - .make_initializable_iterator()) + iterator = dataset_ops.make_initializable_iterator( + dataset_ops.Dataset.range(10) + .map(lambda x: {"foo": x * 2, "bar": x ** 2}) + .map(lambda d: d["foo"] + d["bar"])) init_op = iterator.initializer get_next = iterator.get_next() @@ -560,10 +559,9 @@ class MapDatasetTest(test_base.DatasetTestBase, parameterized.TestCase): def testUseStepContainerInMap(self): row = np.arange(6) - iterator = ( + iterator = dataset_ops.make_initializable_iterator( dataset_ops.Dataset.from_tensors(row) - .map(lambda elems: functional_ops.map_fn(lambda x: x * x, elems)) - .make_initializable_iterator()) + .map(lambda elems: functional_ops.map_fn(lambda x: x * x, elems))) init_op = iterator.initializer get_next = iterator.get_next() @@ -599,9 +597,9 @@ class MapDatasetTest(test_base.DatasetTestBase, parameterized.TestCase): pred_fn_pairs, default=multiply, exclusive=True) def build_dataset(row, num): - iterator = ( + iterator = dataset_ops.make_initializable_iterator( dataset_ops.Dataset.from_tensor_slices(row).map( - lambda x: control_map_fn(x, num)).make_initializable_iterator()) + lambda x: control_map_fn(x, num))) init_op = iterator.initializer get_next = iterator.get_next() return init_op, get_next @@ -638,11 +636,10 @@ class MapDatasetTest(test_base.DatasetTestBase, parameterized.TestCase): def build_dataset(row, num): # pylint: disable=g-long-lambda - iterator = ( + iterator = dataset_ops.make_initializable_iterator( dataset_ops.Dataset.from_tensors(row).map( - lambda elems: functional_ops.map_fn(lambda x: - control_map_fn(x, num), elems) - ).make_initializable_iterator()) + lambda elems: functional_ops.map_fn( + lambda x: control_map_fn(x, num), elems))) init_op = iterator.initializer get_next = iterator.get_next() return init_op, get_next @@ -686,11 +683,10 @@ class MapDatasetTest(test_base.DatasetTestBase, parameterized.TestCase): row = np.arange(6) num = 2 # pylint: disable=g-long-lambda - iterator = ( + iterator = dataset_ops.make_initializable_iterator( dataset_ops.Dataset.from_tensors(row).map( - lambda elems: functional_ops.map_fn(lambda x: - control_map_fn(x, num), elems) - ).make_initializable_iterator()) + lambda elems: functional_ops.map_fn( + lambda x: control_map_fn(x, num), elems))) # pylint: enable=g-long-lambda init_op = iterator.initializer get_next = iterator.get_next() @@ -720,11 +716,10 @@ class MapDatasetTest(test_base.DatasetTestBase, parameterized.TestCase): return script_ops.py_func(_map_py_func, [x], x.dtype) buffer_size_placeholder = array_ops.placeholder(dtypes.int64, shape=[]) - iterator = ( + iterator = dataset_ops.make_initializable_iterator( dataset_ops.Dataset.range(100) .map(_map_fn) - .prefetch(buffer_size_placeholder) - .make_initializable_iterator()) + .prefetch(buffer_size_placeholder)) init_op = iterator.initializer get_next = iterator.get_next() @@ -760,9 +755,9 @@ class MapDatasetTest(test_base.DatasetTestBase, parameterized.TestCase): sess.run(get_next) def testReturnList(self): - iterator = (dataset_ops.Dataset.range(10) - .map(lambda x: [x, constant_op.constant(37.0)]) - .make_initializable_iterator()) + iterator = dataset_ops.make_initializable_iterator( + dataset_ops.Dataset.range(10) + .map(lambda x: [x, constant_op.constant(37.0)])) init_op = iterator.initializer get_next = iterator.get_next() @@ -781,9 +776,8 @@ class MapDatasetTest(test_base.DatasetTestBase, parameterized.TestCase): return script_ops.py_func( _map_py_func, [x_tensor], [dtypes.int64, dtypes.float64]) - iterator = (dataset_ops.Dataset.range(10) - .map(_map_fn) - .make_initializable_iterator()) + iterator = dataset_ops.make_initializable_iterator( + dataset_ops.Dataset.range(10).map(_map_fn)) init_op = iterator.initializer get_next = iterator.get_next() @@ -802,9 +796,8 @@ class MapDatasetTest(test_base.DatasetTestBase, parameterized.TestCase): values=(i * np.array([1])), dense_shape=np.array([1, 1])) - iterator = (dataset_ops.Dataset.range(10) - .map(_sparse) - .make_initializable_iterator()) + iterator = dataset_ops.make_initializable_iterator( + dataset_ops.Dataset.range(10).map(_sparse)) init_op = iterator.initializer get_next = iterator.get_next() @@ -829,9 +822,8 @@ class MapDatasetTest(test_base.DatasetTestBase, parameterized.TestCase): self.assertTrue(sparse_tensor.is_sparse(i)) return sparse_ops.sparse_concat(0, [i, i]) - iterator = ( - dataset_ops.Dataset.range(10).map(_sparse).map(_check) - .make_initializable_iterator()) + iterator = dataset_ops.make_initializable_iterator( + dataset_ops.Dataset.range(10).map(_sparse).map(_check)) init_op = iterator.initializer get_next = iterator.get_next() @@ -851,11 +843,10 @@ class MapDatasetTest(test_base.DatasetTestBase, parameterized.TestCase): else: return i - iterator = ( + iterator = dataset_ops.make_initializable_iterator( dataset_ops.Dataset.range(105) .map(lambda x: script_ops.py_func(raising_py_func, [x], dtypes.int64), - num_parallel_calls=2) - .make_initializable_iterator()) + num_parallel_calls=2)) init_op = iterator.initializer get_next = iterator.get_next() @@ -867,9 +858,8 @@ class MapDatasetTest(test_base.DatasetTestBase, parameterized.TestCase): sess.run(get_next) def testConstantOutput(self): - iterator = ( - dataset_ops.Dataset.range(10).map(lambda x: [x, "hello", 10]) - .make_initializable_iterator()) + iterator = dataset_ops.make_initializable_iterator( + dataset_ops.Dataset.range(10).map(lambda x: [x, "hello", 10])) init_op = iterator.initializer get_next = iterator.get_next() @@ -940,7 +930,7 @@ class MapDatasetTest(test_base.DatasetTestBase, parameterized.TestCase): return const_tensor dataset = dataset.map(broken_function) - iterator = dataset.make_initializable_iterator() + iterator = dataset_ops.make_initializable_iterator(dataset) with self.cached_session() as sess: with self.assertRaisesRegexp(errors.InvalidArgumentError, "BrokenConst"): @@ -1005,7 +995,7 @@ class MapDatasetTest(test_base.DatasetTestBase, parameterized.TestCase): captured_t = array_ops.placeholder(dtypes.int64, shape=[]) dataset = self.structuredDataset(None).repeat().map( lambda x: captured_t, num_parallel_calls=num_parallel_calls) - iterator = dataset.make_initializable_iterator() + iterator = dataset_ops.make_initializable_iterator(dataset) get_next = iterator.get_next() with self.cached_session() as sess: diff --git a/tensorflow/python/data/kernel_tests/shuffle_test.py b/tensorflow/python/data/kernel_tests/shuffle_test.py index ae892a7921..13df870938 100644 --- a/tensorflow/python/data/kernel_tests/shuffle_test.py +++ b/tensorflow/python/data/kernel_tests/shuffle_test.py @@ -130,9 +130,8 @@ class ShuffleTest(test_base.DatasetTestBase, parameterized.TestCase): sess.run(get_next) seed_placeholder = array_ops.placeholder(dtypes.int64, shape=[]) - iterator = ( - dataset_ops.Dataset.range(10).shuffle(10, seed=seed_placeholder) - .make_initializable_iterator()) + iterator = dataset_ops.make_initializable_iterator( + dataset_ops.Dataset.range(10).shuffle(10, seed=seed_placeholder)) get_next = iterator.get_next() with self.cached_session() as sess: @@ -223,7 +222,8 @@ class ShuffleTest(test_base.DatasetTestBase, parameterized.TestCase): 10, reshuffle_each_iteration=reshuffle).repeat(3) if initializable: - iterators = [dataset.make_initializable_iterator() for _ in range(2)] + iterators = [dataset_ops.make_initializable_iterator(dataset) + for _ in range(2)] else: iterators = [dataset_ops.make_one_shot_iterator(dataset) for _ in range(2)] diff --git a/tensorflow/python/data/kernel_tests/test_base.py b/tensorflow/python/data/kernel_tests/test_base.py index ca6ecdf198..85f6c9de23 100644 --- a/tensorflow/python/data/kernel_tests/test_base.py +++ b/tensorflow/python/data/kernel_tests/test_base.py @@ -62,7 +62,7 @@ class DatasetTestBase(test.TestCase): return iterator._next_internal # pylint: disable=protected-access else: if requires_initialization: - iterator = dataset.make_initializable_iterator() + iterator = dataset_ops.make_initializable_iterator(dataset) self.evaluate(iterator.initializer) else: iterator = dataset_ops.make_one_shot_iterator(dataset) diff --git a/tensorflow/python/data/ops/dataset_ops.py b/tensorflow/python/data/ops/dataset_ops.py index c045299275..f6a5772bfd 100644 --- a/tensorflow/python/data/ops/dataset_ops.py +++ b/tensorflow/python/data/ops/dataset_ops.py @@ -135,51 +135,6 @@ class DatasetV2(object): options.experimental_stats.counter_prefix) return dataset - def make_initializable_iterator(self, shared_name=None): - """Creates an `Iterator` for enumerating the elements of this dataset. - - Note: The returned iterator will be in an uninitialized state, - and you must run the `iterator.initializer` operation before using it: - - ```python - dataset = ... - iterator = dataset.make_initializable_iterator() - # ... - sess.run(iterator.initializer) - ``` - - Args: - shared_name: (Optional.) If non-empty, the returned iterator will be - shared under the given name across multiple sessions that share the - same devices (e.g. when using a remote server). - - Returns: - An `Iterator` over the elements of this dataset. - - Raises: - RuntimeError: If eager execution is enabled. - """ - if context.executing_eagerly(): - raise RuntimeError( - "dataset.make_initializable_iterator is not supported when eager " - "execution is enabled.") - dataset = self._apply_options() - if shared_name is None: - shared_name = "" - if compat.forward_compatible(2018, 8, 3): - iterator_resource = gen_dataset_ops.iterator_v2( - container="", shared_name=shared_name, **flat_structure(self)) - else: - iterator_resource = gen_dataset_ops.iterator( - container="", shared_name=shared_name, **flat_structure(self)) - with ops.colocate_with(iterator_resource): - initializer = gen_dataset_ops.make_iterator( - dataset._as_variant_tensor(), # pylint: disable=protected-access - iterator_resource) - return iterator_ops.Iterator(iterator_resource, initializer, - dataset.output_types, dataset.output_shapes, - dataset.output_classes) - def __iter__(self): """Creates an `Iterator` for enumerating the elements of this dataset. @@ -1352,6 +1307,56 @@ class DatasetV1(DatasetV2): dataset_factory=_make_dataset, **flat_structure(self)), None, self.output_types, self.output_shapes, self.output_classes) + @deprecation.deprecated( + None, "Use `for ... in dataset:` to iterate over a dataset. If using " + "`tf.estimator`, return the `Dataset` object directly from your input " + "function. As a last resort, you can use " + "`tf.compat.v1.data.make_initializable_iterator(dataset)`.") + def make_initializable_iterator(self, shared_name=None): + """Creates an `Iterator` for enumerating the elements of this dataset. + + Note: The returned iterator will be in an uninitialized state, + and you must run the `iterator.initializer` operation before using it: + + ```python + dataset = ... + iterator = dataset.make_initializable_iterator() + # ... + sess.run(iterator.initializer) + ``` + + Args: + shared_name: (Optional.) If non-empty, the returned iterator will be + shared under the given name across multiple sessions that share the + same devices (e.g. when using a remote server). + + Returns: + An `Iterator` over the elements of this dataset. + + Raises: + RuntimeError: If eager execution is enabled. + """ + if context.executing_eagerly(): + raise RuntimeError( + "dataset.make_initializable_iterator is not supported when eager " + "execution is enabled.") + dataset = self._apply_options() + if shared_name is None: + shared_name = "" + if compat.forward_compatible(2018, 8, 3): + iterator_resource = gen_dataset_ops.iterator_v2( + container="", shared_name=shared_name, **flat_structure(self)) + else: + iterator_resource = gen_dataset_ops.iterator( + container="", shared_name=shared_name, **flat_structure(self)) + with ops.colocate_with(iterator_resource): + initializer = gen_dataset_ops.make_iterator( + dataset._as_variant_tensor(), # pylint: disable=protected-access + iterator_resource) + return iterator_ops.Iterator(iterator_resource, initializer, + dataset.output_types, dataset.output_shapes, + dataset.output_classes) + @staticmethod @functools.wraps(DatasetV2.from_tensors) def from_tensors(tensors): @@ -1563,9 +1568,6 @@ class DatasetV1Adapter(DatasetV1): def output_types(self): return self._dataset.output_types - def make_initializable_iterator(self, shared_name=None): - return self._dataset.make_initializable_iterator(shared_name) - def __iter__(self): return iter(self._dataset) @@ -1591,6 +1593,37 @@ def make_one_shot_iterator(dataset): return DatasetV1Adapter(dataset).make_one_shot_iterator() +@tf_export(v1=["data.make_initializable_iterator"]) +def make_initializable_iterator(dataset): + """Creates a `tf.data.Iterator` for enumerating the elements of a dataset. + + Note: The returned iterator will be in an uninitialized state, + and you must run the `iterator.initializer` operation before using it: + + ```python + dataset = ... + iterator = dataset.make_initializable_iterator() + # ... + sess.run(iterator.initializer) + ``` + + Args: + dataset: A `tf.data.Dataset`. + + Returns: + A `tf.data.Iterator` over the elements of `dataset`. + + Raises: + RuntimeError: If eager execution is enabled. + """ + try: + # Call the defined `make_one_shot_iterator()` if there is one, because some + # datasets (e.g. for prefetching) override its behavior. + return dataset.make_initializable_iterator() + except AttributeError: + return DatasetV1Adapter(dataset).make_initializable_iterator() + + @tf_export("data.Options") class Options(options_lib.OptionsBase): """Represents options for tf.data.Dataset. diff --git a/tensorflow/python/distribute/values.py b/tensorflow/python/distribute/values.py index 8687d1d945..01a1680a24 100644 --- a/tensorflow/python/distribute/values.py +++ b/tensorflow/python/distribute/values.py @@ -1186,7 +1186,7 @@ class PerReplicaDataset(object): dataset_iterator = multi_device_iterator_ops.MultiDeviceIterator( self._dataset, self._devices) else: - dataset_iterator = self._dataset.make_initializable_iterator() + dataset_iterator = dataset_ops.make_initializable_iterator(self._dataset) return PerReplicaDataIterator( dataset_iterator, self._devices, @@ -1311,7 +1311,8 @@ class MultiWorkerDataset(object): iterators = [] for worker, dataset in self._datasets: with ops.device(worker): - iterators.append((worker, dataset.make_initializable_iterator())) + iterators.append( + (worker, dataset_ops.make_initializable_iterator(dataset))) return MultiWorkerDataIterator(iterators, self._worker_device_pairs) diff --git a/tensorflow/python/keras/engine/training.py b/tensorflow/python/keras/engine/training.py index 38a3928a38..1416f5960f 100644 --- a/tensorflow/python/keras/engine/training.py +++ b/tensorflow/python/keras/engine/training.py @@ -1156,7 +1156,7 @@ class Model(Network): if x in self._dataset_iterator_cache: x = self._dataset_iterator_cache[x] else: - iterator = x.make_initializable_iterator() + iterator = dataset_ops.make_initializable_iterator(x) self._dataset_iterator_cache[x] = iterator x = iterator K.get_session().run(x.initializer) diff --git a/tensorflow/tools/api/golden/v1/tensorflow.data.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.data.pbtxt index bb14216e60..aa47468059 100644 --- a/tensorflow/tools/api/golden/v1/tensorflow.data.pbtxt +++ b/tensorflow/tools/api/golden/v1/tensorflow.data.pbtxt @@ -28,6 +28,10 @@ tf_module { name: "experimental" mtype: "" } + member_method { + name: "make_initializable_iterator" + argspec: "args=[\'dataset\'], varargs=None, keywords=None, defaults=None" + } member_method { name: "make_one_shot_iterator" argspec: "args=[\'dataset\'], varargs=None, keywords=None, defaults=None" diff --git a/tensorflow/tools/api/golden/v2/tensorflow.data.-dataset.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.data.-dataset.pbtxt index 760d56aa06..ac8dd2de7f 100644 --- a/tensorflow/tools/api/golden/v2/tensorflow.data.-dataset.pbtxt +++ b/tensorflow/tools/api/golden/v2/tensorflow.data.-dataset.pbtxt @@ -61,10 +61,6 @@ tf_class { name: "list_files" argspec: "args=[\'file_pattern\', \'shuffle\', \'seed\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], " } - member_method { - name: "make_initializable_iterator" - argspec: "args=[\'self\', \'shared_name\'], varargs=None, keywords=None, defaults=[\'None\'], " - } member_method { name: "map" argspec: "args=[\'self\', \'map_func\', \'num_parallel_calls\'], varargs=None, keywords=None, defaults=[\'None\'], " diff --git a/tensorflow/tools/api/golden/v2/tensorflow.data.-fixed-length-record-dataset.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.data.-fixed-length-record-dataset.pbtxt index c46e276664..f157351243 100644 --- a/tensorflow/tools/api/golden/v2/tensorflow.data.-fixed-length-record-dataset.pbtxt +++ b/tensorflow/tools/api/golden/v2/tensorflow.data.-fixed-length-record-dataset.pbtxt @@ -64,10 +64,6 @@ tf_class { name: "list_files" argspec: "args=[\'file_pattern\', \'shuffle\', \'seed\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], " } - member_method { - name: "make_initializable_iterator" - argspec: "args=[\'self\', \'shared_name\'], varargs=None, keywords=None, defaults=[\'None\'], " - } member_method { name: "map" argspec: "args=[\'self\', \'map_func\', \'num_parallel_calls\'], varargs=None, keywords=None, defaults=[\'None\'], " diff --git a/tensorflow/tools/api/golden/v2/tensorflow.data.-t-f-record-dataset.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.data.-t-f-record-dataset.pbtxt index 5a19f474fa..690da98b1a 100644 --- a/tensorflow/tools/api/golden/v2/tensorflow.data.-t-f-record-dataset.pbtxt +++ b/tensorflow/tools/api/golden/v2/tensorflow.data.-t-f-record-dataset.pbtxt @@ -63,10 +63,6 @@ tf_class { name: "list_files" argspec: "args=[\'file_pattern\', \'shuffle\', \'seed\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], " } - member_method { - name: "make_initializable_iterator" - argspec: "args=[\'self\', \'shared_name\'], varargs=None, keywords=None, defaults=[\'None\'], " - } member_method { name: "map" argspec: "args=[\'self\', \'map_func\', \'num_parallel_calls\'], varargs=None, keywords=None, defaults=[\'None\'], " diff --git a/tensorflow/tools/api/golden/v2/tensorflow.data.-text-line-dataset.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.data.-text-line-dataset.pbtxt index 71be8ed677..fe0bc1a4db 100644 --- a/tensorflow/tools/api/golden/v2/tensorflow.data.-text-line-dataset.pbtxt +++ b/tensorflow/tools/api/golden/v2/tensorflow.data.-text-line-dataset.pbtxt @@ -64,10 +64,6 @@ tf_class { name: "list_files" argspec: "args=[\'file_pattern\', \'shuffle\', \'seed\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], " } - member_method { - name: "make_initializable_iterator" - argspec: "args=[\'self\', \'shared_name\'], varargs=None, keywords=None, defaults=[\'None\'], " - } member_method { name: "map" argspec: "args=[\'self\', \'map_func\', \'num_parallel_calls\'], varargs=None, keywords=None, defaults=[\'None\'], " diff --git a/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-csv-dataset.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-csv-dataset.pbtxt index 823c5c4d18..261129b132 100644 --- a/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-csv-dataset.pbtxt +++ b/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-csv-dataset.pbtxt @@ -64,10 +64,6 @@ tf_class { name: "list_files" argspec: "args=[\'file_pattern\', \'shuffle\', \'seed\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], " } - member_method { - name: "make_initializable_iterator" - argspec: "args=[\'self\', \'shared_name\'], varargs=None, keywords=None, defaults=[\'None\'], " - } member_method { name: "map" argspec: "args=[\'self\', \'map_func\', \'num_parallel_calls\'], varargs=None, keywords=None, defaults=[\'None\'], " diff --git a/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-random-dataset.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-random-dataset.pbtxt index 28050c2758..0b34bbc942 100644 --- a/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-random-dataset.pbtxt +++ b/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-random-dataset.pbtxt @@ -64,10 +64,6 @@ tf_class { name: "list_files" argspec: "args=[\'file_pattern\', \'shuffle\', \'seed\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], " } - member_method { - name: "make_initializable_iterator" - argspec: "args=[\'self\', \'shared_name\'], varargs=None, keywords=None, defaults=[\'None\'], " - } member_method { name: "map" argspec: "args=[\'self\', \'map_func\', \'num_parallel_calls\'], varargs=None, keywords=None, defaults=[\'None\'], " diff --git a/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-sql-dataset.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-sql-dataset.pbtxt index d707ac7320..0e61890eee 100644 --- a/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-sql-dataset.pbtxt +++ b/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-sql-dataset.pbtxt @@ -64,10 +64,6 @@ tf_class { name: "list_files" argspec: "args=[\'file_pattern\', \'shuffle\', \'seed\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], " } - member_method { - name: "make_initializable_iterator" - argspec: "args=[\'self\', \'shared_name\'], varargs=None, keywords=None, defaults=[\'None\'], " - } member_method { name: "map" argspec: "args=[\'self\', \'map_func\', \'num_parallel_calls\'], varargs=None, keywords=None, defaults=[\'None\'], " diff --git a/tensorflow/tools/compatibility/renames_v2.py b/tensorflow/tools/compatibility/renames_v2.py index 696cf7c41f..b8d57d5e11 100644 --- a/tensorflow/tools/compatibility/renames_v2.py +++ b/tensorflow/tools/compatibility/renames_v2.py @@ -108,6 +108,7 @@ renames = { 'tf.create_partitioned_variables': 'tf.compat.v1.create_partitioned_variables', 'tf.cross': 'tf.linalg.cross', 'tf.cumprod': 'tf.math.cumprod', + 'tf.data.make_initializable_iterator': 'tf.compat.v1.data.make_initializable_iterator', 'tf.data.make_one_shot_iterator': 'tf.compat.v1.data.make_one_shot_iterator', 'tf.debugging.is_finite': 'tf.math.is_finite', 'tf.debugging.is_inf': 'tf.math.is_inf', -- GitLab From 8bf493d03d8cf3ca4618e1ba925a95a71bdc9b35 Mon Sep 17 00:00:00 2001 From: Sanjoy Das Date: Mon, 3 Dec 2018 20:44:45 -0800 Subject: [PATCH 066/405] Automated rollback of commit 85e68a20257cdaf176d467fc8a91ba0f1e55462e PiperOrigin-RevId: 223912775 --- tensorflow/workspace.bzl | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tensorflow/workspace.bzl b/tensorflow/workspace.bzl index 55c3dd564a..7e5f84be16 100755 --- a/tensorflow/workspace.bzl +++ b/tensorflow/workspace.bzl @@ -123,11 +123,11 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""): tf_http_archive( name = "com_google_absl", build_file = clean_dep("//third_party:com_google_absl.BUILD"), - sha256 = "e64ffc34ce6ba78946a6ea828c0ad96ac5bd162ec71c30f701c5cb3ebf8418a7", - strip_prefix = "abseil-cpp-926bfeb9fff223429c12224b7514243886323e8d", + sha256 = "3cf6132129ba87f0781c383bfaf381b7174b5818e81fffcc5d04bb451154f0f2", + strip_prefix = "abseil-cpp-f95179062eb65ce40895cc76f1398cce25394369", urls = [ - "https://mirror.bazel.build/github.com/abseil/abseil-cpp/archive/926bfeb9fff223429c12224b7514243886323e8d.tar.gz", - "https://github.com/abseil/abseil-cpp/archive/926bfeb9fff223429c12224b7514243886323e8d.tar.gz", + "https://mirror.bazel.build/github.com/abseil/abseil-cpp/archive/f95179062eb65ce40895cc76f1398cce25394369.tar.gz", + "https://github.com/abseil/abseil-cpp/archive/f95179062eb65ce40895cc76f1398cce25394369.tar.gz", ], ) -- GitLab From 6eee77214daee11f3188cfa0c6a20bed249a609d Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Mon, 3 Dec 2018 21:23:40 -0800 Subject: [PATCH 067/405] [XLA] separate out an Execute from ExecutePerReplica [JAX] reduce the creation of XLA Shape protos on every call (which is slow) PiperOrigin-RevId: 223915944 --- .../xla/python/local_computation_builder.cc | 45 ++++++++++++++++++- .../xla/python/local_computation_builder.h | 8 +++- .../xla/python/local_computation_builder.i | 1 + tensorflow/compiler/xla/python/xla_client.py | 13 ++++-- 4 files changed, 62 insertions(+), 5 deletions(-) diff --git a/tensorflow/compiler/xla/python/local_computation_builder.cc b/tensorflow/compiler/xla/python/local_computation_builder.cc index c1381779c9..6e2ee86632 100644 --- a/tensorflow/compiler/xla/python/local_computation_builder.cc +++ b/tensorflow/compiler/xla/python/local_computation_builder.cc @@ -314,7 +314,50 @@ CompiledLocalComputation::CompiledLocalComputation( std::unique_ptr executable) : executable_(std::move(executable)) {} -StatusOr CompiledLocalComputation::Execute( +StatusOr CompiledLocalComputation::Execute( + absl::Span argument_handles) { + LocalClient* client = GetOrCreateLocalClient(); + StatusOr device_ordinal_status = client->ReplicaNumberToDeviceOrdinal(0); + StatusOr result_buffer_status; + if (!device_ordinal_status.ok()) { + result_buffer_status = device_ordinal_status.status(); + } else { + const int device_ordinal = device_ordinal_status.ValueOrDie(); + VLOG(3) << "Replica 0 mapped to device ordinal for execution: " + << device_ordinal; + + std::vector argument_buffers; + argument_buffers.reserve(argument_handles.size()); + for (auto& handle : argument_handles) { + argument_buffers.push_back(handle->shaped_buffer()); + } + + DeviceAssignment device_assignment = + client->backend() + .computation_placer() + ->AssignDevices(1, /*computation_count=*/1) + .ConsumeValueOrDie(); + + ExecutableRunOptions options; + options.set_device_ordinal(device_ordinal); + options.set_allocator(client->backend().memory_allocator()); + options.set_intra_op_thread_pool( + client->backend().eigen_intra_op_thread_pool_device()); + options.set_device_assignment(&device_assignment); + + result_buffer_status = executable_->Run(argument_buffers, options); + } + + if (!result_buffer_status.ok()) { + return InternalError( + "Failed running replica 0 (other replicas may have failed as well): " + "%s.", + result_buffer_status.status().ToString()); + } + return new LocalShapedBuffer(std::move(result_buffer_status).ValueOrDie()); +} + +StatusOr CompiledLocalComputation::ExecutePerReplica( absl::Span> argument_handles) { LocalClient* client = GetOrCreateLocalClient(); const int num_replicas = GetReplicaCount(); diff --git a/tensorflow/compiler/xla/python/local_computation_builder.h b/tensorflow/compiler/xla/python/local_computation_builder.h index 8247b05775..149e44570d 100644 --- a/tensorflow/compiler/xla/python/local_computation_builder.h +++ b/tensorflow/compiler/xla/python/local_computation_builder.h @@ -173,7 +173,13 @@ class CompiledLocalComputation { public: CompiledLocalComputation(std::unique_ptr executable); - StatusOr Execute( + StatusOr Execute( + absl::Span argument_handles); + + // Execute on many replicas. Takes a sequence of argument lists (one argument + // list per replica) and returns a tuple of results (one result per replica). + // The number of argument lists must be equal to the replica count. + StatusOr ExecutePerReplica( absl::Span > argument_handles); private: diff --git a/tensorflow/compiler/xla/python/local_computation_builder.i b/tensorflow/compiler/xla/python/local_computation_builder.i index 88349b788e..d23d693c1e 100644 --- a/tensorflow/compiler/xla/python/local_computation_builder.i +++ b/tensorflow/compiler/xla/python/local_computation_builder.i @@ -1029,6 +1029,7 @@ tensorflow::ImportNumpy(); %unignore xla::swig::XrtAllocationTuple::size; %unignore xla::swig::CompiledLocalComputation; %unignore xla::swig::CompiledLocalComputation::Execute; +%unignore xla::swig::CompiledLocalComputation::ExecutePerReplica; %unignore xla::swig::CompiledXrtComputation; %unignore xla::swig::CompiledXrtComputation::Execute; %unignore xla::swig::LocalComputation; diff --git a/tensorflow/compiler/xla/python/xla_client.py b/tensorflow/compiler/xla/python/xla_client.py index 4dbbeebd0b..c91a2aaf56 100644 --- a/tensorflow/compiler/xla/python/xla_client.py +++ b/tensorflow/compiler/xla/python/xla_client.py @@ -588,9 +588,16 @@ class LocalComputation(object): compile_options=compile_options, layout_fn=layout_fn) - def Execute(self, arguments=()): + def GetReturnValueShape(self): + return _wrap_shape(self._c_computation.GetReturnValueShape()) + + def Execute(self, arguments=(), check_for_deleted_args=True): """Execute on one replica with LocalBuffer arguments and return value.""" - return self.ExecutePerReplica([arguments])[0] + if check_for_deleted_args and any(arg.is_deleted() for arg in arguments): + raise ValueError('Executing with deleted local buffer argument') + raw_args = [arg.c_buffer for arg in arguments] + output_buffer = self._c_computation.Execute(raw_args) + return LocalBuffer(output_buffer, backend=self._backend, replica=0) def ExecutePerReplica(self, arguments=None): """Execute on many replicas with LocalBuffer arguments and return value. @@ -633,7 +640,7 @@ class LocalComputation(object): 'Multi-replica execution is not yet supported via the XRT backend.') output_buffers = [self._c_computation.Execute(stripped_args[0])] else: - output_buffer_tup = self._c_computation.Execute(stripped_args) + output_buffer_tup = self._c_computation.ExecutePerReplica(stripped_args) size = output_buffer_tup.size() output_buffers = [output_buffer_tup.Release(i) for i in xrange(size)] -- GitLab From 7018e50143526de7a58703cb0b25babbc9bd48db Mon Sep 17 00:00:00 2001 From: Karim Nosir Date: Mon, 3 Dec 2018 21:42:44 -0800 Subject: [PATCH 068/405] Add MirrorPad op. PiperOrigin-RevId: 223917407 --- tensorflow/lite/build_def.bzl | 1 + tensorflow/lite/c/c_api_internal.c | 2 +- tensorflow/lite/c/c_api_internal.h | 2 +- tensorflow/lite/kernels/BUILD | 12 + .../lite/kernels/internal/tensor_ctypes.h | 5 + tensorflow/lite/kernels/mirror_pad.cc | 374 ++++++++++++++++++ tensorflow/lite/kernels/mirror_pad_test.cc | 189 +++++++++ tensorflow/lite/kernels/register.cc | 2 + tensorflow/lite/testing/generate_examples.py | 82 ++++ .../propagate_fixed_sizes.cc | 48 +++ tensorflow/lite/toco/import_tensorflow.cc | 26 ++ tensorflow/lite/toco/model.h | 20 +- tensorflow/lite/toco/tflite/operator.cc | 25 ++ tensorflow/lite/toco/tflite/operator_test.cc | 8 + tensorflow/lite/toco/tooling_util.cc | 1 + 15 files changed, 794 insertions(+), 3 deletions(-) create mode 100644 tensorflow/lite/kernels/mirror_pad.cc create mode 100644 tensorflow/lite/kernels/mirror_pad_test.cc diff --git a/tensorflow/lite/build_def.bzl b/tensorflow/lite/build_def.bzl index 8507521694..5eaf719494 100644 --- a/tensorflow/lite/build_def.bzl +++ b/tensorflow/lite/build_def.bzl @@ -265,6 +265,7 @@ def generated_test_models(): "maximum", "mean", "minimum", + "mirror_pad", "mul", "neg", "not_equal", diff --git a/tensorflow/lite/c/c_api_internal.c b/tensorflow/lite/c/c_api_internal.c index 598d74be84..2923dbad4e 100644 --- a/tensorflow/lite/c/c_api_internal.c +++ b/tensorflow/lite/c/c_api_internal.c @@ -59,7 +59,7 @@ void TfLiteIntArrayPrint(const char* s, TfLiteIntArray* a) { printf("]\n"); } -TfLiteIntArray* TfLiteIntArrayCopy(TfLiteIntArray* src) { +TfLiteIntArray* TfLiteIntArrayCopy(const TfLiteIntArray* src) { if (!src) return NULL; TfLiteIntArray* ret = TfLiteIntArrayCreate(src->size); if (ret) { diff --git a/tensorflow/lite/c/c_api_internal.h b/tensorflow/lite/c/c_api_internal.h index 5cfa64cc7b..1cd84eff5c 100644 --- a/tensorflow/lite/c/c_api_internal.h +++ b/tensorflow/lite/c/c_api_internal.h @@ -96,7 +96,7 @@ int TfLiteIntArrayEqualsArray(TfLiteIntArray* a, int b_size, int b_data[]); // Create a copy of an array passed as `src`. // You are expected to free memory with TfLiteIntArrayFree -TfLiteIntArray* TfLiteIntArrayCopy(TfLiteIntArray* src); +TfLiteIntArray* TfLiteIntArrayCopy(const TfLiteIntArray* src); // Free memory of array `v`. void TfLiteIntArrayFree(TfLiteIntArray* v); diff --git a/tensorflow/lite/kernels/BUILD b/tensorflow/lite/kernels/BUILD index 8d5c3b406f..00d9d1feae 100644 --- a/tensorflow/lite/kernels/BUILD +++ b/tensorflow/lite/kernels/BUILD @@ -198,6 +198,7 @@ cc_library( "lstm.cc", "maximum_minimum.cc", "mfcc.cc", + "mirror_pad.cc", "mul.cc", "neg.cc", "one_hot.cc", @@ -1437,3 +1438,14 @@ filegroup( ) tflite_portable_test_suite() + +tf_cc_test( + name = "mirror_pad_test", + srcs = ["mirror_pad_test.cc"], + deps = [ + ":builtin_ops", + ":test_util", + "//tensorflow/lite:framework", + "@com_google_googletest//:gtest_main", + ], +) diff --git a/tensorflow/lite/kernels/internal/tensor_ctypes.h b/tensorflow/lite/kernels/internal/tensor_ctypes.h index b4822d5701..4a94b703f8 100644 --- a/tensorflow/lite/kernels/internal/tensor_ctypes.h +++ b/tensorflow/lite/kernels/internal/tensor_ctypes.h @@ -53,6 +53,11 @@ inline bool* GetTensorData(TfLiteTensor* tensor) { return tensor != nullptr ? tensor->data.b : nullptr; } +template <> +inline int8_t* GetTensorData(TfLiteTensor* tensor) { + return tensor != nullptr ? tensor->data.int8 : nullptr; +} + template inline const T* GetTensorData(const TfLiteTensor* tensor); diff --git a/tensorflow/lite/kernels/mirror_pad.cc b/tensorflow/lite/kernels/mirror_pad.cc new file mode 100644 index 0000000000..e74e47f7a3 --- /dev/null +++ b/tensorflow/lite/kernels/mirror_pad.cc @@ -0,0 +1,374 @@ +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include + +#include "tensorflow/lite/c/builtin_op_data.h" +#include "tensorflow/lite/c/c_api_internal.h" +#include "tensorflow/lite/kernels/internal/optimized/optimized_ops.h" +#include "tensorflow/lite/kernels/internal/quantization_util.h" +#include "tensorflow/lite/kernels/internal/reference/reference_ops.h" +#include "tensorflow/lite/kernels/internal/tensor.h" +#include "tensorflow/lite/kernels/kernel_util.h" +#include "tensorflow/lite/kernels/op_macros.h" + +namespace tflite { +namespace ops { +namespace builtin { +namespace mirror_pad { +namespace { + +// Simple class that represents a mirror padded tensor - which is the output +// from the Op. +struct PaddedTensor { + // If not null that means this is a scalar value. + // Note: This is not owned by default. It will point to the value + // in the input tensor. + const void* value = nullptr; + // If this tensor is not one value, then this vector will have + // all the tensors that belongs to this tensor. + // Pointers are owned. + std::vector> values; + // Pointers to PaddedTensors that are padded on the left of the current + // tensor. + std::vector left_pad_ptrs; + // Pointers to PaddedTensors that are padded on the right of the current + // tensor. + std::vector right_pad_ptrs; + + // Returns mutable pointer to the tensor identified by 'indices'. + PaddedTensor* GetMutable(const std::vector& indices) { + auto* result = this; + for (int i = 0; i < indices.size(); ++i) { + if (indices[i] >= result->values.size()) { + return nullptr; + } + result = result->values[indices[i]].get(); + if (result == nullptr) break; + } + return result; + } +}; + +// Util method to initialize the memory of the padded tensor. +void InitializeTensorMemory(const TfLiteIntArray* const dims, int dim_index, + int dims_size, PaddedTensor* padded_tensor) { + if (dim_index >= dims_size) { + return; + } + padded_tensor->values.reserve(dims->data[dim_index]); + for (int i = 0; i < dims->data[dim_index]; ++i) { + padded_tensor->values.emplace_back(new PaddedTensor()); + InitializeTensorMemory(dims, dim_index + 1, dims_size, + padded_tensor->values.back().get()); + } +} + +// Returns pointer to the value at the specified index in 'data'. +inline const void* GetValuePointerAtIndex(const void* data, int index, + const TfLiteType data_type) { + switch (data_type) { + case kTfLiteFloat32: + return static_cast(data) + index; + case kTfLiteInt32: + return static_cast(data) + index; + case kTfLiteUInt8: + return static_cast(data) + index; + case kTfLiteInt64: + return static_cast(data) + index; + case kTfLiteBool: + return static_cast(data) + index; + case kTfLiteInt16: + return static_cast(data) + index; + case kTfLiteInt8: + return static_cast(data) + index; + // Unsupported types ? + default: + return nullptr; + } + return nullptr; +} + +// Util method that increment index in the N-d array. +void IncrementTensorIndex(const TfLiteIntArray* dims, + std::vector* tensor_index_ptr) { + int dimension_index = dims->size - 1; + auto& tensor_index = *tensor_index_ptr; + tensor_index[dimension_index]++; + while (dimension_index >= 0 && + tensor_index[dimension_index] == dims->data[dimension_index]) { + tensor_index[dimension_index] = 0; + dimension_index--; + if (dimension_index >= 0) tensor_index[dimension_index]++; + } +} + +// Fills the 'padded_tensor' with data from 'input_tensor'. +TfLiteStatus InitFromInputTensor(const TfLiteTensor* input_tensor, + PaddedTensor* padded_tensor) { + const auto* dims = input_tensor->dims; + const auto data_type = input_tensor->type; + const void* data = static_cast(input_tensor->data.raw_const); + // Either invalid input or unsupported type.+ + if (data == nullptr) { + return kTfLiteError; + } + // Index of current processing tensor. + std::vector tensor_index(dims->size, 0); + int flat_index = 0; + const int num_elements = NumElements(input_tensor); + while (flat_index < num_elements) { + auto* tensor = padded_tensor->GetMutable(tensor_index); + if (tensor == nullptr) { + return kTfLiteError; + } + tensor->value = GetValuePointerAtIndex(data, flat_index, data_type); + IncrementTensorIndex(dims, &tensor_index); + ++flat_index; + } + + return kTfLiteOk; +} + +template +inline void GetPadding(const T* data, int offset, int64_t* left_pad, + int64_t* right_pad) { + *left_pad = static_cast(*(data + offset * 2)); + *right_pad = static_cast(*(data + offset * 2 + 1)); +} + +inline TfLiteStatus GetPadding(const TfLiteTensor* padding_matrix, + int dimension, int64_t* left_pad, + int64_t* right_pad) { + switch (padding_matrix->type) { + case kTfLiteInt32: + GetPadding(padding_matrix->data.i32, dimension, left_pad, right_pad); + break; + case kTfLiteInt64: + GetPadding(padding_matrix->data.i64, dimension, left_pad, right_pad); + break; + default: + return kTfLiteError; + } + return kTfLiteOk; +} + +TfLiteStatus ValidateTensor(const TfLiteTensor* padding_matrix, int offset, + int dimension_index, PaddedTensor* padded_tensor, + TfLiteContext* context) { + if (dimension_index >= padding_matrix->dims->data[0]) { + return kTfLiteOk; + } + + int64_t left_pad = 0, right_pad = 0; + TF_LITE_ENSURE_STATUS( + GetPadding(padding_matrix, dimension_index, &left_pad, &right_pad)); + // If we are not going to include border we must have enough values + // to use. + if (left_pad + offset > padded_tensor->values.size()) { + context->ReportError( + context, "Not enough values for Mirror Pad, required %d, available %d.", + left_pad + offset, padded_tensor->values.size()); + return kTfLiteError; + } + if (right_pad + offset > padded_tensor->values.size()) { + context->ReportError( + context, "Not enough values for Mirror Pad, required %d, available %d.", + right_pad + offset, padded_tensor->values.size()); + return kTfLiteError; + } + if (!padded_tensor->values.empty()) { + ValidateTensor(padding_matrix, offset, dimension_index + 1, + padded_tensor->values[0].get(), context); + } + return kTfLiteOk; +} + +// Fills 'padded_tensor' with the padding information based on +// 'padding_matrix'. +// 'dimension_index' represents which dimension the function is operating on. +TfLiteStatus PadTensor(const TfLiteTensor* padding_matrix, int offset, + int dimension_index, PaddedTensor* padded_tensor, + TfLiteContext* context) { + if (dimension_index >= padding_matrix->dims->data[0]) return kTfLiteOk; + + int64_t left_pad = 0, right_pad = 0; + TF_LITE_ENSURE_STATUS( + GetPadding(padding_matrix, dimension_index, &left_pad, &right_pad)); + + for (int i = left_pad + offset - 1; i >= offset && left_pad > 0; + --i, --left_pad) { + padded_tensor->left_pad_ptrs.push_back(padded_tensor->values[i].get()); + } + for (int i = padded_tensor->values.size() - (1 + offset); + i >= 0 && right_pad > 0; --i, --right_pad) { + padded_tensor->right_pad_ptrs.push_back(padded_tensor->values[i].get()); + } + + for (auto& tensor : padded_tensor->values) { + TF_LITE_ENSURE_STATUS(PadTensor(padding_matrix, offset, dimension_index + 1, + tensor.get(), context)); + } + return kTfLiteOk; +} + +// Fills 'output_data' with data from 'padded_tensor'. +// The function does this recursively by setting left padding first then +// original data, followed by the right padding. +template +int FillOutput(const PaddedTensor* padded_tensor, T* output_data, + int index_in_output) { + if (padded_tensor == nullptr || output_data == nullptr) { + return -1; + } + if (padded_tensor->value != nullptr) { + output_data[index_in_output] = *static_cast(padded_tensor->value); + return index_in_output + 1; + } + for (const auto* tensor : padded_tensor->left_pad_ptrs) { + index_in_output = FillOutput(tensor, output_data, index_in_output); + } + for (const auto& tensor : padded_tensor->values) { + index_in_output = FillOutput(tensor.get(), output_data, index_in_output); + } + for (const auto* tensor : padded_tensor->right_pad_ptrs) { + index_in_output = FillOutput(tensor, output_data, index_in_output); + } + return index_in_output; +} + +// Returns the shape of the final output after padding. +std::unique_ptr GetPaddedOutputShape( + const TfLiteTensor* input, const TfLiteTensor* padding_matrix) { + const int input_dims = NumDimensions(input); + std::unique_ptr shape( + TfLiteIntArrayCreate(input_dims), TfLiteIntArrayFree); + + int64_t left_pad = 0, right_pad = 0; + for (int i = 0; i < input_dims; ++i) { + GetPadding(padding_matrix, i, &left_pad, &right_pad); + shape->data[i] = SizeOfDimension(input, i) + left_pad + right_pad; + } + return shape; +} + +} // namespace + +TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) { + const TfLiteTensor* input_tensor = GetInput(context, node, 0); + const TfLiteTensor* padding_matrix = GetInput(context, node, 1); + auto* params = + reinterpret_cast(node->builtin_data); + + if (params == nullptr) { + return kTfLiteError; + } + const int input_dims = NumDimensions(input_tensor); + + TfLiteTensor* output_tensor = GetOutput(context, node, 0); + if (IsDynamicTensor(output_tensor)) { + auto output_size = GetPaddedOutputShape(input_tensor, padding_matrix); + if (output_size == nullptr) { + return kTfLiteError; + } + TF_LITE_ENSURE_STATUS( + context->ResizeTensor(context, output_tensor, output_size.release())); + } + + PaddedTensor padded_tensor; + // Initialize memory. + InitializeTensorMemory(input_tensor->dims, 0, input_dims, &padded_tensor); + // Set the values from the input_tensor. + TF_LITE_ENSURE_STATUS(InitFromInputTensor(input_tensor, &padded_tensor)); + + const int offset = + params->mode != TfLiteMirrorPaddingMode::kTfLiteMirrorPaddingReflect ? 0 + : 1; + // Make sure padding values are sufficient and valid to use. + TF_LITE_ENSURE_STATUS( + ValidateTensor(padding_matrix, offset, 0, &padded_tensor, context)); + // Apply padding. + TF_LITE_ENSURE_STATUS( + PadTensor(padding_matrix, offset, 0, &padded_tensor, context)); + + // Fill the output tensor from the padded tensor. + TfLiteStatus status = kTfLiteOk; + +#define TF_LITE_MIRROR_PAD(type) \ + FillOutput(&padded_tensor, GetTensorData(output_tensor), 0); + + switch (output_tensor->type) { + case kTfLiteFloat32: { + TF_LITE_MIRROR_PAD(float); + break; + } + case kTfLiteInt32: { + TF_LITE_MIRROR_PAD(int32_t); + break; + } + case kTfLiteUInt8: { + TF_LITE_MIRROR_PAD(uint8_t); + break; + } + case kTfLiteInt64: { + TF_LITE_MIRROR_PAD(int64_t); + break; + } + default: + status = kTfLiteError; + break; + } +#undef TF_LITE_MIRROR_PAD + return status; +} + +void* Init(TfLiteContext* context, const char* buffer, size_t length) { + return nullptr; +} + +void Free(TfLiteContext* context, void* buffer) {} + +TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) { + const TfLiteTensor* input_tensor = GetInput(context, node, 0); + const TfLiteTensor* padding_matrix = GetInput(context, node, 1); + TfLiteTensor* output_tensor = GetOutput(context, node, 0); + + TF_LITE_ENSURE_EQ(context, NumDimensions(padding_matrix), 2); + TF_LITE_ENSURE_EQ(context, SizeOfDimension(padding_matrix, 0), + NumDimensions(input_tensor)); + + if (!IsConstantTensor(padding_matrix)) { + SetTensorToDynamic(output_tensor); + return kTfLiteOk; + } + // We have constant padding, so we can infer output size. + + auto output_size = GetPaddedOutputShape(input_tensor, padding_matrix); + if (output_size == nullptr) { + return kTfLiteError; + } + return context->ResizeTensor(context, output_tensor, output_size.release()); +} + +} // namespace mirror_pad +TfLiteRegistration* Register_MIRROR_PAD() { + static TfLiteRegistration r = {mirror_pad::Init, mirror_pad::Free, + mirror_pad::Prepare, mirror_pad::Eval}; + return &r; +} + +} // namespace builtin +} // namespace ops +} // namespace tflite diff --git a/tensorflow/lite/kernels/mirror_pad_test.cc b/tensorflow/lite/kernels/mirror_pad_test.cc new file mode 100644 index 0000000000..fd09e6e449 --- /dev/null +++ b/tensorflow/lite/kernels/mirror_pad_test.cc @@ -0,0 +1,189 @@ +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ +#include +#include "tensorflow/lite/interpreter.h" +#include "tensorflow/lite/kernels/register.h" +#include "tensorflow/lite/kernels/test_util.h" +#include "tensorflow/lite/model.h" + +namespace tflite { +namespace { + +using ::testing::ElementsAreArray; + +template +class BaseMirrorPadOpModel : public SingleOpModel { + public: + BaseMirrorPadOpModel(const TensorData& input, + const TensorData& padding_matrix, + const TensorData& output, + const tflite::MirrorPadMode mode) { + input_id_ = AddInput(input); + padding_matrix_id_ = AddInput(padding_matrix); + output_id_ = AddOutput(output); + SetBuiltinOp(BuiltinOperator_MIRROR_PAD, BuiltinOptions_MirrorPadOptions, + CreateMirrorPadOptions(builder_, mode).Union()); + BuildInterpreter({GetShape(input_id_), GetShape(padding_matrix_id_)}); + } + + int input_tensor_id() { return input_id_; } + int padding_matrix_tensor_id() { return padding_matrix_id_; } + + std::vector GetOutput() { return ExtractVector(output_id_); } + + protected: + int input_id_; + int padding_matrix_id_; + int output_id_; +}; + +TEST(MirrorPadTest, EmptyPad) { + BaseMirrorPadOpModel model( + {TensorType_INT32, {2, 3}}, {TensorType_INT32, {2, 2}}, + {TensorType_INT32, {}}, tflite::MirrorPadMode_REFLECT); + model.PopulateTensor(model.input_tensor_id(), {1, 2, 3, 4, 5, 6}); + model.PopulateTensor(model.padding_matrix_tensor_id(), {0, 0, 0, 0}); + model.Invoke(); + EXPECT_THAT(model.GetOutput(), ElementsAreArray({1, 2, 3, 4, 5, 6})); +} + +TEST(MirrorPadTest, PadOneSide_right_Reflect) { + BaseMirrorPadOpModel model( + {TensorType_INT32, {2, 3}}, {TensorType_INT32, {2, 2}}, + {TensorType_INT32, {}}, tflite::MirrorPadMode_REFLECT); + model.PopulateTensor(model.input_tensor_id(), {1, 2, 3, 4, 5, 6}); + model.PopulateTensor(model.padding_matrix_tensor_id(), {0, 1, 0, 1}); + model.Invoke(); + EXPECT_THAT(model.GetOutput(), + ElementsAreArray({1, 2, 3, 2, 4, 5, 6, 5, 1, 2, 3, 2})); +} + +TEST(MirrorPadTest, PadOneSide_left_Reflect) { + BaseMirrorPadOpModel model( + {TensorType_INT32, {2, 3}}, {TensorType_INT32, {2, 2}}, + {TensorType_INT32, {}}, tflite::MirrorPadMode_REFLECT); + model.PopulateTensor(model.input_tensor_id(), {1, 2, 3, 4, 5, 6}); + model.PopulateTensor(model.padding_matrix_tensor_id(), {1, 0, 1, 0}); + model.Invoke(); + EXPECT_THAT(model.GetOutput(), + ElementsAreArray({5, 4, 5, 6, 2, 1, 2, 3, 5, 4, 5, 6})); +} + +TEST(MirrorPadTest, PadOneSide_right_Symmetric) { + BaseMirrorPadOpModel model( + {TensorType_INT32, {2, 3}}, {TensorType_INT32, {2, 2}}, + {TensorType_INT32, {}}, tflite::MirrorPadMode_SYMMETRIC); + model.PopulateTensor(model.input_tensor_id(), {1, 2, 3, 4, 5, 6}); + model.PopulateTensor(model.padding_matrix_tensor_id(), {0, 1, 0, 1}); + model.Invoke(); + EXPECT_THAT(model.GetOutput(), + ElementsAreArray({1, 2, 3, 3, 4, 5, 6, 6, 4, 5, 6, 6})); +} + +TEST(MirrorPadTest, PadOneSide_left_Symmetric) { + BaseMirrorPadOpModel model( + {TensorType_INT32, {2, 3}}, {TensorType_INT32, {2, 2}}, + {TensorType_INT32, {}}, tflite::MirrorPadMode_SYMMETRIC); + model.PopulateTensor(model.input_tensor_id(), {1, 2, 3, 4, 5, 6}); + model.PopulateTensor(model.padding_matrix_tensor_id(), {1, 0, 1, 0}); + model.Invoke(); + EXPECT_THAT(model.GetOutput(), + ElementsAreArray({1, 1, 2, 3, 1, 1, 2, 3, 4, 4, 5, 6})); +} + +TEST(MirrorPadTest, PadBothSides_Symmetric) { + BaseMirrorPadOpModel model( + {TensorType_INT32, {2, 3}}, {TensorType_INT32, {2, 2}}, + {TensorType_INT32, {}}, tflite::MirrorPadMode_SYMMETRIC); + model.PopulateTensor(model.input_tensor_id(), {1, 2, 3, 4, 5, 6}); + model.PopulateTensor(model.padding_matrix_tensor_id(), {1, 1, 1, 1}); + model.Invoke(); + EXPECT_THAT(model.GetOutput(), + ElementsAreArray({1, 1, 2, 3, 3, 1, 1, 2, 3, 3, + 4, 4, 5, 6, 6, 4, 4, 5, 6, 6})); +} + +TEST(MirrorPadTest, PadBothSides_Reflect) { + BaseMirrorPadOpModel model( + {TensorType_INT32, {2, 3}}, {TensorType_INT32, {2, 2}}, + {TensorType_INT32, {}}, tflite::MirrorPadMode_REFLECT); + model.PopulateTensor(model.input_tensor_id(), {1, 2, 3, 4, 5, 6}); + model.PopulateTensor(model.padding_matrix_tensor_id(), {1, 1, 1, 1}); + model.Invoke(); + EXPECT_THAT(model.GetOutput(), + ElementsAreArray({5, 4, 5, 6, 5, 2, 1, 2, 3, 2, + 5, 4, 5, 6, 5, 2, 1, 2, 3, 2})); +} + +TEST(MirrorPadTest, PadBothSides_Symmetric_Whole) { + BaseMirrorPadOpModel model( + {TensorType_INT32, {2, 3}}, {TensorType_INT32, {2, 2}}, + {TensorType_INT32, {}}, tflite::MirrorPadMode_SYMMETRIC); + model.PopulateTensor(model.input_tensor_id(), {1, 2, 3, 4, 5, 6}); + model.PopulateTensor(model.padding_matrix_tensor_id(), {2, 2, 3, 3}); + model.Invoke(); + EXPECT_THAT( + model.GetOutput(), + ElementsAreArray({6, 5, 4, 4, 5, 6, 6, 5, 4, 3, 2, 1, 1, 2, 3, 3, 2, 1, + 3, 2, 1, 1, 2, 3, 3, 2, 1, 6, 5, 4, 4, 5, 6, 6, 5, 4, + 6, 5, 4, 4, 5, 6, 6, 5, 4, 3, 2, 1, 1, 2, 3, 3, 2, 1})); +} + +TEST(MirrorPadTest, PadBothSides_Reflect_Whole) { + BaseMirrorPadOpModel model( + {TensorType_INT32, {2, 3}}, {TensorType_INT32, {2, 2}}, + {TensorType_INT32, {}}, tflite::MirrorPadMode_REFLECT); + model.PopulateTensor(model.input_tensor_id(), {1, 2, 3, 4, 5, 6}); + model.PopulateTensor(model.padding_matrix_tensor_id(), {1, 1, 2, 2}); + model.Invoke(); + EXPECT_THAT(model.GetOutput(), + ElementsAreArray({6, 5, 4, 5, 6, 5, 4, 3, 2, 1, 2, 3, 2, 1, + 6, 5, 4, 5, 6, 5, 4, 3, 2, 1, 2, 3, 2, 1})); +} + +TEST(MirrorPadTest, Pad_Symmetric) { + BaseMirrorPadOpModel model( + {TensorType_INT32, {2, 3}}, {TensorType_INT32, {2, 2}}, + {TensorType_INT32, {}}, tflite::MirrorPadMode_SYMMETRIC); + model.PopulateTensor(model.input_tensor_id(), {1, 2, 3, 4, 5, 6}); + model.PopulateTensor(model.padding_matrix_tensor_id(), {1, 1, 2, 2}); + model.Invoke(); + EXPECT_THAT(model.GetOutput(), + ElementsAreArray({2, 1, 1, 2, 3, 3, 2, 2, 1, 1, 2, 3, 3, 2, + 5, 4, 4, 5, 6, 6, 5, 5, 4, 4, 5, 6, 6, 5})); +} + +TEST(MirrorPadTest, Pad_1D_Reflect) { + BaseMirrorPadOpModel model( + {TensorType_INT32, {3}}, {TensorType_INT32, {1, 2}}, + {TensorType_INT32, {}}, tflite::MirrorPadMode_REFLECT); + model.PopulateTensor(model.input_tensor_id(), {1, 2, 3}); + model.PopulateTensor(model.padding_matrix_tensor_id(), {0, 2}); + model.Invoke(); + EXPECT_THAT(model.GetOutput(), ElementsAreArray({1, 2, 3, 2, 1})); +} + +TEST(MirrorPadTest, Pad_1D_Symmetric) { + BaseMirrorPadOpModel model( + {TensorType_INT32, {3}}, {TensorType_INT32, {1, 2}}, + {TensorType_INT32, {}}, tflite::MirrorPadMode_SYMMETRIC); + model.PopulateTensor(model.input_tensor_id(), {1, 2, 3}); + model.PopulateTensor(model.padding_matrix_tensor_id(), {0, 2}); + model.Invoke(); + EXPECT_THAT(model.GetOutput(), ElementsAreArray({1, 2, 3, 3, 2})); +} + +} // namespace +} // namespace tflite diff --git a/tensorflow/lite/kernels/register.cc b/tensorflow/lite/kernels/register.cc index 0ad90a9753..3c60d281b3 100644 --- a/tensorflow/lite/kernels/register.cc +++ b/tensorflow/lite/kernels/register.cc @@ -128,6 +128,7 @@ TfLiteRegistration* Register_RANGE(); TfLiteRegistration* Register_LEAKY_RELU(); TfLiteRegistration* Register_SQUARED_DIFFERENCE(); TfLiteRegistration* Register_FILL(); +TfLiteRegistration* Register_MIRROR_PAD(); TfLiteStatus UnsupportedTensorFlowOp(TfLiteContext* context, TfLiteNode* node) { context->ReportError( @@ -266,6 +267,7 @@ BuiltinOpResolver::BuiltinOpResolver() { AddBuiltin(BuiltinOperator_LEAKY_RELU, Register_LEAKY_RELU()); AddBuiltin(BuiltinOperator_SQUARED_DIFFERENCE, Register_SQUARED_DIFFERENCE()); AddBuiltin(BuiltinOperator_FILL, Register_FILL()); + AddBuiltin(BuiltinOperator_MIRROR_PAD, Register_MIRROR_PAD()); // TODO(andrewharp, ahentz): Move these somewhere more appropriate so that // custom ops aren't always included by default. diff --git a/tensorflow/lite/testing/generate_examples.py b/tensorflow/lite/testing/generate_examples.py index 3448c58ed2..5c90410a19 100644 --- a/tensorflow/lite/testing/generate_examples.py +++ b/tensorflow/lite/testing/generate_examples.py @@ -3613,6 +3613,88 @@ def make_logical_xor_tests(zip_path): return _make_logical_tests(tf.logical_xor)(zip_path) +def make_mirror_pad_tests(zip_path): + """Make a set of tests to do mirror_pad.""" + + test_parameters = [ + { + "input_shape": [[2, 3]], + "padding_matrix": [[[1, 1], [2, 1]]], + "mode": ["REFLECT"], + "type": ["const"] + }, + { + "input_shape": [[2, 3]], + "padding_matrix": [[[1, 1], [1, 1]]], + "mode": ["REFLECT"], + "type": ["const"] + }, + { + "input_shape": [[2, 3]], + "padding_matrix": [[[1, 1], [2, 1]]], + "mode": ["SYMMETRIC"], + "type": ["placeholder"] + }, + { + "input_shape": [[2, 3]], + "padding_matrix": [[[1, 1], [2, 1]]], + "mode": ["REFLECT"], + "type": ["placeholder"] + }, + { + "input_shape": [[3]], + "padding_matrix": [[[0, 2]]], + "mode": ["SYMMETRIC"], + "type": ["placeholder"] + }, + { + "input_shape": [[3]], + "padding_matrix": [[[0, 2]]], + "mode": ["SYMMETRIC"], + "type": ["const"] + }, + { + "input_shape": [[3]], + "padding_matrix": [[[0, 2]]], + "mode": ["REFLECT"], + "type": ["const"] + }, + ] + + def build_graph(parameters): + """Build the graph for the test case.""" + + input_tensor = tf.placeholder( + dtype=tf.int32, name="input", shape=parameters["input_shape"]) + if parameters["type"] != "const": + padding_matrix = tf.placeholder( + dtype=tf.int32, + name="padding", + shape=[len(parameters["input_shape"]), 2]) + input_tensors = [input_tensor, padding_matrix] + else: + padding_matrix = tf.constant(np.array(parameters["padding_matrix"])) + input_tensors = [input_tensor] + output = tf.pad( + input_tensor, paddings=padding_matrix, mode=parameters["mode"]) + + return input_tensors, [output] + + def build_inputs(parameters, sess, inputs, outputs): + input_values = [create_tensor_data(tf.int32, parameters["input_shape"])] + if parameters["type"] != "const": + input_values.append(np.array(parameters["padding_matrix"])) + return input_values, sess.run( + outputs, feed_dict=dict(zip(inputs, input_values))) + + make_zip_of_tests( + zip_path, + test_parameters, + build_graph, + build_inputs, + expected_tf_success=7) + + def make_unroll_batch_matmul_tests(zip_path): """Make a set of tests to test unroll_batch_matmul.""" diff --git a/tensorflow/lite/toco/graph_transformations/propagate_fixed_sizes.cc b/tensorflow/lite/toco/graph_transformations/propagate_fixed_sizes.cc index fc2ed07aa0..0e653f08a0 100644 --- a/tensorflow/lite/toco/graph_transformations/propagate_fixed_sizes.cc +++ b/tensorflow/lite/toco/graph_transformations/propagate_fixed_sizes.cc @@ -1783,6 +1783,51 @@ void ProcessUnpackOperator(Model* model, UnpackOperator* op) { } } +void ProcessMirrorPadOperator(Model* model, MirrorPadOperator* op) { + CHECK_EQ(op->inputs.size(), 2); + const auto& input_array = model->GetArray(op->inputs[0]); + const auto& padding_matrix = model->GetArray(op->inputs[1]); + + // Yield until input dims have been resolved. + if (!input_array.has_shape()) { + return; + } + + auto& output_array = model->GetArray(op->outputs[0]); + // If output already computed or padding matrix is non + // const then return. + if (output_array.has_shape() || + !IsConstantParameterArray(*model, op->inputs[1])) { + return; + } + Shape output_shape = input_array.shape(); + std::vector& dims = *output_shape.mutable_dims(); + + std::vector padding; + if (padding_matrix.data_type == ArrayDataType::kInt32) { + const auto& data = padding_matrix.GetBuffer().data; + for (auto elem : data) { + padding.push_back(static_cast(elem)); + } + } else if (padding_matrix.data_type == ArrayDataType::kInt64) { + const auto& data = padding_matrix.GetBuffer().data; + for (auto elem : data) { + padding.push_back(elem); + } + } else { + CHECK(padding_matrix.data_type == ArrayDataType::kInt64 || + padding_matrix.data_type == ArrayDataType::kInt32); + } + CHECK_EQ(padding_matrix.shape().dimensions_count(), 2); + CHECK_EQ(input_array.shape().dimensions_count(), + padding_matrix.shape().dims(0)); + for (int i = 0; i < input_array.shape().dimensions_count(); ++i) { + dims[i] += padding[i * 2] + padding[i * 2 + 1]; + } + + output_array.copy_shape(output_shape); +} + } // namespace ::tensorflow::Status PropagateFixedSizes::Run(Model* model, @@ -2055,6 +2100,9 @@ void ProcessUnpackOperator(Model* model, UnpackOperator* op) { case OperatorType::kUnpack: ProcessUnpackOperator(model, static_cast(op)); break; + case OperatorType::kMirrorPad: + ProcessMirrorPadOperator(model, static_cast(op)); + break; default: // Unimplemented, another graph transformation should drop it. LOG(FATAL) << "Unhandled operator type " << OperatorTypeName(op->type); diff --git a/tensorflow/lite/toco/import_tensorflow.cc b/tensorflow/lite/toco/import_tensorflow.cc index aa6b4a3bc5..0b2f810394 100644 --- a/tensorflow/lite/toco/import_tensorflow.cc +++ b/tensorflow/lite/toco/import_tensorflow.cc @@ -1153,6 +1153,31 @@ tensorflow::Status ConvertConcatOperator( return tensorflow::Status::OK(); } +tensorflow::Status ConvertMirrorPadOperator( + const NodeDef& node, const TensorFlowImportFlags& tf_import_flags, + Model* model) { + if (node.op() != "MirrorPad") { + LOG(FATAL) << "Expected MirrorPad."; + } + const int num_inputs = GetInputsCount(node, tf_import_flags); + CHECK_EQ(num_inputs, 2); + auto* op = new MirrorPadOperator; + for (int i = 0; i < num_inputs; ++i) { + op->inputs.push_back(node.input(i)); + } + op->outputs.push_back(node.name()); + const auto mode = GetStringAttr(node, "mode"); + if (mode == "REFLECT") { + op->mode = toco::MirrorPadMode::kReflect; + } else if (mode == "SYMMETRIC") { + op->mode = toco::MirrorPadMode::kSymmetric; + } + + model->operators.emplace_back(op); + + return tensorflow::Status::OK(); +} + static constexpr int kAnyNumInputs = -1; enum FlexSupport { kFlexOk, kFlexNotOk }; @@ -2389,6 +2414,7 @@ ConverterMapType GetTensorFlowNodeConverterMap() { {"Unpack", ConvertUnpackOperator}, {"ZerosLike", ConvertSimpleOperator}, {"UnidirectionalSequenceLstm", ConvertUnidirectionalSequenceLstm}, + {"MirrorPad", ConvertMirrorPadOperator}, }); } diff --git a/tensorflow/lite/toco/model.h b/tensorflow/lite/toco/model.h index d4fe62ac75..f19355968f 100644 --- a/tensorflow/lite/toco/model.h +++ b/tensorflow/lite/toco/model.h @@ -156,7 +156,8 @@ enum class OperatorType : uint8 { kZerosLike, kResizeNearestNeighbor, kLeakyRelu, - kAbs + kAbs, + kMirrorPad }; // Helper to deal with TensorFlow arrays using a different ordering of @@ -1932,6 +1933,23 @@ struct TensorFlowZerosLikeOperator : Operator { TensorFlowZerosLikeOperator() : Operator(OperatorType::kZerosLike) {} }; +enum class MirrorPadMode { kNone, kSymmetric, kReflect }; + +// MirrorPad Operator: +// +// Inputs: +// Inputs[0]: required: input tensor to be padded. +// Inputs[1]: required: 2 Column matrix specifying padding sizes. The number of +// rows must be the same as the rank of the input. +// Inputs[2]: required: REFLECT or SYMMETRIC. +// +// TensorFlow equivalent: MirrorPad. +struct MirrorPadOperator : Operator { + MirrorPadOperator() : Operator(OperatorType::kMirrorPad) {} + // mode is either SYMMETRIC or REFLECT. + MirrorPadMode mode; +}; + // Alloc's are used for transient arrays only. An Alloc specifies which interval // of the "transient_data" workspace buffer passed to inference functions, is to // be used for the transient array at hand. The 'start' and 'end' values are diff --git a/tensorflow/lite/toco/tflite/operator.cc b/tensorflow/lite/toco/tflite/operator.cc index c4030a1b1a..e0faed4927 100644 --- a/tensorflow/lite/toco/tflite/operator.cc +++ b/tensorflow/lite/toco/tflite/operator.cc @@ -1275,6 +1275,29 @@ class SquaredDifference int GetVersion(const Operator& op) const override { return 1; } }; +class MirrorPad + : public BuiltinOperator { + public: + using BuiltinOperator::BuiltinOperator; + flatbuffers::Offset WriteOptions( + const TocoOperator& op, + flatbuffers::FlatBufferBuilder* builder) const override { + return ::tflite::CreateMirrorPadOptions( + *builder, op.mode == MirrorPadMode::kReflect + ? ::tflite::MirrorPadMode::MirrorPadMode_REFLECT + : ::tflite::MirrorPadMode::MirrorPadMode_SYMMETRIC); + } + void ReadOptions(const TfLiteOptions& options, + TocoOperator* op) const override { + op->mode = options.mode() == ::tflite::MirrorPadMode::MirrorPadMode_REFLECT + ? MirrorPadMode::kReflect + : MirrorPadMode::kSymmetric; + } + + int GetVersion(const Operator& op) const override { return 1; } +}; + std::unique_ptr WriteFlexOpOptions( const string& tensorflow_node_def) { auto fbb = absl::make_unique(); @@ -1581,6 +1604,8 @@ std::vector> BuildOperatorList( ops.push_back(MakeUnique( ::tflite::BuiltinOperator_SQUARED_DIFFERENCE, OperatorType::kSquaredDifference)); + ops.push_back(MakeUnique(::tflite::BuiltinOperator_MIRROR_PAD, + OperatorType::kMirrorPad)); // Custom Operators. ops.push_back( diff --git a/tensorflow/lite/toco/tflite/operator_test.cc b/tensorflow/lite/toco/tflite/operator_test.cc index 6154d94692..14ec89cd73 100644 --- a/tensorflow/lite/toco/tflite/operator_test.cc +++ b/tensorflow/lite/toco/tflite/operator_test.cc @@ -616,6 +616,14 @@ TEST_F(OperatorTest, TestShouldExportAsFlexOp) { EXPECT_FALSE(ShouldExportAsFlexOp(true, "RFFT")); } +TEST_F(OperatorTest, BuiltinMirrorPad) { + MirrorPadOperator op; + op.mode = MirrorPadMode::kReflect; + auto output_toco_op = SerializeAndDeserialize( + GetOperator("MIRROR_PAD", OperatorType::kMirrorPad), op); + EXPECT_EQ(op.mode, output_toco_op->mode); +} + } // namespace } // namespace tflite diff --git a/tensorflow/lite/toco/tooling_util.cc b/tensorflow/lite/toco/tooling_util.cc index d0cc799424..fc9a0d8af5 100644 --- a/tensorflow/lite/toco/tooling_util.cc +++ b/tensorflow/lite/toco/tooling_util.cc @@ -415,6 +415,7 @@ const char* OperatorTypeName(OperatorType type) { HANDLE_OPERATORTYPENAME_CASE(ResizeNearestNeighbor) HANDLE_OPERATORTYPENAME_CASE(LeakyRelu) HANDLE_OPERATORTYPENAME_CASE(SquaredDifference) + HANDLE_OPERATORTYPENAME_CASE(MirrorPad) default: LOG(FATAL) << "Unhandled op type"; #undef HANDLE_OPERATORTYPENAME_CASE -- GitLab From 92854f2426c2b91f2b09831696ffebb5b793933d Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Mon, 3 Dec 2018 22:45:18 -0800 Subject: [PATCH 069/405] Enable depthwise convolution tests on the CPU PiperOrigin-RevId: 223921992 --- tensorflow/compiler/xla/tests/BUILD | 1 - .../compiler/xla/tests/conv_depthwise_test.cc | 20 +++++++++---------- 2 files changed, 10 insertions(+), 11 deletions(-) diff --git a/tensorflow/compiler/xla/tests/BUILD b/tensorflow/compiler/xla/tests/BUILD index 48bbc4e472..f7f090fe4a 100644 --- a/tensorflow/compiler/xla/tests/BUILD +++ b/tensorflow/compiler/xla/tests/BUILD @@ -305,7 +305,6 @@ xla_test( srcs = ["conv_depthwise_test.cc"], blacklisted_backends = [ # disabled because of a break b/119590850. - "cpu", "gpu", ], shard_count = 50, diff --git a/tensorflow/compiler/xla/tests/conv_depthwise_test.cc b/tensorflow/compiler/xla/tests/conv_depthwise_test.cc index 60ce576ceb..bc9bd8a269 100644 --- a/tensorflow/compiler/xla/tests/conv_depthwise_test.cc +++ b/tensorflow/compiler/xla/tests/conv_depthwise_test.cc @@ -136,7 +136,7 @@ string BuildHloTextDepthwiseConvolution2D( if (spec.activation_dims[1] == 1 && spec.kernel_dims[1] == 2) { return absl::StrFormat( R"( - HloModule TensorFlowDepthwiseConv, is_scheduled=true + HloModule TensorFlowDepthwiseConv ENTRY main { activation = %s[%s]{%s} parameter(0) @@ -161,7 +161,7 @@ string BuildHloTextDepthwiseConvolution2D( } else if (spec.stride == -1) { return absl::StrFormat( R"( - HloModule TensorFlowDepthwiseConv, is_scheduled=true + HloModule TensorFlowDepthwiseConv ENTRY main { activation = %s[%s]{%s} parameter(0) @@ -185,7 +185,7 @@ string BuildHloTextDepthwiseConvolution2D( } else { return absl::StrFormat( R"( - HloModule TensorFlowDepthwiseConv, is_scheduled=true + HloModule TensorFlowDepthwiseConv ENTRY main { activation = %s[%s]{%s} parameter(0) @@ -215,13 +215,13 @@ XLA_TEST_P(DepthwiseConvolution2DTest, DoIt) { const string hlo_text = BuildHloTextDepthwiseConvolution2D(spec, use_bfloat16); - EXPECT_TRUE(RunAndCompareNoHloPasses( - hlo_text, ErrorSpec{0.01, 0.01}, [](HloModule* module) -> Status { - BFloat16MixedPrecisionRemoval remover; - TF_RETURN_IF_ERROR(remover.Run(module).status()); - Despecializer despecializer; - return despecializer.Run(module).status(); - })); + EXPECT_TRUE(RunAndCompare(hlo_text, ErrorSpec{0.01, 0.01}, + [](HloModule* module) -> Status { + BFloat16MixedPrecisionRemoval remover; + TF_RETURN_IF_ERROR(remover.Run(module).status()); + Despecializer despecializer; + return despecializer.Run(module).status(); + })); } INSTANTIATE_TEST_CASE_P( -- GitLab From f5b4fa5b43d238d1d3e93870bcb65ae7b8f38b47 Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Tue, 4 Dec 2018 01:02:22 -0800 Subject: [PATCH 070/405] compat: Update forward compatibility horizon to 2018-12-04 PiperOrigin-RevId: 223933783 --- tensorflow/python/compat/compat.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tensorflow/python/compat/compat.py b/tensorflow/python/compat/compat.py index 811bf31f39..0b6ff30488 100644 --- a/tensorflow/python/compat/compat.py +++ b/tensorflow/python/compat/compat.py @@ -32,7 +32,7 @@ from tensorflow.python.ops import variable_scope from tensorflow.python.util import tf_contextlib from tensorflow.python.util.tf_export import tf_export -_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2018, 12, 3) +_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2018, 12, 4) @tf_export("compat.forward_compatible") -- GitLab From e58a84208a46cf749f92d8170ed81794ecfa3137 Mon Sep 17 00:00:00 2001 From: Ilya Biryukov Date: Tue, 4 Dec 2018 01:29:47 -0800 Subject: [PATCH 071/405] Update downloadable clang to r347933 PiperOrigin-RevId: 223937748 --- third_party/clang_toolchain/download_clang.bzl | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/third_party/clang_toolchain/download_clang.bzl b/third_party/clang_toolchain/download_clang.bzl index a941ee1c99..7ced902747 100644 --- a/third_party/clang_toolchain/download_clang.bzl +++ b/third_party/clang_toolchain/download_clang.bzl @@ -39,15 +39,15 @@ def download_clang(repo_ctx, out_folder): # Latest CLANG_REVISION and CLANG_SUB_REVISION of the Chromiums's release # can be found in https://chromium.googlesource.com/chromium/src/tools/clang/+/master/scripts/update.py - CLANG_REVISION = "346388" - CLANG_SUB_REVISION = 3 + CLANG_REVISION = "347933" + CLANG_SUB_REVISION = 1 package_version = "%s-%s" % (CLANG_REVISION, CLANG_SUB_REVISION) checksums = { - "Linux_x64": "d47b7ac4756c3f8e3bbfa0e81bf199ec8e9faa3a6b11573f0705e9c04af7ad51", - "Mac": "de2b0c701e19cda633ea02804866dd24d8506afb8cae51fbcce3415b76f4ded3", - "Win": "c7d27f13b41aa9eaaf9760903962e9b2b0f8261058df0d35170711dc60545a7d", + "Linux_x64": "cae3643fdf5d46fc9bc8731212bb37573547148d90b64b083165e090133d11b0", + "Mac": "083a0e91a38c06e568652313ac7372b17a101268f7d65533d721ca30413442b4", + "Win": "43160487cfc7e88076a369a2b6e8e4a0f42e104c28d8903f3aaa62d630aba949", } platform_folder = _get_platform_folder(repo_ctx.os.name) -- GitLab From 566923394a39f16e0a008fc602416a7aec9a30c9 Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Tue, 4 Dec 2018 02:29:42 -0800 Subject: [PATCH 072/405] Fix exported version of keras's model_to_estimator. PiperOrigin-RevId: 223944247 --- tensorflow/python/keras/estimator/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tensorflow/python/keras/estimator/__init__.py b/tensorflow/python/keras/estimator/__init__.py index 3c1a63d6df..dcd0600897 100644 --- a/tensorflow/python/keras/estimator/__init__.py +++ b/tensorflow/python/keras/estimator/__init__.py @@ -65,7 +65,7 @@ def model_to_estimator( raise NotImplementedError( 'tf.keras.estimator.model_to_estimator function not available in your ' 'installation.') - keras_lib.model_to_estimator( + return keras_lib.model_to_estimator( keras_model=keras_model, keras_model_path=keras_model_path, custom_objects=custom_objects, -- GitLab From 900762cd4bca45fd8382778bd65e17f2fe13bf2b Mon Sep 17 00:00:00 2001 From: Jason Zaman Date: Wed, 28 Nov 2018 15:35:29 +0800 Subject: [PATCH 073/405] systemlibs: unbundle keras_applications Signed-off-by: Jason Zaman --- third_party/keras_applications_archive/BUILD.system | 13 +++++++++++++ .../keras_applications_archive/workspace.bzl | 1 + third_party/systemlibs/syslibs_configure.bzl | 1 + 3 files changed, 15 insertions(+) create mode 100644 third_party/keras_applications_archive/BUILD.system diff --git a/third_party/keras_applications_archive/BUILD.system b/third_party/keras_applications_archive/BUILD.system new file mode 100644 index 0000000000..a3b58f1503 --- /dev/null +++ b/third_party/keras_applications_archive/BUILD.system @@ -0,0 +1,13 @@ +# Description: Keras Applications: set of pre-trained deep learning models. + +licenses(["notice"]) # MIT + +filegroup( + name = "LICENSE", + visibility = ["//visibility:public"], +) + +py_library( + name = "keras_applications", + visibility = ["//visibility:public"], +) diff --git a/third_party/keras_applications_archive/workspace.bzl b/third_party/keras_applications_archive/workspace.bzl index e90630fa97..cf9d15ca28 100644 --- a/third_party/keras_applications_archive/workspace.bzl +++ b/third_party/keras_applications_archive/workspace.bzl @@ -12,4 +12,5 @@ def repo(): "https://github.com/keras-team/keras-applications/archive/1.0.6.tar.gz", ], build_file = "//third_party/keras_applications_archive:BUILD.bazel", + system_build_file = "//third_party/keras_applications_archive:BUILD.system", ) diff --git a/third_party/systemlibs/syslibs_configure.bzl b/third_party/systemlibs/syslibs_configure.bzl index dbf4fd6e32..85187587c9 100644 --- a/third_party/systemlibs/syslibs_configure.bzl +++ b/third_party/systemlibs/syslibs_configure.bzl @@ -26,6 +26,7 @@ VALID_LIBS = [ "icu", "jpeg", "jsoncpp_git", + "keras_applications_archive", "lmdb", "nasm", "nsync", -- GitLab From 813af36087a44f2a5670625408b076f531ea805b Mon Sep 17 00:00:00 2001 From: Jason Zaman Date: Wed, 28 Nov 2018 15:35:50 +0800 Subject: [PATCH 074/405] systemlibs: icu: update unbundle //third_party/icu/data was added which depends on a new icu target that was missing in the unbundled BUILD file. Signed-off-by: Jason Zaman --- third_party/icu/BUILD.system | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/third_party/icu/BUILD.system b/third_party/icu/BUILD.system index 328e412a8c..8a88a6ef7e 100644 --- a/third_party/icu/BUILD.system +++ b/third_party/icu/BUILD.system @@ -1,13 +1,19 @@ +package( + default_visibility = ["//visibility:public"], +) + licenses(["notice"]) # Apache 2.0 filegroup( name = "icu4c/LICENSE", - visibility = ["//visibility:public"], ) filegroup( name = "icu4j/main/shared/licenses/LICENSE", - visibility = ["//visibility:public"], +) + +cc_library( + name = "headers", ) cc_library( @@ -15,7 +21,6 @@ cc_library( deps = [ ":icuuc", ], - visibility = ["//visibility:public"], ) cc_library( -- GitLab From d6decf3b9adabf64d492bb0e2674d8d928637b2f Mon Sep 17 00:00:00 2001 From: Jason Zaman Date: Mon, 13 Aug 2018 00:29:28 +0800 Subject: [PATCH 075/405] systemlibs: Unbundle protobuf Use system_link_files for protobuf.bzl. The protobuf.bzl file is taken verbatim from the protobuf repo. This version of protobuf.bzl will only be used when opting into using the system version of protobuf and is off by default. Signed-off-by: Jason Zaman --- tensorflow/workspace.bzl | 12 + third_party/systemlibs/protobuf.BUILD | 103 +++++ third_party/systemlibs/protobuf.bzl | 417 +++++++++++++++++++ third_party/systemlibs/syslibs_configure.bzl | 3 + 4 files changed, 535 insertions(+) create mode 100644 third_party/systemlibs/protobuf.BUILD create mode 100644 third_party/systemlibs/protobuf.bzl diff --git a/tensorflow/workspace.bzl b/tensorflow/workspace.bzl index 7e5f84be16..7063031628 100755 --- a/tensorflow/workspace.bzl +++ b/tensorflow/workspace.bzl @@ -357,6 +357,10 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""): name = "protobuf_archive", sha256 = PROTOBUF_SHA256, strip_prefix = PROTOBUF_STRIP_PREFIX, + system_build_file = clean_dep("//third_party/systemlibs:protobuf.BUILD"), + system_link_files = { + "//third_party/systemlibs:protobuf.bzl": "protobuf.bzl", + }, urls = PROTOBUF_URLS, ) @@ -367,6 +371,10 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""): name = "com_google_protobuf", sha256 = PROTOBUF_SHA256, strip_prefix = PROTOBUF_STRIP_PREFIX, + system_build_file = clean_dep("//third_party/systemlibs:protobuf.BUILD"), + system_link_files = { + "//third_party/systemlibs:protobuf.bzl": "protobuf.bzl", + }, urls = PROTOBUF_URLS, ) @@ -374,6 +382,10 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""): name = "com_google_protobuf_cc", sha256 = PROTOBUF_SHA256, strip_prefix = PROTOBUF_STRIP_PREFIX, + system_build_file = clean_dep("//third_party/systemlibs:protobuf.BUILD"), + system_link_files = { + "//third_party/systemlibs:protobuf.bzl": "protobuf.bzl", + }, urls = PROTOBUF_URLS, ) diff --git a/third_party/systemlibs/protobuf.BUILD b/third_party/systemlibs/protobuf.BUILD new file mode 100644 index 0000000000..e9244d172b --- /dev/null +++ b/third_party/systemlibs/protobuf.BUILD @@ -0,0 +1,103 @@ +load("@protobuf_archive//:protobuf.bzl", + "proto_gen", + "py_proto_library", + "cc_proto_library", +) + +licenses(["notice"]) + +filegroup( + name = "LICENSE", + visibility = ["//visibility:public"], +) + +HEADERS = [ + "google/protobuf/any.pb.h", + "google/protobuf/any.proto", + "google/protobuf/arena.h", + "google/protobuf/compiler/importer.h", + "google/protobuf/descriptor.h", + "google/protobuf/descriptor.pb.h", + "google/protobuf/descriptor.proto", + "google/protobuf/duration.pb.h", + "google/protobuf/duration.proto", + "google/protobuf/dynamic_message.h", + "google/protobuf/empty.pb.h", + "google/protobuf/empty.proto", + "google/protobuf/field_mask.pb.h", + "google/protobuf/field_mask.proto", + "google/protobuf/io/coded_stream.h", + "google/protobuf/io/zero_copy_stream.h", + "google/protobuf/io/zero_copy_stream_impl_lite.h", + "google/protobuf/map.h", + "google/protobuf/repeated_field.h", + "google/protobuf/text_format.h", + "google/protobuf/timestamp.pb.h", + "google/protobuf/timestamp.proto", + "google/protobuf/util/json_util.h", + "google/protobuf/util/type_resolver_util.h", + "google/protobuf/wrappers.pb.h", + "google/protobuf/wrappers.proto", +] + +genrule( + name = "link_headers", + outs = HEADERS, + cmd = """ + for i in $(OUTS); do + f=$${i#$(@D)/} + mkdir -p $(@D)/$${f%/*} + ln -sf $(INCLUDEDIR)/$$f $(@D)/$$f + done + """, +) + +cc_library( + name = "protobuf", + hdrs = HEADERS, + linkopts = ["-lprotobuf"], + visibility = ["//visibility:public"], +) + +cc_library( + name = "protobuf_headers", + hdrs = HEADERS, + linkopts = ["-lprotobuf"], + visibility = ["//visibility:public"], +) + +cc_library( + name = "protoc_lib", + linkopts = ["-lprotoc"], + visibility = ["//visibility:public"], +) + +genrule( + name = "protoc", + outs = ["protoc.bin"], + cmd = "ln -s $$(which protoc) $@", + executable = 1, + visibility = ["//visibility:public"], +) + +cc_proto_library( + name = "cc_wkt_protos", + hdrs = HEADERS, + protoc = ":protoc", + internal_bootstrap_hack = 1, + visibility = ["//visibility:public"], +) + +proto_gen( + name = "protobuf_python_genproto", + includes = ["."], + protoc = "@protobuf_archive//:protoc", + visibility = ["//visibility:public"], +) + +py_library( + name = "protobuf_python", + data = [":link_headers"], + srcs_version = "PY2AND3", + visibility = ["//visibility:public"], +) diff --git a/third_party/systemlibs/protobuf.bzl b/third_party/systemlibs/protobuf.bzl new file mode 100644 index 0000000000..78f19c621a --- /dev/null +++ b/third_party/systemlibs/protobuf.bzl @@ -0,0 +1,417 @@ +def _GetPath(ctx, path): + if ctx.label.workspace_root: + return ctx.label.workspace_root + '/' + path + else: + return path + +def _IsNewExternal(ctx): + # Bazel 0.4.4 and older have genfiles paths that look like: + # bazel-out/local-fastbuild/genfiles/external/repo/foo + # After the exec root rearrangement, they look like: + # ../repo/bazel-out/local-fastbuild/genfiles/foo + return ctx.label.workspace_root.startswith("../") + +def _GenDir(ctx): + if _IsNewExternal(ctx): + # We are using the fact that Bazel 0.4.4+ provides repository-relative paths + # for ctx.genfiles_dir. + return ctx.genfiles_dir.path + ( + "/" + ctx.attr.includes[0] if ctx.attr.includes and ctx.attr.includes[0] else "") + # This means that we're either in the old version OR the new version in the local repo. + # Either way, appending the source path to the genfiles dir works. + return ctx.var["GENDIR"] + "/" + _SourceDir(ctx) + +def _SourceDir(ctx): + if not ctx.attr.includes: + return ctx.label.workspace_root + if not ctx.attr.includes[0]: + return _GetPath(ctx, ctx.label.package) + if not ctx.label.package: + return _GetPath(ctx, ctx.attr.includes[0]) + return _GetPath(ctx, ctx.label.package + '/' + ctx.attr.includes[0]) + +def _CcHdrs(srcs, use_grpc_plugin=False): + ret = [s[:-len(".proto")] + ".pb.h" for s in srcs] + if use_grpc_plugin: + ret += [s[:-len(".proto")] + ".grpc.pb.h" for s in srcs] + return ret + +def _CcSrcs(srcs, use_grpc_plugin=False): + ret = [s[:-len(".proto")] + ".pb.cc" for s in srcs] + if use_grpc_plugin: + ret += [s[:-len(".proto")] + ".grpc.pb.cc" for s in srcs] + return ret + +def _CcOuts(srcs, use_grpc_plugin=False): + return _CcHdrs(srcs, use_grpc_plugin) + _CcSrcs(srcs, use_grpc_plugin) + +def _PyOuts(srcs, use_grpc_plugin=False): + ret = [s[:-len(".proto")] + "_pb2.py" for s in srcs] + if use_grpc_plugin: + ret += [s[:-len(".proto")] + "_pb2_grpc.py" for s in srcs] + return ret + +def _RelativeOutputPath(path, include, dest=""): + if include == None: + return path + + if not path.startswith(include): + fail("Include path %s isn't part of the path %s." % (include, path)) + + if include and include[-1] != '/': + include = include + '/' + if dest and dest[-1] != '/': + dest = dest + '/' + + path = path[len(include):] + return dest + path + +def _proto_gen_impl(ctx): + """General implementation for generating protos""" + srcs = ctx.files.srcs + deps = [] + deps += ctx.files.srcs + source_dir = _SourceDir(ctx) + gen_dir = _GenDir(ctx) + if source_dir: + import_flags = ["-I" + source_dir, "-I" + gen_dir] + else: + import_flags = ["-I."] + + for dep in ctx.attr.deps: + import_flags += dep.proto.import_flags + deps += dep.proto.deps + + args = [] + if ctx.attr.gen_cc: + args += ["--cpp_out=" + gen_dir] + if ctx.attr.gen_py: + args += ["--python_out=" + gen_dir] + + inputs = srcs + deps + if ctx.executable.plugin: + plugin = ctx.executable.plugin + lang = ctx.attr.plugin_language + if not lang and plugin.basename.startswith('protoc-gen-'): + lang = plugin.basename[len('protoc-gen-'):] + if not lang: + fail("cannot infer the target language of plugin", "plugin_language") + + outdir = gen_dir + if ctx.attr.plugin_options: + outdir = ",".join(ctx.attr.plugin_options) + ":" + outdir + args += ["--plugin=protoc-gen-%s=%s" % (lang, plugin.path)] + args += ["--%s_out=%s" % (lang, outdir)] + inputs += [plugin] + + if args: + ctx.action( + inputs=inputs, + outputs=ctx.outputs.outs, + arguments=args + import_flags + [s.path for s in srcs], + executable=ctx.executable.protoc, + mnemonic="ProtoCompile", + use_default_shell_env=True, + ) + + return struct( + proto=struct( + srcs=srcs, + import_flags=import_flags, + deps=deps, + ), + ) + +proto_gen = rule( + attrs = { + "srcs": attr.label_list(allow_files = True), + "deps": attr.label_list(providers = ["proto"]), + "includes": attr.string_list(), + "protoc": attr.label( + cfg = "host", + executable = True, + single_file = True, + mandatory = True, + ), + "plugin": attr.label( + cfg = "host", + allow_files = True, + executable = True, + ), + "plugin_language": attr.string(), + "plugin_options": attr.string_list(), + "gen_cc": attr.bool(), + "gen_py": attr.bool(), + "outs": attr.output_list(), + }, + output_to_genfiles = True, + implementation = _proto_gen_impl, +) +"""Generates codes from Protocol Buffers definitions. + +This rule helps you to implement Skylark macros specific to the target +language. You should prefer more specific `cc_proto_library `, +`py_proto_library` and others unless you are adding such wrapper macros. + +Args: + srcs: Protocol Buffers definition files (.proto) to run the protocol compiler + against. + deps: a list of dependency labels; must be other proto libraries. + includes: a list of include paths to .proto files. + protoc: the label of the protocol compiler to generate the sources. + plugin: the label of the protocol compiler plugin to be passed to the protocol + compiler. + plugin_language: the language of the generated sources + plugin_options: a list of options to be passed to the plugin + gen_cc: generates C++ sources in addition to the ones from the plugin. + gen_py: generates Python sources in addition to the ones from the plugin. + outs: a list of labels of the expected outputs from the protocol compiler. +""" + +def cc_proto_library( + name, + srcs=[], + deps=[], + cc_libs=[], + include=None, + protoc="@com_google_protobuf//:protoc", + internal_bootstrap_hack=False, + use_grpc_plugin=False, + default_runtime="@com_google_protobuf//:protobuf", + **kargs): + """Bazel rule to create a C++ protobuf library from proto source files + + NOTE: the rule is only an internal workaround to generate protos. The + interface may change and the rule may be removed when bazel has introduced + the native rule. + + Args: + name: the name of the cc_proto_library. + srcs: the .proto files of the cc_proto_library. + deps: a list of dependency labels; must be cc_proto_library. + cc_libs: a list of other cc_library targets depended by the generated + cc_library. + include: a string indicating the include path of the .proto files. + protoc: the label of the protocol compiler to generate the sources. + internal_bootstrap_hack: a flag indicate the cc_proto_library is used only + for bootstraping. When it is set to True, no files will be generated. + The rule will simply be a provider for .proto files, so that other + cc_proto_library can depend on it. + use_grpc_plugin: a flag to indicate whether to call the grpc C++ plugin + when processing the proto files. + default_runtime: the implicitly default runtime which will be depended on by + the generated cc_library target. + **kargs: other keyword arguments that are passed to cc_library. + + """ + + includes = [] + if include != None: + includes = [include] + + if internal_bootstrap_hack: + # For pre-checked-in generated files, we add the internal_bootstrap_hack + # which will skip the codegen action. + proto_gen( + name=name + "_genproto", + srcs=srcs, + deps=[s + "_genproto" for s in deps], + includes=includes, + protoc=protoc, + visibility=["//visibility:public"], + ) + # An empty cc_library to make rule dependency consistent. + native.cc_library( + name=name, + **kargs) + return + + grpc_cpp_plugin = None + if use_grpc_plugin: + grpc_cpp_plugin = "//external:grpc_cpp_plugin" + + gen_srcs = _CcSrcs(srcs, use_grpc_plugin) + gen_hdrs = _CcHdrs(srcs, use_grpc_plugin) + outs = gen_srcs + gen_hdrs + + proto_gen( + name=name + "_genproto", + srcs=srcs, + deps=[s + "_genproto" for s in deps], + includes=includes, + protoc=protoc, + plugin=grpc_cpp_plugin, + plugin_language="grpc", + gen_cc=1, + outs=outs, + visibility=["//visibility:public"], + ) + + if default_runtime and not default_runtime in cc_libs: + cc_libs = cc_libs + [default_runtime] + if use_grpc_plugin: + cc_libs = cc_libs + ["//external:grpc_lib"] + + native.cc_library( + name=name, + srcs=gen_srcs, + hdrs=gen_hdrs, + deps=cc_libs + deps, + includes=includes, + **kargs) + +def internal_gen_well_known_protos_java(srcs): + """Bazel rule to generate the gen_well_known_protos_java genrule + + Args: + srcs: the well known protos + """ + root = Label("%s//protobuf_java" % (REPOSITORY_NAME)).workspace_root + pkg = PACKAGE_NAME + "/" if PACKAGE_NAME else "" + if root == "": + include = " -I%ssrc " % pkg + else: + include = " -I%s/%ssrc " % (root, pkg) + native.genrule( + name = "gen_well_known_protos_java", + srcs = srcs, + outs = [ + "wellknown.srcjar", + ], + cmd = "$(location :protoc) --java_out=$(@D)/wellknown.jar" + + " %s $(SRCS) " % include + + " && mv $(@D)/wellknown.jar $(@D)/wellknown.srcjar", + tools = [":protoc"], + ) + +def internal_copied_filegroup(name, srcs, strip_prefix, dest, **kwargs): + """Macro to copy files to a different directory and then create a filegroup. + + This is used by the //:protobuf_python py_proto_library target to work around + an issue caused by Python source files that are part of the same Python + package being in separate directories. + + Args: + srcs: The source files to copy and add to the filegroup. + strip_prefix: Path to the root of the files to copy. + dest: The directory to copy the source files into. + **kwargs: extra arguments that will be passesd to the filegroup. + """ + outs = [_RelativeOutputPath(s, strip_prefix, dest) for s in srcs] + + native.genrule( + name = name + "_genrule", + srcs = srcs, + outs = outs, + cmd = " && ".join( + ["cp $(location %s) $(location %s)" % + (s, _RelativeOutputPath(s, strip_prefix, dest)) for s in srcs]), + ) + + native.filegroup( + name = name, + srcs = outs, + **kwargs) + +def py_proto_library( + name, + srcs=[], + deps=[], + py_libs=[], + py_extra_srcs=[], + include=None, + default_runtime="@com_google_protobuf//:protobuf_python", + protoc="@com_google_protobuf//:protoc", + use_grpc_plugin=False, + **kargs): + """Bazel rule to create a Python protobuf library from proto source files + + NOTE: the rule is only an internal workaround to generate protos. The + interface may change and the rule may be removed when bazel has introduced + the native rule. + + Args: + name: the name of the py_proto_library. + srcs: the .proto files of the py_proto_library. + deps: a list of dependency labels; must be py_proto_library. + py_libs: a list of other py_library targets depended by the generated + py_library. + py_extra_srcs: extra source files that will be added to the output + py_library. This attribute is used for internal bootstrapping. + include: a string indicating the include path of the .proto files. + default_runtime: the implicitly default runtime which will be depended on by + the generated py_library target. + protoc: the label of the protocol compiler to generate the sources. + use_grpc_plugin: a flag to indicate whether to call the Python C++ plugin + when processing the proto files. + **kargs: other keyword arguments that are passed to cc_library. + + """ + outs = _PyOuts(srcs, use_grpc_plugin) + + includes = [] + if include != None: + includes = [include] + + grpc_python_plugin = None + if use_grpc_plugin: + grpc_python_plugin = "//external:grpc_python_plugin" + # Note: Generated grpc code depends on Python grpc module. This dependency + # is not explicitly listed in py_libs. Instead, host system is assumed to + # have grpc installed. + + proto_gen( + name=name + "_genproto", + srcs=srcs, + deps=[s + "_genproto" for s in deps], + includes=includes, + protoc=protoc, + gen_py=1, + outs=outs, + visibility=["//visibility:public"], + plugin=grpc_python_plugin, + plugin_language="grpc" + ) + + if default_runtime and not default_runtime in py_libs + deps: + py_libs = py_libs + [default_runtime] + + native.py_library( + name=name, + srcs=outs+py_extra_srcs, + deps=py_libs+deps, + imports=includes, + **kargs) + +def internal_protobuf_py_tests( + name, + modules=[], + **kargs): + """Bazel rules to create batch tests for protobuf internal. + + Args: + name: the name of the rule. + modules: a list of modules for tests. The macro will create a py_test for + each of the parameter with the source "google/protobuf/%s.py" + kargs: extra parameters that will be passed into the py_test. + + """ + for m in modules: + s = "python/google/protobuf/internal/%s.py" % m + native.py_test( + name="py_%s" % m, + srcs=[s], + main=s, + **kargs) + + +def check_protobuf_required_bazel_version(): + """For WORKSPACE files, to check the installed version of bazel. + + This ensures bazel supports our approach to proto_library() depending on a + copied filegroup. (Fixed in bazel 0.5.4) + """ + expected = apple_common.dotted_version("0.5.4") + current = apple_common.dotted_version(native.bazel_version) + if current.compare_to(expected) < 0: + fail("Bazel must be newer than 0.5.4") diff --git a/third_party/systemlibs/syslibs_configure.bzl b/third_party/systemlibs/syslibs_configure.bzl index dbf4fd6e32..645d242c96 100644 --- a/third_party/systemlibs/syslibs_configure.bzl +++ b/third_party/systemlibs/syslibs_configure.bzl @@ -15,6 +15,8 @@ VALID_LIBS = [ "boringssl", "com_github_googleapis_googleapis", "com_github_googlecloudplatform_google_cloud_cpp", + "com_google_protobuf", + "com_google_protobuf_cc", "com_googlesource_code_re2", "curl", "cython", @@ -32,6 +34,7 @@ VALID_LIBS = [ "org_sqlite", "pcre", "png_archive", + "protobuf_archive", "six_archive", "snappy", "swig", -- GitLab From 8cc14c541e165528237e2bdb74a4e5ef9981675c Mon Sep 17 00:00:00 2001 From: Adrian Kuegel Date: Tue, 4 Dec 2018 05:39:27 -0800 Subject: [PATCH 076/405] Make sure that we can fuse diamond shaped graphs. We have some logic which does a global analysis of the graph to determine if we want to allow duplication of specific instructions. This logic however does not take into account whether actually duplication is needed because the analysis runs on the original graph and is not recomputed when nodes are fused. For the GPU and CPU backends, we run the fusion pass twice: once where we disallow *all* duplications, and once where we allow *some* duplications. The global analysis is therefore not needed if we disallow *all* duplications, and then we can successfully fuse diamond-shaped graphs because we do a reverse postorder traversal through the graph. PiperOrigin-RevId: 223961479 --- .../xla/service/instruction_fusion.cc | 13 +++-- .../xla/service/instruction_fusion_test.cc | 50 +++++++++++++++++++ 2 files changed, 59 insertions(+), 4 deletions(-) diff --git a/tensorflow/compiler/xla/service/instruction_fusion.cc b/tensorflow/compiler/xla/service/instruction_fusion.cc index 2297edcbe1..7559ed1bab 100644 --- a/tensorflow/compiler/xla/service/instruction_fusion.cc +++ b/tensorflow/compiler/xla/service/instruction_fusion.cc @@ -457,8 +457,13 @@ StatusOr InstructionFusion::Run(HloModule* module) { computation_ = computation; reachability_ = HloReachabilityMap::Build(computation_); - HloInstructionSet do_not_duplicate = - ComputeGloballyUnfusible(computation_->MakeInstructionPostOrder()); + HloInstructionSet do_not_duplicate; + // If we allow duplications, we need to compute which instructions we do not + // want to duplicate based on a global analysis of the graph. + if (may_duplicate_) { + do_not_duplicate = + ComputeGloballyUnfusible(computation_->MakeInstructionPostOrder()); + } auto fusion_queue = GetFusionQueue(computation_); // Instruction fusion effectively fuses edges in the computation graph @@ -566,8 +571,8 @@ HloInstruction* InstructionFusion::FuseIntoMultiOutput( bool InstructionFusion::MultiOutputFusionCreatesCycle( HloInstruction* producer, HloInstruction* consumer) { auto is_reachable = [&](const HloInstruction* a, const HloInstruction* b) { - // A consumer operand may have been multii-output fused into a parallel - // consumer and thus be missing from the oridinal reachability map. + // A consumer operand may have been multi-output fused into a parallel + // consumer and thus be missing from the original reachability map. if (!reachability_->IsPresent(a) || !reachability_->IsPresent(b)) { reachability_ = HloReachabilityMap::Build(consumer->parent()); } diff --git a/tensorflow/compiler/xla/service/instruction_fusion_test.cc b/tensorflow/compiler/xla/service/instruction_fusion_test.cc index 6b48312649..58b7135cea 100644 --- a/tensorflow/compiler/xla/service/instruction_fusion_test.cc +++ b/tensorflow/compiler/xla/service/instruction_fusion_test.cc @@ -394,6 +394,56 @@ TEST_F(InstructionFusionTest, AllowEffectiveUnaryDuplication) { .ValueOrDie()); } +TEST_F(InstructionFusionTest, FuseDiamondGraphsNoDuplication) { + auto module = ParseHloString(R"( + HloModule test_module + ENTRY Test { + p0 = f32[100] parameter(0) + p1 = f32[100] parameter(1) + add = f32[100] add(p0, p1) + slice1 = f32[99] slice(add), slice={[0:99:1]} + slice2 = f32[99] slice(add), slice={[1:100:1]} + ROOT add2 = f32[99] add(slice1, slice2) + })") + .ValueOrDie(); + EXPECT_TRUE( + InstructionFusion(InstructionFusion::IsExpensive, /*may_duplicate=*/false) + .Run(module.get()) + .ValueOrDie()) + << module->ToString(); + + HloInstruction* root = module->entry_computation()->root_instruction(); + // 'add' would originally need to be duplicated if fused. However after its + // two users 'slice1' and 'slice2' are fused into 'add2', 'add' has only one + // user and can now be also fused. + EXPECT_THAT(root, op::Fusion(op::Parameter(), op::Parameter())); +} + +TEST_F(InstructionFusionTest, FuseDiamondGraphsAllowDuplication) { + auto module = ParseHloString(R"( + HloModule test_module + ENTRY Test { + p0 = f32[100] parameter(0) + p1 = f32[100] parameter(1) + add = f32[100] add(p0, p1) + slice1 = f32[99] slice(add), slice={[0:99:1]} + slice2 = f32[99] slice(add), slice={[1:100:1]} + ROOT add2 = f32[99] add(slice1, slice2) + })") + .ValueOrDie(); + EXPECT_TRUE( + InstructionFusion(InstructionFusion::IsExpensive, /*may_duplicate=*/true) + .Run(module.get()) + .ValueOrDie()) + << module->ToString(); + + HloInstruction* root = module->entry_computation()->root_instruction(); + // 'add' would originally need to be duplicated if fused. However after its + // two users 'slice1' and 'slice2' are fused into 'add2', 'add' has only one + // user and can now be also fused. + EXPECT_THAT(root, op::Fusion(op::Parameter(), op::Parameter())); +} + TEST_F(InstructionFusionTest, WideningConvertsAreAlwaysDuplicableIntoConsumers) { auto module = ParseHloString(R"( -- GitLab From 4beea30ec8a231a12b9183f631608a6af64b8172 Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Tue, 4 Dec 2018 05:47:46 -0800 Subject: [PATCH 077/405] Include import library (_pywrap_tensorflow_internal.lib) for _pywrap_tensorflow_internal.pyd in pip package To fix #23740 PiperOrigin-RevId: 223962144 --- tensorflow/python/BUILD | 15 ++++++++++++++- tensorflow/tools/pip_package/BUILD | 11 ++++------- 2 files changed, 18 insertions(+), 8 deletions(-) diff --git a/tensorflow/python/BUILD b/tensorflow/python/BUILD index d990ebb7f1..a558045e4a 100644 --- a/tensorflow/python/BUILD +++ b/tensorflow/python/BUILD @@ -4181,11 +4181,24 @@ genrule( # Get the import library of _pywrap_tensorflow_internal.dll filegroup( - name = "pywrap_tensorflow_import_lib_file", + name = "get_pywrap_tensorflow_import_lib_file", srcs = [":_pywrap_tensorflow_internal.so"], output_group = "interface_library", ) +# Rename the import library for _pywrap_tensorflow_internal.pyd to _pywrap_tensorflow_internal.lib +# (It was _pywrap_tensorflow_internal.so.if.lib). +genrule( + name = "pywrap_tensorflow_import_lib_file", + srcs = [":get_pywrap_tensorflow_import_lib_file"], + outs = ["_pywrap_tensorflow_internal.lib"], + cmd = select({ + "//tensorflow:windows": "cp -f $< $@", + "//conditions:default": "touch $@", # Just a placeholder for Unix platforms + }), + visibility = ["//visibility:public"], +) + # Create a cc_import rule for the import library of _pywrap_tensorflow_internal.dll # so that custom ops' dynamic libraries can link against it. cc_import( diff --git a/tensorflow/tools/pip_package/BUILD b/tensorflow/tools/pip_package/BUILD index fa372dcd74..82c6bf383f 100644 --- a/tensorflow/tools/pip_package/BUILD +++ b/tensorflow/tools/pip_package/BUILD @@ -88,6 +88,9 @@ COMMON_PIP_DEPS = [ "//tensorflow/contrib/timeseries:timeseries_pip", "//tensorflow/contrib/tpu", "//tensorflow/examples/tutorials/mnist:package", + "//tensorflow/lite/python:interpreter_test_data", + "//tensorflow/lite/python:tflite_convert", + "//tensorflow/lite/toco/python:toco_from_protos", # "//tensorflow/python/autograph/converters:converters", # "//tensorflow/python/autograph/core:core", "//tensorflow/python/autograph/core:test_lib", @@ -124,7 +127,7 @@ COMMON_PIP_DEPS = [ py_binary( name = "simple_console_for_windows", srcs = ["simple_console_for_windows.py"], - data = COMMON_PIP_DEPS, + data = COMMON_PIP_DEPS + ["//tensorflow/python:pywrap_tensorflow_import_lib_file"], srcs_version = "PY2AND3", deps = ["//tensorflow:tensorflow_py"], ) @@ -228,15 +231,9 @@ sh_binary( data = select({ "//tensorflow:windows": [ ":simple_console_for_windows", - "//tensorflow/lite/python:interpreter_test_data", - "//tensorflow/lite/python:tflite_convert", - "//tensorflow/lite/toco/python:toco_from_protos", ], "//conditions:default": COMMON_PIP_DEPS + [ ":simple_console", - "//tensorflow/lite/python:interpreter_test_data", - "//tensorflow/lite/python:tflite_convert", - "//tensorflow/lite/toco/python:toco_from_protos", ], }) + if_mkl_ml(["//third_party/mkl:intel_binary_blob"]), ) -- GitLab From 5828c4084ca025f7943e097cb2ddfd5e17817d64 Mon Sep 17 00:00:00 2001 From: Adrian Kuegel Date: Tue, 4 Dec 2018 06:13:55 -0800 Subject: [PATCH 078/405] Add a test case that shows exponential compilation behavior on the GPU/CPU backends. PiperOrigin-RevId: 223964674 --- .../xla/tests/array_elementwise_ops_test.cc | 38 +++++++++++++++++++ 1 file changed, 38 insertions(+) diff --git a/tensorflow/compiler/xla/tests/array_elementwise_ops_test.cc b/tensorflow/compiler/xla/tests/array_elementwise_ops_test.cc index 0615f9425c..f6be27bee2 100644 --- a/tensorflow/compiler/xla/tests/array_elementwise_ops_test.cc +++ b/tensorflow/compiler/xla/tests/array_elementwise_ops_test.cc @@ -350,6 +350,44 @@ TEST_P(ArrayElementwiseOpTestParamCount, AddManyValues) { error_spec_); } +// TODO(b/119692968): This test runs OOM on the GPU and CPU backend. +XLA_TEST_F(ArrayElementwiseOpTest, + DISABLED_ON_GPU(DISABLED_ON_CPU(DeeplyNestedAddWithSlices))) { + XlaBuilder builder(TestName()); + std::vector values(30, 0.0); + auto a_literal = LiteralUtil::CreateR1(values); + auto a = Parameter(&builder, 0, a_literal.shape(), "x"); + auto b_literal = LiteralUtil::CreateR1(values); + auto b = Parameter(&builder, 1, b_literal.shape(), "x"); + + // Construct a sequence of diamond-shaped gadgets like this: + // + // add + // / \ + // slice slice + // \ / + // add + // + // Each 'left' slice removes the last element, each 'right' slice removes the + // first element. In this way, we index into the add with different + // multi-dimensional index arrays, which defeats the caching we use to avoid + // exponential compile time. + std::function generate_recursive = + [&](int64 slice_size) -> XlaOp { + if (slice_size == values.size()) { + return Add(a, b); + } + XlaOp param = generate_recursive(slice_size + 1); + auto slice1 = Slice(param, {0}, {slice_size}, {1}); + auto slice2 = Slice(param, {1}, {slice_size + 1}, {1}); + return Add(slice1, slice2); + }; + generate_recursive(1); + auto a_data = client_->TransferToServer(a_literal).ConsumeValueOrDie(); + auto b_data = client_->TransferToServer(b_literal).ConsumeValueOrDie(); + ComputeAndCompareR1(&builder, {0.0}, {a_data.get(), b_data.get()}); +} + XLA_TEST_F(ArrayElementwiseOpTest, SubTwoConstantF32s) { XlaBuilder builder(TestName()); auto a = ConstantR1(&builder, {-2.5f, 3.14f, 2.25f, -10.0f, 6.0f}); -- GitLab From f5dfd16cf97a876d49930d9dd1e59092fc3fd40f Mon Sep 17 00:00:00 2001 From: Peter Hawkins Date: Tue, 4 Dec 2018 07:39:43 -0800 Subject: [PATCH 079/405] [TF:XLA] Remove the transpose_x/y and conjugate_x/y arguments from the TF2XLA BatchDot helper. There's no need for a single large function that does everything. We can instead compose these operations out of the individual pieces, simplifying the BatchDot API. PiperOrigin-RevId: 223975055 --- .../tf2xla/kernels/batch_matmul_op.cc | 9 ++++-- tensorflow/compiler/tf2xla/lib/batch_dot.cc | 23 +++++--------- tensorflow/compiler/tf2xla/lib/batch_dot.h | 11 ++----- tensorflow/compiler/tf2xla/lib/cholesky.cc | 14 ++------- tensorflow/compiler/tf2xla/lib/qr.cc | 31 +++++-------------- .../compiler/tf2xla/lib/triangular_solve.cc | 23 +++++++------- tensorflow/compiler/tf2xla/lib/util.cc | 4 +++ tensorflow/compiler/tf2xla/lib/util.h | 4 +++ tensorflow/compiler/tf2xla/lib/util_test.cc | 2 +- 9 files changed, 46 insertions(+), 75 deletions(-) diff --git a/tensorflow/compiler/tf2xla/kernels/batch_matmul_op.cc b/tensorflow/compiler/tf2xla/kernels/batch_matmul_op.cc index 4cfe946b2e..7ba91765fd 100644 --- a/tensorflow/compiler/tf2xla/kernels/batch_matmul_op.cc +++ b/tensorflow/compiler/tf2xla/kernels/batch_matmul_op.cc @@ -14,6 +14,8 @@ limitations under the License. ==============================================================================*/ #include "tensorflow/compiler/tf2xla/lib/batch_dot.h" + +#include "tensorflow/compiler/tf2xla/lib/util.h" #include "tensorflow/compiler/tf2xla/xla_op_kernel.h" #include "tensorflow/compiler/tf2xla/xla_op_registry.h" @@ -28,9 +30,10 @@ class BatchMatMulOp : public XlaOpKernel { } void Compile(XlaOpKernelContext* ctx) override { - auto result = BatchDot(ctx->Input(0), ctx->Input(1), - /*transpose_x=*/adj_x_, /*transpose_y=*/adj_y_, - /*conjugate_x=*/adj_x_, /*conjugate_y=*/adj_y_); + auto result = BatchDot(MaybeTransposeInMinorDims( + MaybeConjugate(ctx->Input(0), adj_x_), adj_x_), + MaybeTransposeInMinorDims( + MaybeConjugate(ctx->Input(1), adj_y_), adj_y_)); ctx->SetOutput(0, result); } diff --git a/tensorflow/compiler/tf2xla/lib/batch_dot.cc b/tensorflow/compiler/tf2xla/lib/batch_dot.cc index 5400e8834c..dfc840698a 100644 --- a/tensorflow/compiler/tf2xla/lib/batch_dot.cc +++ b/tensorflow/compiler/tf2xla/lib/batch_dot.cc @@ -26,8 +26,7 @@ limitations under the License. namespace tensorflow { -xla::XlaOp BatchDot(xla::XlaOp x, xla::XlaOp y, bool transpose_x, - bool transpose_y, bool conjugate_x, bool conjugate_y, +xla::XlaOp BatchDot(xla::XlaOp x, xla::XlaOp y, xla::PrecisionConfig::Precision precision) { xla::XlaBuilder* builder = x.builder(); return builder->ReportErrorOrReturn([&]() -> xla::StatusOr { @@ -61,15 +60,14 @@ xla::XlaOp BatchDot(xla::XlaOp x, xla::XlaOp y, bool transpose_x, batch_dimension_numbers.push_back(i); } - int x_inner_dim = transpose_x ? (ndims - 2) : (ndims - 1); - int y_inner_dim = transpose_y ? (ndims - 1) : (ndims - 2); + int x_inner_dim = ndims - 1; + int y_inner_dim = ndims - 2; if (x_shape.dimensions(x_inner_dim) != y_shape.dimensions(y_inner_dim)) { return errors::InvalidArgument( "Dimensions ", x_inner_dim, " and ", y_inner_dim, " of arguments to BatchedDot must be equal: ", - xla::ShapeUtil::HumanString(x_shape), " transpose: ", transpose_x, - " vs. ", xla::ShapeUtil::HumanString(y_shape), - " transpose: ", transpose_y); + xla::ShapeUtil::HumanString(x_shape), " vs. ", + xla::ShapeUtil::HumanString(y_shape)); } // Check for zero lhs/rhs dim size. @@ -79,8 +77,8 @@ xla::XlaOp BatchDot(xla::XlaOp x, xla::XlaOp y, bool transpose_x, for (int i = 0; i < batch_dimension_numbers.size(); ++i) { dimensions[i] = x_shape.dimensions(batch_dimension_numbers[i]); } - int x_outer_dim = transpose_x ? (ndims - 1) : (ndims - 2); - int y_outer_dim = transpose_y ? (ndims - 2) : (ndims - 1); + int x_outer_dim = ndims - 2; + int y_outer_dim = ndims - 1; dimensions.push_back(x_shape.dimensions(x_outer_dim)); dimensions.push_back(y_shape.dimensions(y_outer_dim)); return xla::Broadcast( @@ -89,13 +87,6 @@ xla::XlaOp BatchDot(xla::XlaOp x, xla::XlaOp y, bool transpose_x, dimensions); } - if (x_shape.element_type() == xla::C64 && conjugate_x) { - x = xla::Conj(x); - } - if (y_shape.element_type() == xla::C64 && conjugate_y) { - y = xla::Conj(y); - } - xla::PrecisionConfig precision_proto; precision_proto.add_operand_precision(precision); precision_proto.add_operand_precision(precision); diff --git a/tensorflow/compiler/tf2xla/lib/batch_dot.h b/tensorflow/compiler/tf2xla/lib/batch_dot.h index 6edd63a4d3..6cfa698593 100644 --- a/tensorflow/compiler/tf2xla/lib/batch_dot.h +++ b/tensorflow/compiler/tf2xla/lib/batch_dot.h @@ -25,12 +25,7 @@ namespace tensorflow { // Multiplies all slices of `Tensor` `x` and `y` (each slice can be // viewed as an element of a batch), and arranges the individual results -// in a single output tensor of the same batch size. Each of the -// individual slices can optionally be transposed before multiplication by -// setting the `transpose_x` or `transpose_y` flag to `true`. Similarly, each -// can be elementwise-complex-conjugated by setting the `conjugate_x` or -// `conjugate_y` flag to `true`. To apply a Hermitian adjoint to `x`, set both -// `transpose_x` and `conjugate_x` to `true`, and analogously for `y`. +// in a single output tensor of the same batch size. // // The input tensors `x` and `y` are 2-D or higher with shape `[..., r_x, c_x]` // and `[..., r_y, c_y]`. @@ -44,9 +39,7 @@ namespace tensorflow { // // output[..., :, :] = matrix(x[..., :, :]) * matrix(y[..., :, :]) xla::XlaOp BatchDot( - xla::XlaOp x, xla::XlaOp y, bool transpose_x = false, - bool transpose_y = false, bool conjugate_x = false, - bool conjugate_y = false, + xla::XlaOp x, xla::XlaOp y, xla::PrecisionConfig::Precision precision = xla::PrecisionConfig::DEFAULT); } // namespace tensorflow diff --git a/tensorflow/compiler/tf2xla/lib/cholesky.cc b/tensorflow/compiler/tf2xla/lib/cholesky.cc index ab3d0a5668..dfc014395b 100644 --- a/tensorflow/compiler/tf2xla/lib/cholesky.cc +++ b/tensorflow/compiler/tf2xla/lib/cholesky.cc @@ -101,10 +101,7 @@ xla::XlaOp CholeskyUnblocked(xla::XlaOp a, // a[..., i, i] auto a_ii = DynamicSliceInMinorDims(body_a, {i, i}, {1, 1}); // np.dot(row, np.swapaxes(row, -1, -2)) - auto diag_dot = BatchDot(row, row, - /*transpose_x=*/false, - /*transpose_y=*/true, /*conjugate_x=*/false, - /*conjugate_y=*/false, precision); + auto diag_dot = BatchDot(row, TransposeInMinorDims(row), precision); // l[..., i, i] = np.sqrt(a[..., i, i] - np.dot(row, // np.swapaxes(row, -1, -2))) auto l_ii = @@ -122,10 +119,7 @@ xla::XlaOp CholeskyUnblocked(xla::XlaOp a, // The columns in [i, n] are zeroed out in `row`, so we just have to // zero out rows above i+1 after the BatchDot. np.dot(l[..., :, :i], // r.T) - auto dot = BatchDot(body_l, row, - /*transpose_x=*/false, - /*transpose_y=*/true, /*conjugate_x=*/false, - /*conjugate_y=*/false, precision); + auto dot = BatchDot(body_l, TransposeInMinorDims(row), precision); // np.dot(l[..., i+1:, :i], r.T) auto dot_ip1 = xla::Select(xla::Le(mask_range_col, i), mask_zeros_col, dot); @@ -185,9 +179,7 @@ xla::XlaOp Cholesky(xla::XlaOp a, int64 block_size, // a[i:, i:i+k] -= np.dot(l[i:, :i], np.transpose(l[i:i+k, :i])) auto lhs = SliceInMinorDims(l, {i, 0}, {n, i}); auto rhs = SliceInMinorDims(l, {i, 0}, {i + k, i}); - auto delta = BatchDot(lhs, rhs, /*transpose_x=*/false, - /*transpose_y=*/true, /*conjugate_x=*/false, - /*conjugate_y=*/false, precision); + auto delta = BatchDot(lhs, TransposeInMinorDims(rhs), precision); auto before = SliceInMinorDims(a, {i, i}, {n, i + k}); a = UpdateSliceInMinorDims(a, before - delta, {i, i}); } diff --git a/tensorflow/compiler/tf2xla/lib/qr.cc b/tensorflow/compiler/tf2xla/lib/qr.cc index aa1c0e4743..440ad13321 100644 --- a/tensorflow/compiler/tf2xla/lib/qr.cc +++ b/tensorflow/compiler/tf2xla/lib/qr.cc @@ -191,12 +191,8 @@ xla::StatusOr QRBlock( auto v_broadcast = xla::Reshape(v, shape); // a[:, :] -= tau * np.dot(v[:, np.newaxis], // np.dot(v[np.newaxis, :], a[:, :])) - auto vva = - BatchDot(v_broadcast, a, /*transpose_x=*/false, /*transpose_y=*/false, - /*conjugate_x=*/false, /*conjugate_y=*/false, precision); - vva = - BatchDot(v_broadcast, vva, /*transpose_x=*/true, /*transpose_y=*/false, - /*conjugate_x=*/false, /*conjugate_y=*/false, precision); + auto vva = BatchDot(v_broadcast, a, precision); + vva = BatchDot(TransposeInMinorDims(v_broadcast), vva, precision); a = a - xla::Mul(tau, vva, /*broadcast_dimensions=*/batch_dim_indices); @@ -278,12 +274,9 @@ xla::StatusOr ComputeWYRepresentation( auto beta = DynamicSliceInMinorDims(taus, {j}, {1}); // yv has shape [..., n, 1] - auto yv = BatchDot(y, v, /*transpose_x=*/true, /*transpose_y=*/false, - /*conjugate_x=*/false, /*conjugate_y=*/false, precision); + auto yv = BatchDot(TransposeInMinorDims(y), v, precision); // wyv has shape [..., m, 1] - auto wyv = - BatchDot(w, yv, /*transpose_x=*/false, /*transpose_y=*/false, - /*conjugate_x=*/false, /*conjugate_y=*/false, precision); + auto wyv = BatchDot(w, yv, precision); auto z = xla::Mul( -beta, v + wyv, @@ -375,23 +368,15 @@ xla::StatusOr QRDecomposition( // a[i:, i+k:] += np.dot(Y, np.dot(W.T, a[i:, i+k:])) auto a_panel = SliceInMinorDims(a, {i, i + k}, {m, n}); - auto a_update = - BatchDot(w, a_panel, /*transpose_x=*/true, /*transpose_y=*/false, - /*conjugate_x=*/false, /*conjugate_y=*/false, precision); - a_update = - BatchDot(y, a_update, /*transpose_x=*/false, /*transpose_y=*/false, - /*conjugate_x=*/false, /*conjugate_y=*/false, precision); + auto a_update = BatchDot(TransposeInMinorDims(w), a_panel, precision); + a_update = BatchDot(y, a_update, precision); a_panel = a_panel + a_update; a = UpdateSliceInMinorDims(a, a_panel, {i, i + k}); // q[:, i:] += np.dot(np.dot(q[:, i:], W), Y.T)) auto q_panel = SliceInMinorDims(q, {0, i}, {m, m}); - auto q_update = - BatchDot(q_panel, w, /*transpose_x=*/false, /*transpose_y=*/false, - /*conjugate_x=*/false, /*conjugate_y=*/false, precision); - q_update = BatchDot(q_update, y, /*transpose_x=*/false, - /*transpose_y=*/true, /*conjugate_x=*/false, - /*conjugate_y=*/false, precision); + auto q_update = BatchDot(q_panel, w, precision); + q_update = BatchDot(q_update, TransposeInMinorDims(y), precision); q_panel = q_panel + q_update; q = UpdateSliceInMinorDims(q, q_panel, {0, i}); } diff --git a/tensorflow/compiler/tf2xla/lib/triangular_solve.cc b/tensorflow/compiler/tf2xla/lib/triangular_solve.cc index 4bc3796e4f..df8489b62e 100644 --- a/tensorflow/compiler/tf2xla/lib/triangular_solve.cc +++ b/tensorflow/compiler/tf2xla/lib/triangular_solve.cc @@ -311,13 +311,13 @@ xla::XlaOp SolveWithInvertedDiagonalBlocks( auto a_row = MaybeConjugate(SliceInMinorDims(a, start, end), conjugate_a); if (left_side) { - remainder = b_row - BatchDot(a_row, x, transpose_a, false, - /*conjugate_x=*/false, - /*conjugate_y=*/false, precision); + remainder = + b_row - BatchDot(MaybeTransposeInMinorDims(a_row, transpose_a), x, + precision); } else { - remainder = b_row - BatchDot(x, a_row, false, transpose_a, - /*conjugate_x=*/false, - /*conjugate_y=*/false, precision); + remainder = + b_row - BatchDot(x, MaybeTransposeInMinorDims(a_row, transpose_a), + precision); } } @@ -327,13 +327,12 @@ xla::XlaOp SolveWithInvertedDiagonalBlocks( xla::ConstantR0WithType(builder, xla::S32, j * block_size); std::vector update_starts = {start_index, zero}; if (left_side) { - x_update = - BatchDot(inv_block, remainder, transpose_a, false, - /*conjugate_x=*/false, /*conjugate_y=*/false, precision); + x_update = BatchDot(MaybeTransposeInMinorDims(inv_block, transpose_a), + remainder, precision); } else { - x_update = - BatchDot(remainder, inv_block, false, transpose_a, - /*conjugate_x=*/false, /*conjugate_y=*/false, precision); + x_update = BatchDot(remainder, + MaybeTransposeInMinorDims(inv_block, transpose_a), + precision); std::swap(update_starts[0], update_starts[1]); } x = DynamicUpdateSliceInMinorDims(x, x_update, /*starts=*/update_starts); diff --git a/tensorflow/compiler/tf2xla/lib/util.cc b/tensorflow/compiler/tf2xla/lib/util.cc index 804671fbc7..610d6882ba 100644 --- a/tensorflow/compiler/tf2xla/lib/util.cc +++ b/tensorflow/compiler/tf2xla/lib/util.cc @@ -239,6 +239,10 @@ xla::XlaOp TransposeInMinorDims(xla::XlaOp x) { }); } +xla::XlaOp MaybeTransposeInMinorDims(xla::XlaOp x, bool transpose) { + return transpose ? TransposeInMinorDims(x) : x; +} + xla::XlaOp MaybeConjugate(xla::XlaOp x, bool conjugate) { xla::XlaBuilder* builder = x.builder(); return builder->ReportErrorOrReturn([&]() -> xla::StatusOr { diff --git a/tensorflow/compiler/tf2xla/lib/util.h b/tensorflow/compiler/tf2xla/lib/util.h index 80e9e5b002..d0980717c9 100644 --- a/tensorflow/compiler/tf2xla/lib/util.h +++ b/tensorflow/compiler/tf2xla/lib/util.h @@ -72,6 +72,10 @@ xla::XlaOp DynamicUpdateSliceInMinorDims(xla::XlaOp x, xla::XlaOp update, // Transposes a stack of matrices `x` by swapping the last two dimensions. xla::XlaOp TransposeInMinorDims(xla::XlaOp x); +// Transposes `x` in its minor dimensions if `transpose` is true, otherwise +// returns `x` unchanged. +xla::XlaOp MaybeTransposeInMinorDims(xla::XlaOp x, bool transpose); + // Applies a complex conjugation operation if `a` is complex and `conjugate_a` // is true, otherwise returns its argument. xla::XlaOp MaybeConjugate(xla::XlaOp x, bool conjugate); diff --git a/tensorflow/compiler/tf2xla/lib/util_test.cc b/tensorflow/compiler/tf2xla/lib/util_test.cc index 442fe92c34..820f9a4972 100644 --- a/tensorflow/compiler/tf2xla/lib/util_test.cc +++ b/tensorflow/compiler/tf2xla/lib/util_test.cc @@ -126,7 +126,7 @@ XLA_TEST_F(UtilTest, RowBatchDot) { auto l_index = DynamicSliceInMinorDims( a, {index, xla::ConstantR0(&builder, 0)}, {1, n}); - BatchDot(l_index, row, /*transpose_x=*/false, /*transpose_y=*/true); + BatchDot(l_index, TransposeInMinorDims(row)); ComputeAndCompareR3(&builder, {{{33}}, {{292}}}, {a_data.get(), row_data.get(), index_data.get()}); -- GitLab From e185d1cc064b836867d0f33dd8c864ccf9a23570 Mon Sep 17 00:00:00 2001 From: Derek Murray Date: Tue, 4 Dec 2018 08:02:18 -0800 Subject: [PATCH 080/405] [tf.data] Add upgrade script warning about `Dataset.make_{initializable,one_shot}_iterator(). PiperOrigin-RevId: 223978130 --- .../tools/compatibility/tf_upgrade_v2.py | 21 +++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/tensorflow/tools/compatibility/tf_upgrade_v2.py b/tensorflow/tools/compatibility/tf_upgrade_v2.py index 571367d112..c37bab029c 100644 --- a/tensorflow/tools/compatibility/tf_upgrade_v2.py +++ b/tensorflow/tools/compatibility/tf_upgrade_v2.py @@ -786,11 +786,32 @@ class TFAPIChangeSpec(ast_edits.APIChangeSpec): "only effects core estimator. If you are using " "tf.contrib.learn.Estimator, please switch to using core estimator.") + make_initializable_iterator_deprecation = ( + "(Manual edit required) The " + "`tf.data.Dataset.make_initializable_iterator()` method has been " + "removed. If you are using the Estimator API, you can return a dataset " + "directly from your input functions without creating an iterator. " + "As a last resort, please replace calls to that method on `dataset` " + "with a call to " + "`tf.compat.v1.data.make_initializable_iterator(dataset)`.") + + make_one_shot_iterator_deprecation = ( + "(Manual edit required) The " + "`tf.data.Dataset.make_one_shot_iterator()` method has been " + "removed. If you are using eager execution, you can iterate over " + "`dataset` using a Python `for` loop. If you are using the Estimator " + "API, you can return a dataset directly from your input functions " + "without creating an iterator. As a last resort, please replace calls " + "to that method on `dataset` with a call to " + "`tf.compat.v1.data.make_one_shot_iterator(dataset)`.") + # Specify warnings for functions that aren't restricted to the tf.x.y.z # format. This should only be used for methods with unique names, e.g. # export_savedmodel, which is only defined in Estimator objects. self.unrestricted_function_warnings = { "export_savedmodel": export_saved_model_renamed, + "make_initializable_iterator": make_initializable_iterator_deprecation, + "make_one_shot_iterator": make_one_shot_iterator_deprecation, } @staticmethod -- GitLab From 8f740400a1e41020bc35852a2e611ca237e38fa8 Mon Sep 17 00:00:00 2001 From: Tamara Norman Date: Tue, 4 Dec 2018 08:07:30 -0800 Subject: [PATCH 081/405] Improve documentation of v1 initializers PiperOrigin-RevId: 223979317 --- tensorflow/python/ops/init_ops.py | 78 ++++++++++++++++++++----------- 1 file changed, 51 insertions(+), 27 deletions(-) diff --git a/tensorflow/python/ops/init_ops.py b/tensorflow/python/ops/init_ops.py index 03d2201a9a..c0a4bcd51d 100644 --- a/tensorflow/python/ops/init_ops.py +++ b/tensorflow/python/ops/init_ops.py @@ -55,6 +55,15 @@ class Initializer(object): """ def __call__(self, shape, dtype=None, partition_info=None): + """Returns a tensor object initialized as specified by the initializer. + + Args: + shape: Shape of the tensor. + dtype: Optional dtype of the tensor. If not provided use the initializer + dtype. + partition_info: Optional information about the possible partitioning of a + tensor. + """ raise NotImplementedError def get_config(self): @@ -143,7 +152,8 @@ class Constant(Initializer): value: A Python scalar, list or tuple of values, or a N-dimensional numpy array. All elements of the initialized variable will be set to the corresponding value in the `value` argument. - dtype: The data type. + dtype: Default data type, used if no `dtype` argument is provided when + calling the initializer. verify_shape: Boolean that enables verification of the shape of `value`. If `True`, the initializer will throw an error if the shape of `value` is not compatible with the shape of the initialized tensor. @@ -239,7 +249,8 @@ class RandomUniform(Initializer): seed: A Python integer. Used to create random seeds. See `tf.set_random_seed` for behavior. - dtype: The data type. + dtype: Default data type, used if no `dtype` argument is provided when + calling the initializer. """ def __init__(self, minval=0, maxval=None, seed=None, dtype=dtypes.float32): @@ -275,7 +286,8 @@ class RandomNormal(Initializer): seed: A Python integer. Used to create random seeds. See `tf.set_random_seed` for behavior. - dtype: The data type. Only floating point types are supported. + dtype: Default data type, used if no `dtype` argument is provided when + calling the initializer. Only floating point types are supported. """ def __init__(self, mean=0.0, stddev=1.0, seed=None, dtype=dtypes.float32): @@ -316,7 +328,8 @@ class TruncatedNormal(Initializer): seed: A Python integer. Used to create random seeds. See `tf.set_random_seed` for behavior. - dtype: The data type. Only floating point types are supported. + dtype: Default data type, used if no `dtype` argument is provided when + calling the initializer. Only floating point types are supported. """ def __init__(self, mean=0.0, stddev=1.0, seed=None, dtype=dtypes.float32): @@ -369,8 +382,9 @@ class UniformUnitScaling(Initializer): seed: A Python integer. Used to create random seeds. See `tf.set_random_seed` for behavior. - dtype: The data type. Only floating point types are supported. - + dtype: Default data type, used if no `dtype` argument is provided when + calling the initializer. Only floating point types are supported. + References: [Sussillo et al., 2014](https://arxiv.org/abs/1412.6558) ([pdf](http://arxiv.org/pdf/1412.6558.pdf)) @@ -437,7 +451,8 @@ class VarianceScaling(Initializer): seed: A Python integer. Used to create random seeds. See `tf.set_random_seed` for behavior. - dtype: The data type. Only floating point types are supported. + dtype: Default data type, used if no `dtype` argument is provided when + calling the initializer. Only floating point types are supported. Raises: ValueError: In case of an invalid value for the "scale", mode" or @@ -483,7 +498,7 @@ class VarianceScaling(Initializer): else: scale /= max(1., (fan_in + fan_out) / 2.) if self.distribution == "normal" or self.distribution == "truncated_normal": - # constant taken from scipy.stats.truncnorm.std(a=-2, b=2, loc=0., scale=1.) + # constant taken from scipy.stats.truncnorm.std(a=-2, b=2, loc=0., scale=1.) stddev = math.sqrt(scale) / .87962566103423978 return random_ops.truncated_normal( shape, 0.0, stddev, dtype, seed=self.seed) @@ -534,8 +549,9 @@ class Orthogonal(Initializer): seed: A Python integer. Used to create random seeds. See `tf.set_random_seed` for behavior. - dtype: The data type. - + dtype: Default data type, used if no `dtype` argument is provided when + calling the initializer. Only floating point types are supported. + References: [Saxe et al., 2014](https://openreview.net/forum?id=_wzZwKpTDF_9C) ([pdf](https://arxiv.org/pdf/1312.6120.pdf)) @@ -592,8 +608,9 @@ class ConvolutionDeltaOrthogonal(Initializer): `gain` after applying this convolution. seed: A Python integer. Used to create random seeds. See `tf.set_random_seed` for behavior. - dtype: The data type. - + dtype: Default data type, used if no `dtype` argument is provided when + calling the initializer. Only floating point types are supported. + References: [Xiao et al., 2018](http://proceedings.mlr.press/v80/xiao18a.html) ([pdf](http://proceedings.mlr.press/v80/xiao18a/xiao18a.pdf)) @@ -652,8 +669,9 @@ class ConvolutionOrthogonal(Initializer): `gain` after applying this convolution. seed: A Python integer. Used to create random seeds. See `tf.set_random_seed` for behavior. - dtype: The data type. - + dtype: Default data type, used if no `dtype` argument is provided when + calling the initializer. Only floating point types are supported. + References: [Xiao et al., 2018](http://proceedings.mlr.press/v80/xiao18a.html) ([pdf](http://proceedings.mlr.press/v80/xiao18a/xiao18a.pdf)) @@ -721,8 +739,9 @@ class ConvolutionOrthogonal2D(ConvolutionOrthogonal): a factor of `gain`. seed: A Python integer. Used to create random seeds. See `tf.set_random_seed` for behavior. - dtype: The data type. - + dtype: Default data type, used if no `dtype` argument is provided when + calling the initializer. Only floating point types are supported. + References: [Xiao et al., 2018](http://proceedings.mlr.press/v80/xiao18a.html) ([pdf](http://proceedings.mlr.press/v80/xiao18a/xiao18a.pdf)) @@ -862,8 +881,9 @@ class ConvolutionOrthogonal1D(ConvolutionOrthogonal): seed: A Python integer. Used to create random seeds. See `tf.set_random_seed` for behavior. - dtype: The data type. - + dtype: Default data type, used if no `dtype` argument is provided when + calling the initializer. Only floating point types are supported. + References: [Xiao et al., 2018](http://proceedings.mlr.press/v80/xiao18a.html) ([pdf](http://proceedings.mlr.press/v80/xiao18a/xiao18a.pdf)) @@ -982,8 +1002,9 @@ class ConvolutionOrthogonal3D(ConvolutionOrthogonal): `gain` after applying this convolution. seed: A Python integer. Used to create random seeds. See `tf.set_random_seed` for behavior. - dtype: The data type. - + dtype: Default data type, used if no `dtype` argument is provided when + calling the initializer. Only floating point types are supported. + References: [Xiao et al., 2018](http://proceedings.mlr.press/v80/xiao18a.html) ([pdf](http://proceedings.mlr.press/v80/xiao18a/xiao18a.pdf)) @@ -1132,7 +1153,8 @@ class Identity(Initializer): Args: gain: Multiplicative factor to apply to the identity matrix. - dtype: The type of the output. + dtype: Default data type, used if no `dtype` argument is provided when + calling the initializer. Only floating point types are supported. """ def __init__(self, gain=1.0, dtype=dtypes.float32): @@ -1170,9 +1192,10 @@ class GlorotUniform(VarianceScaling): seed: A Python integer. Used to create random seeds. See `tf.set_random_seed` for behavior. - dtype: The data type. Only floating point types are supported. + dtype: Default data type, used if no `dtype` argument is provided when + calling the initializer. Only floating point types are supported. - References: + References: [Glorot et al., 2010](http://proceedings.mlr.press/v9/glorot10a.html) ([pdf](http://jmlr.org/proceedings/papers/v9/glorot10a/glorot10a.pdf)) """ @@ -1208,9 +1231,10 @@ class GlorotNormal(VarianceScaling): Args: seed: A Python integer. Used to create random seeds. See `tf.set_random_seed` for behavior. - dtype: The data type. Only floating point types are supported. + dtype: Default data type, used if no `dtype` argument is provided when + calling the initializer. Only floating point types are supported. - References: + References: [Glorot et al., 2010](http://proceedings.mlr.press/v9/glorot10a.html) ([pdf](http://jmlr.org/proceedings/papers/v9/glorot10a/glorot10a.pdf)) """ @@ -1264,7 +1288,7 @@ def lecun_normal(seed=None): An initializer. References: - - Self-Normalizing Neural Networks, + - Self-Normalizing Neural Networks, [Klambauer et al., 2017](https://papers.nips.cc/paper/6698-self-normalizing-neural-networks) ([pdf](https://papers.nips.cc/paper/6698-self-normalizing-neural-networks.pdf)) - Efficient Backprop, @@ -1289,7 +1313,7 @@ def lecun_uniform(seed=None): An initializer. References: - - Self-Normalizing Neural Networks, + - Self-Normalizing Neural Networks, [Klambauer et al., 2017](https://papers.nips.cc/paper/6698-self-normalizing-neural-networks) ([pdf](https://papers.nips.cc/paper/6698-self-normalizing-neural-networks.pdf)) - Efficient Backprop, -- GitLab From e68e9432bf224987106123e08cc61b9686f256ce Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Tue, 4 Dec 2018 08:17:35 -0800 Subject: [PATCH 082/405] Internal change. PiperOrigin-RevId: 223980636 --- tensorflow/core/BUILD | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/tensorflow/core/BUILD b/tensorflow/core/BUILD index c268605711..0e4923be70 100644 --- a/tensorflow/core/BUILD +++ b/tensorflow/core/BUILD @@ -1803,7 +1803,7 @@ cc_library( # registration of ops to prune code size. cc_library( name = "android_tensorflow_lib_selective_registration", - srcs = if_android(["//tensorflow/core:android_srcs"]), + srcs = if_android(["//tensorflow/core:android_srcs_only_runtime"]), copts = tf_copts(android_optimization_level_override = None) + [ "-DSUPPORT_SELECTIVE_REGISTRATION", ], @@ -1814,11 +1814,9 @@ cc_library( ], visibility = ["//visibility:public"], deps = [ + ":android_tensorflow_lib_lite", ":protos_all_cc_impl", - "//third_party/eigen3", "@com_google_absl//absl/container:flat_hash_set", - "@double_conversion//:double-conversion", - "@nsync//:nsync_cpp", "@protobuf_archive//:protobuf", ], alwayslink = 1, @@ -1828,7 +1826,7 @@ cc_library( # no proto_rtti. cc_library( name = "android_tensorflow_lib_selective_registration_nortti", - srcs = if_android(["//tensorflow/core:android_srcs"]), + srcs = if_android(["//tensorflow/core:android_srcs_only_runtime"]), copts = tf_copts(android_optimization_level_override = None) + tf_opts_nortti_if_android() + [ "-DSUPPORT_SELECTIVE_REGISTRATION", ], @@ -1839,11 +1837,9 @@ cc_library( ], visibility = ["//visibility:public"], deps = [ + ":android_tensorflow_lib_lite_nortti", ":protos_all_cc_impl", - "//third_party/eigen3", "@com_google_absl//absl/container:flat_hash_set", - "@double_conversion//:double-conversion", - "@nsync//:nsync_cpp", "@protobuf_archive//:protobuf", ], alwayslink = 1, -- GitLab From 7fa9d6a1f2d5916f3873ea38ed59c69af4c54e70 Mon Sep 17 00:00:00 2001 From: Derek Murray Date: Tue, 4 Dec 2018 08:52:57 -0800 Subject: [PATCH 083/405] Make `TensorBuffer::data()` non-virtual and move the pointer into the base class. All existing `TensorBuffer` subclasses already store a pointer to their buffer. Accessing that pointer by calling a virtual method is inefficient. We currently generate the following instruction sequence at the callsite (when compiling for x86_64 with a recent version of Clang): 1dd002: mov (%rdi),%rax tensor.h:655 1dd005: callq *0x10(%rax) tensor.h:655 ...and the following implementation for `Buffer::data()`: 236520: mov 0x10(%rdi),%rax tensor.h:888 236524: retq tensor.h:888 With this change, we generate a single `mov` instruction inline at the call site, and avoid any branching. PiperOrigin-RevId: 223985477 --- tensorflow/c/c_api.cc | 57 ++++++++++--------- tensorflow/compiler/jit/xla_launch_util.h | 10 ++-- .../common_runtime/gpu/gpu_event_mgr_test.cc | 4 +- tensorflow/core/framework/tensor.cc | 17 +++--- tensorflow/core/framework/tensor.h | 17 ++++-- tensorflow/core/framework/tensor_test.cc | 21 +++++++ tensorflow/lite/delegates/flex/buffer_map.cc | 57 ++++++++++--------- tensorflow/python/lib/core/py_func.cc | 6 +- 8 files changed, 110 insertions(+), 79 deletions(-) diff --git a/tensorflow/c/c_api.cc b/tensorflow/c/c_api.cc index f13e8777df..94d18eb8b0 100644 --- a/tensorflow/c/c_api.cc +++ b/tensorflow/c/c_api.cc @@ -136,16 +136,22 @@ const char* TF_Message(const TF_Status* s) { namespace { class TF_ManagedBuffer : public TensorBuffer { public: - void* data_; - size_t len_; - void (*deallocator_)(void* data, size_t len, void* arg); - void* deallocator_arg_; + TF_ManagedBuffer(void* data, size_t len, + void (*deallocator)(void* data, size_t len, void* arg), + void* deallocator_arg) + : TensorBuffer(data), + len_(len), + deallocator_(deallocator), + deallocator_arg_(deallocator_arg) {} + + const size_t len_; + void (*const deallocator_)(void* data, size_t len, void* arg); + void* const deallocator_arg_; ~TF_ManagedBuffer() override { - (*deallocator_)(data_, len_, deallocator_arg_); + (*deallocator_)(data(), len_, deallocator_arg_); } - void* data() const override { return data_; } size_t size() const override { return len_; } TensorBuffer* root_buffer() override { return this; } void FillAllocationDescription(AllocationDescription* proto) const override { @@ -199,8 +205,7 @@ TF_Tensor* TF_NewTensor(TF_DataType dtype, const int64_t* dims, int num_dims, dimvec[i] = static_cast(dims[i]); } - TF_ManagedBuffer* buf = new TF_ManagedBuffer; - buf->len_ = len; + TF_ManagedBuffer* buf = nullptr; if (dtype != TF_STRING && dtype != TF_RESOURCE && tensorflow::DataTypeCanUseMemcpy(static_cast(dtype)) && reinterpret_cast(data) % std::max(1, EIGEN_MAX_ALIGN_BYTES) != @@ -212,17 +217,15 @@ TF_Tensor* TF_NewTensor(TF_DataType dtype, const int64_t* dims, int num_dims, // // Other types have the same representation, so copy only if it is safe to // do so. - buf->data_ = allocate_tensor("TF_NewTensor", len); - std::memcpy(buf->data_, data, len); - buf->deallocator_ = deallocate_buffer; - buf->deallocator_arg_ = nullptr; + buf = new TF_ManagedBuffer(allocate_tensor("TF_NewTensor", len), len, + deallocate_buffer, nullptr); + std::memcpy(buf->data(), data, len); // Free the original buffer. deallocator(data, len, deallocator_arg); } else { - buf->data_ = data; - buf->deallocator_ = deallocator; - buf->deallocator_arg_ = deallocator_arg; + buf = new TF_ManagedBuffer(data, len, deallocator, deallocator_arg); } + TF_Tensor* ret = new TF_Tensor{dtype, TensorShape(dimvec), buf}; size_t elem_size = TF_DataTypeSize(dtype); if (elem_size > 0 && len < (elem_size * ret->shape.num_elements())) { @@ -477,9 +480,9 @@ static TF_Tensor* EmptyTensor(TF_DataType dtype, const TensorShape& shape) { CHECK_EQ(nelems, 0); static_assert(sizeof(int64_t) == sizeof(tensorflow::int64), "64-bit int types should match in size"); - return TF_NewTensor(dtype, reinterpret_cast(dims.data()), - shape.dims(), reinterpret_cast(&empty), 0, - [](void*, size_t, void*) {}, nullptr); + return TF_NewTensor( + dtype, reinterpret_cast(dims.data()), shape.dims(), + reinterpret_cast(&empty), 0, [](void*, size_t, void*) {}, nullptr); } // Non-static for testing. @@ -1592,18 +1595,20 @@ TF_AttrMetadata TF_OperationGetAttrMetadata(TF_Operation* oper, break; \ } - LIST_CASE(s, TF_ATTR_STRING, metadata.total_size = 0; - for (int i = 0; i < attr->list().s_size(); - ++i) { metadata.total_size += attr->list().s(i).size(); }); + LIST_CASE( + s, TF_ATTR_STRING, metadata.total_size = 0; + for (int i = 0; i < attr->list().s_size(); + ++i) { metadata.total_size += attr->list().s(i).size(); }); LIST_CASE(i, TF_ATTR_INT); LIST_CASE(f, TF_ATTR_FLOAT); LIST_CASE(b, TF_ATTR_BOOL); LIST_CASE(type, TF_ATTR_TYPE); - LIST_CASE(shape, TF_ATTR_SHAPE, metadata.total_size = 0; - for (int i = 0; i < attr->list().shape_size(); ++i) { - const auto& s = attr->list().shape(i); - metadata.total_size += s.unknown_rank() ? 0 : s.dim_size(); - }); + LIST_CASE( + shape, TF_ATTR_SHAPE, metadata.total_size = 0; + for (int i = 0; i < attr->list().shape_size(); ++i) { + const auto& s = attr->list().shape(i); + metadata.total_size += s.unknown_rank() ? 0 : s.dim_size(); + }); LIST_CASE(tensor, TF_ATTR_TENSOR); LIST_CASE(tensor, TF_ATTR_FUNC); #undef LIST_CASE diff --git a/tensorflow/compiler/jit/xla_launch_util.h b/tensorflow/compiler/jit/xla_launch_util.h index 437db019a0..706811f55e 100644 --- a/tensorflow/compiler/jit/xla_launch_util.h +++ b/tensorflow/compiler/jit/xla_launch_util.h @@ -199,19 +199,17 @@ class XlaTensorBuffer : public TensorBuffer { public: XlaTensorBuffer(const void* ptr, size_t expected_size, size_t actual_size, Allocator* allocator) - : expected_size_(expected_size), + : TensorBuffer(const_cast(ptr)), + expected_size_(expected_size), actual_size_(actual_size), - allocator_(allocator) { - data_ = const_cast(ptr); - } + allocator_(allocator) {} ~XlaTensorBuffer() override { if (data_) { - allocator_->DeallocateRaw(data_); + allocator_->DeallocateRaw(data()); } } - void* data() const override { return data_; } size_t size() const override { return expected_size_; } TensorBuffer* root_buffer() override { return this; } diff --git a/tensorflow/core/common_runtime/gpu/gpu_event_mgr_test.cc b/tensorflow/core/common_runtime/gpu/gpu_event_mgr_test.cc index d2adf699f5..fe32147557 100644 --- a/tensorflow/core/common_runtime/gpu/gpu_event_mgr_test.cc +++ b/tensorflow/core/common_runtime/gpu/gpu_event_mgr_test.cc @@ -78,7 +78,8 @@ static std::atomic_int_fast64_t live_tensor_bytes(0); // A TensorBuffer that counts live memory usage for testing class TestTensorBuffer : public TensorBuffer { public: - explicit TestTensorBuffer(size_t bytes) : bytes_(bytes) { + explicit TestTensorBuffer(size_t bytes) + : TensorBuffer(nullptr), bytes_(bytes) { live_tensor_bytes += bytes_; } ~TestTensorBuffer() override { live_tensor_bytes -= bytes_; } @@ -86,7 +87,6 @@ class TestTensorBuffer : public TensorBuffer { size_t size() const override { return bytes_; } // Not used in this test - void* data() const override { return nullptr; } TensorBuffer* root_buffer() override { return nullptr; } void FillAllocationDescription(AllocationDescription* arg) const override {} diff --git a/tensorflow/core/framework/tensor.cc b/tensorflow/core/framework/tensor.cc index c7ddc6c21e..7e841489eb 100644 --- a/tensorflow/core/framework/tensor.cc +++ b/tensorflow/core/framework/tensor.cc @@ -68,7 +68,8 @@ namespace { // An un-templated base class for Buffer. class BufferBase : public TensorBuffer { public: - explicit BufferBase(Allocator* alloc) : alloc_(alloc) {} + explicit BufferBase(Allocator* alloc, void* data_ptr) + : TensorBuffer(data_ptr), alloc_(alloc) {} TensorBuffer* root_buffer() override { return this; } void FillAllocationDescription(AllocationDescription* proto) const override { @@ -106,7 +107,6 @@ class Buffer : public BufferBase { Buffer(Allocator* a, int64 n); Buffer(Allocator* a, int64 n, const AllocationAttributes& allocation_attr); - void* data() const override { return data_; } size_t size() const override { return sizeof(T) * elem_; } private: @@ -442,20 +442,20 @@ struct ProtoHelper { template Buffer::Buffer(Allocator* a, int64 n) - : BufferBase(a), data_(a->Allocate(n)), elem_(n) {} + : BufferBase(a, a->Allocate(n)), elem_(n) {} template Buffer::Buffer(Allocator* a, int64 n, const AllocationAttributes& allocation_attr) - : BufferBase(a), data_(a->Allocate(n, allocation_attr)), elem_(n) {} + : BufferBase(a, a->Allocate(n, allocation_attr)), elem_(n) {} template Buffer::~Buffer() { - if (data_) { + if (data()) { if (LogMemory::IsEnabled()) { RecordDeallocation(); } - alloc_->Deallocate(data_, elem_); + alloc_->Deallocate(static_cast(data()), elem_); } } @@ -764,7 +764,9 @@ class SubBuffer : public TensorBuffer { public: // This buffer is an alias to buf[delta, delta + n). SubBuffer(TensorBuffer* buf, int64 delta, int64 n) - : root_(buf->root_buffer()), data_(buf->base() + delta), elem_(n) { + : TensorBuffer(buf->base() + delta), + root_(buf->root_buffer()), + elem_(n) { // Sanity check. The caller should ensure the sub buffer is valid. CHECK_LE(root_->base(), this->base()); T* root_limit = root_->base() + root_->size() / sizeof(T); @@ -775,7 +777,6 @@ class SubBuffer : public TensorBuffer { root_->Ref(); } - void* data() const override { return data_; } size_t size() const override { return sizeof(T) * elem_; } TensorBuffer* root_buffer() override { return root_; } void FillAllocationDescription(AllocationDescription* proto) const override { diff --git a/tensorflow/core/framework/tensor.h b/tensorflow/core/framework/tensor.h index 3177bbe7e9..6e03cf9f6f 100644 --- a/tensorflow/core/framework/tensor.h +++ b/tensorflow/core/framework/tensor.h @@ -636,10 +636,15 @@ class Tensor { // Interface to access the raw ref-counted data buffer. class TensorBuffer : public core::RefCounted { public: + explicit TensorBuffer(void* data_ptr) : data_(data_ptr) {} ~TensorBuffer() override {} // data() points to a memory region of size() bytes. - virtual void* data() const = 0; + // + // NOTE(mrry): The `data()` method is not virtual for performance reasons. + // It can be called multiple times when the contents of a `Tensor` are + // accessed, and so making it non-virtual allows the body to be inlined. + void* data() const { return data_; } virtual size_t size() const = 0; // If this TensorBuffer is sub-buffer of another TensorBuffer, @@ -657,6 +662,9 @@ class TensorBuffer : public core::RefCounted { // Whether this TensorBuffer owns the underlying memory. virtual bool OwnsMemory() const { return true; } + + private: + void* const data_; }; template @@ -874,6 +882,7 @@ inline Tensor::Tensor(Tensor&& other) class Tensor::HostScalarTensorBufferBase : public TensorBuffer { public: + using TensorBuffer::TensorBuffer; void FillAllocationDescription(AllocationDescription* proto) const final; }; @@ -884,8 +893,7 @@ template struct Tensor::ValueAndTensorBuffer { class HostScalarTensorBuffer : public Tensor::HostScalarTensorBufferBase { public: - HostScalarTensorBuffer(void* data) : data_(data) {} - void* data() const final { return const_cast(data_); } + HostScalarTensorBuffer(void* data) : HostScalarTensorBufferBase(data) {} size_t size() const final { return sizeof(T); } TensorBuffer* root_buffer() final { return this; } @@ -904,8 +912,7 @@ struct Tensor::ValueAndTensorBuffer { } private: - ~HostScalarTensorBuffer() override { static_cast(data_)->~T(); } - void* const data_; + ~HostScalarTensorBuffer() override { static_cast(data())->~T(); } }; T value; diff --git a/tensorflow/core/framework/tensor_test.cc b/tensorflow/core/framework/tensor_test.cc index 4fa9d1df67..713f91fe04 100644 --- a/tensorflow/core/framework/tensor_test.cc +++ b/tensorflow/core/framework/tensor_test.cc @@ -1491,5 +1491,26 @@ void BM_CreateAndMoveCtrWithBuf(int iters) { } BENCHMARK(BM_CreateAndMoveCtrWithBuf); +// Benchmark creating and destroy a host-scalar tensor, using the allocator +// interface. +void BM_CreateAndDestroyHostScalarNonOptimized(int iters) { + TensorShape shape({}); + Allocator* allocator = cpu_allocator(); + while (--iters) { + Tensor a(allocator, DT_FLOAT, shape); + a.scalar()() = 37.0; + } +} +BENCHMARK(BM_CreateAndDestroyHostScalarNonOptimized); + +// Benchmark creating and destroy a host-scalar tensor, using the specialized +// constructor. +void BM_CreateAndDestroyHostScalarOptimized(int iters) { + while (--iters) { + Tensor a(37.0); + } +} +BENCHMARK(BM_CreateAndDestroyHostScalarOptimized); + } // namespace } // namespace tensorflow diff --git a/tensorflow/lite/delegates/flex/buffer_map.cc b/tensorflow/lite/delegates/flex/buffer_map.cc index 262ca9e089..0d0c953636 100644 --- a/tensorflow/lite/delegates/flex/buffer_map.cc +++ b/tensorflow/lite/delegates/flex/buffer_map.cc @@ -26,6 +26,8 @@ namespace flex { namespace { // A tensor buffer that is allocated, deallocated and populated by TF Lite. class BaseTfLiteTensorBuffer : public tensorflow::TensorBuffer { + using tensorflow::TensorBuffer::TensorBuffer; + TensorBuffer* root_buffer() override { return this; } void FillAllocationDescription( tensorflow::AllocationDescription* proto) const override { @@ -60,31 +62,29 @@ class BaseTfLiteTensorBuffer : public tensorflow::TensorBuffer { // representation in TFLITE and TF, so we just need use memcpy(). class TfLiteTensorBuffer : public BaseTfLiteTensorBuffer { public: - explicit TfLiteTensorBuffer(const TfLiteTensor* tensor) { + explicit TfLiteTensorBuffer(const TfLiteTensor* tensor) + : BaseTfLiteTensorBuffer(tensorflow::cpu_allocator()->AllocateRaw( + EIGEN_MAX_ALIGN_BYTES, tensor->bytes)) { // TODO(ahentz): if we can guarantee that TF Lite allocated tensors with // the same alignment as TensorFlow (EIGEN_MAX_ALIGN_BYTES), then we can // potentially eliminate the copy below. len_ = tensor->bytes; - data_ = - tensorflow::cpu_allocator()->AllocateRaw(EIGEN_MAX_ALIGN_BYTES, len_); LogAllocation(); - if (data_) { - std::memcpy(data_, tensor->data.raw, tensor->bytes); + if (data()) { + std::memcpy(data(), tensor->data.raw, tensor->bytes); } } ~TfLiteTensorBuffer() override { LogDeallocation(); - tensorflow::cpu_allocator()->DeallocateRaw(data_); + tensorflow::cpu_allocator()->DeallocateRaw(data()); } - void* data() const override { return data_; } size_t size() const override { return len_; } private: - void* data_; size_t len_; }; @@ -92,19 +92,30 @@ class TfLiteTensorBuffer : public BaseTfLiteTensorBuffer { // TF's so we need perform the conversion here. class StringTfLiteTensorBuffer : public BaseTfLiteTensorBuffer { public: - explicit StringTfLiteTensorBuffer(const TfLiteTensor* tensor) { - if (tensor->data.raw == nullptr) { - num_strings_ = 0; - data_ = nullptr; - return; - } - num_strings_ = GetStringCount(tensor->data.raw); - data_ = tensorflow::cpu_allocator()->Allocate(num_strings_); + explicit StringTfLiteTensorBuffer(const TfLiteTensor* tensor) + : StringTfLiteTensorBuffer(tensor, tensor->data.raw != nullptr + ? GetStringCount(tensor->data.raw) + : 0) {} + + ~StringTfLiteTensorBuffer() override { + LogDeallocation(); + tensorflow::cpu_allocator()->Deallocate( + static_cast(data()), num_strings_); + } + + size_t size() const override { return num_strings_ * sizeof(string); } + private: + StringTfLiteTensorBuffer(const TfLiteTensor* tensor, int num_strings) + : BaseTfLiteTensorBuffer( + num_strings != 0 + ? tensorflow::cpu_allocator()->Allocate(num_strings) + : nullptr), + num_strings_(num_strings) { LogAllocation(); - if (data_) { - string* p = data_; + if (data()) { + string* p = static_cast(data()); for (size_t i = 0; i < num_strings_; ++p, ++i) { auto ref = GetString(tensor->data.raw, i); p->assign(ref.str, ref.len); @@ -112,16 +123,6 @@ class StringTfLiteTensorBuffer : public BaseTfLiteTensorBuffer { } } - ~StringTfLiteTensorBuffer() override { - LogDeallocation(); - tensorflow::cpu_allocator()->Deallocate(data_, num_strings_); - } - - void* data() const override { return data_; } - size_t size() const override { return num_strings_ * sizeof(string); } - - private: - string* data_; int num_strings_; }; diff --git a/tensorflow/python/lib/core/py_func.cc b/tensorflow/python/lib/core/py_func.cc index 9364aec373..97bebe8617 100644 --- a/tensorflow/python/lib/core/py_func.cc +++ b/tensorflow/python/lib/core/py_func.cc @@ -302,15 +302,14 @@ Status DoCallPyFunc(PyCall* call, bool* out_log_on_error) { class NumpyTensorBuffer : public TensorBuffer { public: NumpyTensorBuffer(PyArrayObject* array, size_t len, void* data) - : array_(array), len_(len), data_(data) {} + : TensorBuffer(data), array_(array), len_(len) {} ~NumpyTensorBuffer() override { // Note: The session::run wrapper is responsible for freeing this while // holding the GIL. - DelayedNumpyDecref(data_, len_, array_); + DelayedNumpyDecref(data(), len_, array_); } - void* data() const override { return data_; } size_t size() const override { return len_; } TensorBuffer* root_buffer() override { return this; } void FillAllocationDescription(AllocationDescription* proto) const override { @@ -329,7 +328,6 @@ class NumpyTensorBuffer : public TensorBuffer { private: PyArrayObject* array_; size_t len_; - void* data_; }; Status PyObjectToString(PyObject* obj, string* str) { -- GitLab From 7b1169cd951730e89ec4b019dd1c135e22eed29f Mon Sep 17 00:00:00 2001 From: Chris Antaki Date: Tue, 4 Dec 2018 08:57:57 -0800 Subject: [PATCH 084/405] Removes line from README As @terrytangyuan pointed out in #23647, the line being removed makes a suggestion that can't be followed --- tensorflow/contrib/tfprof/README.md | 2 -- 1 file changed, 2 deletions(-) diff --git a/tensorflow/contrib/tfprof/README.md b/tensorflow/contrib/tfprof/README.md index b29d1acacf..f40e76f554 100644 --- a/tensorflow/contrib/tfprof/README.md +++ b/tensorflow/contrib/tfprof/README.md @@ -1,7 +1,5 @@ # tfprof: TensorFlow Profiler and Beyond -

Please use `tf.profiler.xxx` instead of `tf.contrib.tfprof.xxx`

-

Full Document in tensorflow/core/profiler/README.md

-- GitLab From 30d6a001371a9cef5ab085980356365d4861b8ee Mon Sep 17 00:00:00 2001 From: Pooya Davoodi Date: Tue, 4 Dec 2018 09:03:15 -0800 Subject: [PATCH 085/405] Update README.md --- tensorflow/contrib/tensorrt/README.md | 29 ++++++++++++++++----------- 1 file changed, 17 insertions(+), 12 deletions(-) diff --git a/tensorflow/contrib/tensorrt/README.md b/tensorflow/contrib/tensorrt/README.md index 09ef7f459f..dedac2c748 100644 --- a/tensorflow/contrib/tensorrt/README.md +++ b/tensorflow/contrib/tensorrt/README.md @@ -2,8 +2,7 @@ This module provides necessary bindings and introduces `TRTEngineOp` operator that wraps a subgraph in TensorRT. -This is still a work in progress but should be useable -with most common graphs. +This module is under active development. ## Installing TF-TRT @@ -24,6 +23,21 @@ of TensorRT from the Installation instructions for compatibility with TensorFlow are provided on the [TensorFlow GPU support](https://www.tensorflow.org/install/gpu) guide. +## Examples + +You can find example scripts for running inference on deep learning +models in this repository: https://github.com/tensorflow/tensorrt + +We have used these examples to verify the accuracy and +performance of TF-TRT. For more information see +[Verified Models](https://docs.nvidia.com/deeplearning/dgx/integrate-tf-trt/index.html#verified-models). + +## Documentation + +[TF-TRT documentaion](https://docs.nvidia.com/deeplearning/dgx/integrate-tf-trt/index.html) +gives an overview of the supported functionalities, provides tutorials +and verified models, explains best practices with troubleshooting guides. + ## Tests TF-TRT includes both Python tests and C++ unit tests. @@ -33,16 +47,6 @@ with the Python command. Most of the C++ unit tests are used to test the conversion functions that convert each TF op to a number of TensorRT layers. -## Examples - -You can find example scripts for running inference on deep learning models -in this repository: https://github.com/tensorflow/tensorrt - -## Documentation - -You can find documentation for TF-TRT here: -https://docs.nvidia.com/deeplearning/dgx/integrate-tf-trt/index.html - ## Compilation In order to compile the module, you need to have a local TensorRT installation @@ -56,3 +60,4 @@ has to set path to location where the library is installed during configuration. bazel build --config=cuda --config=opt //tensorflow/tools/pip_package:build_pip_package bazel-bin/tensorflow/tools/pip_package/build_pip_package /tmp/ ``` + -- GitLab From 508331bda52cba5446a5a76c8c3aaac787d455bb Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Tue, 4 Dec 2018 09:00:45 -0800 Subject: [PATCH 086/405] Remove dimension switching for depthwise convolutions from the feature group converter. PiperOrigin-RevId: 223986482 --- .../convolution_feature_group_converter.cc | 32 ------------------- 1 file changed, 32 deletions(-) diff --git a/tensorflow/compiler/xla/service/convolution_feature_group_converter.cc b/tensorflow/compiler/xla/service/convolution_feature_group_converter.cc index 07d6680a72..95c7724c3c 100644 --- a/tensorflow/compiler/xla/service/convolution_feature_group_converter.cc +++ b/tensorflow/compiler/xla/service/convolution_feature_group_converter.cc @@ -205,38 +205,6 @@ Status ConvolutionVisitor::HandleConvolution(HloInstruction* convolution) { // If the code generator handles depthwise separable convolutions // inherently, then no filter expansion is needed. if (!filter_expansion_ && depthwise_separable) { - const int64 old_kernel_input_feature_dimension = - dim_numbers.kernel_input_feature_dimension(); - const int64 old_kernel_output_feature_dimension = - dim_numbers.kernel_output_feature_dimension(); - - // For depthwise convolutions, we want the kernel input feature dimension - // to be smaller than the output feature dimension. If that's not the - // case, we swap the dimensions. - if (old_kernel_input_feature_dimension > - old_kernel_output_feature_dimension) { - Shape reshaped_filter_shape = filter->shape(); - auto& dimensions = *reshaped_filter_shape.mutable_dimensions(); - std::swap(dimensions[old_kernel_input_feature_dimension], - dimensions[old_kernel_output_feature_dimension]); - - auto reshaped_filter = - add(HloInstruction::CreateReshape(reshaped_filter_shape, filter)); - - dim_numbers.set_kernel_input_feature_dimension( - old_kernel_output_feature_dimension); - - dim_numbers.set_kernel_output_feature_dimension( - old_kernel_input_feature_dimension); - - auto new_convolution = HloInstruction::CreateConvolve( - convolution->shape(), convolution->mutable_operand(0), - reshaped_filter, group_count, convolution->window(), dim_numbers, - convolution->precision_config()); - - TF_RETURN_IF_ERROR(computation_->ReplaceWithNewInstruction( - convolution, std::move(new_convolution))); - } return Status::OK(); } // We want to repeat 'filter' in the 'input_feature_dim' dimension -- GitLab From 3ec030892d2a92e86ad7015b211b31e6007df415 Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Tue, 4 Dec 2018 09:34:48 -0800 Subject: [PATCH 087/405] Automated rollback of commit e68e9432bf224987106123e08cc61b9686f256ce PiperOrigin-RevId: 223991983 --- tensorflow/core/BUILD | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/tensorflow/core/BUILD b/tensorflow/core/BUILD index 0e4923be70..c268605711 100644 --- a/tensorflow/core/BUILD +++ b/tensorflow/core/BUILD @@ -1803,7 +1803,7 @@ cc_library( # registration of ops to prune code size. cc_library( name = "android_tensorflow_lib_selective_registration", - srcs = if_android(["//tensorflow/core:android_srcs_only_runtime"]), + srcs = if_android(["//tensorflow/core:android_srcs"]), copts = tf_copts(android_optimization_level_override = None) + [ "-DSUPPORT_SELECTIVE_REGISTRATION", ], @@ -1814,9 +1814,11 @@ cc_library( ], visibility = ["//visibility:public"], deps = [ - ":android_tensorflow_lib_lite", ":protos_all_cc_impl", + "//third_party/eigen3", "@com_google_absl//absl/container:flat_hash_set", + "@double_conversion//:double-conversion", + "@nsync//:nsync_cpp", "@protobuf_archive//:protobuf", ], alwayslink = 1, @@ -1826,7 +1828,7 @@ cc_library( # no proto_rtti. cc_library( name = "android_tensorflow_lib_selective_registration_nortti", - srcs = if_android(["//tensorflow/core:android_srcs_only_runtime"]), + srcs = if_android(["//tensorflow/core:android_srcs"]), copts = tf_copts(android_optimization_level_override = None) + tf_opts_nortti_if_android() + [ "-DSUPPORT_SELECTIVE_REGISTRATION", ], @@ -1837,9 +1839,11 @@ cc_library( ], visibility = ["//visibility:public"], deps = [ - ":android_tensorflow_lib_lite_nortti", ":protos_all_cc_impl", + "//third_party/eigen3", "@com_google_absl//absl/container:flat_hash_set", + "@double_conversion//:double-conversion", + "@nsync//:nsync_cpp", "@protobuf_archive//:protobuf", ], alwayslink = 1, -- GitLab From b3710b27cc6098c86204b5ca3459280cbae1161c Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Tue, 4 Dec 2018 09:48:01 -0800 Subject: [PATCH 088/405] TOCO should be OK with using constant arrays as outputs. PiperOrigin-RevId: 223994547 --- tensorflow/lite/testing/generate_examples.py | 13 ++++++++----- tensorflow/lite/toco/toco_tooling.cc | 1 + tensorflow/lite/toco/tooling_util.cc | 3 +++ 3 files changed, 12 insertions(+), 5 deletions(-) diff --git a/tensorflow/lite/testing/generate_examples.py b/tensorflow/lite/testing/generate_examples.py index 5c90410a19..4c731a7d18 100644 --- a/tensorflow/lite/testing/generate_examples.py +++ b/tensorflow/lite/testing/generate_examples.py @@ -815,6 +815,7 @@ def make_constant_tests(zip_path): test_parameters = [{ "dtype": [tf.float32, tf.int32], "input_shape": [[], [1], [2], [1, 1, 1, 1], [2, 2, 2, 2]], + "constant_is_also_output": [True, False], }] def build_graph(parameters): @@ -824,17 +825,19 @@ def make_constant_tests(zip_path): shape=parameters["input_shape"]) constant = tf.constant( create_tensor_data(parameters["dtype"], parameters["input_shape"])) - # This maximum node is here to avoid the situation where a graph output is - # a constant, which is an error in toco. - out = tf.maximum(dummy_input, constant) - return [dummy_input], [out] + out = [tf.maximum(dummy_input, constant)] + if parameters["constant_is_also_output"]: + out.append(constant) + + return [dummy_input], out def build_inputs(parameters, sess, inputs, outputs): dummy_input = np.zeros( parameters["input_shape"], dtype=_TF_TYPE_INFO[parameters["dtype"]][0]) return [dummy_input], sess.run(outputs, feed_dict={inputs[0]: dummy_input}) - make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs) + make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs, + expected_tf_success=20) def make_binary_op_tests(zip_path, binary_operator): diff --git a/tensorflow/lite/toco/toco_tooling.cc b/tensorflow/lite/toco/toco_tooling.cc index d8b111d037..55a454e66d 100644 --- a/tensorflow/lite/toco/toco_tooling.cc +++ b/tensorflow/lite/toco/toco_tooling.cc @@ -309,6 +309,7 @@ void Transform(const TocoFlags& toco_flags, Model* model) { // Fix any issues with IO edges. This must happen after any transform that // may modify the structure of the edges. FixEdgeArrays(model); + FixOperatorOrdering(model); if (quantize_output) { // If the user specified default min/max ranges we need to set all arrays diff --git a/tensorflow/lite/toco/tooling_util.cc b/tensorflow/lite/toco/tooling_util.cc index fc9a0d8af5..af4cd386a2 100644 --- a/tensorflow/lite/toco/tooling_util.cc +++ b/tensorflow/lite/toco/tooling_util.cc @@ -899,6 +899,9 @@ void CheckNonExistentIOArrays(const Model& model) { << "\" is not consumed by any op in this graph. " << general_comment; } for (const string& output_array : model.flags.output_arrays()) { + if (IsConstantParameterArray(model, output_array)) { + continue; // It is OK to request that a constant be an output. + } QCHECK(GetOpWithOutput(model, output_array)) << "Specified output array \"" << output_array << "\" is not produced by any op in this graph. " << general_comment; -- GitLab From a4e5957bc8065e2b8b5322d90d8b846525e83d16 Mon Sep 17 00:00:00 2001 From: Peter Hawkins Date: Tue, 4 Dec 2018 09:48:03 -0800 Subject: [PATCH 089/405] [XLA] [TF:XLA] Move BatchDot into the XLA client library. BatchDot is a helper for performing batched matrix multiplications where all except the minor two dimensions are batch dimensions. It is used by a number of other linear algebra routines. PiperOrigin-RevId: 223994553 --- tensorflow/compiler/tf2xla/kernels/BUILD | 1 - .../tf2xla/kernels/batch_matmul_op.cc | 12 +- tensorflow/compiler/tf2xla/lib/BUILD | 20 +--- tensorflow/compiler/tf2xla/lib/batch_dot.cc | 106 ------------------ tensorflow/compiler/tf2xla/lib/batch_dot.h | 47 -------- tensorflow/compiler/tf2xla/lib/cholesky.cc | 2 +- tensorflow/compiler/tf2xla/lib/qr.cc | 1 - .../compiler/tf2xla/lib/triangular_solve.cc | 1 - tensorflow/compiler/tf2xla/lib/util_test.cc | 2 +- tensorflow/compiler/xla/client/lib/BUILD | 4 + tensorflow/compiler/xla/client/lib/matrix.cc | 81 ++++++++++++- tensorflow/compiler/xla/client/lib/matrix.h | 20 ++++ 12 files changed, 114 insertions(+), 183 deletions(-) delete mode 100644 tensorflow/compiler/tf2xla/lib/batch_dot.cc delete mode 100644 tensorflow/compiler/tf2xla/lib/batch_dot.h diff --git a/tensorflow/compiler/tf2xla/kernels/BUILD b/tensorflow/compiler/tf2xla/kernels/BUILD index dcd38764be..fa51a72aea 100644 --- a/tensorflow/compiler/tf2xla/kernels/BUILD +++ b/tensorflow/compiler/tf2xla/kernels/BUILD @@ -121,7 +121,6 @@ tf_kernel_library( ":while_op", "//tensorflow/compiler/tf2xla:common", "//tensorflow/compiler/tf2xla:xla_compiler", - "//tensorflow/compiler/tf2xla/lib:batch_dot", "//tensorflow/compiler/tf2xla/lib:broadcast", "//tensorflow/compiler/tf2xla/lib:cholesky", "//tensorflow/compiler/tf2xla/lib:qr", diff --git a/tensorflow/compiler/tf2xla/kernels/batch_matmul_op.cc b/tensorflow/compiler/tf2xla/kernels/batch_matmul_op.cc index 7ba91765fd..0a68988de1 100644 --- a/tensorflow/compiler/tf2xla/kernels/batch_matmul_op.cc +++ b/tensorflow/compiler/tf2xla/kernels/batch_matmul_op.cc @@ -13,11 +13,10 @@ See the License for the specific language governing permissions and limitations under the License. ==============================================================================*/ -#include "tensorflow/compiler/tf2xla/lib/batch_dot.h" - #include "tensorflow/compiler/tf2xla/lib/util.h" #include "tensorflow/compiler/tf2xla/xla_op_kernel.h" #include "tensorflow/compiler/tf2xla/xla_op_registry.h" +#include "tensorflow/compiler/xla/client/lib/matrix.h" namespace tensorflow { namespace { @@ -30,10 +29,11 @@ class BatchMatMulOp : public XlaOpKernel { } void Compile(XlaOpKernelContext* ctx) override { - auto result = BatchDot(MaybeTransposeInMinorDims( - MaybeConjugate(ctx->Input(0), adj_x_), adj_x_), - MaybeTransposeInMinorDims( - MaybeConjugate(ctx->Input(1), adj_y_), adj_y_)); + auto result = + xla::BatchDot(MaybeTransposeInMinorDims( + MaybeConjugate(ctx->Input(0), adj_x_), adj_x_), + MaybeTransposeInMinorDims( + MaybeConjugate(ctx->Input(1), adj_y_), adj_y_)); ctx->SetOutput(0, result); } diff --git a/tensorflow/compiler/tf2xla/lib/BUILD b/tensorflow/compiler/tf2xla/lib/BUILD index 3575e480c1..34b95a112e 100644 --- a/tensorflow/compiler/tf2xla/lib/BUILD +++ b/tensorflow/compiler/tf2xla/lib/BUILD @@ -17,20 +17,6 @@ filegroup( load("//tensorflow/compiler/xla/tests:build_defs.bzl", "xla_test") -cc_library( - name = "batch_dot", - srcs = ["batch_dot.cc"], - hdrs = ["batch_dot.h"], - deps = [ - "//tensorflow/compiler/xla:shape_util", - "//tensorflow/compiler/xla:status_macros", - "//tensorflow/compiler/xla:statusor", - "//tensorflow/compiler/xla:xla_data_proto", - "//tensorflow/compiler/xla/client:xla_builder", - "//tensorflow/core:lib", - ], -) - cc_library( name = "broadcast", srcs = ["broadcast.cc"], @@ -52,7 +38,6 @@ cc_library( srcs = ["cholesky.cc"], hdrs = ["cholesky.h"], deps = [ - ":batch_dot", ":triangular_solve", ":util", ":while_loop", @@ -63,6 +48,7 @@ cc_library( "//tensorflow/compiler/xla:xla_data_proto", "//tensorflow/compiler/xla/client:xla_builder", "//tensorflow/compiler/xla/client/lib:constants", + "//tensorflow/compiler/xla/client/lib:matrix", "//tensorflow/core:lib", ], ) @@ -87,7 +73,6 @@ cc_library( srcs = ["qr.cc"], hdrs = ["qr.h"], deps = [ - ":batch_dot", ":util", ":while_loop", "//tensorflow/compiler/xla:literal_util", @@ -129,7 +114,6 @@ cc_library( srcs = ["triangular_solve.cc"], hdrs = ["triangular_solve.h"], deps = [ - ":batch_dot", ":util", "//tensorflow/compiler/xla:literal", "//tensorflow/compiler/xla:shape_util", @@ -191,7 +175,6 @@ xla_test( name = "util_test", srcs = ["util_test.cc"], deps = [ - ":batch_dot", ":util", "//tensorflow/compiler/xla:array2d", "//tensorflow/compiler/xla:literal", @@ -202,6 +185,7 @@ xla_test( "//tensorflow/compiler/xla:xla_data_proto", "//tensorflow/compiler/xla/client:global_data", "//tensorflow/compiler/xla/client:local_client", + "//tensorflow/compiler/xla/client/lib:matrix", "//tensorflow/compiler/xla/tests:client_library_test_base", "//tensorflow/compiler/xla/tests:literal_test_util", "//tensorflow/compiler/xla/tests:xla_internal_test_main", diff --git a/tensorflow/compiler/tf2xla/lib/batch_dot.cc b/tensorflow/compiler/tf2xla/lib/batch_dot.cc deleted file mode 100644 index dfc840698a..0000000000 --- a/tensorflow/compiler/tf2xla/lib/batch_dot.cc +++ /dev/null @@ -1,106 +0,0 @@ -/* Copyright 2017 The TensorFlow Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -==============================================================================*/ - -#include "tensorflow/compiler/tf2xla/lib/batch_dot.h" - -#include -#include - -#include "tensorflow/compiler/xla/client/xla_builder.h" -#include "tensorflow/compiler/xla/shape_util.h" -#include "tensorflow/compiler/xla/status_macros.h" -#include "tensorflow/compiler/xla/statusor.h" -#include "tensorflow/core/lib/core/errors.h" - -namespace tensorflow { - -xla::XlaOp BatchDot(xla::XlaOp x, xla::XlaOp y, - xla::PrecisionConfig::Precision precision) { - xla::XlaBuilder* builder = x.builder(); - return builder->ReportErrorOrReturn([&]() -> xla::StatusOr { - TF_ASSIGN_OR_RETURN(xla::Shape x_shape, builder->GetShape(x)); - TF_ASSIGN_OR_RETURN(xla::Shape y_shape, builder->GetShape(y)); - - // Check that both tensors have the same number of dimensions. There must be - // at least two (the batch dimensions can be empty). - if (xla::ShapeUtil::Rank(x_shape) != xla::ShapeUtil::Rank(y_shape)) { - return errors::InvalidArgument( - "Arguments to BatchedDot have different ranks: ", - xla::ShapeUtil::HumanString(x_shape), " vs. ", - xla::ShapeUtil::HumanString(y_shape)); - } - const int ndims = xla::ShapeUtil::Rank(x_shape); - if (ndims < 2) { - return errors::InvalidArgument( - "Arguments to BatchedDot must have rank >= 2: ", ndims); - } - - // The batch dimensions must be equal and the matrix dimensions must be - // valid. - std::vector batch_dimension_numbers; - for (int i = 0; i < ndims - 2; ++i) { - if (x_shape.dimensions(i) != y_shape.dimensions(i)) { - return errors::InvalidArgument( - "Dimension ", i, " of inputs to BatchedDot must be equal: ", - xla::ShapeUtil::HumanString(x_shape), " vs ", - xla::ShapeUtil::HumanString(y_shape)); - } - batch_dimension_numbers.push_back(i); - } - - int x_inner_dim = ndims - 1; - int y_inner_dim = ndims - 2; - if (x_shape.dimensions(x_inner_dim) != y_shape.dimensions(y_inner_dim)) { - return errors::InvalidArgument( - "Dimensions ", x_inner_dim, " and ", y_inner_dim, - " of arguments to BatchedDot must be equal: ", - xla::ShapeUtil::HumanString(x_shape), " vs. ", - xla::ShapeUtil::HumanString(y_shape)); - } - - // Check for zero lhs/rhs dim size. - if (xla::ShapeUtil::IsZeroElementArray(x_shape) || - xla::ShapeUtil::IsZeroElementArray(y_shape)) { - std::vector dimensions(batch_dimension_numbers.size()); - for (int i = 0; i < batch_dimension_numbers.size(); ++i) { - dimensions[i] = x_shape.dimensions(batch_dimension_numbers[i]); - } - int x_outer_dim = ndims - 2; - int y_outer_dim = ndims - 1; - dimensions.push_back(x_shape.dimensions(x_outer_dim)); - dimensions.push_back(y_shape.dimensions(y_outer_dim)); - return xla::Broadcast( - xla::ConstantLiteral(builder, - xla::LiteralUtil::Zero(x_shape.element_type())), - dimensions); - } - - xla::PrecisionConfig precision_proto; - precision_proto.add_operand_precision(precision); - precision_proto.add_operand_precision(precision); - - xla::DotDimensionNumbers dot_dnums; - dot_dnums.add_lhs_contracting_dimensions(x_inner_dim); - dot_dnums.add_rhs_contracting_dimensions(y_inner_dim); - for (auto batch_dimension_number : batch_dimension_numbers) { - dot_dnums.add_lhs_batch_dimensions(batch_dimension_number); - dot_dnums.add_rhs_batch_dimensions(batch_dimension_number); - } - - return xla::DotGeneral(x, y, dot_dnums, &precision_proto); - }); -} - -} // namespace tensorflow diff --git a/tensorflow/compiler/tf2xla/lib/batch_dot.h b/tensorflow/compiler/tf2xla/lib/batch_dot.h deleted file mode 100644 index 6cfa698593..0000000000 --- a/tensorflow/compiler/tf2xla/lib/batch_dot.h +++ /dev/null @@ -1,47 +0,0 @@ -/* Copyright 2017 The TensorFlow Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -==============================================================================*/ - -#ifndef TENSORFLOW_COMPILER_TF2XLA_LIB_BATCH_DOT_H_ -#define TENSORFLOW_COMPILER_TF2XLA_LIB_BATCH_DOT_H_ - -#include "tensorflow/compiler/xla/client/xla_builder.h" -#include "tensorflow/compiler/xla/xla_data.pb.h" - -namespace tensorflow { - -// Multiplies slices of two tensors in batches. - -// Multiplies all slices of `Tensor` `x` and `y` (each slice can be -// viewed as an element of a batch), and arranges the individual results -// in a single output tensor of the same batch size. -// -// The input tensors `x` and `y` are 2-D or higher with shape `[..., r_x, c_x]` -// and `[..., r_y, c_y]`. -// -// The output tensor is 2-D or higher with shape `[..., r_o, c_o]`, where: -// -// r_o = c_x if transpose_x else r_x -// c_o = r_y if transpose_y else c_y -// -// It is computed as: -// -// output[..., :, :] = matrix(x[..., :, :]) * matrix(y[..., :, :]) -xla::XlaOp BatchDot( - xla::XlaOp x, xla::XlaOp y, - xla::PrecisionConfig::Precision precision = xla::PrecisionConfig::DEFAULT); - -} // namespace tensorflow - -#endif // TENSORFLOW_COMPILER_TF2XLA_LIB_BATCH_DOT_H_ diff --git a/tensorflow/compiler/tf2xla/lib/cholesky.cc b/tensorflow/compiler/tf2xla/lib/cholesky.cc index dfc014395b..d580e8b330 100644 --- a/tensorflow/compiler/tf2xla/lib/cholesky.cc +++ b/tensorflow/compiler/tf2xla/lib/cholesky.cc @@ -18,11 +18,11 @@ limitations under the License. #include #include -#include "tensorflow/compiler/tf2xla/lib/batch_dot.h" #include "tensorflow/compiler/tf2xla/lib/triangular_solve.h" #include "tensorflow/compiler/tf2xla/lib/util.h" #include "tensorflow/compiler/tf2xla/lib/while_loop.h" #include "tensorflow/compiler/xla/client/lib/constants.h" +#include "tensorflow/compiler/xla/client/lib/matrix.h" #include "tensorflow/compiler/xla/client/xla_builder.h" #include "tensorflow/compiler/xla/literal.h" #include "tensorflow/compiler/xla/shape_util.h" diff --git a/tensorflow/compiler/tf2xla/lib/qr.cc b/tensorflow/compiler/tf2xla/lib/qr.cc index 440ad13321..1b374cf58c 100644 --- a/tensorflow/compiler/tf2xla/lib/qr.cc +++ b/tensorflow/compiler/tf2xla/lib/qr.cc @@ -18,7 +18,6 @@ limitations under the License. #include #include -#include "tensorflow/compiler/tf2xla/lib/batch_dot.h" #include "tensorflow/compiler/tf2xla/lib/util.h" #include "tensorflow/compiler/tf2xla/lib/while_loop.h" #include "tensorflow/compiler/xla/client/lib/arithmetic.h" diff --git a/tensorflow/compiler/tf2xla/lib/triangular_solve.cc b/tensorflow/compiler/tf2xla/lib/triangular_solve.cc index df8489b62e..d2d4076401 100644 --- a/tensorflow/compiler/tf2xla/lib/triangular_solve.cc +++ b/tensorflow/compiler/tf2xla/lib/triangular_solve.cc @@ -18,7 +18,6 @@ limitations under the License. #include #include -#include "tensorflow/compiler/tf2xla/lib/batch_dot.h" #include "tensorflow/compiler/tf2xla/lib/util.h" #include "tensorflow/compiler/xla/client/lib/constants.h" #include "tensorflow/compiler/xla/client/lib/matrix.h" diff --git a/tensorflow/compiler/tf2xla/lib/util_test.cc b/tensorflow/compiler/tf2xla/lib/util_test.cc index 820f9a4972..d86b01a2d5 100644 --- a/tensorflow/compiler/tf2xla/lib/util_test.cc +++ b/tensorflow/compiler/tf2xla/lib/util_test.cc @@ -19,8 +19,8 @@ limitations under the License. #include #include -#include "tensorflow/compiler/tf2xla/lib/batch_dot.h" #include "tensorflow/compiler/xla/array2d.h" +#include "tensorflow/compiler/xla/client/lib/matrix.h" #include "tensorflow/compiler/xla/literal.h" #include "tensorflow/compiler/xla/statusor.h" #include "tensorflow/compiler/xla/test.h" diff --git a/tensorflow/compiler/xla/client/lib/BUILD b/tensorflow/compiler/xla/client/lib/BUILD index dc188b67a6..cc33293a63 100644 --- a/tensorflow/compiler/xla/client/lib/BUILD +++ b/tensorflow/compiler/xla/client/lib/BUILD @@ -110,7 +110,11 @@ cc_library( deps = [ ":arithmetic", ":constants", + "//tensorflow/compiler/xla:shape_util", + "//tensorflow/compiler/xla:status_macros", + "//tensorflow/compiler/xla:statusor", "//tensorflow/compiler/xla:types", + "//tensorflow/compiler/xla:util", "//tensorflow/compiler/xla:xla_data_proto", "//tensorflow/compiler/xla/client:xla_builder", "@com_google_absl//absl/types:span", diff --git a/tensorflow/compiler/xla/client/lib/matrix.cc b/tensorflow/compiler/xla/client/lib/matrix.cc index f78d8509d8..77d9ec5ece 100644 --- a/tensorflow/compiler/xla/client/lib/matrix.cc +++ b/tensorflow/compiler/xla/client/lib/matrix.cc @@ -21,6 +21,11 @@ limitations under the License. #include "absl/types/span.h" #include "tensorflow/compiler/xla/client/lib/arithmetic.h" #include "tensorflow/compiler/xla/client/lib/constants.h" +#include "tensorflow/compiler/xla/client/xla_builder.h" +#include "tensorflow/compiler/xla/shape_util.h" +#include "tensorflow/compiler/xla/status_macros.h" +#include "tensorflow/compiler/xla/statusor.h" +#include "tensorflow/compiler/xla/util.h" namespace xla { @@ -71,7 +76,7 @@ XlaOp Triangle(XlaOp x, bool lower) { AsInt64Slice(shape.dimensions()).subspan(/*pos=*/0, /*len=*/n_dims - 2); auto a = Iota(builder, U32, n); auto b = Iota(builder, U32, m); - xla::XlaOp indicator; + XlaOp indicator; if (lower) { indicator = Ge(b, Broadcast(a, {m}), /*broadcast_dimensions=*/{0}); } else { @@ -87,4 +92,78 @@ XlaOp UpperTriangle(XlaOp x) { return Triangle(x, false); } XlaOp LowerTriangle(XlaOp x) { return Triangle(x, true); } +XlaOp BatchDot(XlaOp x, XlaOp y, PrecisionConfig::Precision precision) { + XlaBuilder* builder = x.builder(); + return builder->ReportErrorOrReturn([&]() -> StatusOr { + TF_ASSIGN_OR_RETURN(Shape x_shape, builder->GetShape(x)); + TF_ASSIGN_OR_RETURN(Shape y_shape, builder->GetShape(y)); + + // Check that both tensors have the same number of dimensions. There must be + // at least two (the batch dimensions can be empty). + if (ShapeUtil::Rank(x_shape) != ShapeUtil::Rank(y_shape)) { + return InvalidArgument( + "Arguments to BatchDot have different ranks: %s vs. %s", + ShapeUtil::HumanString(x_shape), ShapeUtil::HumanString(y_shape)); + } + const int ndims = ShapeUtil::Rank(x_shape); + if (ndims < 2) { + return InvalidArgument( + "Arguments to BatchDot must have rank >= 2: got %d", ndims); + } + + // The batch dimensions must be equal and the matrix dimensions must be + // valid. + std::vector batch_dimension_numbers; + for (int i = 0; i < ndims - 2; ++i) { + if (x_shape.dimensions(i) != y_shape.dimensions(i)) { + return InvalidArgument( + "Dimension %d of inputs to BatchDot must be equal: shapes %s vs %s", + i, ShapeUtil::HumanString(x_shape), + ShapeUtil::HumanString(y_shape)); + } + batch_dimension_numbers.push_back(i); + } + + int x_inner_dim = ndims - 1; + int y_inner_dim = ndims - 2; + if (x_shape.dimensions(x_inner_dim) != y_shape.dimensions(y_inner_dim)) { + return InvalidArgument( + "Dimensions %d and %d of arguments to BatchDot must be equal: " + "shapes %s vs %s", + x_inner_dim, y_inner_dim, ShapeUtil::HumanString(x_shape), + ShapeUtil::HumanString(y_shape)); + } + + // Check for zero lhs/rhs dim size. + if (ShapeUtil::IsZeroElementArray(x_shape) || + ShapeUtil::IsZeroElementArray(y_shape)) { + std::vector dimensions(batch_dimension_numbers.size()); + for (int i = 0; i < batch_dimension_numbers.size(); ++i) { + dimensions[i] = x_shape.dimensions(batch_dimension_numbers[i]); + } + int x_outer_dim = ndims - 2; + int y_outer_dim = ndims - 1; + dimensions.push_back(x_shape.dimensions(x_outer_dim)); + dimensions.push_back(y_shape.dimensions(y_outer_dim)); + return Broadcast( + ConstantLiteral(builder, LiteralUtil::Zero(x_shape.element_type())), + dimensions); + } + + PrecisionConfig precision_proto; + precision_proto.add_operand_precision(precision); + precision_proto.add_operand_precision(precision); + + DotDimensionNumbers dot_dnums; + dot_dnums.add_lhs_contracting_dimensions(x_inner_dim); + dot_dnums.add_rhs_contracting_dimensions(y_inner_dim); + for (auto batch_dimension_number : batch_dimension_numbers) { + dot_dnums.add_lhs_batch_dimensions(batch_dimension_number); + dot_dnums.add_rhs_batch_dimensions(batch_dimension_number); + } + + return DotGeneral(x, y, dot_dnums, &precision_proto); + }); +} + } // namespace xla diff --git a/tensorflow/compiler/xla/client/lib/matrix.h b/tensorflow/compiler/xla/client/lib/matrix.h index 7af3e68f20..82e8514411 100644 --- a/tensorflow/compiler/xla/client/lib/matrix.h +++ b/tensorflow/compiler/xla/client/lib/matrix.h @@ -40,6 +40,26 @@ XlaOp UpperTriangle(XlaOp x); // Get the lower triangle part of the last two dimensions XlaOp LowerTriangle(XlaOp x); +// Multiplies slices of two tensors in batches. + +// Multiplies all slices of `Tensor` `x` and `y` (each slice can be +// viewed as an element of a batch), and arranges the individual results +// in a single output tensor of the same batch size. +// +// The input tensors `x` and `y` are 2-D or higher with shape `[..., r_x, c_x]` +// and `[..., r_y, c_y]`. +// +// The output tensor is 2-D or higher with shape `[..., r_o, c_o]`, where: +// +// r_o = c_x if transpose_x else r_x +// c_o = r_y if transpose_y else c_y +// +// It is computed as: +// +// output[..., :, :] = matrix(x[..., :, :]) * matrix(y[..., :, :]) +xla::XlaOp BatchDot( + xla::XlaOp x, xla::XlaOp y, + xla::PrecisionConfig::Precision precision = xla::PrecisionConfig::DEFAULT); } // namespace xla #endif // TENSORFLOW_COMPILER_XLA_CLIENT_LIB_MATRIX_H_ -- GitLab From 48f1d6fa940e34365e22874479d751efa706504b Mon Sep 17 00:00:00 2001 From: Sanjoy Das Date: Tue, 4 Dec 2018 09:48:05 -0800 Subject: [PATCH 090/405] Accept const buffers as args; NFC Makes it explicit that we do not modify arg buffers in place. PiperOrigin-RevId: 223994560 --- tensorflow/compiler/aot/codegen.cc | 2 +- tensorflow/compiler/aot/codegen_test_h.golden | 6 +++--- tensorflow/compiler/tf2xla/xla_compiled_cpu_function.h | 10 ++++++++-- 3 files changed, 12 insertions(+), 6 deletions(-) diff --git a/tensorflow/compiler/aot/codegen.cc b/tensorflow/compiler/aot/codegen.cc index e0ac7130a6..ab1c1be344 100644 --- a/tensorflow/compiler/aot/codegen.cc +++ b/tensorflow/compiler/aot/codegen.cc @@ -178,7 +178,7 @@ Status GenArgMethods(const tf2xla::Config& config, TF_RETURN_IF_ERROR( AddRewritesForShape(i, xla::Shape(ps.parameters(i)), &rewrites)); const string code = R"( - void set_arg{{NAME}}_data(void* data) { + void set_arg{{NAME}}_data(const void* data) { set_arg_data({{I}}, data); } {{TYPE}}* arg{{NAME}}_data() { diff --git a/tensorflow/compiler/aot/codegen_test_h.golden b/tensorflow/compiler/aot/codegen_test_h.golden index a2cdab5d1a..968afad65e 100644 --- a/tensorflow/compiler/aot/codegen_test_h.golden +++ b/tensorflow/compiler/aot/codegen_test_h.golden @@ -114,7 +114,7 @@ class MyClass : public tensorflow::XlaCompiledCpuFunction { // with dim indices specifying which value. No bounds checking is performed // on dim indices. - void set_arg0_data(void* data) { + void set_arg0_data(const void* data) { set_arg_data(0, data); } float* arg0_data() { @@ -132,7 +132,7 @@ class MyClass : public tensorflow::XlaCompiledCpuFunction { arg_data(0)))[dim0][dim1]; } - void set_arg_myfeed_data(void* data) { + void set_arg_myfeed_data(const void* data) { set_arg_data(0, data); } float* arg_myfeed_data() { @@ -150,7 +150,7 @@ class MyClass : public tensorflow::XlaCompiledCpuFunction { arg_data(0)))[dim0][dim1]; } - void set_arg1_data(void* data) { + void set_arg1_data(const void* data) { set_arg_data(1, data); } tensorflow::int64* arg1_data() { diff --git a/tensorflow/compiler/tf2xla/xla_compiled_cpu_function.h b/tensorflow/compiler/tf2xla/xla_compiled_cpu_function.h index a1d359e97c..c7341cf8b9 100644 --- a/tensorflow/compiler/tf2xla/xla_compiled_cpu_function.h +++ b/tensorflow/compiler/tf2xla/xla_compiled_cpu_function.h @@ -206,8 +206,14 @@ class XlaCompiledCpuFunction { // // Aliasing of argument and result buffers is not allowed, and results in // undefined behavior. - void set_arg_data(size_t index, void* data) { - buffer_table_[arg_index_table_[index]] = data; + void set_arg_data(size_t index, const void* data) { + // The const_cast is safe because the generated code does not write to arg + // buffers. + // + // buffer_table_ contains pointers to buffers that _will_ be written to by + // generated code so it would be misleading to make buffer_table_ a `const + // void**`. + buffer_table_[arg_index_table_[index]] = const_cast(data); } // ------------------------------ -- GitLab From dfbbc2d4de667e0f9fe07035d0c413d3e4bd8364 Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Tue, 4 Dec 2018 09:48:13 -0800 Subject: [PATCH 091/405] Introduce a new XRTExecute flag which allows an exploded (in terms of its handles) tuple to be returned. This prevents clients which are interested in the tuple element handles to do extra RPCs to get them. PiperOrigin-RevId: 223994592 --- .../compiler/xrt/kernels/xrt_execute_op.cc | 37 +++++++-- tensorflow/compiler/xrt/tests/raw_api_test.cc | 76 +++++++++++++++++++ tensorflow/compiler/xrt/xrt.proto | 4 + 3 files changed, 109 insertions(+), 8 deletions(-) diff --git a/tensorflow/compiler/xrt/kernels/xrt_execute_op.cc b/tensorflow/compiler/xrt/kernels/xrt_execute_op.cc index 8c6191ddc0..751329eefc 100644 --- a/tensorflow/compiler/xrt/kernels/xrt_execute_op.cc +++ b/tensorflow/compiler/xrt/kernels/xrt_execute_op.cc @@ -228,14 +228,35 @@ Status XRTExecuteOp::DoWork(OpKernelContext* context) { TF_RETURN_IF_ERROR(XRTTupleAllocation::CreateFromBuffer( shaped_buffer, device_ref.backend(), device_ref.device_ordinal(), &output_tuple)); - - Tensor* output_tensor; - TF_RETURN_IF_ERROR( - context->allocate_output(0, TensorShape({}), &output_tensor)); - int64 key; - TF_RETURN_IF_ERROR(output_tuple->Intern(rm, &key)); - output_tensor->scalar()() = key; - + if (config_proto.return_exploded_tuple() && + xla::ShapeUtil::IsTuple(output_tuple->on_device_shape())) { + int64 tuple_element_count = + xla::ShapeUtil::TupleElementCount(output_tuple->on_device_shape()); + Tensor* output_tensor; + TF_RETURN_IF_ERROR(context->allocate_output( + 0, TensorShape({tuple_element_count}), &output_tensor)); + + for (int64 i = 0; i < tuple_element_count; ++i) { + xla::ShapeIndex shape_index; + shape_index.push_back(i); + + XRTTupleAllocation* suballocation; + TF_RETURN_IF_ERROR(XRTTupleAllocation::MakeSubBuffer( + output_tuple, shape_index, &suballocation, + /*alias_parent_allocation=*/false)); + int64 key; + TF_RETURN_IF_ERROR(suballocation->Intern(rm, &key)); + output_tensor->vec()(i) = key; + } + output_tuple->Unref(); + } else { + Tensor* output_tensor; + TF_RETURN_IF_ERROR( + context->allocate_output(0, TensorShape({}), &output_tensor)); + int64 key; + TF_RETURN_IF_ERROR(output_tuple->Intern(rm, &key)); + output_tensor->scalar()() = key; + } return Status::OK(); } diff --git a/tensorflow/compiler/xrt/tests/raw_api_test.cc b/tensorflow/compiler/xrt/tests/raw_api_test.cc index b9262c1843..aa0eabfce7 100644 --- a/tensorflow/compiler/xrt/tests/raw_api_test.cc +++ b/tensorflow/compiler/xrt/tests/raw_api_test.cc @@ -175,6 +175,18 @@ xla::XlaComputation AddAndTuple() { return builder.Build().ValueOrDie(); } +xla::XlaComputation AddAndSubTuple() { + xla::XlaBuilder builder("AddAndSubTuple"); + auto p0 = xla::Parameter(&builder, 0, xla::ShapeUtil::MakeShape(xla::F32, {}), + "P0"); + auto p1 = xla::Parameter(&builder, 1, xla::ShapeUtil::MakeShape(xla::F32, {}), + "P1"); + auto sum = xla::Add(p0, p1); + auto sub = xla::Sub(p0, p1); + xla::Tuple(&builder, {sum, sub}); + return builder.Build().ValueOrDie(); +} + void StoreComputationSnapshot(const xla::XlaComputation& computation, xla::HloSnapshot* dst) { auto snapshot = computation.Snapshot().ValueOrDie(); @@ -681,6 +693,70 @@ TEST(RawApiTest, CompileAndExecuteReturnTuple) { EXPECT_TRUE(CompareLiteralToLiteralProto(expected, response)); } +TEST(RawApiTest, CompileAndExecuteReturnExplodedTuple) { + xrt::XLAAllocation p0; + p0.set_device_ordinal(0); + *p0.mutable_value() = xla::LiteralUtil::CreateR0(12.0f).ToProto(); + + xrt::XLAAllocation p1; + p1.set_device_ordinal(0); + *p1.mutable_value() = xla::LiteralUtil::CreateR0(3.0f).ToProto(); + + xrt::XLAComputation c; + auto config = c.mutable_config(); + auto shapes = config->mutable_program_shape(); + *shapes->add_parameters() = xla::ShapeUtil::MakeShape(xla::F32, {}).ToProto(); + *shapes->add_parameters() = xla::ShapeUtil::MakeShape(xla::F32, {}).ToProto(); + *shapes->mutable_result() = + xla::ShapeUtil::MakeTupleShape({xla::ShapeUtil::MakeShape(xla::F32, {}), + xla::ShapeUtil::MakeShape(xla::F32, {})}) + .ToProto(); + StoreComputationSnapshot(AddAndSubTuple(), c.mutable_hlo_snapshot()); + + xrt::XRTExecutionConfig e; + e.set_release_input_handles(true); + e.set_release_compilation_handle(true); + e.set_return_exploded_tuple(true); + + Scope root = Scope::NewRootScope().WithDevice(DeviceFromFlag()); + auto e_config = + ops::Const(root.WithDevice("/device:CPU:0"), e.SerializeAsString()); + auto computation = + ops::Const(root.WithDevice("/device:CPU:0"), c.SerializeAsString()); + auto c_handle = ops::XRTCompile(root, computation); + auto p0_value = + ops::Const(root.WithDevice("/device:CPU:0"), p0.SerializeAsString()); + auto p0_handle = ops::XRTAllocate(root, p0_value); + auto p1_value = + ops::Const(root.WithDevice("/device:CPU:0"), p1.SerializeAsString()); + auto p1_handle = ops::XRTAllocate(root, p1_value); + auto result = ops::XRTExecute(root, c_handle.handle, e_config, + {Output(p0_handle), Output(p1_handle)}); + TF_ASSERT_OK(root.status()); + + ClientSession session(root); + std::vector outputs; + TF_EXPECT_OK(session.Run({result}, &outputs)); + EXPECT_EQ(outputs.size(), 1); + + auto handles_vec = outputs.front().vec(); + EXPECT_EQ(handles_vec.size(), 2); + + const float kResults[2] = {15.0f, 9.0f}; + for (int64 i = 0; i < handles_vec.size(); ++i) { + auto read_back = ops::XRTReadLiteralAndRelease(root, Input(handles_vec(i))); + std::vector voutputs; + TF_EXPECT_OK(session.Run({read_back}, &voutputs)); + EXPECT_EQ(voutputs.size(), 1); + + xla::LiteralProto response; + EXPECT_TRUE(response.ParseFromString(voutputs[0].scalar()())); + + auto expected = xla::LiteralUtil::CreateR0(kResults[i]); + EXPECT_TRUE(CompareLiteralToLiteralProto(expected, response)); + } +} + TEST(RawApiTest, LeakCompilationReference) { xrt::XLAComputation c; auto config = c.mutable_config(); diff --git a/tensorflow/compiler/xrt/xrt.proto b/tensorflow/compiler/xrt/xrt.proto index e149f2f435..378bb9246f 100644 --- a/tensorflow/compiler/xrt/xrt.proto +++ b/tensorflow/compiler/xrt/xrt.proto @@ -101,4 +101,8 @@ message XRTExecutionConfig { bool release_input_handles = 5; // If true, release the handle to the computation after running. bool release_compilation_handle = 6; + // If set to true, and the result shape is a tuple, then instead of returning + // a single tuple allocation the execution will return a vector of + // allocations, one for each of the first-level elements of the result tuple. + bool return_exploded_tuple = 7; } -- GitLab From 551248d4db31779d144c9c40cacbc2e5ad09f2a3 Mon Sep 17 00:00:00 2001 From: Nupur Garg Date: Tue, 4 Dec 2018 09:49:51 -0800 Subject: [PATCH 092/405] Added the functionality of input_data_types flag to the Python API. PiperOrigin-RevId: 223994946 --- tensorflow/lite/python/convert.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tensorflow/lite/python/convert.py b/tensorflow/lite/python/convert.py index 198612f6fe..0974bdf3d0 100644 --- a/tensorflow/lite/python/convert.py +++ b/tensorflow/lite/python/convert.py @@ -335,9 +335,11 @@ def build_toco_convert_protos(input_tensors, model.change_concat_input_ranges = change_concat_input_ranges for idx, input_tensor in enumerate(input_tensors): input_array = model.input_arrays.add() + input_array.name = tensor_name(input_tensor) + input_array.data_type = convert_dtype_to_tflite_type(input_tensor.dtype) + if toco.inference_input_type == _types_pb2.QUANTIZED_UINT8: input_array.mean_value, input_array.std_value = quantized_input_stats[idx] - input_array.name = tensor_name(input_tensor) if input_shapes is None: shape = input_tensor.get_shape() else: -- GitLab From 1db31998360be29d233b66ac78fb16c3ebcdae65 Mon Sep 17 00:00:00 2001 From: Sanjoy Das Date: Tue, 4 Dec 2018 09:52:20 -0800 Subject: [PATCH 093/405] [TF:XLA] Don't ignore data_format in XLA's {Avg|Max}Pool3d{Grad} lowering I didn't add a dedicated test for this because soon we'll be exercising this via python/kernel_tests/pooling_ops_3d_test.py (that's how I found this problem). PiperOrigin-RevId: 223995407 --- .../compiler/tf2xla/kernels/pooling_ops.cc | 31 ++++++++++--------- 1 file changed, 17 insertions(+), 14 deletions(-) diff --git a/tensorflow/compiler/tf2xla/kernels/pooling_ops.cc b/tensorflow/compiler/tf2xla/kernels/pooling_ops.cc index a259da6383..06c6cc37ec 100644 --- a/tensorflow/compiler/tf2xla/kernels/pooling_ops.cc +++ b/tensorflow/compiler/tf2xla/kernels/pooling_ops.cc @@ -152,7 +152,12 @@ class MaxPoolOp : public PoolingOp { public: MaxPoolOp(OpKernelConstruction* ctx, int num_spatial_dims) : PoolingOp(ctx, /*num_spatial_dims=*/num_spatial_dims, - /*reduction_type=*/ctx->input_type(0)) {} + /*reduction_type=*/ctx->input_type(0)) { + string data_format_str; + OP_REQUIRES_OK(ctx, ctx->GetAttr("data_format", &data_format_str)); + OP_REQUIRES(ctx, FormatFromString(data_format_str, &data_format_), + errors::InvalidArgument("Invalid data format")); + } void Compile(XlaOpKernelContext* ctx) override { auto ksize_or_error = GetKernelSize(ctx); @@ -180,10 +185,6 @@ class MaxPool2DOp : public MaxPoolOp { public: explicit MaxPool2DOp(OpKernelConstruction* ctx) : MaxPoolOp(ctx, /*num_spatial_dims=*/2) { - string data_format_str; - OP_REQUIRES_OK(ctx, ctx->GetAttr("data_format", &data_format_str)); - OP_REQUIRES(ctx, FormatFromString(data_format_str, &data_format_), - errors::InvalidArgument("Invalid data format")); } }; REGISTER_XLA_OP(Name("MaxPool"), MaxPool2DOp); @@ -204,7 +205,12 @@ class AvgPoolOp : public PoolingOp { AvgPoolOp(OpKernelConstruction* ctx, int num_spatial_dims) : PoolingOp(ctx, /*num_spatial_dims=*/num_spatial_dims, /*reduction_type=*/ - XlaHelpers::SumAccumulationType(ctx->input_type(0))) {} + XlaHelpers::SumAccumulationType(ctx->input_type(0))) { + string data_format_str; + OP_REQUIRES_OK(ctx, ctx->GetAttr("data_format", &data_format_str)); + OP_REQUIRES(ctx, FormatFromString(data_format_str, &data_format_), + errors::InvalidArgument("Invalid data format")); + } void Compile(XlaOpKernelContext* ctx) override { auto ksize_or_error = GetKernelSize(ctx); @@ -241,10 +247,6 @@ class AvgPool2DOp : public AvgPoolOp { public: explicit AvgPool2DOp(OpKernelConstruction* ctx) : AvgPoolOp(ctx, /*num_spatial_dims=*/2) { - string data_format_str; - OP_REQUIRES_OK(ctx, ctx->GetAttr("data_format", &data_format_str)); - OP_REQUIRES(ctx, FormatFromString(data_format_str, &data_format_), - errors::InvalidArgument("Invalid data format")); } }; REGISTER_XLA_OP(Name("AvgPool"), AvgPool2DOp); @@ -390,6 +392,11 @@ class AvgPoolGradOp : public XlaOpKernel { OP_REQUIRES(ctx, ksize_[0] == 1 && stride_[0] == 1, errors::Unimplemented( "Pooling is not yet supported on the batch dimension.")); + + string data_format; + OP_REQUIRES_OK(ctx, ctx->GetAttr("data_format", &data_format)); + OP_REQUIRES(ctx, FormatFromString(data_format, &data_format_), + errors::InvalidArgument("Invalid data format")); } int num_dims() const { return num_spatial_dims_ + 2; } @@ -449,10 +456,6 @@ class AvgPool2DGradOp : public AvgPoolGradOp { public: explicit AvgPool2DGradOp(OpKernelConstruction* ctx) : AvgPoolGradOp(ctx, /*num_spatial_dims=*/2) { - string data_format; - OP_REQUIRES_OK(ctx, ctx->GetAttr("data_format", &data_format)); - OP_REQUIRES(ctx, FormatFromString(data_format, &data_format_), - errors::InvalidArgument("Invalid data format")); } }; REGISTER_XLA_OP( -- GitLab From d8b33e9e4a9a3a02c3856279a18715efc45cbb8c Mon Sep 17 00:00:00 2001 From: Eugene Zhulenev Date: Tue, 4 Dec 2018 09:55:22 -0800 Subject: [PATCH 094/405] Split conv_ops_fused PiperOrigin-RevId: 223995825 --- tensorflow/core/kernels/conv_ops_fused.cc | 885 +---------------- .../kernels/conv_ops_fused_image_transform.cc | 902 ++++++++++++++++++ 2 files changed, 911 insertions(+), 876 deletions(-) create mode 100644 tensorflow/core/kernels/conv_ops_fused_image_transform.cc diff --git a/tensorflow/core/kernels/conv_ops_fused.cc b/tensorflow/core/kernels/conv_ops_fused.cc index a0484e9235..798a7325cd 100644 --- a/tensorflow/core/kernels/conv_ops_fused.cc +++ b/tensorflow/core/kernels/conv_ops_fused.cc @@ -14,897 +14,30 @@ limitations under the License. ==============================================================================*/ // Implements convolution operations with other kernels baked into the -// processing, to optimize latency and memory usage. +// processing, to optimize latency and memory usage: +// - Conv2D + BiasAdd + +// - Conv2D + FusedBatchNorm + +// +// Activation: Relu, Relu6, Elu, etc... +// +// Kernels for convolutions fused with image transformations (resize and mirror +// padding) defined in `conv_ops_fused_image_transform.cc`. #define EIGEN_USE_THREADS -#include -#include +#include #include -#include "tensorflow/core/framework/common_shape_fns.h" -#include "tensorflow/core/framework/numeric_op.h" #include "tensorflow/core/framework/op_kernel.h" #include "tensorflow/core/framework/register_types.h" -#include "tensorflow/core/framework/resource_mgr.h" #include "tensorflow/core/framework/tensor.h" #include "tensorflow/core/framework/tensor_shape.h" -#include "tensorflow/core/framework/tensor_slice.h" #include "tensorflow/core/kernels/bounds_check.h" #include "tensorflow/core/kernels/conv_2d.h" #include "tensorflow/core/kernels/conv_ops.h" -#include "tensorflow/core/kernels/gemm_functors.h" -#include "tensorflow/core/kernels/image_resizer_state.h" #include "tensorflow/core/kernels/ops_util.h" -#include "tensorflow/core/lib/core/threadpool.h" -#include "tensorflow/core/util/mirror_pad_mode.h" -#include "tensorflow/core/util/padding.h" #include "tensorflow/core/util/tensor_format.h" namespace tensorflow { - -namespace { - -// We don't want to allocate a buffer to hold all the patches if the size is -// going to be extremely large, so break it into chunks if it's bigger than -// a limit. Each chunk will be processed serially, so we can refill the -// buffer for the next chunk and reuse it, keeping maximum memory size down. -// In this case, we've picked 16 megabytes as a reasonable limit for Android and -// other platforms using Eigen, and 1MB for iOS devices, from experimentation. -#if defined(__APPLE__) && defined(IS_MOBILE_PLATFORM) -const size_t kMaxChunkSize = (1 * 1024 * 1024); -#else -const size_t kMaxChunkSize = (16 * 1024 * 1024); -#endif -const size_t kResizeCacheSize = (8 * 1024 * 1024); - -// Lookup method used when resizing. -enum SamplingMode { - BILINEAR = 0, - NEAREST = 1, -}; - -// Simple utility function used by FusedConv to multithread basic workloads. To -// use it, pass begin and end values for the full workload and a std::function -// that receives a subset of that through the begin and end values for each -// worker's task. The division of the full workload into worker tasks is handled -// by the multithreading logic. Here's an example of how to use it: -// std::vector my_vector(100); -// ... -// FusedConvParallelFor(context, 0, 100, -// [&my_vector](int64 task_begin, int64 task_end) { -// for (int64 current = task_begin; current != task_end; ++current) { -// my_vector[current] *= 10.0f; -// } -// }); -void FusedConvParallelFor( - OpKernelContext* context, int64 begin, int64 end, - const std::function& task_function) { -// On iOS, the thread management imposes a very big performance penalty, so -// just call the function directly with no multithreading. -#if defined(__APPLE__) && defined(IS_MOBILE_PLATFORM) - task_function(begin, end); -#else - auto& worker_threads = *(context->device()->tensorflow_cpu_worker_threads()); - thread::ThreadPool* thread_pool = worker_threads.workers; - const int64 total_elements = end - begin; - // This is a bit of an arbitrary number, but was found to work well for - // typical models we've been profiling on various devices. - const int64 element_cost = 10000000; - thread_pool->ParallelFor( - total_elements, element_cost, - [begin, task_function](int64 begin_offset, int64 end_offset) { - const int64 task_begin = begin + begin_offset; - const int64 task_end = begin + end_offset; - task_function(task_begin, task_end); - }); -#endif -} - -// Holds the state needed for the resizing subtasks. -template -struct ResizeTaskParameters { - ResizeTaskParameters() : st(false) {} - - int cache_height; - T1* resize_cache; - int cache_line_width; - int input_width; - int input_depth; - int top_padding; - int pad_offset; - int64 resized_height; - ImageResizerState st; - const T1* input_batch_start; - int64 cache_start_x; - int64 cache_end_x; - int left_padding; - int64 resized_width; - int64 padded_width; - int64 padded_height; -}; - -template -struct PerCacheLineParameters { - PerCacheLineParameters() {} - PerCacheLineParameters(const PerCacheLineParameters& other) - : cache_line_start(other.cache_line_start), - input_top_row_start(other.input_top_row_start), - input_bottom_row_start(other.input_bottom_row_start), - y_lerp(other.y_lerp) {} - - T1* cache_line_start; - const T1* input_top_row_start; - const T1* input_bottom_row_start; - T1 y_lerp; -}; - -// Helper class to simplify bilinear filtering -template -struct SampleRect { - EIGEN_ALWAYS_INLINE SampleRect(const T1* in_top_left, const T1* in_top_right, - const T1* in_bottom_left, - const T1* in_bottom_right) - : top_left(in_top_left), - top_right(in_top_right), - bottom_left(in_bottom_left), - bottom_right(in_bottom_right) {} - - EIGEN_ALWAYS_INLINE T1 BilinearSample(int channel, T1 x_lerp, - T1 y_lerp) const { - const T1 top = - top_left[channel] + (top_right[channel] - top_left[channel]) * x_lerp; - const T1 bottom = bottom_left[channel] + - (bottom_right[channel] - bottom_left[channel]) * x_lerp; - return top + (bottom - top) * y_lerp; - } - - const T1* top_left; - const T1* top_right; - const T1* bottom_left; - const T1* bottom_right; -}; - -// Calculates parameters which remain constant through a resize cache row. -template -EIGEN_ALWAYS_INLINE PerCacheLineParameters CalculatePerCacheLineParameters( - int64 cache_height, int64 cache_y, T1* resize_cache, int64 cache_line_width, - int64 input_width, int64 input_depth, int64 top_padding, int64 pad_offset, - int64 resized_height, const ImageResizerState& st, - const T1* input_batch_start) { - PerCacheLineParameters result; - // The cache is organized so that the real y values of the resized image map - // onto the actual cache values through a modulo scheme. This means that as we - // progress downwards through the image, we keep reusing a small cache and so - // keep memory usage down. - int64 cache_index_y; - if (cache_y < 0) { - cache_index_y = cache_height + (cache_y % cache_height); - } else { - cache_index_y = cache_y % cache_height; - } - result.cache_line_start = - resize_cache + (cache_index_y * cache_line_width * input_depth); - // This part is implementing the mirror padding that happens before resizing. - float in_y = (cache_y - top_padding); - if (in_y < 0) { - in_y = -(in_y + 1.0f - pad_offset); - } else if (in_y >= resized_height) { - in_y = (resized_height * 2.0f) - (in_y + 1.0f + pad_offset); - } - // Here's where do do the actual resize. - in_y *= st.height_scale; - const int64 top_y_index = static_cast(std::floor(in_y)); - const int64 bottom_y_index = - std::min(static_cast(std::ceil(in_y)), (st.in_height - 1)); - // Lerp is used for bilinear filtering when that's needed. - result.y_lerp = static_cast(in_y - top_y_index); - // Which rows of the original input image to pull the values from. - result.input_top_row_start = - input_batch_start + (top_y_index * input_width * input_depth); - result.input_bottom_row_start = - input_batch_start + (bottom_y_index * input_width * input_depth); - return result; -} - -template -struct PerCachePixelParameters { - PerCachePixelParameters() {} - PerCachePixelParameters(const PerCachePixelParameters& other) - : cache_line_pixel(other.cache_line_pixel), - left_x_index(other.left_x_index), - right_x_index(other.right_x_index), - x_lerp(other.x_lerp) {} - - T1* cache_line_pixel; - int64 left_x_index; - int64 right_x_index; - T1 x_lerp; -}; - -// Pulls out common parameters used for every resized pixel. -template -EIGEN_ALWAYS_INLINE PerCachePixelParameters -CalculatePerCachePixelParameters(int64 cache_x, int64 cache_start_x, - T1* cache_line_start, int64 input_depth, - int64 left_padding, int64 pad_offset, - int64 resized_width, - const ImageResizerState& st) { - PerCachePixelParameters result; - // Figure out where we're going to store the results of our transform. - const int cache_index_x = cache_x - cache_start_x; - result.cache_line_pixel = cache_line_start + (cache_index_x * input_depth); - // Implement mirror padding by flipping in_x if it's off the edge. - float in_x = (cache_x - left_padding); - if (in_x < 0) { - in_x = -(in_x + 1.0f - pad_offset); - } else if (in_x >= resized_width) { - in_x = (resized_width * 2.0f) - (in_x + 1.0f + pad_offset); - } - // Resize the x parameters. - in_x *= st.width_scale; - // Get the x coordinates for the left and right pixels to pull from. - result.left_x_index = static_cast(std::floor(in_x)); - result.right_x_index = - std::min(static_cast(std::ceil(in_x)), (st.in_width - 1)); - // This x_lerp is used to blend pixels in bilinear filtering. - result.x_lerp = static_cast(in_x - result.left_x_index); - return result; -} - -// Combines bilinear resizing and mirror padding into the im2col transformation -// stage of convolution. -template -class FusedResizeAndPadConvFunctor { - public: - void operator()(OpKernelContext* context, const Tensor& input, - int input_batches, int resized_height, int resized_width, - int padded_height, int padded_width, int input_depth, - const T2* filter_data, int filter_height, int filter_width, - int filter_count, int stride_rows, int stride_cols, - Padding padding, T3* output_data, int output_height, - int output_width, const ImageResizerState& st, - int top_padding, int bottom_padding, int left_padding, - int right_padding, int pad_offset) { - if ((input_batches <= 0) || (padded_width <= 0) || (padded_height <= 0) || - (input_depth <= 0)) { - LOG(WARNING) << "Conv2D was called with bad input dimensions: " - << input_batches << ", " << padded_height << ", " - << padded_width << ", " << input_depth; - return; - } - if ((filter_width <= 0) || (filter_height <= 0) || (filter_count <= 0)) { - LOG(WARNING) << "Conv2D was called with bad filter dimensions: " - << filter_width << ", " << filter_height << ", " - << filter_count; - return; - } - if ((output_width <= 0) || (output_height <= 0)) { - LOG(WARNING) << "Conv2D was called with bad output width or height: " - << output_width << ", " << output_height; - return; - } - OP_REQUIRES( - context, ((SampleMode == NEAREST) || (SampleMode == BILINEAR)), - errors::InvalidArgument("Bad sample mode passed in", SampleMode)); - - // These calculations define how the patches will be positioned within the - // input image. The actual definitions are quite complex, and rely on the - // previously-calculated output size. - int filter_left_offset; - int filter_top_offset; - if (padding == VALID) { - filter_left_offset = - ((output_width - 1) * stride_cols + filter_width - padded_width + 1) / - 2; - filter_top_offset = ((output_height - 1) * stride_rows + filter_height - - padded_height + 1) / - 2; - } else { - filter_left_offset = - ((output_width - 1) * stride_cols + filter_width - padded_width) / 2; - filter_top_offset = - ((output_height - 1) * stride_rows + filter_height - padded_height) / - 2; - } - - ResizeTaskParameters task_params; - task_params.input_depth = input_depth; - task_params.top_padding = top_padding; - task_params.pad_offset = pad_offset; - task_params.resized_height = resized_height; - task_params.st = st; - task_params.left_padding = left_padding; - task_params.resized_width = resized_width; - task_params.padded_width = padded_width; - task_params.padded_height = padded_height; - - // The im2col buffer has # of patches rows, and # of filters cols. - // It's laid out like this, in row major order in memory: - // < filter value count > - // ^ +---------------------+ - // patch | | - // count | | - // v +---------------------+ - // Each patch row contains a filter_width x filter_height patch of the - // input, with the depth channel as the most contiguous in memory, followed - // by the width, then the height. This is the standard memory order in the - // image world if it helps to visualize it. - const int filter_value_count = filter_width * filter_height * input_depth; - - OP_REQUIRES(context, (filter_value_count * sizeof(T1)) <= kMaxChunkSize, - errors::InvalidArgument("Im2Col patch too large for buffer")); - const size_t patches_per_chunk = - kMaxChunkSize / (filter_value_count * sizeof(T1)); - // Because memory allocation is very expensive on mobile platforms, try to - // allocate a persistent buffer that will be kept around between calls. We - // use TensorFlow's resource management to ensure that the memory will be - // released when the session is over. - Im2ColBufferResource* im2col_buffer_resource; - std::function**)> creator = - [](Im2ColBufferResource** resource) { - *resource = new Im2ColBufferResource(); - return Status::OK(); - }; - OP_REQUIRES_OK(context, context->resource_manager()->LookupOrCreate( - "Conv2d", "im2col_buffer", - &im2col_buffer_resource, creator)); - - // Create a resize cache memory buffer that will hold the rows of - // transformed and mirror padded input pixels, ready to be copied - // into filter patches by im2col. - // It's laid out like this, in row major order in memory: - // < cache line width > - // ^ +--------------------+ - // cache | | - // height | | - // v +--------------------+ - // Each cache row contains a cache_line_width number of resized pixels, - // each with input_depth channels. The cache height is typically less than - // the full height the resized image would be, so it's filled up - // incrementally as we progress downwards through the input creating im2col - // patches. - task_params.cache_start_x = -filter_left_offset; - task_params.cache_end_x = - (((output_width - 1) * stride_cols) - filter_left_offset) + - filter_width; - task_params.cache_line_width = - task_params.cache_end_x - task_params.cache_start_x; - task_params.cache_height = - kResizeCacheSize / (task_params.cache_line_width * input_depth); - const int needed_resize_cache_count = - filter_height * task_params.cache_line_width * input_depth; - OP_REQUIRES(context, - (needed_resize_cache_count * sizeof(T1)) <= kResizeCacheSize, - errors::InvalidArgument("Input too large for resize cache")); - Im2ColBufferResource* resize_cache_resource; - std::function**)> - resize_creator = - [](Im2ColBufferResource** resource) { - *resource = new Im2ColBufferResource(); - return Status::OK(); - }; - OP_REQUIRES_OK(context, context->resource_manager()->LookupOrCreate( - "Conv2d", "resize_cache", - &resize_cache_resource, resize_creator)); - - // This means that multiple ops can't be run simultaneously on different - // threads, because we have a single shared resource. The platforms this is - // aimed at have intra-op parallelism as their focus though, so it shouldn't - // be an issue. - mutex_lock lock_buffer(im2col_buffer_resource->mu); - core::ScopedUnref unref_buffer(im2col_buffer_resource); - T1* im2col_buffer = im2col_buffer_resource->data; - - // This buffer is used as a fairly heavy-weight cache for the resized and - // mirrored inputs to the im2col operation. The problem is that we want to - // keep the memory usage down by not rendering the fully resized and padded - // input tensor to the convolution into an entire buffer. The first approach - // to avoid this was to fold the bilinear filtering and padding spatial - // transformations into the im2col lookup itself. This successfully reduced - // memory usage, but because im2col can access an individual pixel for many - // different patches, the extra overhead of doing the same bilinear lookups - // repeatedly became too expensive. - // The resize cache is designed to avoid this problem by keeping a - // horizontal slice of the resized and padded input to the im2col - // precalculated, so that repeated accesses to the same pixel from different - // filter patches can just be copied from this cache. It's organized as a - // horizontal slice stretching across the whole virtual image, and as high - // as the filter window, so that as the patch processing moves across all - // the pixels are present, and before a new row of patches is started any - // previously calculated rows that are needed are maintained, with new rows - // calculated as required. - mutex_lock resize_lock_buffer(resize_cache_resource->mu); - core::ScopedUnref unref_resized_cache(resize_cache_resource); - task_params.resize_cache = resize_cache_resource->data; - - const T1* input_data = input.flat().data(); - const int64 input_height = input.shape().dim_sizes()[1]; - task_params.input_width = input.shape().dim_sizes()[2]; - - int end_cached_lines = std::numeric_limits::min(); - - for (int batch = 0; batch < input_batches; ++batch) { - task_params.input_batch_start = - input_data + - (batch * input_height * task_params.input_width * input_depth); - const int in_y_end = - ((output_height * stride_rows) - filter_top_offset) + filter_height; - for (int out_y = 0; out_y < output_height; ++out_y) { - const int in_y_origin = (out_y * stride_rows) - filter_top_offset; - const int cache_start_y = std::max(in_y_origin, end_cached_lines); - const int cache_end_y = std::min( - in_y_end, std::max((in_y_origin + task_params.cache_height), - end_cached_lines)); - if (end_cached_lines < (in_y_origin + filter_height)) { - // This call breaks up the work required for calculating the mirror - // padding and resizing across multiple threads. - FusedConvParallelFor( - context, cache_start_y, cache_end_y, - [task_params](int64 task_cache_start_y, int64 task_cache_end_y) { - // This is a long and confusing function, but it's been laid out - // this way to help with performance on some intensive models. - // What it's doing is populating a cache of the original input - // image, after it's been bilinear resized and had its edges - // mirrored. This allows the following im2col code to access the - // transformed pixels from this cache, without having to - // repeatedly apply the expensive bilinear calculations as the - // same pixels are accessed by different patches. - // This is most effective when the stride is small and the - // filter size is large, since that's when pixels are reused - // most frequently as patches overlap. - for (int cache_y = task_cache_start_y; - cache_y < task_cache_end_y; ++cache_y) { - // We organize the cache as a series of rows, each containing - // all the transformed pixels for a given line in the image. - // This cache is big enough to hold at least a filter's height - // worth of rows, but typically more, limited by the size of - // the cache buffer. - // We don't allocate an entire image's worth of rows though, - // because we're trying to keep memory usage down, so as we - // progress downwards through the im2col we periodically - // refresh the cache so that the next lines that are needed - // for that operation are always present. - // Work out the parameters that remain constant across the - // row we're calculating. - PerCacheLineParameters line_params( - CalculatePerCacheLineParameters( - task_params.cache_height, cache_y, - task_params.resize_cache, - task_params.cache_line_width, task_params.input_width, - task_params.input_depth, task_params.top_padding, - task_params.pad_offset, task_params.resized_height, - task_params.st, task_params.input_batch_start)); - // Iterate through the resize cache row we're filling in. - for (int cache_x = task_params.cache_start_x; - cache_x < task_params.cache_end_x; ++cache_x) { - // Figure out what we need for the cache pixel we're - // populating. - PerCachePixelParameters pixel_params( - CalculatePerCachePixelParameters( - cache_x, task_params.cache_start_x, - line_params.cache_line_start, - task_params.input_depth, task_params.left_padding, - task_params.pad_offset, task_params.resized_width, - task_params.st)); - // If the access is off the left, right, top, or bottom of - // the resized image, the conv padding means we should set - // it to zero. - if ((cache_x < 0) || - (cache_x >= task_params.padded_width) || - (cache_y < 0) || - (cache_y >= task_params.padded_height)) { - std::fill_n(pixel_params.cache_line_pixel, - task_params.input_depth, T1(0)); - } else { - // There are two different sampling strategies for - // resizing. When using nearest, we can just do a - // straight copy of the pixel closest to our sample point, - // but bilinear requires a more complex calculation. - if (SampleMode == NEAREST) { - const T1* input_top_left_pixel = - line_params.input_top_row_start + - (pixel_params.left_x_index * - task_params.input_depth); - - std::copy_n(input_top_left_pixel, - task_params.input_depth, - pixel_params.cache_line_pixel); - } else { - const SampleRect rect( - line_params.input_top_row_start + - (pixel_params.left_x_index * - task_params.input_depth), - line_params.input_top_row_start + - (pixel_params.right_x_index * - task_params.input_depth), - line_params.input_bottom_row_start + - (pixel_params.left_x_index * - task_params.input_depth), - line_params.input_bottom_row_start + - (pixel_params.right_x_index * - task_params.input_depth)); - for (int in_channel = 0; - in_channel < task_params.input_depth; - ++in_channel) { - pixel_params.cache_line_pixel[in_channel] = - rect.BilinearSample(in_channel, - pixel_params.x_lerp, - line_params.y_lerp); - } - } - } - } - } - }); - end_cached_lines = cache_end_y; - } - for (int out_x = 0; out_x < output_width; ++out_x) { - const int in_x_origin = (out_x * stride_cols) - filter_left_offset; - const int patch_index = (batch * output_width * output_height) + - (out_y * output_width) + out_x; - const int patch_index_within_chunk = patch_index % patches_per_chunk; - T1* im2col_patch_start = - im2col_buffer + (patch_index_within_chunk * filter_value_count); - for (int filter_y = 0; filter_y < filter_height; ++filter_y) { - T1* im2col_row_start = - im2col_patch_start + - (filter_y * filter_width * task_params.input_depth); - const int conv_in_y = in_y_origin + filter_y; - int cache_index_y; - if (conv_in_y < 0) { - cache_index_y = task_params.cache_height + - (conv_in_y % task_params.cache_height); - } else { - cache_index_y = conv_in_y % task_params.cache_height; - } - T1* cache_line_start = - task_params.resize_cache + - (cache_index_y * task_params.cache_line_width * - task_params.input_depth); - T1* cache_filter_row_start = - cache_line_start + ((in_x_origin - task_params.cache_start_x) * - task_params.input_depth); - std::copy_n(cache_filter_row_start, - (filter_width * task_params.input_depth), - im2col_row_start); - } - const bool is_last_in_chunk = - (patch_index_within_chunk == (patches_per_chunk - 1)); - const bool is_last_overall = - ((batch == (input_batches - 1)) && - (out_y == (output_height - 1)) && (out_x == (output_width - 1))); - if (is_last_in_chunk || is_last_overall) { - // Now we've assembled a set of image patches into a matrix, apply - // a GEMM matrix multiply of the patches as rows, times the filter - // weights in columns, to get partial results in the output - // matrix. - const int how_many_patches = patch_index_within_chunk + 1; - const int m = how_many_patches; - const int n = filter_count; - const int k = filter_value_count; - const int lda = filter_value_count; - const int ldb = filter_count; - const int ldc = filter_count; - const size_t start_patch_index = - patch_index - (how_many_patches - 1); - T3* chunk_output_data = - output_data + (start_patch_index * filter_count); - TGemmFunctor gemm_functor; - gemm_functor(context, m, n, k, im2col_buffer, lda, filter_data, ldb, - chunk_output_data, ldc); - } - } - } - } - } -}; - -} // namespace - -// Implements a version of convolution with bilinear resizing and mirror padding -// included. -template -class FusedResizeConv2DUsingGemmOp : public OpKernel { - public: - explicit FusedResizeConv2DUsingGemmOp(OpKernelConstruction* context) - : OpKernel(context) { - if (DoResize) { - OP_REQUIRES_OK(context, - context->GetAttr("resize_align_corners", &align_corners_)); - } - MirrorPadMode mode; - OP_REQUIRES_OK(context, context->GetAttr("mode", &mode)); - - switch (mode) { - case MirrorPadMode::SYMMETRIC: { - offset_ = 0; - break; - } - case MirrorPadMode::REFLECT: { - offset_ = 1; - break; - } - default: - OP_REQUIRES(context, false, - errors::InvalidArgument( - "mode must be either REFLECT or SYMMETRIC.")); - } - OP_REQUIRES_OK(context, context->GetAttr("strides", &strides_)); - OP_REQUIRES(context, strides_.size() == 4, - errors::InvalidArgument("Sliding window strides field must " - "specify 4 dimensions")); - const int64 stride_n = GetTensorDim(strides_, FORMAT_NHWC, 'N'); - const int64 stride_c = GetTensorDim(strides_, FORMAT_NHWC, 'C'); - OP_REQUIRES( - context, stride_n == 1 && stride_c == 1, - errors::InvalidArgument("Current implementation does not yet support " - "strides in the batch and depth dimensions.")); - OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_)); - } - - void Compute(OpKernelContext* context) override { - // Input tensor is of the following dimensions: - // [ batch, in_rows, in_cols, in_depth ] - const Tensor& input = context->input(0); - OP_REQUIRES(context, (input.shape().num_elements() > 0), - errors::InvalidArgument("Input tensor can't be empty")); - - ImageResizerState st(false); - if (DoResize) { - st = ImageResizerState(align_corners_); - st.ValidateAndCalculateOutputSize(context, input); - if (!context->status().ok()) return; - } else { - // Set up the resize parameters to do no scaling at all. - st.batch_size = input.dim_size(0); - st.out_height = input.dim_size(1); - st.out_width = input.dim_size(2); - st.in_height = input.dim_size(1); - st.in_width = input.dim_size(2); - st.channels = input.dim_size(3); - st.height_scale = 1.0f; - st.width_scale = 1.0f; - } - TensorShape resized_shape( - {input.dim_size(0), st.out_height, st.out_width, input.dim_size(3)}); - int paddings_index; - int filter_index; - if (DoResize) { - paddings_index = 2; - filter_index = 3; - } else { - paddings_index = 1; - filter_index = 2; - } - const Tensor& paddings = context->input(paddings_index); - - const int dims = resized_shape.dims(); - OP_REQUIRES( - context, - TensorShapeUtils::IsMatrix(paddings.shape()) && - paddings.dim_size(1) == 2, - errors::InvalidArgument("paddings must be a matrix with 2 columns: ", - paddings.shape().DebugString())); - const int fixed_dims = - (allow_legacy_scalars() && dims == 0 && paddings.dim_size(0) == 1) - ? 1 - : dims; - OP_REQUIRES( - context, fixed_dims == paddings.dim_size(0), - errors::InvalidArgument( - "The first dimension of paddings must be the rank of inputs: ", - fixed_dims, " ", paddings.shape().DebugString(), " ", - resized_shape.DebugString())); - OP_REQUIRES( - context, dims == paddings.dim_size(0), - errors::InvalidArgument( - "The first dimension of paddings must be the rank of inputs: ", - dims, " ", paddings.shape().DebugString(), " ", - resized_shape.DebugString())); - - OP_REQUIRES( - context, dims == 4, - errors::InvalidArgument( - "Fused mirror padding only supports four-dimensional inputs, but ", - dims, " requested")); - - // Compute the shape of the output tensor, and allocate it. - TensorShape padded_shape; - TTypes::ConstMatrix paddings_matrix = paddings.matrix(); - for (int d = 0; d < dims; ++d) { - const int32 before = - paddings_matrix(d, 0); // Pad before existing elements. - const int32 after = - paddings_matrix(d, 1); // Pad after existing elements. - OP_REQUIRES(context, before >= 0 && after >= 0, - errors::InvalidArgument( - "paddings must be non-negative: ", before, " ", after)); - if (offset_ == 0) { // SYMMETRIC mode. - OP_REQUIRES( - context, - before <= resized_shape.dim_size(d) && - after <= resized_shape.dim_size(d), - errors::InvalidArgument("paddings must be no greater " - "than the dimension size: ", - before, ", ", after, " greater than ", - resized_shape.dim_size(d))); - } else if (offset_ == 1) { // REFLECT mode. - OP_REQUIRES( - context, - before < resized_shape.dim_size(d) && - after < resized_shape.dim_size(d), - errors::InvalidArgument("paddings must be less than" - " the dimension size: ", - before, ", ", after, " not less than ", - resized_shape.dim_size(d))); - } - padded_shape.AddDim(before + resized_shape.dim_size(d) + after); - } - - OP_REQUIRES( - context, ((paddings_matrix(0, 0) == 0) && (paddings_matrix(0, 1) == 0)), - errors::InvalidArgument( - "Fused mirror padding only support spatial padding, not batches: ", - paddings.DebugString())); - OP_REQUIRES( - context, ((paddings_matrix(3, 0) == 0) && (paddings_matrix(3, 1) == 0)), - errors::InvalidArgument( - "Fused mirror padding only support spatial padding, not channels: ", - paddings.DebugString())); - const int32 top_padding = paddings_matrix(1, 0); - const int32 bottom_padding = paddings_matrix(1, 1); - const int32 left_padding = paddings_matrix(2, 0); - const int32 right_padding = paddings_matrix(2, 1); - - // Input filter is of the following dimensions: - // [ filter_rows, filter_cols, in_depth, out_depth] - const Tensor& filter = context->input(filter_index); - - // For 2D convolution, there should be 4 dimensions. - OP_REQUIRES(context, padded_shape.dims() == 4, - errors::InvalidArgument("input must be 4-dimensional", - padded_shape.DebugString())); - OP_REQUIRES(context, filter.dims() == 4, - errors::InvalidArgument("filter must be 4-dimensional: ", - filter.shape().DebugString())); - - // We only check the first three dims, since the depth is accessed as an - // int64 below. - for (int i = 0; i < 3; i++) { - OP_REQUIRES( - context, - FastBoundsCheck(filter.dim_size(i), std::numeric_limits::max()), - errors::InvalidArgument("filter too large")); - } - - // The last dimension for input is in_depth. It must be the same as the - // filter's in_depth. - const int64 in_depth = padded_shape.dim_size(3); - OP_REQUIRES(context, in_depth == filter.dim_size(2), - errors::InvalidArgument( - "input and filter must have the same depth: ", in_depth, - " vs ", filter.dim_size(2))); - - // The last dimension for filter is out_depth. - const int out_depth = static_cast(filter.dim_size(3)); - - // The second dimension for input is rows/height. - // The first dimension for filter is rows/height. - const int64 padded_rows_raw = padded_shape.dim_size(1); - OP_REQUIRES( - context, - FastBoundsCheck(padded_rows_raw, std::numeric_limits::max()), - errors::InvalidArgument("Input rows too large")); - const int padded_rows = static_cast(padded_rows_raw); - const int filter_rows = static_cast(filter.dim_size(0)); - const int resized_rows = static_cast(resized_shape.dim_size(1)); - - // The third dimension for input is columns/width. - // The second dimension for filter is columns/width. - const int64 padded_cols_raw = padded_shape.dim_size(2); - OP_REQUIRES( - context, - FastBoundsCheck(padded_cols_raw, std::numeric_limits::max()), - errors::InvalidArgument("Input cols too large")); - const int padded_cols = static_cast(padded_cols_raw); - const int filter_cols = static_cast(filter.dim_size(1)); - const int resized_cols = static_cast(resized_shape.dim_size(2)); - - // The first dimension for input is batch. - const int64 batch_raw = padded_shape.dim_size(0); - OP_REQUIRES(context, - FastBoundsCheck(batch_raw, std::numeric_limits::max()), - errors::InvalidArgument("batch is too large")); - const int batch = static_cast(batch_raw); - - // For now we take the stride from the second and third dimensions only (we - // do not support striding on the batch or depth dimension). - const int stride_rows = GetTensorDim(strides_, FORMAT_NHWC, 'H'); - const int stride_cols = GetTensorDim(strides_, FORMAT_NHWC, 'W'); - - int64 out_rows = 0, out_cols = 0, pad_rows = 0, pad_cols = 0; - OP_REQUIRES_OK(context, - GetWindowedOutputSize(padded_rows, filter_rows, stride_rows, - padding_, &out_rows, &pad_rows)); - OP_REQUIRES_OK(context, - GetWindowedOutputSize(padded_cols, filter_cols, stride_cols, - padding_, &out_cols, &pad_cols)); - TensorShape out_shape = - ShapeFromFormat(FORMAT_NHWC, batch, out_rows, out_cols, out_depth); - OP_REQUIRES(context, (out_shape.num_elements() > 0), - errors::InvalidArgument("Output tensor can't be empty")); - - // Output tensor is of the following dimensions: - // [ in_batch, out_rows, out_cols, out_depth ] - Tensor* output = nullptr; - OP_REQUIRES_OK(context, context->allocate_output(0, out_shape, &output)); - - VLOG(2) << "FusedConv2D: " << name() << ", in_depth = " << in_depth - << ", padded_cols = " << padded_cols - << ", resized_cols = " << resized_cols - << ", filter_cols = " << filter_cols - << ", padded_rows = " << padded_rows - << ", resized_rows = " << resized_rows - << ", filter_rows = " << filter_rows - << ", stride_rows = " << stride_rows - << ", stride_cols = " << stride_cols - << ", out_depth = " << out_depth << ", DoResize=" << DoResize; - - // If there is nothing to compute, return. - if (out_shape.num_elements() == 0) { - return; - } - TConvFunctor conv_functor; - conv_functor(context, input, batch, resized_rows, resized_cols, padded_rows, - padded_cols, in_depth, filter.flat().data(), filter_rows, - filter_cols, out_depth, stride_rows, stride_cols, padding_, - output->flat().data(), out_rows, out_cols, st, top_padding, - bottom_padding, left_padding, right_padding, offset_); - } - - private: - std::vector strides_; - Padding padding_; - bool align_corners_; - int offset_; - - TF_DISALLOW_COPY_AND_ASSIGN(FusedResizeConv2DUsingGemmOp); -}; - -#define REGISTER_FUSED(T) \ - REGISTER_KERNEL_BUILDER( \ - Name("FusedResizeAndPadConv2D") \ - .Device(DEVICE_CPU) \ - .TypeConstraint("T"), \ - FusedResizeConv2DUsingGemmOp< \ - T, \ - FusedResizeAndPadConvFunctor, \ - BILINEAR>, \ - true>); - -TF_CALL_half(REGISTER_FUSED); -TF_CALL_float(REGISTER_FUSED); -TF_CALL_double(REGISTER_FUSED); - -#define REGISTER_PAD_ONLY_FUSED(T) \ - REGISTER_KERNEL_BUILDER( \ - Name("FusedPadConv2D").Device(DEVICE_CPU).TypeConstraint("T"), \ - FusedResizeConv2DUsingGemmOp< \ - T, \ - FusedResizeAndPadConvFunctor, \ - NEAREST>, \ - false>); - -TF_CALL_half(REGISTER_PAD_ONLY_FUSED); -TF_CALL_float(REGISTER_PAD_ONLY_FUSED); -TF_CALL_double(REGISTER_PAD_ONLY_FUSED); - -// Support for fusing computationally cheap, but memory bandwidth expensive -// computations into the output of convolution to reduce the overall latency. -// -// Example: Fuse Conv2D+BiasAdd+Relu. - namespace { typedef Eigen::ThreadPoolDevice CPUDevice; diff --git a/tensorflow/core/kernels/conv_ops_fused_image_transform.cc b/tensorflow/core/kernels/conv_ops_fused_image_transform.cc new file mode 100644 index 0000000000..7be1de29c9 --- /dev/null +++ b/tensorflow/core/kernels/conv_ops_fused_image_transform.cc @@ -0,0 +1,902 @@ +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +// Implements convolution operations with image transformations (resize and +// mirror padding) baked into the processing, to optimize latency and memory +// usage. + +#define EIGEN_USE_THREADS + +#include +#include +#include "tensorflow/core/framework/common_shape_fns.h" +#include "tensorflow/core/framework/numeric_op.h" +#include "tensorflow/core/framework/op_kernel.h" +#include "tensorflow/core/framework/register_types.h" +#include "tensorflow/core/framework/resource_mgr.h" +#include "tensorflow/core/framework/tensor.h" +#include "tensorflow/core/framework/tensor_shape.h" +#include "tensorflow/core/framework/tensor_slice.h" +#include "tensorflow/core/kernels/bounds_check.h" +#include "tensorflow/core/kernels/conv_2d.h" +#include "tensorflow/core/kernels/conv_ops.h" +#include "tensorflow/core/kernels/gemm_functors.h" +#include "tensorflow/core/kernels/image_resizer_state.h" +#include "tensorflow/core/kernels/ops_util.h" +#include "tensorflow/core/lib/core/threadpool.h" +#include "tensorflow/core/util/mirror_pad_mode.h" +#include "tensorflow/core/util/padding.h" +#include "tensorflow/core/util/tensor_format.h" + +namespace tensorflow { +namespace { + +// We don't want to allocate a buffer to hold all the patches if the size is +// going to be extremely large, so break it into chunks if it's bigger than +// a limit. Each chunk will be processed serially, so we can refill the +// buffer for the next chunk and reuse it, keeping maximum memory size down. +// In this case, we've picked 16 megabytes as a reasonable limit for Android and +// other platforms using Eigen, and 1MB for iOS devices, from experimentation. +#if defined(__APPLE__) && defined(IS_MOBILE_PLATFORM) +const size_t kMaxChunkSize = (1 * 1024 * 1024); +#else +const size_t kMaxChunkSize = (16 * 1024 * 1024); +#endif +const size_t kResizeCacheSize = (8 * 1024 * 1024); + +// Lookup method used when resizing. +enum SamplingMode { + BILINEAR = 0, + NEAREST = 1, +}; + +// Simple utility function used by FusedConv to multithread basic workloads. To +// use it, pass begin and end values for the full workload and a std::function +// that receives a subset of that through the begin and end values for each +// worker's task. The division of the full workload into worker tasks is handled +// by the multithreading logic. Here's an example of how to use it: +// std::vector my_vector(100); +// ... +// FusedConvParallelFor(context, 0, 100, +// [&my_vector](int64 task_begin, int64 task_end) { +// for (int64 current = task_begin; current != task_end; ++current) { +// my_vector[current] *= 10.0f; +// } +// }); +void FusedConvParallelFor( + OpKernelContext* context, int64 begin, int64 end, + const std::function& task_function) { +// On iOS, the thread management imposes a very big performance penalty, so +// just call the function directly with no multithreading. +#if defined(__APPLE__) && defined(IS_MOBILE_PLATFORM) + task_function(begin, end); +#else + auto& worker_threads = *(context->device()->tensorflow_cpu_worker_threads()); + thread::ThreadPool* thread_pool = worker_threads.workers; + const int64 total_elements = end - begin; + // This is a bit of an arbitrary number, but was found to work well for + // typical models we've been profiling on various devices. + const int64 element_cost = 10000000; + thread_pool->ParallelFor( + total_elements, element_cost, + [begin, task_function](int64 begin_offset, int64 end_offset) { + const int64 task_begin = begin + begin_offset; + const int64 task_end = begin + end_offset; + task_function(task_begin, task_end); + }); +#endif +} + +// Holds the state needed for the resizing subtasks. +template +struct ResizeTaskParameters { + ResizeTaskParameters() : st(false) {} + + int cache_height; + T1* resize_cache; + int cache_line_width; + int input_width; + int input_depth; + int top_padding; + int pad_offset; + int64 resized_height; + ImageResizerState st; + const T1* input_batch_start; + int64 cache_start_x; + int64 cache_end_x; + int left_padding; + int64 resized_width; + int64 padded_width; + int64 padded_height; +}; + +template +struct PerCacheLineParameters { + PerCacheLineParameters() {} + PerCacheLineParameters(const PerCacheLineParameters& other) + : cache_line_start(other.cache_line_start), + input_top_row_start(other.input_top_row_start), + input_bottom_row_start(other.input_bottom_row_start), + y_lerp(other.y_lerp) {} + + T1* cache_line_start; + const T1* input_top_row_start; + const T1* input_bottom_row_start; + T1 y_lerp; +}; + +// Helper class to simplify bilinear filtering +template +struct SampleRect { + EIGEN_ALWAYS_INLINE SampleRect(const T1* in_top_left, const T1* in_top_right, + const T1* in_bottom_left, + const T1* in_bottom_right) + : top_left(in_top_left), + top_right(in_top_right), + bottom_left(in_bottom_left), + bottom_right(in_bottom_right) {} + + EIGEN_ALWAYS_INLINE T1 BilinearSample(int channel, T1 x_lerp, + T1 y_lerp) const { + const T1 top = + top_left[channel] + (top_right[channel] - top_left[channel]) * x_lerp; + const T1 bottom = bottom_left[channel] + + (bottom_right[channel] - bottom_left[channel]) * x_lerp; + return top + (bottom - top) * y_lerp; + } + + const T1* top_left; + const T1* top_right; + const T1* bottom_left; + const T1* bottom_right; +}; + +// Calculates parameters which remain constant through a resize cache row. +template +EIGEN_ALWAYS_INLINE PerCacheLineParameters CalculatePerCacheLineParameters( + int64 cache_height, int64 cache_y, T1* resize_cache, int64 cache_line_width, + int64 input_width, int64 input_depth, int64 top_padding, int64 pad_offset, + int64 resized_height, const ImageResizerState& st, + const T1* input_batch_start) { + PerCacheLineParameters result; + // The cache is organized so that the real y values of the resized image map + // onto the actual cache values through a modulo scheme. This means that as we + // progress downwards through the image, we keep reusing a small cache and so + // keep memory usage down. + int64 cache_index_y; + if (cache_y < 0) { + cache_index_y = cache_height + (cache_y % cache_height); + } else { + cache_index_y = cache_y % cache_height; + } + result.cache_line_start = + resize_cache + (cache_index_y * cache_line_width * input_depth); + // This part is implementing the mirror padding that happens before resizing. + float in_y = (cache_y - top_padding); + if (in_y < 0) { + in_y = -(in_y + 1.0f - pad_offset); + } else if (in_y >= resized_height) { + in_y = (resized_height * 2.0f) - (in_y + 1.0f + pad_offset); + } + // Here's where do do the actual resize. + in_y *= st.height_scale; + const int64 top_y_index = static_cast(std::floor(in_y)); + const int64 bottom_y_index = + std::min(static_cast(std::ceil(in_y)), (st.in_height - 1)); + // Lerp is used for bilinear filtering when that's needed. + result.y_lerp = static_cast(in_y - top_y_index); + // Which rows of the original input image to pull the values from. + result.input_top_row_start = + input_batch_start + (top_y_index * input_width * input_depth); + result.input_bottom_row_start = + input_batch_start + (bottom_y_index * input_width * input_depth); + return result; +} + +template +struct PerCachePixelParameters { + PerCachePixelParameters() {} + PerCachePixelParameters(const PerCachePixelParameters& other) + : cache_line_pixel(other.cache_line_pixel), + left_x_index(other.left_x_index), + right_x_index(other.right_x_index), + x_lerp(other.x_lerp) {} + + T1* cache_line_pixel; + int64 left_x_index; + int64 right_x_index; + T1 x_lerp; +}; + +// Pulls out common parameters used for every resized pixel. +template +EIGEN_ALWAYS_INLINE PerCachePixelParameters +CalculatePerCachePixelParameters(int64 cache_x, int64 cache_start_x, + T1* cache_line_start, int64 input_depth, + int64 left_padding, int64 pad_offset, + int64 resized_width, + const ImageResizerState& st) { + PerCachePixelParameters result; + // Figure out where we're going to store the results of our transform. + const int cache_index_x = cache_x - cache_start_x; + result.cache_line_pixel = cache_line_start + (cache_index_x * input_depth); + // Implement mirror padding by flipping in_x if it's off the edge. + float in_x = (cache_x - left_padding); + if (in_x < 0) { + in_x = -(in_x + 1.0f - pad_offset); + } else if (in_x >= resized_width) { + in_x = (resized_width * 2.0f) - (in_x + 1.0f + pad_offset); + } + // Resize the x parameters. + in_x *= st.width_scale; + // Get the x coordinates for the left and right pixels to pull from. + result.left_x_index = static_cast(std::floor(in_x)); + result.right_x_index = + std::min(static_cast(std::ceil(in_x)), (st.in_width - 1)); + // This x_lerp is used to blend pixels in bilinear filtering. + result.x_lerp = static_cast(in_x - result.left_x_index); + return result; +} + +// Combines bilinear resizing and mirror padding into the im2col transformation +// stage of convolution. +template +class FusedResizeAndPadConvFunctor { + public: + void operator()(OpKernelContext* context, const Tensor& input, + int input_batches, int resized_height, int resized_width, + int padded_height, int padded_width, int input_depth, + const T2* filter_data, int filter_height, int filter_width, + int filter_count, int stride_rows, int stride_cols, + Padding padding, T3* output_data, int output_height, + int output_width, const ImageResizerState& st, + int top_padding, int bottom_padding, int left_padding, + int right_padding, int pad_offset) { + if ((input_batches <= 0) || (padded_width <= 0) || (padded_height <= 0) || + (input_depth <= 0)) { + LOG(WARNING) << "Conv2D was called with bad input dimensions: " + << input_batches << ", " << padded_height << ", " + << padded_width << ", " << input_depth; + return; + } + if ((filter_width <= 0) || (filter_height <= 0) || (filter_count <= 0)) { + LOG(WARNING) << "Conv2D was called with bad filter dimensions: " + << filter_width << ", " << filter_height << ", " + << filter_count; + return; + } + if ((output_width <= 0) || (output_height <= 0)) { + LOG(WARNING) << "Conv2D was called with bad output width or height: " + << output_width << ", " << output_height; + return; + } + OP_REQUIRES( + context, ((SampleMode == NEAREST) || (SampleMode == BILINEAR)), + errors::InvalidArgument("Bad sample mode passed in", SampleMode)); + + // These calculations define how the patches will be positioned within the + // input image. The actual definitions are quite complex, and rely on the + // previously-calculated output size. + int filter_left_offset; + int filter_top_offset; + if (padding == VALID) { + filter_left_offset = + ((output_width - 1) * stride_cols + filter_width - padded_width + 1) / + 2; + filter_top_offset = ((output_height - 1) * stride_rows + filter_height - + padded_height + 1) / + 2; + } else { + filter_left_offset = + ((output_width - 1) * stride_cols + filter_width - padded_width) / 2; + filter_top_offset = + ((output_height - 1) * stride_rows + filter_height - padded_height) / + 2; + } + + ResizeTaskParameters task_params; + task_params.input_depth = input_depth; + task_params.top_padding = top_padding; + task_params.pad_offset = pad_offset; + task_params.resized_height = resized_height; + task_params.st = st; + task_params.left_padding = left_padding; + task_params.resized_width = resized_width; + task_params.padded_width = padded_width; + task_params.padded_height = padded_height; + + // The im2col buffer has # of patches rows, and # of filters cols. + // It's laid out like this, in row major order in memory: + // < filter value count > + // ^ +---------------------+ + // patch | | + // count | | + // v +---------------------+ + // Each patch row contains a filter_width x filter_height patch of the + // input, with the depth channel as the most contiguous in memory, followed + // by the width, then the height. This is the standard memory order in the + // image world if it helps to visualize it. + const int filter_value_count = filter_width * filter_height * input_depth; + + OP_REQUIRES(context, (filter_value_count * sizeof(T1)) <= kMaxChunkSize, + errors::InvalidArgument("Im2Col patch too large for buffer")); + const size_t patches_per_chunk = + kMaxChunkSize / (filter_value_count * sizeof(T1)); + // Because memory allocation is very expensive on mobile platforms, try to + // allocate a persistent buffer that will be kept around between calls. We + // use TensorFlow's resource management to ensure that the memory will be + // released when the session is over. + Im2ColBufferResource* im2col_buffer_resource; + std::function**)> creator = + [](Im2ColBufferResource** resource) { + *resource = new Im2ColBufferResource(); + return Status::OK(); + }; + OP_REQUIRES_OK(context, context->resource_manager()->LookupOrCreate( + "Conv2d", "im2col_buffer", + &im2col_buffer_resource, creator)); + + // Create a resize cache memory buffer that will hold the rows of + // transformed and mirror padded input pixels, ready to be copied + // into filter patches by im2col. + // It's laid out like this, in row major order in memory: + // < cache line width > + // ^ +--------------------+ + // cache | | + // height | | + // v +--------------------+ + // Each cache row contains a cache_line_width number of resized pixels, + // each with input_depth channels. The cache height is typically less than + // the full height the resized image would be, so it's filled up + // incrementally as we progress downwards through the input creating im2col + // patches. + task_params.cache_start_x = -filter_left_offset; + task_params.cache_end_x = + (((output_width - 1) * stride_cols) - filter_left_offset) + + filter_width; + task_params.cache_line_width = + task_params.cache_end_x - task_params.cache_start_x; + task_params.cache_height = + kResizeCacheSize / (task_params.cache_line_width * input_depth); + const int needed_resize_cache_count = + filter_height * task_params.cache_line_width * input_depth; + OP_REQUIRES(context, + (needed_resize_cache_count * sizeof(T1)) <= kResizeCacheSize, + errors::InvalidArgument("Input too large for resize cache")); + Im2ColBufferResource* resize_cache_resource; + std::function**)> + resize_creator = + [](Im2ColBufferResource** resource) { + *resource = new Im2ColBufferResource(); + return Status::OK(); + }; + OP_REQUIRES_OK(context, context->resource_manager()->LookupOrCreate( + "Conv2d", "resize_cache", + &resize_cache_resource, resize_creator)); + + // This means that multiple ops can't be run simultaneously on different + // threads, because we have a single shared resource. The platforms this is + // aimed at have intra-op parallelism as their focus though, so it shouldn't + // be an issue. + mutex_lock lock_buffer(im2col_buffer_resource->mu); + core::ScopedUnref unref_buffer(im2col_buffer_resource); + T1* im2col_buffer = im2col_buffer_resource->data; + + // This buffer is used as a fairly heavy-weight cache for the resized and + // mirrored inputs to the im2col operation. The problem is that we want to + // keep the memory usage down by not rendering the fully resized and padded + // input tensor to the convolution into an entire buffer. The first approach + // to avoid this was to fold the bilinear filtering and padding spatial + // transformations into the im2col lookup itself. This successfully reduced + // memory usage, but because im2col can access an individual pixel for many + // different patches, the extra overhead of doing the same bilinear lookups + // repeatedly became too expensive. + // The resize cache is designed to avoid this problem by keeping a + // horizontal slice of the resized and padded input to the im2col + // precalculated, so that repeated accesses to the same pixel from different + // filter patches can just be copied from this cache. It's organized as a + // horizontal slice stretching across the whole virtual image, and as high + // as the filter window, so that as the patch processing moves across all + // the pixels are present, and before a new row of patches is started any + // previously calculated rows that are needed are maintained, with new rows + // calculated as required. + mutex_lock resize_lock_buffer(resize_cache_resource->mu); + core::ScopedUnref unref_resized_cache(resize_cache_resource); + task_params.resize_cache = resize_cache_resource->data; + + const T1* input_data = input.flat().data(); + const int64 input_height = input.shape().dim_sizes()[1]; + task_params.input_width = input.shape().dim_sizes()[2]; + + int end_cached_lines = std::numeric_limits::min(); + + for (int batch = 0; batch < input_batches; ++batch) { + task_params.input_batch_start = + input_data + + (batch * input_height * task_params.input_width * input_depth); + const int in_y_end = + ((output_height * stride_rows) - filter_top_offset) + filter_height; + for (int out_y = 0; out_y < output_height; ++out_y) { + const int in_y_origin = (out_y * stride_rows) - filter_top_offset; + const int cache_start_y = std::max(in_y_origin, end_cached_lines); + const int cache_end_y = std::min( + in_y_end, std::max((in_y_origin + task_params.cache_height), + end_cached_lines)); + if (end_cached_lines < (in_y_origin + filter_height)) { + // This call breaks up the work required for calculating the mirror + // padding and resizing across multiple threads. + FusedConvParallelFor( + context, cache_start_y, cache_end_y, + [task_params](int64 task_cache_start_y, int64 task_cache_end_y) { + // This is a long and confusing function, but it's been laid out + // this way to help with performance on some intensive models. + // What it's doing is populating a cache of the original input + // image, after it's been bilinear resized and had its edges + // mirrored. This allows the following im2col code to access the + // transformed pixels from this cache, without having to + // repeatedly apply the expensive bilinear calculations as the + // same pixels are accessed by different patches. + // This is most effective when the stride is small and the + // filter size is large, since that's when pixels are reused + // most frequently as patches overlap. + for (int cache_y = task_cache_start_y; + cache_y < task_cache_end_y; ++cache_y) { + // We organize the cache as a series of rows, each containing + // all the transformed pixels for a given line in the image. + // This cache is big enough to hold at least a filter's height + // worth of rows, but typically more, limited by the size of + // the cache buffer. + // We don't allocate an entire image's worth of rows though, + // because we're trying to keep memory usage down, so as we + // progress downwards through the im2col we periodically + // refresh the cache so that the next lines that are needed + // for that operation are always present. + // Work out the parameters that remain constant across the + // row we're calculating. + PerCacheLineParameters line_params( + CalculatePerCacheLineParameters( + task_params.cache_height, cache_y, + task_params.resize_cache, + task_params.cache_line_width, task_params.input_width, + task_params.input_depth, task_params.top_padding, + task_params.pad_offset, task_params.resized_height, + task_params.st, task_params.input_batch_start)); + // Iterate through the resize cache row we're filling in. + for (int cache_x = task_params.cache_start_x; + cache_x < task_params.cache_end_x; ++cache_x) { + // Figure out what we need for the cache pixel we're + // populating. + PerCachePixelParameters pixel_params( + CalculatePerCachePixelParameters( + cache_x, task_params.cache_start_x, + line_params.cache_line_start, + task_params.input_depth, task_params.left_padding, + task_params.pad_offset, task_params.resized_width, + task_params.st)); + // If the access is off the left, right, top, or bottom of + // the resized image, the conv padding means we should set + // it to zero. + if ((cache_x < 0) || + (cache_x >= task_params.padded_width) || + (cache_y < 0) || + (cache_y >= task_params.padded_height)) { + std::fill_n(pixel_params.cache_line_pixel, + task_params.input_depth, T1(0)); + } else { + // There are two different sampling strategies for + // resizing. When using nearest, we can just do a + // straight copy of the pixel closest to our sample point, + // but bilinear requires a more complex calculation. + if (SampleMode == NEAREST) { + const T1* input_top_left_pixel = + line_params.input_top_row_start + + (pixel_params.left_x_index * + task_params.input_depth); + + std::copy_n(input_top_left_pixel, + task_params.input_depth, + pixel_params.cache_line_pixel); + } else { + const SampleRect rect( + line_params.input_top_row_start + + (pixel_params.left_x_index * + task_params.input_depth), + line_params.input_top_row_start + + (pixel_params.right_x_index * + task_params.input_depth), + line_params.input_bottom_row_start + + (pixel_params.left_x_index * + task_params.input_depth), + line_params.input_bottom_row_start + + (pixel_params.right_x_index * + task_params.input_depth)); + for (int in_channel = 0; + in_channel < task_params.input_depth; + ++in_channel) { + pixel_params.cache_line_pixel[in_channel] = + rect.BilinearSample(in_channel, + pixel_params.x_lerp, + line_params.y_lerp); + } + } + } + } + } + }); + end_cached_lines = cache_end_y; + } + for (int out_x = 0; out_x < output_width; ++out_x) { + const int in_x_origin = (out_x * stride_cols) - filter_left_offset; + const int patch_index = (batch * output_width * output_height) + + (out_y * output_width) + out_x; + const int patch_index_within_chunk = patch_index % patches_per_chunk; + T1* im2col_patch_start = + im2col_buffer + (patch_index_within_chunk * filter_value_count); + for (int filter_y = 0; filter_y < filter_height; ++filter_y) { + T1* im2col_row_start = + im2col_patch_start + + (filter_y * filter_width * task_params.input_depth); + const int conv_in_y = in_y_origin + filter_y; + int cache_index_y; + if (conv_in_y < 0) { + cache_index_y = task_params.cache_height + + (conv_in_y % task_params.cache_height); + } else { + cache_index_y = conv_in_y % task_params.cache_height; + } + T1* cache_line_start = + task_params.resize_cache + + (cache_index_y * task_params.cache_line_width * + task_params.input_depth); + T1* cache_filter_row_start = + cache_line_start + ((in_x_origin - task_params.cache_start_x) * + task_params.input_depth); + std::copy_n(cache_filter_row_start, + (filter_width * task_params.input_depth), + im2col_row_start); + } + const bool is_last_in_chunk = + (patch_index_within_chunk == (patches_per_chunk - 1)); + const bool is_last_overall = + ((batch == (input_batches - 1)) && + (out_y == (output_height - 1)) && (out_x == (output_width - 1))); + if (is_last_in_chunk || is_last_overall) { + // Now we've assembled a set of image patches into a matrix, apply + // a GEMM matrix multiply of the patches as rows, times the filter + // weights in columns, to get partial results in the output + // matrix. + const int how_many_patches = patch_index_within_chunk + 1; + const int m = how_many_patches; + const int n = filter_count; + const int k = filter_value_count; + const int lda = filter_value_count; + const int ldb = filter_count; + const int ldc = filter_count; + const size_t start_patch_index = + patch_index - (how_many_patches - 1); + T3* chunk_output_data = + output_data + (start_patch_index * filter_count); + TGemmFunctor gemm_functor; + gemm_functor(context, m, n, k, im2col_buffer, lda, filter_data, ldb, + chunk_output_data, ldc); + } + } + } + } + } +}; + +} // namespace + +// Implements a version of convolution with bilinear resizing and mirror padding +// included. +template +class FusedResizeConv2DUsingGemmOp : public OpKernel { + public: + explicit FusedResizeConv2DUsingGemmOp(OpKernelConstruction* context) + : OpKernel(context) { + if (DoResize) { + OP_REQUIRES_OK(context, + context->GetAttr("resize_align_corners", &align_corners_)); + } + MirrorPadMode mode; + OP_REQUIRES_OK(context, context->GetAttr("mode", &mode)); + + switch (mode) { + case MirrorPadMode::SYMMETRIC: { + offset_ = 0; + break; + } + case MirrorPadMode::REFLECT: { + offset_ = 1; + break; + } + default: + OP_REQUIRES(context, false, + errors::InvalidArgument( + "mode must be either REFLECT or SYMMETRIC.")); + } + OP_REQUIRES_OK(context, context->GetAttr("strides", &strides_)); + OP_REQUIRES(context, strides_.size() == 4, + errors::InvalidArgument("Sliding window strides field must " + "specify 4 dimensions")); + const int64 stride_n = GetTensorDim(strides_, FORMAT_NHWC, 'N'); + const int64 stride_c = GetTensorDim(strides_, FORMAT_NHWC, 'C'); + OP_REQUIRES( + context, stride_n == 1 && stride_c == 1, + errors::InvalidArgument("Current implementation does not yet support " + "strides in the batch and depth dimensions.")); + OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_)); + } + + void Compute(OpKernelContext* context) override { + // Input tensor is of the following dimensions: + // [ batch, in_rows, in_cols, in_depth ] + const Tensor& input = context->input(0); + OP_REQUIRES(context, (input.shape().num_elements() > 0), + errors::InvalidArgument("Input tensor can't be empty")); + + ImageResizerState st(false); + if (DoResize) { + st = ImageResizerState(align_corners_); + st.ValidateAndCalculateOutputSize(context, input); + if (!context->status().ok()) return; + } else { + // Set up the resize parameters to do no scaling at all. + st.batch_size = input.dim_size(0); + st.out_height = input.dim_size(1); + st.out_width = input.dim_size(2); + st.in_height = input.dim_size(1); + st.in_width = input.dim_size(2); + st.channels = input.dim_size(3); + st.height_scale = 1.0f; + st.width_scale = 1.0f; + } + TensorShape resized_shape( + {input.dim_size(0), st.out_height, st.out_width, input.dim_size(3)}); + int paddings_index; + int filter_index; + if (DoResize) { + paddings_index = 2; + filter_index = 3; + } else { + paddings_index = 1; + filter_index = 2; + } + const Tensor& paddings = context->input(paddings_index); + + const int dims = resized_shape.dims(); + OP_REQUIRES( + context, + TensorShapeUtils::IsMatrix(paddings.shape()) && + paddings.dim_size(1) == 2, + errors::InvalidArgument("paddings must be a matrix with 2 columns: ", + paddings.shape().DebugString())); + const int fixed_dims = + (allow_legacy_scalars() && dims == 0 && paddings.dim_size(0) == 1) + ? 1 + : dims; + OP_REQUIRES( + context, fixed_dims == paddings.dim_size(0), + errors::InvalidArgument( + "The first dimension of paddings must be the rank of inputs: ", + fixed_dims, " ", paddings.shape().DebugString(), " ", + resized_shape.DebugString())); + OP_REQUIRES( + context, dims == paddings.dim_size(0), + errors::InvalidArgument( + "The first dimension of paddings must be the rank of inputs: ", + dims, " ", paddings.shape().DebugString(), " ", + resized_shape.DebugString())); + + OP_REQUIRES( + context, dims == 4, + errors::InvalidArgument( + "Fused mirror padding only supports four-dimensional inputs, but ", + dims, " requested")); + + // Compute the shape of the output tensor, and allocate it. + TensorShape padded_shape; + TTypes::ConstMatrix paddings_matrix = paddings.matrix(); + for (int d = 0; d < dims; ++d) { + const int32 before = + paddings_matrix(d, 0); // Pad before existing elements. + const int32 after = + paddings_matrix(d, 1); // Pad after existing elements. + OP_REQUIRES(context, before >= 0 && after >= 0, + errors::InvalidArgument( + "paddings must be non-negative: ", before, " ", after)); + if (offset_ == 0) { // SYMMETRIC mode. + OP_REQUIRES( + context, + before <= resized_shape.dim_size(d) && + after <= resized_shape.dim_size(d), + errors::InvalidArgument("paddings must be no greater " + "than the dimension size: ", + before, ", ", after, " greater than ", + resized_shape.dim_size(d))); + } else if (offset_ == 1) { // REFLECT mode. + OP_REQUIRES( + context, + before < resized_shape.dim_size(d) && + after < resized_shape.dim_size(d), + errors::InvalidArgument("paddings must be less than" + " the dimension size: ", + before, ", ", after, " not less than ", + resized_shape.dim_size(d))); + } + padded_shape.AddDim(before + resized_shape.dim_size(d) + after); + } + + OP_REQUIRES( + context, ((paddings_matrix(0, 0) == 0) && (paddings_matrix(0, 1) == 0)), + errors::InvalidArgument( + "Fused mirror padding only support spatial padding, not batches: ", + paddings.DebugString())); + OP_REQUIRES( + context, ((paddings_matrix(3, 0) == 0) && (paddings_matrix(3, 1) == 0)), + errors::InvalidArgument( + "Fused mirror padding only support spatial padding, not channels: ", + paddings.DebugString())); + const int32 top_padding = paddings_matrix(1, 0); + const int32 bottom_padding = paddings_matrix(1, 1); + const int32 left_padding = paddings_matrix(2, 0); + const int32 right_padding = paddings_matrix(2, 1); + + // Input filter is of the following dimensions: + // [ filter_rows, filter_cols, in_depth, out_depth] + const Tensor& filter = context->input(filter_index); + + // For 2D convolution, there should be 4 dimensions. + OP_REQUIRES(context, padded_shape.dims() == 4, + errors::InvalidArgument("input must be 4-dimensional", + padded_shape.DebugString())); + OP_REQUIRES(context, filter.dims() == 4, + errors::InvalidArgument("filter must be 4-dimensional: ", + filter.shape().DebugString())); + + // We only check the first three dims, since the depth is accessed as an + // int64 below. + for (int i = 0; i < 3; i++) { + OP_REQUIRES( + context, + FastBoundsCheck(filter.dim_size(i), std::numeric_limits::max()), + errors::InvalidArgument("filter too large")); + } + + // The last dimension for input is in_depth. It must be the same as the + // filter's in_depth. + const int64 in_depth = padded_shape.dim_size(3); + OP_REQUIRES(context, in_depth == filter.dim_size(2), + errors::InvalidArgument( + "input and filter must have the same depth: ", in_depth, + " vs ", filter.dim_size(2))); + + // The last dimension for filter is out_depth. + const int out_depth = static_cast(filter.dim_size(3)); + + // The second dimension for input is rows/height. + // The first dimension for filter is rows/height. + const int64 padded_rows_raw = padded_shape.dim_size(1); + OP_REQUIRES( + context, + FastBoundsCheck(padded_rows_raw, std::numeric_limits::max()), + errors::InvalidArgument("Input rows too large")); + const int padded_rows = static_cast(padded_rows_raw); + const int filter_rows = static_cast(filter.dim_size(0)); + const int resized_rows = static_cast(resized_shape.dim_size(1)); + + // The third dimension for input is columns/width. + // The second dimension for filter is columns/width. + const int64 padded_cols_raw = padded_shape.dim_size(2); + OP_REQUIRES( + context, + FastBoundsCheck(padded_cols_raw, std::numeric_limits::max()), + errors::InvalidArgument("Input cols too large")); + const int padded_cols = static_cast(padded_cols_raw); + const int filter_cols = static_cast(filter.dim_size(1)); + const int resized_cols = static_cast(resized_shape.dim_size(2)); + + // The first dimension for input is batch. + const int64 batch_raw = padded_shape.dim_size(0); + OP_REQUIRES(context, + FastBoundsCheck(batch_raw, std::numeric_limits::max()), + errors::InvalidArgument("batch is too large")); + const int batch = static_cast(batch_raw); + + // For now we take the stride from the second and third dimensions only (we + // do not support striding on the batch or depth dimension). + const int stride_rows = GetTensorDim(strides_, FORMAT_NHWC, 'H'); + const int stride_cols = GetTensorDim(strides_, FORMAT_NHWC, 'W'); + + int64 out_rows = 0, out_cols = 0, pad_rows = 0, pad_cols = 0; + OP_REQUIRES_OK(context, + GetWindowedOutputSize(padded_rows, filter_rows, stride_rows, + padding_, &out_rows, &pad_rows)); + OP_REQUIRES_OK(context, + GetWindowedOutputSize(padded_cols, filter_cols, stride_cols, + padding_, &out_cols, &pad_cols)); + TensorShape out_shape = + ShapeFromFormat(FORMAT_NHWC, batch, out_rows, out_cols, out_depth); + OP_REQUIRES(context, (out_shape.num_elements() > 0), + errors::InvalidArgument("Output tensor can't be empty")); + + // Output tensor is of the following dimensions: + // [ in_batch, out_rows, out_cols, out_depth ] + Tensor* output = nullptr; + OP_REQUIRES_OK(context, context->allocate_output(0, out_shape, &output)); + + VLOG(2) << "FusedConv2D: " << name() << ", in_depth = " << in_depth + << ", padded_cols = " << padded_cols + << ", resized_cols = " << resized_cols + << ", filter_cols = " << filter_cols + << ", padded_rows = " << padded_rows + << ", resized_rows = " << resized_rows + << ", filter_rows = " << filter_rows + << ", stride_rows = " << stride_rows + << ", stride_cols = " << stride_cols + << ", out_depth = " << out_depth << ", DoResize=" << DoResize; + + // If there is nothing to compute, return. + if (out_shape.num_elements() == 0) { + return; + } + TConvFunctor conv_functor; + conv_functor(context, input, batch, resized_rows, resized_cols, padded_rows, + padded_cols, in_depth, filter.flat().data(), filter_rows, + filter_cols, out_depth, stride_rows, stride_cols, padding_, + output->flat().data(), out_rows, out_cols, st, top_padding, + bottom_padding, left_padding, right_padding, offset_); + } + + private: + std::vector strides_; + Padding padding_; + bool align_corners_; + int offset_; + + TF_DISALLOW_COPY_AND_ASSIGN(FusedResizeConv2DUsingGemmOp); +}; + +#define REGISTER_FUSED(T) \ + REGISTER_KERNEL_BUILDER( \ + Name("FusedResizeAndPadConv2D") \ + .Device(DEVICE_CPU) \ + .TypeConstraint("T"), \ + FusedResizeConv2DUsingGemmOp< \ + T, \ + FusedResizeAndPadConvFunctor, \ + BILINEAR>, \ + true>); + +TF_CALL_half(REGISTER_FUSED); +TF_CALL_float(REGISTER_FUSED); +TF_CALL_double(REGISTER_FUSED); + +#define REGISTER_PAD_ONLY_FUSED(T) \ + REGISTER_KERNEL_BUILDER( \ + Name("FusedPadConv2D").Device(DEVICE_CPU).TypeConstraint("T"), \ + FusedResizeConv2DUsingGemmOp< \ + T, \ + FusedResizeAndPadConvFunctor, \ + NEAREST>, \ + false>); + +TF_CALL_half(REGISTER_PAD_ONLY_FUSED); +TF_CALL_float(REGISTER_PAD_ONLY_FUSED); +TF_CALL_double(REGISTER_PAD_ONLY_FUSED); + +} // namespace tensorflow -- GitLab From d194c00783646820d8a2d5e9be2edf42518dafbf Mon Sep 17 00:00:00 2001 From: AG Ramesh Date: Tue, 4 Dec 2018 10:21:14 -0800 Subject: [PATCH 095/405] Clang format error fixes --- tensorflow/core/graph/mkl_layout_pass.cc | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/tensorflow/core/graph/mkl_layout_pass.cc b/tensorflow/core/graph/mkl_layout_pass.cc index a143892e08..52b4660094 100644 --- a/tensorflow/core/graph/mkl_layout_pass.cc +++ b/tensorflow/core/graph/mkl_layout_pass.cc @@ -1340,8 +1340,7 @@ class MklLayoutRewritePass : public GraphOptimizationPass { static void CopyAttrsPadWithConv2D(const Node* orig_node, NodeBuilder* nb, bool change_format = false); static void CopyAttrsFromPadAndConv2D(const Node* orig_node1, - const Node* orig_node2, - NodeBuilder* nb, + const Node* orig_node2, NodeBuilder* nb, bool change_format = false); static void CopyAttrsPooling(const Node* orig_node, NodeBuilder* nb, bool change_format = false); @@ -2435,9 +2434,9 @@ Status MklLayoutRewritePass::MergeConv2DWithBiasAdd(std::unique_ptr* g, Status MklLayoutRewritePass::MergePadWithConv2D(std::unique_ptr* g, Node* m, Node* n) { DCHECK(((m->type_string() == csinfo_.pad && - n->type_string() == csinfo_.conv2d)) || - ((n->type_string() == csinfo_.pad && - m->type_string() == csinfo_.conv2d))); + n->type_string() == csinfo_.conv2d)) || + ((n->type_string() == csinfo_.pad && + m->type_string() == csinfo_.conv2d))); // Conv2D is successor node, and Pad predecessor node. Node* pred = m->type_string() == csinfo_.pad ? m : n; @@ -2572,7 +2571,7 @@ Status MklLayoutRewritePass::MergePadWithConv2D(std::unique_ptr* g, // output (at slot 0). const int kPadWithConv2DOutputSlot = 0; (*g)->AddEdge(new_node, kPadWithConv2DOutputSlot, e->dst(), - e->dst_input()); + e->dst_input()); } } -- GitLab From be4a5d634fa1dea81a13ddbd034c241be8dc5e55 Mon Sep 17 00:00:00 2001 From: Eugene Zhulenev Date: Tue, 4 Dec 2018 09:58:49 -0800 Subject: [PATCH 096/405] Turn on PartitionedCallOp inlining by default PiperOrigin-RevId: 223996298 --- tensorflow/core/grappler/grappler_item.h | 7 ++ .../grappler/optimizers/function_optimizer.cc | 90 +++++++++++++------ .../grappler/optimizers/meta_optimizer.cc | 4 + .../core/kernels/partitioned_function_ops.cc | 6 ++ 4 files changed, 78 insertions(+), 29 deletions(-) diff --git a/tensorflow/core/grappler/grappler_item.h b/tensorflow/core/grappler/grappler_item.h index aea7d27792..6ef4f14247 100644 --- a/tensorflow/core/grappler/grappler_item.h +++ b/tensorflow/core/grappler/grappler_item.h @@ -83,6 +83,13 @@ struct GrapplerItem { // Is it allowed to add nodes to the graph that do not have registered // gradient function. bool non_differentiable_rewrites = true; + // By default we are not allowed to inline ops with side effects into the + // main graph, because we can't guarantee that after pruning these ops will + // be executed. However if we are optimizing a function library (see + // meta_optimizer.cc) and a graph was instantiated by a function definition, + // we can do that, because functions guarantee that all side effects will be + // executed (see function_optimizer.cc for details). + bool inline_ops_with_side_effects = false; }; const std::unordered_set& devices() const; diff --git a/tensorflow/core/grappler/optimizers/function_optimizer.cc b/tensorflow/core/grappler/optimizers/function_optimizer.cc index 2dd4ff10e4..69685409a3 100644 --- a/tensorflow/core/grappler/optimizers/function_optimizer.cc +++ b/tensorflow/core/grappler/optimizers/function_optimizer.cc @@ -258,6 +258,7 @@ class FunctionOptimizerContext { : grappler_item_id_(item.id), graph_version_(item.graph.versions().producer()), opt_level_(opt_level), + allowed_optimizations_(item.allowed_optimizations()), function_library_(OpRegistry::Global(), item.graph.library()), available_device_names_(item.devices().begin(), item.devices().end()), graph_view_(&item.graph) { @@ -267,6 +268,10 @@ class FunctionOptimizerContext { const RewriterConfig::Toggle opt_level() const { return opt_level_; } + const GrapplerItem::AllowedOptimizations& allowed_optimizations() const { + return allowed_optimizations_; + } + const FunctionLibraryDefinition& function_library() const { return function_library_; } @@ -397,6 +402,7 @@ class FunctionOptimizerContext { const string grappler_item_id_; const int graph_version_; const RewriterConfig::Toggle opt_level_; + const GrapplerItem::AllowedOptimizations allowed_optimizations_; FunctionLibraryDefinition function_library_; // These fields initialized lazily only if needed. @@ -1228,12 +1234,6 @@ Status IsInlinableIndirectFunctionCall(const FunctionOptimizerContext& ctx, SummarizeNodeDef(func_node)); } - // TODO(ezhulenev): Enable it by default. - if (ctx.opt_level() != RewriterConfig::AGGRESSIVE) { - return errors::FailedPrecondition( - "Indirect function inlining supported only in aggressive mode"); - } - if (MarkedNoInline(func)) { return errors::FailedPrecondition( "Can't inline function marked with '_noinline': ", @@ -1256,6 +1256,20 @@ Status IsInlinableIndirectFunctionCall(const FunctionOptimizerContext& ctx, SummarizeNodeDef(func_node)); } + // We can't inline functions with `Switch` nodes in the function body, because + // they might have dead tensors as a function output argument (we need all + // intermediate tensors to compute the function gradient). `PartitionedCallOp` + // invokes functions with `allow_dead_tensors = true` to reset dead flag, + // and return default initialized tensors instead of a dead tensors. + // TODO(ezhulenev): Do the liveness analysis and add + // `IdentitytWithResurrection` nodes after all potentially dead output + // tensors? + if (absl::c_any_of(func.node_def(), IsSwitch)) { + return errors::FailedPrecondition( + "Can't inline function with `Switch` nodes in the function body: ", + SummarizeNodeDef(func_node)); + } + return Status::OK(); } @@ -1339,11 +1353,6 @@ Status InlineIndirectFunctionCall(const NodeDef& func_node, const string prefix = strings::StrCat(func_node.name(), "/"); - // Keep track of side-effectful ops inside function body. Each outgoing - // control edge from the function call node, must be replaced with control - // edges from inlined side-effectful ops. - std::vector side_effectful_nodes; - // ------------------------------------------------------------------------ // // First we need to assign device placements to all function body nodes. @@ -1427,31 +1436,54 @@ Status InlineIndirectFunctionCall(const NodeDef& func_node, TF_RETURN_IF_ERROR( AddPrefixAndSuffixToNode(prefix, /*suffix=*/"", &func_body_node)); - // If the function body has a side-effectful op, we double check that the - // function call node has an output control edge, otherwise we can't safely - // do inlining and guarantee that node will be executed. - // TODO(ezhulenev): If we don't have `happens_after` dependencies does it - // mean that no one is interested in observing side effects and we can - // safely inline it? + // After inlining into the optimized graph, NodeDef must have all attributes + // defined, which is not required for a node in a FunctionDef. + const OpDef* op_def; + TF_RETURN_IF_ERROR( + ctx->function_library().LookUpOpDef(func_body_node.op(), &op_def)); + AddDefaultsToNodeDef(*op_def, &func_body_node); + } + + // Construct a graph view for the preprocessed function body graph. + GraphView placed_graph_view(&placed_graph_def); + + // Keep track of side-effectful ops inside function body. Each outgoing + // control edge from the function call node, must be replaced with control + // edges from inlined side-effectful ops. + std::vector side_effectful_nodes; + + // We have to make sure that all side-effectful nodes inside a function body + // will be executed after function inlining. + for (NodeDef& func_body_node : *placed_graph_def.mutable_node()) { if (!IsFreeOfSideEffect(func_body_node, &ctx->function_library())) { - DCHECK(!happens_after.empty()); - if (happens_after.empty()) { - // NOTE: If this happens file a bug to TF.Eager team. + int num_fanouts = placed_graph_view.NumFanouts( + func_body_node, /*include_controlling_nodes=*/true); + + // If the node doesn't have any outgoing edges and we do not have any + // nodes in the `happens_after` set, we can't inline a function and + // guarantee that side-effects will be executed. The only exception if we + // do function library optimization, and the GrapplerItem was constructed + // for the function body, because functions have strict semantics. + + if (num_fanouts == 0 && happens_after.empty() && + !ctx->allowed_optimizations().inline_ops_with_side_effects) { return errors::Internal( - "Can't inline a function with a stateful op and empty output " - "control edge set. Function body node: ", + "Can't inline a function with a side-effectful op with empty " + "fanouts and empty output control edge set. Function body node: ", SummarizeNodeDef(func_body_node)); } side_effectful_nodes.push_back(func_body_node.name()); } + } - // TODO(ezhulenev): Inline nested indirect function calls. - - // Move the node to the optimized graph. + // Move all the nodes to the optimized graph after successful preprocessing. + for (NodeDef& func_body_node : *placed_graph_def.mutable_node()) { optimized_graph->add_node()->Swap(&func_body_node); } + // TODO(ezhulenev): Inline nested indirect function calls. + // Indirect function call is fully inlined into the optimized graph, and we do // not copy the original function call node, so we have to setup tensor // mapping from old output tensors, to the outputs of inlined nodes. @@ -1469,6 +1501,8 @@ Status InlineIndirectFunctionCall(const NodeDef& func_node, // call node to all side-effectful ops inside function body. ctx->AddControlOverrides(func_node, side_effectful_nodes); + VLOG(3) << "Successfully inlined indirect function call: " + << SummarizeNodeDef(func_node); return Status::OK(); } @@ -1552,8 +1586,7 @@ Status FunctionOptimizer::Optimize(Cluster*, const GrapplerItem& item, node, *func, graph_def_version, ctx, optimized_graph)); continue; } else { - VLOG(2) << "Can't inline direct function call: " - << inlinable.error_message(); + VLOG(2) << inlinable.error_message(); } } @@ -1565,8 +1598,7 @@ Status FunctionOptimizer::Optimize(Cluster*, const GrapplerItem& item, node, *func, graph_def_version, &ctx, optimized_graph)); continue; } else { - VLOG(2) << "Can't inline indirect function call: " - << inlinable.error_message(); + VLOG(2) << inlinable.error_message(); } } diff --git a/tensorflow/core/grappler/optimizers/meta_optimizer.cc b/tensorflow/core/grappler/optimizers/meta_optimizer.cc index 5560e8d55f..7c83036341 100644 --- a/tensorflow/core/grappler/optimizers/meta_optimizer.cc +++ b/tensorflow/core/grappler/optimizers/meta_optimizer.cc @@ -533,6 +533,10 @@ Status MetaOptimizer::Optimize(Cluster* cluster, const GrapplerItem& item, VLOG(3) << added_devices.error_message(); } + // We can safely inline nested function calls with side-effectful ops into + // the function body (see function_optimizer.cc for details). + func_item.allowed_optimizations().inline_ops_with_side_effects = true; + // Optimize function body graph. GraphDef optimized_func_graph; TF_RETURN_IF_ERROR( diff --git a/tensorflow/core/kernels/partitioned_function_ops.cc b/tensorflow/core/kernels/partitioned_function_ops.cc index 6c90ffd75e..30a8be141c 100644 --- a/tensorflow/core/kernels/partitioned_function_ops.cc +++ b/tensorflow/core/kernels/partitioned_function_ops.cc @@ -531,6 +531,12 @@ class PartitionedCallOp : public AsyncOpKernel { tensorflow::grappler::GrapplerItem item; + // Add all available devices so that inlined function can be placed. + for (const Device* d : device_set.devices()) { + Status added_device = item.AddDevice(d->name()); + if (!added_device.ok()) VLOG(3) << added_device.error_message(); + } + // Add fetches so that the graph can be pruned. for (Node* node : ret_nodes) { item.fetch.push_back(node->name()); -- GitLab From 2fa9a541a96131a83cd9c8f2898c43e2d48ea00c Mon Sep 17 00:00:00 2001 From: Dan Moldovan Date: Tue, 4 Dec 2018 10:13:55 -0800 Subject: [PATCH 097/405] Avoid aliasing variables that are not live upon entry of conditional block. This avoids incorrectly aliasing variables that may be defined because of loop effects in the CFG. PiperOrigin-RevId: 223999403 --- .../autograph/converters/control_flow.py | 42 ++++++++++++++++--- .../pyct/static_analysis/liveness.py | 14 +++++++ 2 files changed, 51 insertions(+), 5 deletions(-) diff --git a/tensorflow/python/autograph/converters/control_flow.py b/tensorflow/python/autograph/converters/control_flow.py index 5853e044c5..bef6cae1bb 100644 --- a/tensorflow/python/autograph/converters/control_flow.py +++ b/tensorflow/python/autograph/converters/control_flow.py @@ -106,14 +106,49 @@ class ControlFlowTransformer(converter.Base): return 'no variables' return ', '.join(map(str, symbol_set)) - def visit_If(self, node): - node = self.generic_visit(node) + def _determine_aliased_symbols(self, scope, node_defined_in, block): + if block: + block_live_in = set(anno.getanno(block[0], anno.Static.LIVE_VARS_IN)) + else: + block_live_in = set() + # For the purpose of aliasing, composite symbols with live owners are live + # as well. Otherwise this would leak tensors from the conditional's body. + # + # For example: + # + # obj = some_obj + # if cond: + # obj.a = val + # + # Thanslating to the code below would be incorrect: + # + # def true_fn(): + # obj.a = val() # Wrong! leaks ops owned by true_fn + # return obj.a + for s in scope.modified: + if s.is_composite(): + live_parents = block_live_in & s.owner_set + if live_parents: + block_live_in.add(s) + return scope.modified & node_defined_in & block_live_in + + def visit_If(self, node): body_scope = anno.getanno(node, annos.NodeAnno.BODY_SCOPE) orelse_scope = anno.getanno(node, annos.NodeAnno.ORELSE_SCOPE) defined_in = anno.getanno(node, anno.Static.DEFINED_VARS_IN) live_out = anno.getanno(node, anno.Static.LIVE_VARS_OUT) + # Note: this information needs to be extracted before the body conversion + # that happens in the call to generic_visit below, because the conversion + # generates nodes that lack static analysis annotations. + need_alias_in_body = self._determine_aliased_symbols( + body_scope, defined_in, node.body) + need_alias_in_orelse = self._determine_aliased_symbols( + orelse_scope, defined_in, node.orelse) + + node = self.generic_visit(node) + modified_in_cond = body_scope.modified | orelse_scope.modified returned_from_cond = set() for s in modified_in_cond: @@ -125,9 +160,6 @@ class ControlFlowTransformer(converter.Base): if live_out & s.owner_set: returned_from_cond.add(s) - need_alias_in_body = body_scope.modified & defined_in - need_alias_in_orelse = orelse_scope.modified & defined_in - created_in_body = body_scope.modified & returned_from_cond - defined_in created_in_orelse = orelse_scope.modified & returned_from_cond - defined_in diff --git a/tensorflow/python/autograph/pyct/static_analysis/liveness.py b/tensorflow/python/autograph/pyct/static_analysis/liveness.py index 451398f1b7..f8b8d7fa77 100644 --- a/tensorflow/python/autograph/pyct/static_analysis/liveness.py +++ b/tensorflow/python/autograph/pyct/static_analysis/liveness.py @@ -161,6 +161,16 @@ class Annotator(transformer.Base): self.cross_function_analyzer = cross_function_analyzer self.current_analyzer = None + def visit(self, node): + node = super(Annotator, self).visit(node) + if (self.current_analyzer is not None and + isinstance(node, gast.stmt) and + node in self.current_analyzer.graph.index): + cfg_node = self.current_analyzer.graph.index[node] + anno.setanno(node, anno.Static.LIVE_VARS_IN, + frozenset(self.current_analyzer.in_[cfg_node])) + return node + def visit_FunctionDef(self, node): parent_analyzer = self.current_analyzer self.current_analyzer = self.cross_function_analyzer.analyzers[node] @@ -198,6 +208,10 @@ class Annotator(transformer.Base): node = self._block_statement_live_out(node) return self._block_statement_live_in(node, node.test) + def visit_With(self, node): + node = self.generic_visit(node) + return self._block_statement_live_in(node, node.items[0]) + def visit_Expr(self, node): node = self.generic_visit(node) cfg_node = self.current_analyzer.graph.index[node] -- GitLab From 93b27b079e46056056885c7890ac857b193f4921 Mon Sep 17 00:00:00 2001 From: Zhenyu Tan Date: Tue, 4 Dec 2018 10:35:50 -0800 Subject: [PATCH 098/405] Implement the Keras version of Momentum optimization algorithm Op. PiperOrigin-RevId: 224003676 --- .../api_def_ResourceApplyKerasMomentum.pbtxt | 56 +++++ ...def_ResourceSparseApplyKerasMomentum.pbtxt | 64 +++++ .../api_def_ResourceApplyKerasMomentum.pbtxt | 4 + ...def_ResourceSparseApplyKerasMomentum.pbtxt | 4 + tensorflow/core/kernels/training_ops.cc | 227 ++++++++++++++++++ tensorflow/core/kernels/training_ops.h | 9 + .../core/kernels/training_ops_gpu.cu.cc | 25 ++ tensorflow/core/kernels/training_ops_test.cc | 34 +++ tensorflow/core/ops/training_ops.cc | 28 +++ .../keras/optimizer_v2/gradient_descent.py | 4 +- .../optimizer_v2/gradient_descent_test.py | 45 ++-- .../keras/optimizer_v2/optimizer_v2_test.py | 45 +++- 12 files changed, 519 insertions(+), 26 deletions(-) create mode 100644 tensorflow/core/api_def/base_api/api_def_ResourceApplyKerasMomentum.pbtxt create mode 100644 tensorflow/core/api_def/base_api/api_def_ResourceSparseApplyKerasMomentum.pbtxt create mode 100644 tensorflow/core/api_def/python_api/api_def_ResourceApplyKerasMomentum.pbtxt create mode 100644 tensorflow/core/api_def/python_api/api_def_ResourceSparseApplyKerasMomentum.pbtxt diff --git a/tensorflow/core/api_def/base_api/api_def_ResourceApplyKerasMomentum.pbtxt b/tensorflow/core/api_def/base_api/api_def_ResourceApplyKerasMomentum.pbtxt new file mode 100644 index 0000000000..830391a32b --- /dev/null +++ b/tensorflow/core/api_def/base_api/api_def_ResourceApplyKerasMomentum.pbtxt @@ -0,0 +1,56 @@ +op { + graph_op_name: "ResourceApplyKerasMomentum" + in_arg { + name: "var" + description: <