这是indexloc提供的服务,不要输入任何密码
Skip to content

[determinism] Add softmax/cross-entropy op exceptions for GPU determinism #47925

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
40 changes: 40 additions & 0 deletions tensorflow/core/kernels/sparse_xent_op.cc
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,8 @@ limitations under the License.
#include "tensorflow/core/framework/tensor.h"
#include "tensorflow/core/framework/tensor_shape.h"
#include "tensorflow/core/framework/tensor_types.h"
#include "tensorflow/core/util/env_var.h"


namespace tensorflow {

Expand All @@ -47,6 +49,33 @@ Status CheckInvalidLabelIndex(const Tensor& labels, int64 max_index) {
return Status::OK();
}

namespace {

// TODO(duncanriach): Factor this into a shared utility library
bool RequireDeterminism() {
static bool require_determinism = [] {
bool deterministic_ops = false;
TF_CHECK_OK(tensorflow::ReadBoolFromEnvVar("TF_DETERMINISTIC_OPS",
/*default_val=*/false,
&deterministic_ops));
return deterministic_ops;
}();
return require_determinism;
}

bool DisableSparseSoftmaxXentWithLogitsOpDeterminismExceptions() {
static bool cached_disable = [] {
bool disable = false;
TF_CHECK_OK(tensorflow::ReadBoolFromEnvVar(
"TF_DISABLE_SPARSE_SOFTMAX_XENT_WITH_LOGITS_OP_DETERMINISM_EXCEPTIONS",
/*default_val=*/false, &disable));
return disable;
}();
return cached_disable;
}

} // namespace

template <typename Device, typename T, typename Index>
class SparseSoftmaxXentWithLogitsOp : public OpKernel {
public:
Expand All @@ -73,6 +102,17 @@ class SparseSoftmaxXentWithLogitsOp : public OpKernel {
"Must have at least one class, but got logits shape ",
logits.shape().DebugString()));

if (std::is_same<Device, GPUDevice>::value) {
OP_REQUIRES(
context,
!RequireDeterminism() ||
DisableSparseSoftmaxXentWithLogitsOpDeterminismExceptions(),
errors::Unimplemented(
"Deterministic GPU implementation of"
" SparseSoftmaxCrossEntropyWithLogits not available."
));
}

Tensor scratch;
OP_REQUIRES_OK(context, context->allocate_temp(DataTypeToEnum<T>::value,
labels.shape(), &scratch));
Expand Down
39 changes: 39 additions & 0 deletions tensorflow/core/kernels/xent_op.cc
Original file line number Diff line number Diff line change
Expand Up @@ -25,12 +25,40 @@ limitations under the License.
#include "tensorflow/core/framework/tensor_shape.h"
#include "tensorflow/core/kernels/xent_op.h"
#include "tensorflow/core/util/bcast.h"
#include "tensorflow/core/util/env_var.h"

namespace tensorflow {

typedef Eigen::ThreadPoolDevice CPUDevice;
typedef Eigen::GpuDevice GPUDevice;

namespace {

// TODO(duncanriach): Factor this into a shared utility library
bool RequireDeterminism() {
static bool require_determinism = [] {
bool deterministic_ops = false;
TF_CHECK_OK(tensorflow::ReadBoolFromEnvVar("TF_DETERMINISTIC_OPS",
/*default_val=*/false,
&deterministic_ops));
return deterministic_ops;
}();
return require_determinism;
}

bool DisableSoftmaxXentWithLogitsOpDeterminismExceptions() {
static bool cached_disable = [] {
bool disable = false;
TF_CHECK_OK(tensorflow::ReadBoolFromEnvVar(
"TF_DISABLE_SOFTMAX_XENT_WITH_LOGITS_OP_DETERMINISM_EXCEPTIONS",
/*default_val=*/false, &disable));
return disable;
}();
return cached_disable;
}

} // namespace

template <typename Device, typename T>
class SoftmaxXentWithLogitsOp : public OpKernel {
public:
Expand Down Expand Up @@ -58,6 +86,17 @@ class SoftmaxXentWithLogitsOp : public OpKernel {
"2-dimensional, or broadcasted to be "
"2-dimensional"));

if (std::is_same<Device, GPUDevice>::value) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is the CPU implementation deterministic?

Copy link
Contributor Author

@duncanriach duncanriach Mar 23, 2021

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We just confirmed that the CPU implementation is deterministic (thanks @wenscarl). I'm considering adding tests to prove/confirm/ensure that in a future PR.

OP_REQUIRES(
context,
!RequireDeterminism() ||
DisableSoftmaxXentWithLogitsOpDeterminismExceptions(),
errors::Unimplemented(
"Deterministic GPU implementation of"
" SoftmaxCrossEntropyWithLogits not available."
));
}

// loss is 1-D (one per example), and size is batch_size.

Tensor scratch;
Expand Down
28 changes: 28 additions & 0 deletions tensorflow/python/kernel_tests/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -2695,6 +2695,20 @@ cuda_py_test(
],
)

cuda_py_test(
name = "sparse_xent_op_deterministic_test",
size = "small",
srcs = ["sparse_xent_op_deterministic_test.py"],
xla_enable_strict_auto_jit = False,
deps = [
"//tensorflow/python:client_testlib",
"//tensorflow/python:constant_op",
"//tensorflow/python:dtypes",
"//tensorflow/python:errors",
"//tensorflow/python:nn_ops",
],
)

cuda_py_test(
name = "sparse_xent_op_test",
size = "small",
Expand Down Expand Up @@ -2882,6 +2896,20 @@ cuda_py_test(
],
)

cuda_py_test(
name = "xent_op_deterministic_test",
size = "small",
srcs = ["xent_op_deterministic_test.py"],
xla_enable_strict_auto_jit = False,
deps = [
"//tensorflow/python:client_testlib",
"//tensorflow/python:constant_op",
"//tensorflow/python:dtypes",
"//tensorflow/python:errors",
"//tensorflow/python:nn_ops",
],
)

cuda_py_test(
name = "xent_op_test",
size = "small",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,11 @@


class SegmentReductionDeterminismExceptionsTest(test.TestCase):
"""Test that tf.errors.UnimplementedError is thrown or not thrown, as appropriate, by the GPU code-paths for the segment reduction ops when determinsitic ops are enabled.
"""Test d9m-unimplemented exceptions from the segment reduction ops.

Test that tf.errors.UnimplementedError is thrown or not thrown, as
appropriate, by the GPU code-paths for segment reduction ops when
deterministic ops are enabled.

This test assumes that the base op test runs all the same test cases when
deterministic ops are not enabled and will therefore detect erroneous
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Tests for deterministic functionality of SparseSoftmaxCrossEntropyWithLogits op."""

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import os

from tensorflow.python.framework import constant_op
from tensorflow.python.framework import dtypes
from tensorflow.python.framework import errors_impl
from tensorflow.python.framework import test_util
from tensorflow.python.ops import nn_ops
from tensorflow.python.platform import test


class SparseSoftmaxCrossEntropyWithLogitsDeterminismExceptionsTest(
test.TestCase):
"""Test d9m-unimplemented exceptions from SparseSoftmaxCrossEntropyWithLogits.

Test that tf.errors.UnimplementedError is thrown or not thrown, as
appropriate, by the GPU code-paths for SparseSoftmaxCrossEntropyWithLogits
when deterministic ops are enabled.

This test assumes that the base op test runs all the same test cases when
deterministic ops are not enabled and will therefore detect erroneous
exception throwing in those cases.
"""

@test_util.run_cuda_only
@test_util.run_in_graph_and_eager_modes
def testExceptionThrowing(self):
with self.session(force_gpu=True):
for logits_dtype in [dtypes.float16, dtypes.float32]:
for labels_dtype in [dtypes.int32, dtypes.int64]:
labels = constant_op.constant([1, 0], dtype=labels_dtype)
logits = constant_op.constant(
[[0.3, 0.5], [0.2, 0.6]], dtype=logits_dtype)
with self.assertRaisesRegex(
errors_impl.UnimplementedError,
"Deterministic GPU implementation of " +
"SparseSoftmaxCrossEntropyWithLogits not available."):
result = nn_ops.sparse_softmax_cross_entropy_with_logits(
labels=labels, logits=logits)
self.evaluate(result)


if __name__ == "__main__":
# Note that the effect of setting the following environment variable to
# 'true' is not tested. Unless we can find a simpler pattern for testing these
# environment variables, it would require this file to be made into a base
# and then two more test files to be created.
os.environ["TF_DETERMINISTIC_OPS"] = "1"
test.main()
65 changes: 65 additions & 0 deletions tensorflow/python/kernel_tests/xent_op_deterministic_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Tests for deterministic functionality of SoftmaxCrossEntropyWithLogits op."""

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import os

from tensorflow.python.framework import constant_op
from tensorflow.python.framework import dtypes
from tensorflow.python.framework import errors_impl
from tensorflow.python.framework import test_util
from tensorflow.python.ops import nn_ops
from tensorflow.python.platform import test


class SoftmaxCrossEntropyWithLogitsDeterminismExceptionsTest(test.TestCase):
"""Test d9m-unimplemented exceptions from SoftmaxCrossEntropyWithLogits.

Test that tf.errors.UnimplementedError is thrown or not thrown, as
appropriate, by the GPU code-paths for SoftmaxCrossEntropyWithLogits when
deterministic ops are enabled.

This test assumes that the base op test runs all the same test cases when
deterministic ops are not enabled and will therefore detect erroneous
exception throwing in those cases.
"""

@test_util.run_cuda_only
@test_util.run_in_graph_and_eager_modes
def testExceptionThrowing(self):
with self.session(force_gpu=True):
for dtype in [dtypes.float16, dtypes.float32, dtypes.float64]:
labels = constant_op.constant([[0.2, 0.4], [0.1, 0.2]], dtype=dtype)
logits = constant_op.constant([[0.3, 0.5], [0.5, 0.6]], dtype=dtype)
with self.assertRaisesRegex(
errors_impl.UnimplementedError,
"Deterministic GPU implementation of " +
"SoftmaxCrossEntropyWithLogits not available."):
result = nn_ops.softmax_cross_entropy_with_logits(
labels=labels, logits=logits)
self.evaluate(result)


if __name__ == "__main__":
# Note that the effect of setting the following environment variable to
# 'true' is not tested. Unless we can find a simpler pattern for testing these
# environment variables, it would require this file to be made into a base
# and then two more test files to be created.
os.environ["TF_DETERMINISTIC_OPS"] = "1"
test.main()