diff --git a/include/layers/BinaryOpLayer.hpp b/include/layers/BinaryOpLayer.hpp index 06bed60c..629373a6 100644 --- a/include/layers/BinaryOpLayer.hpp +++ b/include/layers/BinaryOpLayer.hpp @@ -19,6 +19,8 @@ class BinaryOpLayer : public Layer { void run(const std::vector& input, std::vector& output) override; + void run(const std::vector& input, std::vector& output, + const RuntimeOptions& options) override; static bool is_scalar_tensor(const Tensor& t); #ifdef ENABLE_STATISTIC_WEIGHTS @@ -30,6 +32,7 @@ class BinaryOpLayer : public Layer { private: Operation op_; + ParBackend parallel_backend_ = ParBackend::kSeq; template void run_with_scalar_impl(const Tensor& input, ValueType scalar, diff --git a/src/layers/BinaryOpLayer.cpp b/src/layers/BinaryOpLayer.cpp index b4ad6d62..822bc334 100644 --- a/src/layers/BinaryOpLayer.cpp +++ b/src/layers/BinaryOpLayer.cpp @@ -80,6 +80,13 @@ void BinaryOpLayer::run(const std::vector& input, } } +void BinaryOpLayer::run(const std::vector& input, + std::vector& output, + const RuntimeOptions& options) { + parallel_backend_ = options.par_backend; + run(input, output); +} + void BinaryOpLayer::run_with_scalar(const Tensor& input, float scalar, Tensor& output) const { switch (input.get_type()) { @@ -101,12 +108,17 @@ template void BinaryOpLayer::run_with_scalar_impl(const Tensor& input, ValueType scalar, Tensor& output) const { const auto& input_data = *input.as(); - std::vector result; - result.reserve(input_data.size()); + std::vector result(input_data.size()); - for (const auto& val : input_data) { - result.push_back(apply_binary_op(val, scalar, op_)); - } + parallel::Options options; + options.backend = parallel_backend_; + + parallel::parallel_for( + input_data.size(), + [&](size_t i) { + result[i] = apply_binary_op(input_data[i], scalar, op_); + }, + options); output = make_tensor(result, input.get_shape()); } @@ -122,13 +134,19 @@ void BinaryOpLayer::run_broadcast_impl(const Tensor& A, const Tensor& B, const auto strides_b = get_strides(B.get_shape()); const auto strides_output = get_strides(output_shape); - for (size_t i = 0; i < result.size(); ++i) { - size_t a_idx = get_broadcasted_index(i, A.get_shape(), output_shape, - strides_a, strides_output); - size_t b_idx = get_broadcasted_index(i, B.get_shape(), output_shape, - strides_b, strides_output); - result[i] = apply_binary_op(a_data[a_idx], b_data[b_idx], op_); - } + parallel::Options options; + options.backend = parallel_backend_; + + parallel::parallel_for( + result.size(), + [&](size_t i) { + size_t a_idx = get_broadcasted_index(i, A.get_shape(), output_shape, + strides_a, strides_output); + size_t b_idx = get_broadcasted_index(i, B.get_shape(), output_shape, + strides_b, strides_output); + result[i] = apply_binary_op(a_data[a_idx], b_data[b_idx], op_); + }, + options); output = make_tensor(result, output_shape); } diff --git a/test/single_layer_parall_version/test_binaryoplayer_parall.cpp b/test/single_layer_parall_version/test_binaryoplayer_parall.cpp new file mode 100644 index 00000000..a4632d2d --- /dev/null +++ b/test/single_layer_parall_version/test_binaryoplayer_parall.cpp @@ -0,0 +1,177 @@ +#include +#include +#include +#include +#include + +#include "gtest/gtest.h" +#include "layers/BinaryOpLayer.hpp" + +#define ENABLE_TIMING_OUTPUT 1 + +#if ENABLE_TIMING_OUTPUT +#define PRINT_TIMING(msg) std::cout << msg << std::endl +#else +#define PRINT_TIMING(msg) ((void)0) +#endif + +using namespace it_lab_ai; + +static void ExpectTensorsNear(const Tensor& a, const Tensor& b, + float tolerance = 1e-5f) { + ASSERT_EQ(a.get_shape(), b.get_shape()); + ASSERT_EQ(a.get_type(), b.get_type()); + + if (a.get_type() == Type::kFloat) { + auto data_a = *a.as(); + auto data_b = *b.as(); + ASSERT_EQ(data_a.size(), data_b.size()); + for (size_t i = 0; i < data_a.size(); ++i) { + EXPECT_NEAR(data_a[i], data_b[i], tolerance) << "Mismatch at index " << i; + } + } else if (a.get_type() == Type::kInt) { + auto data_a = *a.as(); + auto data_b = *b.as(); + ASSERT_EQ(data_a.size(), data_b.size()); + for (size_t i = 0; i < data_a.size(); ++i) { + EXPECT_EQ(data_a[i], data_b[i]) << "Mismatch at index " << i; + } + } +} + +static Tensor RunBinary(BinaryOpLayer& layer, const Tensor& a, const Tensor& b, + ParBackend backend, long long* duration_ms = nullptr) { + RuntimeOptions options; + options.par_backend = backend; + + Tensor output; + std::vector in{a, b}; + std::vector out{output}; + + auto start = std::chrono::high_resolution_clock::now(); + layer.run(in, out, options); + auto end = std::chrono::high_resolution_clock::now(); + + if (duration_ms != nullptr) { + *duration_ms = + std::chrono::duration_cast(end - start) + .count(); + } + + return out[0]; +} + +static void RunAllBackendsAndCompare(BinaryOpLayer& layer, const Tensor& a, + const Tensor& b, const std::string& label, + float tolerance = 1e-5f) { + std::vector backends = {ParBackend::kSeq, ParBackend::kThreads, + ParBackend::kTbb, ParBackend::kOmp, + ParBackend::kKokkos}; + + Tensor baseline = RunBinary(layer, a, b, ParBackend::kSeq); + + for (auto backend : backends) { + long long ms = 0; + Tensor result = RunBinary(layer, a, b, backend, &ms); + + PRINT_TIMING("BinaryOp " << label << " Backend " + << static_cast(backend) << " time: " << ms + << " ms"); + + ExpectTensorsNear(baseline, result, tolerance); + } +} + +TEST(binaryoplayer_parall, parallel_add_basic_float) { + Tensor a = make_tensor({1.f, 2.f, 3.f, 4.f}, {2, 2}); + Tensor b = make_tensor({5.f, 6.f, 7.f, 8.f}, {2, 2}); + + BinaryOpLayer layer(BinaryOpLayer::Operation::kAdd); + RunAllBackendsAndCompare(layer, a, b, "add_basic_float"); +} + +TEST(binaryoplayer_parall, parallel_mul_basic_int) { + Tensor a = make_tensor({1, 2, 3, 4}, {2, 2}); + Tensor b = make_tensor({2, 3, 4, 5}, {2, 2}); + + BinaryOpLayer layer(BinaryOpLayer::Operation::kMul); + RunAllBackendsAndCompare(layer, a, b, "mul_basic_int", 0.0f); +} + +TEST(binaryoplayer_parall, parallel_sub_scalar_float) { + Shape shape({1024, 1024}); + Tensor a = make_tensor(std::vector(shape.count(), 5.0f), shape); + Tensor scalar = make_tensor({2.0f}); + + BinaryOpLayer layer(BinaryOpLayer::Operation::kSub); + RunAllBackendsAndCompare(layer, a, scalar, "sub_scalar_float"); +} + +TEST(binaryoplayer_parall, parallel_div_scalar_float) { + Shape shape({1024, 1024}); + Tensor a = make_tensor(std::vector(shape.count(), 8.0f), shape); + Tensor scalar = make_tensor({2.0f}); + + BinaryOpLayer layer(BinaryOpLayer::Operation::kDiv); + RunAllBackendsAndCompare(layer, a, scalar, "div_scalar_float"); +} + +TEST(binaryoplayer_parall, parallel_broadcast_2d_add) { + Tensor a = make_tensor(std::vector(1024 * 1, 3.0f), {1024, 1}); + Tensor b = make_tensor(std::vector(1 * 1024, 4.0f), {1, 1024}); + + BinaryOpLayer layer(BinaryOpLayer::Operation::kAdd); + RunAllBackendsAndCompare(layer, a, b, "broadcast_2d_add"); +} + +TEST(binaryoplayer_parall, parallel_broadcast_3d_mul) { + Tensor a = + make_tensor(std::vector(64 * 1 * 512, 1.5f), {64, 1, 512}); + Tensor b = + make_tensor(std::vector(64 * 512 * 1, 2.0f), {64, 512, 1}); + + BinaryOpLayer layer(BinaryOpLayer::Operation::kMul); + RunAllBackendsAndCompare(layer, a, b, "broadcast_3d_mul"); +} + +TEST(binaryoplayer_parall, parallel_large_add_same_shape) { + Shape shape({2048, 2048}); + Tensor a = make_tensor(std::vector(shape.count(), 1.0f), shape); + Tensor b = make_tensor(std::vector(shape.count(), 2.0f), shape); + + BinaryOpLayer layer(BinaryOpLayer::Operation::kAdd); + RunAllBackendsAndCompare(layer, a, b, "large_add_same_shape"); +} + +TEST(binaryoplayer_parall, parallel_large_mul_same_shape) { + Shape shape({1024, 1024, 4}); + Tensor a = + make_tensor(std::vector(shape.count(), 1.25f), shape); + Tensor b = make_tensor(std::vector(shape.count(), 2.0f), shape); + + BinaryOpLayer layer(BinaryOpLayer::Operation::kMul); + RunAllBackendsAndCompare(layer, a, b, "large_mul_same_shape"); +} + +TEST(binaryoplayer_parall, parallel_large_broadcast_4d_add) { + Shape a_shape({16, 32, 128, 1}); + Shape b_shape({1, 32, 1, 128}); + + Tensor a = + make_tensor(std::vector(a_shape.count(), 1.0f), a_shape); + Tensor b = + make_tensor(std::vector(b_shape.count(), 2.0f), b_shape); + + BinaryOpLayer layer(BinaryOpLayer::Operation::kAdd); + RunAllBackendsAndCompare(layer, a, b, "large_broadcast_4d_add"); +} + +// TEST(binaryoplayer_parall, parallel_huge_timing_add) { +// Shape shape({128, 512, 512}); +// Tensor a = make_tensor(std::vector(shape.count(), 1.0f), +// shape); Tensor b = +// make_tensor(std::vector(shape.count(), 2.0f), shape); + +// BinaryOpLayer layer(BinaryOpLayer::Operation::kAdd); +// RunAllBackendsAndCompare(layer, a, b, "huge_timing_add"); +// }