Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Dev/asolovev add comm profiling #2957

Draft
wants to merge 9 commits into
base: main
Choose a base branch
from
80 changes: 61 additions & 19 deletions cpp/daal/src/externals/service_profiler.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -16,50 +16,92 @@
*******************************************************************************/

#include "src/externals/service_profiler.h"
#include <iostream>

namespace daal
{
namespace internal
{
#ifdef ONEDAL_KERNEL_PROFILER

ProfilerTask::ProfilerTask(const char * taskName) : _taskName(taskName)
bool is_verbose_enabled()
{
_handle = __itt_string_handle_create(taskName);

__itt_task_begin(Profiler::getDomain(), __itt_null, __itt_null, _handle);
const char * env_var = std::getenv("ONEDAL_VERBOSE");
return env_var && std::string(env_var) == "1";
}

ProfilerTask::~ProfilerTask()
profiler::profiler()
{
Profiler::endTask(_taskName);
start_time = get_time();
}

ProfilerTask Profiler::startTask(const char * taskName)
profiler::~profiler()
{
return ProfilerTask(taskName);
auto end_time = get_time();
auto total_time = end_time - start_time;
if (is_verbose_enabled())
{
std::cerr << "DAAL_KERNEL_PROFILER: total time " << total_time / 1e6 << std::endl;
}
}

void Profiler::endTask(const char * taskName)
std::uint64_t profiler::get_time()
{
__itt_task_end(Profiler::getDomain());
struct timespec t;
clock_gettime(CLOCK_MONOTONIC, &t);
return t.tv_sec * 1000000000 + t.tv_nsec;
}

#else
ProfilerTask Profiler::startTask(const char * taskName)
profiler * profiler::get_instance()
{
return ProfilerTask(taskName);
static profiler instance;
return &instance;
}

void Profiler::endTask(const char * taskName) {}

ProfilerTask::ProfilerTask(const char * taskName) : _taskName(taskName) {}
task & profiler::get_task()
{
return task_;
}

ProfilerTask::~ProfilerTask()
profiler_task profiler::start_task(const char * task_name)
{
Profiler::endTask(_taskName);
auto ns_start = get_time();
auto & tasks_info = get_instance()->get_task();
tasks_info.time_kernels[tasks_info.current_kernel] = ns_start;
tasks_info.current_kernel++;
return profiler_task(task_name);
}

void profiler::end_task(const char * task_name)
{
const std::uint64_t ns_end = get_time();
auto & tasks_info = get_instance()->get_task();
#ifdef ONEDAL_DATA_PARALLEL
auto & queue = get_instance()->get_queue();
queue.wait_and_throw();
#endif
tasks_info.current_kernel--;
const std::uint64_t times = ns_end - tasks_info.time_kernels[tasks_info.current_kernel];

auto it = tasks_info.kernels.find(task_name);
if (it == tasks_info.kernels.end())
{
tasks_info.kernels.insert({ task_name, times });
}
else
{
it->second += times;
}
if (is_verbose_enabled())
{
std::cerr << "DAAL_KERNEL_PROFILER: " << std::string(task_name) << " " << times / 1e6 << std::endl;
}
}

profiler_task::profiler_task(const char * task_name) : task_name_(task_name) {}

profiler_task::~profiler_task()
{
profiler::end_task(task_name_);
}
} // namespace internal
} // namespace daal
114 changes: 38 additions & 76 deletions cpp/daal/src/externals/service_profiler.h
Original file line number Diff line number Diff line change
Expand Up @@ -21,102 +21,64 @@
//--
*/

#ifndef __SERVICE_PROFILER_H__
#define __SERVICE_PROFILER_H__
#include <sys/time.h>
#include <time.h>
#include <cstdint>
#include <cstring>
#include <map>
#include <vector>

#ifdef ONEDAL_KERNEL_PROFILER
/* Here if oneDAL kernel profiling is enabled in the build */
#include <ittnotify.h>
#endif
#ifndef __SERVICE_PROFILER_H__
#define __SERVICE_PROFILER_H__

#define DAAL_ITTNOTIFY_CONCAT2(x, y) x##y
#define DAAL_ITTNOTIFY_CONCAT(x, y) DAAL_ITTNOTIFY_CONCAT2(x, y)
#define DAAL_ITTNOTIFY_CONCAT2(x, y) x##y
#define DAAL_ITTNOTIFY_CONCAT(x, y) DAAL_ITTNOTIFY_CONCAT2(x, y)

#define DAAL_ITTNOTIFY_UNIQUE_ID __LINE__
#define DAAL_ITTNOTIFY_UNIQUE_ID __LINE__

#define DAAL_ITTNOTIFY_SCOPED_TASK(name) \
daal::internal::ProfilerTask DAAL_ITTNOTIFY_CONCAT(__profiler_task__, DAAL_ITTNOTIFY_UNIQUE_ID) = daal::internal::Profiler::startTask(#name);
#define DAAL_ITTNOTIFY_SCOPED_TASK(name) \
daal::internal::profiler_task DAAL_ITTNOTIFY_CONCAT(__profiler_taks__, DAAL_ITTNOTIFY_UNIQUE_ID) = \
daal::internal::profiler::start_task(#name);

namespace daal
{
namespace internal
{
/**
* Defines a logical unit of work to be tracked by performance profilier.
*/
class ProfilerTask

struct task
{
static const std::uint64_t MAX_KERNELS = 256;
std::map<const char *, std::uint64_t> kernels;
std::uint64_t current_kernel = 0;
std::uint64_t time_kernels[MAX_KERNELS];
void clear();
};

class profiler_task
{
public:
/**
* Constructs a task with a given name.
* \param[in] taskName Name of the task.
*/
ProfilerTask(const char * taskName);
~ProfilerTask();
profiler_task(const char * task_name);
~profiler_task();

private:
const char * _taskName;
#ifdef ONEDAL_KERNEL_PROFILER
/* Here if oneDAL kernel profiling is enabled */
__itt_string_handle * _handle; /* The task string handle */
__itt_domain * _domain; /* Pointer to the domain of the task */
#endif
const char * task_name_;
};

/**
* Global performance profiler.
*
* By default this class is a stub in the library and its redefinition will be in C++ Bechmarks.
* If oneDAL kernel profiling is enabled, the profiler uses Task API from <ittnotify.h>
*/
class Profiler
class profiler
{
public:
/**
* Start the task to be profiled.
* \param[in] taskName Name of the task.
*/
static ProfilerTask startTask(const char * taskName);

/**
* Start the task to profile.
* \param[in] taskName Name of the task.
*/
static void endTask(const char * taskName);

#ifdef ONEDAL_KERNEL_PROFILER
/* Here if oneDAL kernel profiling is enabled */

/**
* Get pointer to a global profiler state.
* \return Pointer to a global profiler state.
*/
static Profiler * getInstance()
{
static Profiler instance;
return &instance;
}
profiler();
~profiler();
static profiler_task start_task(const char * task_name);
static std::uint64_t get_time();
static profiler * get_instance();
task & get_task();

/**
* Get pointer to the ITT domain associated with the profiler.
* \return Pointer to the ITT domain.
*/
static __itt_domain * getDomain()
{
return (getInstance())->_domain;
}
static void end_task(const char * task_name);

private:
/**
* Construct the profiler.
*/
Profiler()
{
_domain = __itt_domain_create("oneDAL");
}
~Profiler() {}
__itt_domain * _domain; /* Pointer to the ITT domain */
#endif
std::uint64_t start_time;
task task_;
};

} // namespace internal
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,13 @@
#include "oneapi/dal/backend/primitives/reduction.hpp"
#include "oneapi/dal/backend/primitives/stat.hpp"
#include "oneapi/dal/backend/primitives/blas.hpp"
#include <iostream>
#include "oneapi/dal/backend/common.hpp"
#include "oneapi/dal/detail/cpu_info_impl.hpp"
#include "oneapi/dal/detail/error_messages.hpp"
#include "oneapi/dal/detail/parameters/system_parameters_impl.hpp"
#include <daal/src/services/service_defines.h>
#include <daal/include/services/internal/daal_kernel_defines.h>

#ifdef ONEDAL_DATA_PARALLEL

Expand All @@ -48,6 +55,11 @@ template <typename Float>
result_t compute_kernel_dense_impl<Float>::operator()(const descriptor_t& desc,
const parameters_t& params,
const input_t& input) {
using daal::services::Environment;
Environment* env = Environment::getInstance();
std::cerr << "number of threads = " << static_cast<std::uint32_t>(env->getNumberOfThreads())
<< std::endl;
ONEDAL_PROFILER_TASK(compute_covariance_kernel_dense);
ONEDAL_ASSERT(input.get_data().has_data());

const auto data = input.get_data();
Expand All @@ -62,9 +74,13 @@ result_t compute_kernel_dense_impl<Float>::operator()(const descriptor_t& desc,
auto assume_centered = desc.get_assume_centered();

auto result = compute_result<task_t>{}.set_result_options(desc.get_result_options());

const auto data_nd = pr::table2ndarray<Float>(q_, data, alloc::device);

{
ONEDAL_PROFILER_TASK(allreduce_rows_count_global);
comm_.allreduce(rows_count_global, spmd::reduce_op::sum).wait();
}

auto [sums, sums_event] = compute_sums(q_, data_nd, assume_centered, {});

{
Expand All @@ -85,11 +101,6 @@ result_t compute_kernel_dense_impl<Float>::operator()(const descriptor_t& desc,
comm_.allreduce(xtx.flatten(q_, { gemm_event }), spmd::reduce_op::sum).wait();
}

{
ONEDAL_PROFILER_TASK(allreduce_rows_count_global);
comm_.allreduce(rows_count_global, spmd::reduce_op::sum).wait();
}

if (desc.get_result_options().test(result_options::cov_matrix)) {
auto [cov, cov_event] = compute_covariance(q_,
rows_count_global,
Expand All @@ -98,28 +109,41 @@ result_t compute_kernel_dense_impl<Float>::operator()(const descriptor_t& desc,
bias,
assume_centered,
{ gemm_event });
result.set_cov_matrix(
(homogen_table::wrap(cov.flatten(q_, { cov_event }), column_count, column_count)));
{
ONEDAL_PROFILER_TASK(cov_flatten, q_);
result.set_cov_matrix(
(homogen_table::wrap(cov.flatten(q_, { cov_event }), column_count, column_count)));
}
}
if (desc.get_result_options().test(result_options::cor_matrix)) {
auto [corr, corr_event] =
compute_correlation(q_, rows_count_global, xtx, sums, { gemm_event });
result.set_cor_matrix(
(homogen_table::wrap(corr.flatten(q_, { corr_event }), column_count, column_count)));
{
ONEDAL_PROFILER_TASK(corr_flatten, q_);
result.set_cor_matrix((
homogen_table::wrap(corr.flatten(q_, { corr_event }), column_count, column_count)));
}
}
if (desc.get_result_options().test(result_options::means)) {
if (!assume_centered) {
auto [means, means_event] = compute_means(q_, sums, rows_count_global, { gemm_event });
result.set_means(
homogen_table::wrap(means.flatten(q_, { means_event }), 1, column_count));
{
ONEDAL_PROFILER_TASK(means_flatten, q_);
result.set_means(
homogen_table::wrap(means.flatten(q_, { means_event }), 1, column_count));
}
}
else {
auto [zero_means, zeros_event] =
pr::ndarray<Float, 1>::zeros(q_, { column_count }, sycl::usm::alloc::device);
result.set_means(
homogen_table::wrap(zero_means.flatten(q_, { zeros_event }), 1, column_count));
{
ONEDAL_PROFILER_TASK(zero_means_flatten, q_);
result.set_means(
homogen_table::wrap(zero_means.flatten(q_, { zeros_event }), 1, column_count));
}
}
}

return result;
}

Expand Down
4 changes: 4 additions & 0 deletions cpp/oneapi/dal/backend/communicator.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,8 @@ class fake_spmd_communicator_host_impl : public spmd::communicator_iface_base {

void barrier() override {}

// void is_init(int * flag) override {}

request_t* bcast(byte_t* send_buf,
std::int64_t count,
const data_type& dtype,
Expand Down Expand Up @@ -167,6 +169,8 @@ class fake_spmd_communicator_device_impl : public spmd::communicator_iface {

void barrier() override {}

// void is_init(int * flag) override {}

request_t* bcast(byte_t* send_buf,
std::int64_t count,
const data_type& dtype,
Expand Down
4 changes: 4 additions & 0 deletions cpp/oneapi/dal/backend/communicator.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,10 @@ class communicator {
public_comm_.barrier();
}

// void is_init(int * flag) const {
// public_comm_.is_init(flag);
// }

template <typename... Args>
communicator_event bcast(Args&&... args) const {
return public_comm_.bcast(std::forward<Args>(args)...);
Expand Down
Loading
Loading