Skip to content

Commit a9fcf49

Browse files
isaevilpavelkumbrasevaleksei-fedotovakukanov
authored
Add reference implementation for parallel_phase feature (#1570)
Implement parallel_phase API proposed in RFC, add new entry points to the library, move the RFC to the experimental stage. Signed-off-by: pavelkumbrasev <[email protected]> Signed-off-by: Isaev, Ilya <[email protected]> Co-authored-by: pavelkumbrasev <[email protected]> Co-authored-by: Aleksei Fedotov <[email protected]> Co-authored-by: Alexey Kukanov <[email protected]>
1 parent 0f287a7 commit a9fcf49

20 files changed

+727
-45
lines changed

include/oneapi/tbb/detail/_config.h

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
Copyright (c) 2005-2024 Intel Corporation
2+
Copyright (c) 2005-2025 Intel Corporation
33
44
Licensed under the Apache License, Version 2.0 (the "License");
55
you may not use this file except in compliance with the License.
@@ -534,4 +534,8 @@
534534
#define __TBB_PREVIEW_TASK_GROUP_EXTENSIONS 1
535535
#endif
536536

537+
#if TBB_PREVIEW_PARALLEL_PHASE || __TBB_BUILD
538+
#define __TBB_PREVIEW_PARALLEL_PHASE 1
539+
#endif
540+
537541
#endif // __TBB_detail__config_H

include/oneapi/tbb/task_arena.h

Lines changed: 152 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
Copyright (c) 2005-2023 Intel Corporation
2+
Copyright (c) 2005-2025 Intel Corporation
33
44
Licensed under the Apache License, Version 2.0 (the "License");
55
you may not use this file except in compliance with the License.
@@ -95,6 +95,11 @@ TBB_EXPORT void __TBB_EXPORTED_FUNC isolate_within_arena(d1::delegate_base& d, s
9595
TBB_EXPORT void __TBB_EXPORTED_FUNC enqueue(d1::task&, d1::task_arena_base*);
9696
TBB_EXPORT void __TBB_EXPORTED_FUNC enqueue(d1::task&, d1::task_group_context&, d1::task_arena_base*);
9797
TBB_EXPORT void __TBB_EXPORTED_FUNC submit(d1::task&, d1::task_group_context&, arena*, std::uintptr_t);
98+
99+
#if __TBB_PREVIEW_PARALLEL_PHASE
100+
TBB_EXPORT void __TBB_EXPORTED_FUNC enter_parallel_phase(d1::task_arena_base*, std::uintptr_t);
101+
TBB_EXPORT void __TBB_EXPORTED_FUNC exit_parallel_phase(d1::task_arena_base*, std::uintptr_t);
102+
#endif
98103
} // namespace r1
99104

100105
namespace d2 {
@@ -122,6 +127,14 @@ class task_arena_base {
122127
normal = 2 * priority_stride,
123128
high = 3 * priority_stride
124129
};
130+
131+
#if __TBB_PREVIEW_PARALLEL_PHASE
132+
enum class leave_policy : int {
133+
automatic = 0,
134+
fast = 1
135+
};
136+
#endif
137+
125138
#if __TBB_ARENA_BINDING
126139
using constraints = tbb::detail::d1::constraints;
127140
#endif /*__TBB_ARENA_BINDING*/
@@ -162,13 +175,36 @@ class task_arena_base {
162175
return (my_version_and_traits & core_type_support_flag) == core_type_support_flag ? my_max_threads_per_core : automatic;
163176
}
164177

178+
#if __TBB_PREVIEW_PARALLEL_PHASE
179+
leave_policy get_leave_policy() const {
180+
return (my_version_and_traits & fast_leave_policy_flag) ? leave_policy::fast : leave_policy::automatic;
181+
}
182+
183+
int leave_policy_trait(leave_policy lp) const {
184+
return lp == leave_policy::fast ? fast_leave_policy_flag : 0;
185+
}
186+
187+
void set_leave_policy(leave_policy lp) {
188+
my_version_and_traits |= leave_policy_trait(lp);
189+
}
190+
#endif
191+
165192
enum {
166-
default_flags = 0
167-
, core_type_support_flag = 1
193+
default_flags = 0,
194+
core_type_support_flag = 1,
195+
fast_leave_policy_flag = 1 << 1
168196
};
169197

170-
task_arena_base(int max_concurrency, unsigned reserved_for_masters, priority a_priority)
171-
: my_version_and_traits(default_flags | core_type_support_flag)
198+
task_arena_base(int max_concurrency, unsigned reserved_for_masters, priority a_priority
199+
#if __TBB_PREVIEW_PARALLEL_PHASE
200+
, leave_policy lp
201+
#endif
202+
)
203+
: my_version_and_traits(default_flags | core_type_support_flag
204+
#if __TBB_PREVIEW_PARALLEL_PHASE
205+
| leave_policy_trait(lp)
206+
#endif
207+
)
172208
, my_initialization_state(do_once_state::uninitialized)
173209
, my_arena(nullptr)
174210
, my_max_concurrency(max_concurrency)
@@ -180,8 +216,16 @@ class task_arena_base {
180216
{}
181217

182218
#if __TBB_ARENA_BINDING
183-
task_arena_base(const constraints& constraints_, unsigned reserved_for_masters, priority a_priority)
184-
: my_version_and_traits(default_flags | core_type_support_flag)
219+
task_arena_base(const constraints& constraints_, unsigned reserved_for_masters, priority a_priority
220+
#if __TBB_PREVIEW_PARALLEL_PHASE
221+
, leave_policy lp
222+
#endif
223+
)
224+
: my_version_and_traits(default_flags | core_type_support_flag
225+
#if __TBB_PREVIEW_PARALLEL_PHASE
226+
| leave_policy_trait(lp)
227+
#endif
228+
)
185229
, my_initialization_state(do_once_state::uninitialized)
186230
, my_arena(nullptr)
187231
, my_max_concurrency(constraints_.max_concurrency)
@@ -259,31 +303,58 @@ class task_arena : public task_arena_base {
259303
* Value of 1 is default and reflects behavior of implicit arenas.
260304
**/
261305
task_arena(int max_concurrency_ = automatic, unsigned reserved_for_masters = 1,
262-
priority a_priority = priority::normal)
263-
: task_arena_base(max_concurrency_, reserved_for_masters, a_priority)
306+
priority a_priority = priority::normal
307+
#if __TBB_PREVIEW_PARALLEL_PHASE
308+
, leave_policy lp = leave_policy::automatic
309+
#endif
310+
)
311+
: task_arena_base(max_concurrency_, reserved_for_masters, a_priority
312+
#if __TBB_PREVIEW_PARALLEL_PHASE
313+
, lp
314+
#endif
315+
)
264316
{}
265317

266318
#if __TBB_ARENA_BINDING
267319
//! Creates task arena pinned to certain NUMA node
268320
task_arena(const constraints& constraints_, unsigned reserved_for_masters = 1,
269-
priority a_priority = priority::normal)
270-
: task_arena_base(constraints_, reserved_for_masters, a_priority)
321+
priority a_priority = priority::normal
322+
#if __TBB_PREVIEW_PARALLEL_PHASE
323+
, leave_policy lp = leave_policy::automatic
324+
#endif
325+
)
326+
: task_arena_base(constraints_, reserved_for_masters, a_priority
327+
#if __TBB_PREVIEW_PARALLEL_PHASE
328+
, lp
329+
#endif
330+
)
271331
{}
272332

273333
//! Copies settings from another task_arena
274-
task_arena(const task_arena &s) // copy settings but not the reference or instance
334+
task_arena(const task_arena &a) // copy settings but not the reference or instance
275335
: task_arena_base(
276336
constraints{}
277-
.set_numa_id(s.my_numa_id)
278-
.set_max_concurrency(s.my_max_concurrency)
279-
.set_core_type(s.my_core_type)
280-
.set_max_threads_per_core(s.my_max_threads_per_core)
281-
, s.my_num_reserved_slots, s.my_priority)
337+
.set_numa_id(a.my_numa_id)
338+
.set_max_concurrency(a.my_max_concurrency)
339+
.set_core_type(a.my_core_type)
340+
.set_max_threads_per_core(a.my_max_threads_per_core)
341+
, a.my_num_reserved_slots, a.my_priority
342+
#if __TBB_PREVIEW_PARALLEL_PHASE
343+
, a.get_leave_policy()
344+
#endif
345+
)
346+
282347
{}
283348
#else
284349
//! Copies settings from another task_arena
285350
task_arena(const task_arena& a) // copy settings but not the reference or instance
286-
: task_arena_base(a.my_max_concurrency, a.my_num_reserved_slots, a.my_priority)
351+
: task_arena_base(a.my_max_concurrency,
352+
a.my_num_reserved_slots,
353+
a.my_priority,
354+
#if __TBB_PREVIEW_PARALLEL_PHASE
355+
a.get_leave_policy()
356+
#endif
357+
)
287358
{}
288359
#endif /*__TBB_ARENA_BINDING*/
289360

@@ -292,7 +363,11 @@ class task_arena : public task_arena_base {
292363

293364
//! Creates an instance of task_arena attached to the current arena of the thread
294365
explicit task_arena( attach )
295-
: task_arena_base(automatic, 1, priority::normal) // use default settings if attach fails
366+
: task_arena_base(automatic, 1, priority::normal
367+
#if __TBB_PREVIEW_PARALLEL_PHASE
368+
, leave_policy::automatic
369+
#endif
370+
) // use default settings if attach fails
296371
{
297372
if (r1::attach(*this)) {
298373
mark_initialized();
@@ -311,21 +386,32 @@ class task_arena : public task_arena_base {
311386

312387
//! Overrides concurrency level and forces initialization of internal representation
313388
void initialize(int max_concurrency_, unsigned reserved_for_masters = 1,
314-
priority a_priority = priority::normal)
389+
priority a_priority = priority::normal
390+
#if __TBB_PREVIEW_PARALLEL_PHASE
391+
, leave_policy lp = leave_policy::automatic
392+
#endif
393+
)
315394
{
316395
__TBB_ASSERT(!my_arena.load(std::memory_order_relaxed), "Impossible to modify settings of an already initialized task_arena");
317396
if( !is_active() ) {
318397
my_max_concurrency = max_concurrency_;
319398
my_num_reserved_slots = reserved_for_masters;
320399
my_priority = a_priority;
400+
#if __TBB_PREVIEW_PARALLEL_PHASE
401+
set_leave_policy(lp);
402+
#endif
321403
r1::initialize(*this);
322404
mark_initialized();
323405
}
324406
}
325407

326408
#if __TBB_ARENA_BINDING
327409
void initialize(constraints constraints_, unsigned reserved_for_masters = 1,
328-
priority a_priority = priority::normal)
410+
priority a_priority = priority::normal
411+
#if __TBB_PREVIEW_PARALLEL_PHASE
412+
, leave_policy lp = leave_policy::automatic
413+
#endif
414+
)
329415
{
330416
__TBB_ASSERT(!my_arena.load(std::memory_order_relaxed), "Impossible to modify settings of an already initialized task_arena");
331417
if( !is_active() ) {
@@ -335,6 +421,9 @@ class task_arena : public task_arena_base {
335421
my_max_threads_per_core = constraints_.max_threads_per_core;
336422
my_num_reserved_slots = reserved_for_masters;
337423
my_priority = a_priority;
424+
#if __TBB_PREVIEW_PARALLEL_PHASE
425+
set_leave_policy(lp);
426+
#endif
338427
r1::initialize(*this);
339428
mark_initialized();
340429
}
@@ -404,6 +493,32 @@ class task_arena : public task_arena_base {
404493
return execute_impl<decltype(f())>(f);
405494
}
406495

496+
#if __TBB_PREVIEW_PARALLEL_PHASE
497+
void start_parallel_phase() {
498+
initialize();
499+
r1::enter_parallel_phase(this, /*reserved*/0);
500+
}
501+
void end_parallel_phase(bool with_fast_leave = false) {
502+
__TBB_ASSERT(my_initialization_state.load(std::memory_order_relaxed) == do_once_state::initialized, nullptr);
503+
// It is guaranteed by the standard that conversion of boolean to integral type will result in either 0 or 1
504+
r1::exit_parallel_phase(this, static_cast<std::uintptr_t>(with_fast_leave));
505+
}
506+
507+
class scoped_parallel_phase {
508+
task_arena& arena;
509+
bool one_time_fast_leave;
510+
public:
511+
scoped_parallel_phase(task_arena& ta, bool with_fast_leave = false)
512+
: arena(ta), one_time_fast_leave(with_fast_leave)
513+
{
514+
arena.start_parallel_phase();
515+
}
516+
~scoped_parallel_phase() {
517+
arena.end_parallel_phase(one_time_fast_leave);
518+
}
519+
};
520+
#endif
521+
407522
#if __TBB_EXTRA_DEBUG
408523
//! Returns my_num_reserved_slots
409524
int debug_reserved_slots() const {
@@ -472,6 +587,17 @@ inline void enqueue(F&& f) {
472587
enqueue_impl(std::forward<F>(f), nullptr);
473588
}
474589

590+
#if __TBB_PREVIEW_PARALLEL_PHASE
591+
inline void start_parallel_phase() {
592+
r1::enter_parallel_phase(nullptr, /*reserved*/0);
593+
}
594+
595+
inline void end_parallel_phase(bool with_fast_leave) {
596+
// It is guaranteed by the standard that conversion of boolean to integral type will result in either 0 or 1
597+
r1::exit_parallel_phase(nullptr, static_cast<std::uintptr_t>(with_fast_leave));
598+
}
599+
#endif
600+
475601
using r1::submit;
476602

477603
} // namespace d1
@@ -491,6 +617,11 @@ using detail::d1::max_concurrency;
491617
using detail::d1::isolate;
492618

493619
using detail::d1::enqueue;
620+
621+
#if __TBB_PREVIEW_PARALLEL_PHASE
622+
using detail::d1::start_parallel_phase;
623+
using detail::d1::end_parallel_phase;
624+
#endif
494625
} // namespace this_task_arena
495626

496627
} // inline namespace v1

rfcs/proposed/parallel_block_for_task_arena/README.md renamed to rfcs/experimental/parallel_phase_for_task_arena/README.md

Lines changed: 33 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -235,7 +235,6 @@ void scoped_parallel_phase_example() {
235235
// Computation
236236
}
237237
}
238-
239238
```
240239

241240
## Considerations
@@ -256,6 +255,32 @@ it might introduce performance problems if:
256255
Heavier involvement of less performant core types might result in artificial work
257256
imbalance in the arena.
258257

258+
## Technical Details
259+
260+
To implement the proposed feature, the following changes were made:
261+
* Added a new entity `thread_leave_manager` to the `r1::arena` which is responsible for
262+
for managing the state of workers' arena leaving behaviour.
263+
* Introduced two new entry points to the library.
264+
* `r1::enter_parallel_phase(d1::task_arena_base*, std::uintptr_t)` - used to communicate
265+
the start of parallel phase with the library.
266+
* `r1::exit_parallel_phase(d1::task_arena_base*, std::uintptr_t)` - used to communicate
267+
the end of parallel phase with the library.
268+
269+
### Thread Leave Manager
270+
271+
`thread_leave_manager` class implements the state machine described in proposal.
272+
Specifically, it controls when worker threads are allowed to be retained in the arena.
273+
`thread_leave_manager` is initialized with a state that determines the default
274+
behavior for workers leaving the arena.
275+
276+
To support `start/end_parallel_phase` API, it provides functionality to override the default
277+
state with a "Parallel Phase" state. It also keeps track of the number of active parallel phases.
278+
279+
The following sequence diagram illustrates the interaction between the user and
280+
the `thread_leave_manager` during the execution of parallel phases. It shows how the
281+
`thread_leave_manager` manages the state transitions when using `start/end_parallel_phase`.
282+
283+
<img src="parallel_phase_sequence_diagram.png" width=1000>
259284

260285
## Open Questions in Design
261286

@@ -272,3 +297,10 @@ Some open questions that remain:
272297
* Do we see any value if arena potentially can transition from one to another state?
273298
* What if different types of workloads are mixed in one application?
274299
* What if there concurrent calls to this API?
300+
301+
## Conditions to become fully supported
302+
303+
Following conditions need to be met for the feature to move from experimental to fully supported:
304+
* Open questions regarding API should be resolved.
305+
* The feature should demonstrate performance improvements in scenarios mentioned.
306+
* oneTBB specification needs to be updated to reflect the new feature.
Loading

0 commit comments

Comments
 (0)