Skip to content

Commit 40106b1

Browse files
committed
Added records sorting. (2 columns)
1 parent bf3d45a commit 40106b1

File tree

11 files changed

+156
-17
lines changed

11 files changed

+156
-17
lines changed

CMakeLists.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@ include_directories(third_party/googletest/googlemock/include)
1313
include_directories(third_party/ips4o)
1414
include_directories(third_party/pdqsort)
1515

16-
set(CMAKE_CXX_FLAGS "-g -O3 -flto -Wall -march=native")
16+
set(CMAKE_CXX_FLAGS "-g -O3 -flto -Wall -march=native -fopenmp")
1717

1818
file(GLOB_RECURSE SOURCE_FILES
1919
"src/*.cpp" "test/*.cpp")

src/avx256/merge_util.cpp

Lines changed: 12 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -59,12 +59,13 @@ template void MaskedMergeRuns4<double, __m256d>(double *&arr, size_t N);
5959
template<typename InType, typename RegType>
6060
void MergePass8(InType *&arr, InType *buffer, size_t N, unsigned int run_size) {
6161
int UNIT_RUN_SIZE = 8;
62-
RegType ra, rb;
63-
int buffer_offset = 0;
62+
#pragma omp parallel for
6463
for (int i = 0; i < N; i += 2 * run_size) {
6564
int start = i;
6665
int mid = i + run_size;
6766
int end = i + 2 * run_size;
67+
int buffer_offset = start;
68+
RegType ra, rb;
6869
int p1_ptr = start;
6970
int p2_ptr = mid;
7071
LoadReg(ra, &arr[p1_ptr]);
@@ -119,12 +120,13 @@ template void MergePass8<float, __m256>(float *&arr, float *buffer, size_t N, un
119120
template<typename InType, typename RegType>
120121
void MaskedMergePass8(InType *&arr, InType *buffer, size_t N, unsigned int run_size) {
121122
int UNIT_RUN_SIZE = 8;
122-
RegType ra, rb;
123-
int buffer_offset = 0;
123+
#pragma omp parallel for
124124
for (int i = 0; i < N; i += 2 * run_size) {
125125
int start = i;
126126
int mid = i + run_size;
127127
int end = i + 2 * run_size;
128+
int buffer_offset = start;
129+
RegType ra, rb;
128130
int p1_ptr = start;
129131
int p2_ptr = mid;
130132
LoadReg(ra, &arr[p1_ptr]);
@@ -179,12 +181,13 @@ template void MaskedMergePass8<float, __m256>(float *&arr, float *buffer, size_t
179181
template<typename InType, typename RegType>
180182
void MergePass4(InType *&arr, InType *buffer, size_t N, unsigned int run_size) {
181183
int UNIT_RUN_SIZE = 4;
182-
RegType ra, rb;
183-
int buffer_offset = 0;
184+
#pragma omp parallel for
184185
for (int i = 0; i < N; i += 2 * run_size) {
185186
int start = i;
186187
int mid = i + run_size;
187188
int end = i + 2 * run_size;
189+
int buffer_offset = start;
190+
RegType ra, rb;
188191
int p1_ptr = start;
189192
int p2_ptr = mid;
190193
LoadReg(ra, &arr[p1_ptr]);
@@ -238,12 +241,13 @@ template void MergePass4<double, __m256d>(double *&arr, double *buffer, size_t N
238241
template<typename InType, typename RegType>
239242
void MaskedMergePass4(InType *&arr, InType *buffer, size_t N, unsigned int run_size) {
240243
int UNIT_RUN_SIZE = 4;
241-
RegType ra, rb;
242-
int buffer_offset = 0;
244+
#pragma omp parallel for
243245
for (int i = 0; i < N; i += 2 * run_size) {
244246
int start = i;
245247
int mid = i + run_size;
246248
int end = i + 2 * run_size;
249+
int buffer_offset = start;
250+
RegType ra, rb;
247251
int p1_ptr = start;
248252
int p2_ptr = mid;
249253
LoadReg(ra, &arr[p1_ptr]);

src/avx256/simd_sort.cpp

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -69,6 +69,21 @@ void SIMDSort(size_t N, std::pair<int, int> *&arr) {
6969
}
7070
}
7171

72+
void SIMDOrderBy(std::pair<int, int> *&result_arr, size_t N, std::pair<int, int> *arr, int order_by) {
73+
int64_t *kv_arr;
74+
aligned_init<int64_t>(kv_arr, N);
75+
aligned_init<std::pair<int, int>>(result_arr, N);
76+
for (int i = 0; i < N; ++i) {
77+
auto value = (int64_t) (order_by == 0 ? arr[i].first : arr[i].second);
78+
kv_arr[i] = (((value) << 32) | (0x00000000ffffffff & i));
79+
}
80+
SIMDSort(N, kv_arr);
81+
for (int j = 0; j < N; ++j) {
82+
auto index = 0x00000000ffffffff & kv_arr[j];
83+
result_arr[j] = arr[index];
84+
}
85+
}
86+
7287
void SIMDSort(size_t N, std::pair<float, float> *&arr) {
7388
float *kv_arr;
7489
size_t Nkv = N * 2;

src/avx512/merge_util.cpp

Lines changed: 12 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -59,12 +59,13 @@ template void MaskedMergeRuns8<double, __m512d>(double *&arr, size_t N);
5959
template<typename InType, typename RegType>
6060
void MergePass16(InType *&arr, InType *buffer, size_t N, int run_size) {
6161
int UNIT_RUN_SIZE = 16;
62-
RegType ra, rb;
63-
int buffer_offset = 0;
62+
#pragma omp parallel for
6463
for (int i = 0; i < N; i += 2 * run_size) {
6564
int start = i;
6665
int mid = i + run_size;
6766
int end = i + 2 * run_size;
67+
int buffer_offset = start;
68+
RegType ra, rb;
6869
int p1_ptr = start;
6970
int p2_ptr = mid;
7071
LoadReg(ra, &arr[p1_ptr]);
@@ -119,12 +120,13 @@ template void MergePass16<float, __m512>(float *&arr, float *buffer, size_t N, i
119120
template<typename InType, typename RegType>
120121
void MaskedMergePass16(InType *&arr, InType *buffer, size_t N, int run_size) {
121122
int UNIT_RUN_SIZE = 16;
122-
RegType ra, rb;
123-
int buffer_offset = 0;
123+
#pragma omp parallel for
124124
for (int i = 0; i < N; i += 2 * run_size) {
125125
int start = i;
126126
int mid = i + run_size;
127127
int end = i + 2 * run_size;
128+
int buffer_offset = start;
129+
RegType ra, rb;
128130
int p1_ptr = start;
129131
int p2_ptr = mid;
130132
LoadReg(ra, &arr[p1_ptr]);
@@ -179,12 +181,13 @@ template void MaskedMergePass16<float, __m512>(float *&arr, float *buffer, size_
179181
template<typename InType, typename RegType>
180182
void MergePass8(InType *&arr, InType *buffer, size_t N, int run_size) {
181183
int UNIT_RUN_SIZE = 8;
182-
RegType ra, rb;
183-
int buffer_offset = 0;
184+
#pragma omp parallel for
184185
for (int i = 0; i < N; i += 2 * run_size) {
185186
int start = i;
186187
int mid = i + run_size;
187188
int end = i + 2 * run_size;
189+
int buffer_offset = start;
190+
RegType ra, rb;
188191
int p1_ptr = start;
189192
int p2_ptr = mid;
190193
LoadReg(ra, &arr[p1_ptr]);
@@ -239,12 +242,13 @@ template void MergePass8<double, __m512d>(double *&arr, double *buffer, size_t N
239242
template<typename InType, typename RegType>
240243
void MaskedMergePass8(InType *&arr, InType *buffer, size_t N, int run_size) {
241244
int UNIT_RUN_SIZE = 8;
242-
RegType ra, rb;
243-
int buffer_offset = 0;
245+
#pragma omp parallel for
244246
for (int i = 0; i < N; i += 2 * run_size) {
245247
int start = i;
246248
int mid = i + run_size;
247249
int end = i + 2 * run_size;
250+
int buffer_offset = start;
251+
RegType ra, rb;
248252
int p1_ptr = start;
249253
int p2_ptr = mid;
250254
LoadReg(ra, &arr[p1_ptr]);

src/avx512/simd_sort.cpp

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,21 @@ void SIMDSort(size_t N, std::pair<int, int> *&arr) {
6161
}
6262
}
6363

64+
void SIMDOrderBy(std::pair<int, int> *&result_arr, size_t N, std::pair<int, int> *arr, int order_by) {
65+
int64_t *kv_arr;
66+
aligned_init<int64_t>(kv_arr, N);
67+
aligned_init<std::pair<int, int>>(result_arr, N);
68+
for (int i = 0; i < N; ++i) {
69+
auto value = (int64_t) (order_by == 0 ? arr[i].first : arr[i].second);
70+
kv_arr[i] = (((value) << 32) | (0x00000000ffffffff & i));
71+
}
72+
SIMDSort(N, kv_arr);
73+
for (int j = 0; j < N; ++j) {
74+
auto index = 0x00000000ffffffff & kv_arr[j];
75+
result_arr[j] = arr[index];
76+
}
77+
}
78+
6479
void SIMDSort(size_t N, std::pair<float, float> *&arr) {
6580
float *kv_arr;
6681
size_t Nkv = N * 2;

src/include/avx256/simd_sort.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@ namespace avx2{
1212
void SIMDSort(size_t N, float *&arr);
1313
void SIMDSort(size_t N, double *&arr);
1414
void SIMDSort(size_t N, std::pair<int,int> *&arr);
15+
void SIMDOrderBy(std::pair<int, int> *&result_arr, size_t N, std::pair<int, int> *arr, int order_by=0);
1516
void SIMDSort(size_t N, std::pair<float, float> *&arr);
1617
void SIMDSort(size_t N, std::pair<int64_t ,int64_t> *&arr);
1718
void SIMDSort(size_t N, std::pair<double, double> *&arr);

src/include/avx512/simd_sort.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@ namespace avx512{
1212
void SIMDSort(size_t N, float *&arr);
1313
void SIMDSort(size_t N, double *&arr);
1414
void SIMDSort(size_t N, std::pair<int,int> *&arr);
15+
void SIMDOrderBy(std::pair<int, int> *&result_arr, size_t N, std::pair<int, int> *arr, int order_by=0);
1516
void SIMDSort(size_t N, std::pair<float, float> *&arr);
1617
void SIMDSort(size_t N, std::pair<int64_t ,int64_t> *&arr);
1718
void SIMDSort(size_t N, std::pair<double, double> *&arr);

src/include/common.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
#include <string>
66
#include <cstdint>
77
#include <cassert>
8+
#include <omp.h>
89

910
/**
1011
* Common definitions

test/avx256/simd_sort_test.cpp

Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -345,6 +345,49 @@ TEST(SIMDSortTests, AVX256SIMDSort32BitKeyValueIntTest) {
345345
delete soln_arr;
346346
}
347347

348+
TEST(SIMDSortTests, AVX256SIMDOrderBy32BitIntTest) {
349+
using T = int;
350+
size_t N = NNUM;
351+
T lo = LO;
352+
T hi = HI;
353+
std::pair<T, T> *rand_arr;
354+
std::pair<T, T> *soln_arr1, *soln_arr2, *input_arr1, *input_arr2;
355+
double start, end;
356+
357+
// Initialization
358+
TestUtil::RandGenIntEntries(rand_arr, N, lo, hi);
359+
360+
aligned_init<std::pair<T, T>>(input_arr1, N);
361+
aligned_init<std::pair<T, T>>(soln_arr1, N);
362+
std::copy(rand_arr, rand_arr + N, input_arr1);
363+
std::vector<std::pair<T, T>> check_arr1(rand_arr, rand_arr + N);
364+
start = currentSeconds();
365+
SIMDOrderBy(soln_arr1, N, input_arr1);
366+
end = currentSeconds();
367+
std::sort(check_arr1.begin(), check_arr1.end(), [](std::pair<T, T> &left, std::pair<T, T> &right) {
368+
return left.first < right.first;
369+
});
370+
for (int i = 0; i < N; i++) {
371+
EXPECT_EQ(check_arr1[i].first, soln_arr1[i].first);
372+
}
373+
374+
aligned_init<std::pair<T, T>>(input_arr2, N);
375+
aligned_init<std::pair<T, T>>(soln_arr2, N);
376+
std::copy(rand_arr, rand_arr + N, input_arr2);
377+
std::vector<std::pair<T, T>> check_arr2(rand_arr, rand_arr + N);
378+
SIMDOrderBy(soln_arr2, N, input_arr2, 1);
379+
std::sort(check_arr2.begin(), check_arr2.end(), [](std::pair<T, T> &left, std::pair<T, T> &right) {
380+
return left.second < right.second;
381+
});
382+
for (int i = 0; i < N; i++) {
383+
EXPECT_EQ(check_arr2[i].second, soln_arr2[i].second);
384+
}
385+
printf("[avx256::orderby] %lu elements: %.8f seconds\n", N, end - start);
386+
delete rand_arr;
387+
delete soln_arr1;
388+
delete soln_arr2;
389+
}
390+
348391
TEST(SIMDSortTests, AVX256SIMDSort64BitKeyValueIntTest) {
349392
using T = int64_t;
350393
size_t N = NNUM;

test/avx512/simd_sort_test.cpp

Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -347,6 +347,49 @@ TEST(SIMDSortTests, AVX512SIMDSort32BitKeyValueIntTest) {
347347
delete soln_arr;
348348
}
349349

350+
TEST(SIMDSortTests, AVX256SIMDOrderBy32BitIntTest) {
351+
using T = int;
352+
size_t N = NNUM;
353+
T lo = LO;
354+
T hi = HI;
355+
std::pair<T, T> *rand_arr;
356+
std::pair<T, T> *soln_arr1, *soln_arr2, *input_arr1, *input_arr2;
357+
double start, end;
358+
359+
// Initialization
360+
TestUtil::RandGenIntEntries(rand_arr, N, lo, hi);
361+
362+
aligned_init<std::pair<T, T>>(input_arr1, N);
363+
aligned_init<std::pair<T, T>>(soln_arr1, N);
364+
std::copy(rand_arr, rand_arr + N, input_arr1);
365+
std::vector<std::pair<T, T>> check_arr1(rand_arr, rand_arr + N);
366+
start = currentSeconds();
367+
SIMDOrderBy(soln_arr1, N, input_arr1);
368+
end = currentSeconds();
369+
std::sort(check_arr1.begin(), check_arr1.end(), [](std::pair<T, T> &left, std::pair<T, T> &right) {
370+
return left.first < right.first;
371+
});
372+
for (int i = 0; i < N; i++) {
373+
EXPECT_EQ(check_arr1[i].first, soln_arr1[i].first);
374+
}
375+
376+
aligned_init<std::pair<T, T>>(input_arr2, N);
377+
aligned_init<std::pair<T, T>>(soln_arr2, N);
378+
std::copy(rand_arr, rand_arr + N, input_arr2);
379+
std::vector<std::pair<T, T>> check_arr2(rand_arr, rand_arr + N);
380+
SIMDOrderBy(soln_arr2, N, input_arr2, 1);
381+
std::sort(check_arr2.begin(), check_arr2.end(), [](std::pair<T, T> &left, std::pair<T, T> &right) {
382+
return left.second < right.second;
383+
});
384+
for (int i = 0; i < N; i++) {
385+
EXPECT_EQ(check_arr2[i].second, soln_arr2[i].second);
386+
}
387+
printf("[avx256::orderby] %lu elements: %.8f seconds\n", N, end - start);
388+
delete rand_arr;
389+
delete soln_arr1;
390+
delete soln_arr2;
391+
}
392+
350393
TEST(SIMDSortTests, AVX512SIMDSort64BitKeyValueIntTest) {
351394
using T = int64_t;
352395
size_t N = NNUM;

test/include/test_util.h

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,18 @@ struct TestUtil{
5858
}
5959
}
6060

61+
template <typename T>
62+
static void RandGenIntEntries(std::pair<T, T>* &arr, size_t N, T lo, T hi, unsigned int offset_start=0) {
63+
aligned_init<std::pair<T, T>>(arr, N);
64+
std::random_device rd; //Will be used to obtain a seed for the random number engine
65+
std::mt19937 gen(rd()); //Standard mersenne_twister_engine seeded with rd()
66+
std::uniform_int_distribution<T> dis(lo, hi);
67+
for(size_t i = 0; i < N; i++) {
68+
arr[i].first = dis(gen);
69+
arr[i].second = dis(gen);
70+
}
71+
}
72+
6173
template <typename T>
6274
static void RandGenFloatRecords(std::pair<T, T>* &arr, size_t N, T lo, T hi, unsigned int offset_start=0) {
6375
aligned_init<std::pair<T, T>>(arr, N);

0 commit comments

Comments
 (0)