diff --git a/.vscode/settings.json b/.vscode/settings.json index 6c419f3..f4a9bbe 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -83,6 +83,7 @@ "stack": "cpp", "cfenv": "cpp", "typeindex": "cpp", - "valarray": "cpp" + "valarray": "cpp", + "csignal": "cpp" } } \ No newline at end of file diff --git a/CMakeLists.txt b/CMakeLists.txt index 8d77660..22286f0 100755 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -6,7 +6,8 @@ find_package(Threads REQUIRED) add_executable(Sorter framework/runner.cpp src/container.cpp src/container.hpp - src/sorter.cpp src/sorter.hpp) + src/sorter.cpp src/sorter.hpp + src/thread_pool.cpp src/thread_pool.hpp) target_link_libraries(Sorter PUBLIC Threads::Threads) target_compile_features(Sorter PRIVATE cxx_std_20) @@ -50,6 +51,8 @@ add_custom_command( ${CMAKE_CURRENT_LIST_DIR}/src/container.hpp ${CMAKE_CURRENT_LIST_DIR}/src/sorter.cpp ${CMAKE_CURRENT_LIST_DIR}/src/sorter.hpp + ${CMAKE_CURRENT_LIST_DIR}/src/thread_pool.cpp + ${CMAKE_CURRENT_LIST_DIR}/src/thread_pool.hpp ${plot_file} ${CMAKE_CURRENT_LIST_DIR}/description.md COMMENT "Creating submission" @@ -71,6 +74,8 @@ set(dist_file_list src/container.hpp src/sorter.cpp src/sorter.hpp + src/thread_pool.cpp + src/thread_pool.hpp ) set(framework_dist_file "ae-sorting.zip") diff --git a/eval.py b/eval.py index a6e01e7..cd705cd 100755 --- a/eval.py +++ b/eval.py @@ -7,7 +7,7 @@ from pathlib import Path def run_experiment(output_file, build_dir): # The number of threads is not currently used, it's just here in case you want to parallelize your code. - for threads in [1]: + for threads in [1, 2, 4, 8, 12, 16]: # for size in [1e2, 1e3, 1e4 + 1, 1e5, 1e6 - 1, 1e7]: for size in [1e2, 1e3, 1e4 + 1, 1e5, 1e7]: print("Measuring p=" + str(threads) + " n=" + str(size)) diff --git a/framework/runner.cpp b/framework/runner.cpp index d509ba4..131550c 100755 --- a/framework/runner.cpp +++ b/framework/runner.cpp @@ -62,7 +62,7 @@ void runExperiment(std::string_view name, std::chrono::steady_clock::time_point ctor = std::chrono::steady_clock::now(); auto to_sort = container_factory(input); std::chrono::steady_clock::time_point begin = std::chrono::steady_clock::now(); - sort_func(to_sort); + sort_func(to_sort, num_threads); std::chrono::steady_clock::time_point end = std::chrono::steady_clock::now(); totalNanoseconds += std::chrono::duration_cast(end - begin) @@ -95,8 +95,8 @@ int main(int argc, char **argv) { [](const auto& data) { return ae::container(data); }, - [](ae::container& data) { - ae::sorter{}.sort(data); + [](ae::container& data, auto num_threads) { + ae::sorter(num_threads).sort(data); }, argc, argv); return 0; diff --git a/result.txt b/result.txt index e881d50..e7256c5 100644 --- a/result.txt +++ b/result.txt @@ -1,5 +1,30 @@ -RESULT name=sort n=100 t=1 iterations=301 durationNanoseconds=3324 totalDurationNanoseconds=1000561 constructorNanoseconds=274 totalConstructorNanoseconds=82660 -RESULT name=sort n=1000 t=1 iterations=13 durationNanoseconds=77557 totalDurationNanoseconds=1008241 constructorNanoseconds=2057 totalConstructorNanoseconds=26750 -RESULT name=sort n=10001 t=1 iterations=1 durationNanoseconds=1509911 totalDurationNanoseconds=1509911 constructorNanoseconds=108831 totalConstructorNanoseconds=108831 -RESULT name=sort n=100000 t=1 iterations=1 durationNanoseconds=8070546 totalDurationNanoseconds=8070546 constructorNanoseconds=488620 totalConstructorNanoseconds=488620 -RESULT name=sort n=10000000 t=1 iterations=1 durationNanoseconds=723407878 totalDurationNanoseconds=723407878 constructorNanoseconds=51148616 totalConstructorNanoseconds=51148616 +RESULT name=sort n=100 t=1 iterations=18 durationNanoseconds=57856 totalDurationNanoseconds=1041410 constructorNanoseconds=513 totalConstructorNanoseconds=9250 +RESULT name=sort n=1000 t=1 iterations=8 durationNanoseconds=126883 totalDurationNanoseconds=1015070 constructorNanoseconds=1475 totalConstructorNanoseconds=11800 +RESULT name=sort n=10001 t=1 iterations=2 durationNanoseconds=615565 totalDurationNanoseconds=1231131 constructorNanoseconds=30890 totalConstructorNanoseconds=61780 +RESULT name=sort n=100000 t=1 iterations=1 durationNanoseconds=9488107 totalDurationNanoseconds=9488107 constructorNanoseconds=524961 totalConstructorNanoseconds=524961 +RESULT name=sort n=10000000 t=1 iterations=1 durationNanoseconds=1416091993 totalDurationNanoseconds=1416091993 constructorNanoseconds=50440746 totalConstructorNanoseconds=50440746 +RESULT name=sort n=100 t=2 iterations=16 durationNanoseconds=63258 totalDurationNanoseconds=1012141 constructorNanoseconds=375 totalConstructorNanoseconds=6000 +RESULT name=sort n=1000 t=2 iterations=7 durationNanoseconds=154110 totalDurationNanoseconds=1078770 constructorNanoseconds=2008 totalConstructorNanoseconds=14060 +RESULT name=sort n=10001 t=2 iterations=3 durationNanoseconds=451387 totalDurationNanoseconds=1354161 constructorNanoseconds=19620 totalConstructorNanoseconds=58860 +RESULT name=sort n=100000 t=2 iterations=1 durationNanoseconds=6236655 totalDurationNanoseconds=6236655 constructorNanoseconds=514650 totalConstructorNanoseconds=514650 +RESULT name=sort n=10000000 t=2 iterations=1 durationNanoseconds=1380325518 totalDurationNanoseconds=1380325518 constructorNanoseconds=50373886 totalConstructorNanoseconds=50373886 +RESULT name=sort n=100 t=4 iterations=9 durationNanoseconds=118743 totalDurationNanoseconds=1068691 constructorNanoseconds=436 totalConstructorNanoseconds=3930 +RESULT name=sort n=1000 t=4 iterations=4 durationNanoseconds=272115 totalDurationNanoseconds=1088461 constructorNanoseconds=2415 totalConstructorNanoseconds=9660 +RESULT name=sort n=10001 t=4 iterations=2 durationNanoseconds=569255 totalDurationNanoseconds=1138510 constructorNanoseconds=29920 totalConstructorNanoseconds=59840 +RESULT name=sort n=100000 t=4 iterations=1 durationNanoseconds=6598125 totalDurationNanoseconds=6598125 constructorNanoseconds=507180 totalConstructorNanoseconds=507180 +RESULT name=sort n=10000000 t=4 iterations=1 durationNanoseconds=1300242690 totalDurationNanoseconds=1300242690 constructorNanoseconds=50475097 totalConstructorNanoseconds=50475097 +RESULT name=sort n=100 t=8 iterations=3 durationNanoseconds=347863 totalDurationNanoseconds=1043591 constructorNanoseconds=2706 totalConstructorNanoseconds=8120 +RESULT name=sort n=1000 t=8 iterations=2 durationNanoseconds=610620 totalDurationNanoseconds=1221241 constructorNanoseconds=10400 totalConstructorNanoseconds=20800 +RESULT name=sort n=10001 t=8 iterations=2 durationNanoseconds=706495 totalDurationNanoseconds=1412991 constructorNanoseconds=29600 totalConstructorNanoseconds=59200 +RESULT name=sort n=100000 t=8 iterations=1 durationNanoseconds=7387085 totalDurationNanoseconds=7387085 constructorNanoseconds=557391 totalConstructorNanoseconds=557391 +RESULT name=sort n=10000000 t=8 iterations=1 durationNanoseconds=1261560682 totalDurationNanoseconds=1261560682 constructorNanoseconds=49470756 totalConstructorNanoseconds=49470756 +RESULT name=sort n=100 t=12 iterations=3 durationNanoseconds=432037 totalDurationNanoseconds=1296111 constructorNanoseconds=1170 totalConstructorNanoseconds=3510 +RESULT name=sort n=1000 t=12 iterations=1 durationNanoseconds=1092461 totalDurationNanoseconds=1092461 constructorNanoseconds=12880 totalConstructorNanoseconds=12880 +RESULT name=sort n=10001 t=12 iterations=1 durationNanoseconds=1019941 totalDurationNanoseconds=1019941 constructorNanoseconds=54540 totalConstructorNanoseconds=54540 +RESULT name=sort n=100000 t=12 iterations=1 durationNanoseconds=7159465 totalDurationNanoseconds=7159465 constructorNanoseconds=536730 totalConstructorNanoseconds=536730 +RESULT name=sort n=10000000 t=12 iterations=1 durationNanoseconds=1503813105 totalDurationNanoseconds=1503813105 constructorNanoseconds=50150056 totalConstructorNanoseconds=50150056 +RESULT name=sort n=100 t=16 iterations=3 durationNanoseconds=432706 totalDurationNanoseconds=1298120 constructorNanoseconds=3833 totalConstructorNanoseconds=11500 +RESULT name=sort n=1000 t=16 iterations=2 durationNanoseconds=784875 totalDurationNanoseconds=1569751 constructorNanoseconds=5285 totalConstructorNanoseconds=10570 +RESULT name=sort n=10001 t=16 iterations=1 durationNanoseconds=1953311 totalDurationNanoseconds=1953311 constructorNanoseconds=59420 totalConstructorNanoseconds=59420 +RESULT name=sort n=100000 t=16 iterations=1 durationNanoseconds=6820104 totalDurationNanoseconds=6820104 constructorNanoseconds=524961 totalConstructorNanoseconds=524961 +RESULT name=sort n=10000000 t=16 iterations=1 durationNanoseconds=1352546567 totalDurationNanoseconds=1352546567 constructorNanoseconds=52707158 totalConstructorNanoseconds=52707158 diff --git a/src/sorter.cpp b/src/sorter.cpp index 6ea00ac..27de298 100755 --- a/src/sorter.cpp +++ b/src/sorter.cpp @@ -11,6 +11,11 @@ namespace ae { +sorter::sorter(uint32_t num = 1) { + sorter::num_threads = num; + sorter::pool = new ThreadPool(num); +} + void sorter::sort(container& data) { for (auto i = 1uz; i < data.placeholder_.size(); ++i) { std::ranges::copy(data.placeholder_[i], std::back_inserter(data.placeholder_[0])); @@ -18,7 +23,7 @@ void sorter::sort(container& data) { } #if DEBUG for (int i = 0; i < data.placeholder_[0].size(); i++) { - // if (copy[i] != data.placeholder_[0][i]) + if (copy[i] != data.placeholder_[0][i]) std::cerr << i << " before:" << data.placeholder_[0][i] << std::endl; } @@ -27,6 +32,7 @@ void sorter::sort(container& data) { std::sort(copy.begin(), copy.end()); #endif sorter::msd_inplace_radix_sort(data.placeholder_[0], 0, [&](auto span) {sorter::robin_hood_sort(span);}); + while (sorter::pool->size() > 0 || sorter::pool->isWorking()) {}; #if DEBUG for (int i = 0; i < copy.size(); i++) { if (copy[i] != data.placeholder_[0][i]) @@ -169,9 +175,27 @@ void sorter::msd_inplace_radix_sort( // sort each bucket recursively for (auto i = 0; i < sorter::RADIX_BUCKETS; i++) { - sorter::msd_inplace_radix_sort(std::span (buckets_start[i], buckets_end[i]), passes + 1, bucket_sort); + if (sorter::pool != nullptr) { + #if DEBUG + std::cerr << "Putting in task with depth " << passes << " of bucket " << i << std::endl; + #endif + auto start = buckets_start[i]; + auto end = buckets_end[i]; + sorter::pool->add([start, end, &bucket_sort, passes, this, i](){ + #if DEBUG + std::cerr << "Starting task with depth " << passes << " of bucket " << i << std::endl; + #endif + sorter::msd_inplace_radix_sort(std::span (start, end), passes + 1, bucket_sort); + #if DEBUG + std::cerr << "Finishing task with depth " << passes << " of bucket " << i << std::endl; + #endif + }); + } else { + sorter::msd_inplace_radix_sort(std::span (buckets_start[i], buckets_end[i]), passes + 1, bucket_sort); + } } } + void sorter::robin_hood_sort(std::span bucket) { const auto size = bucket.size() + sorter::OVERHEAD_SIZE; const auto mask = ((1L) << (sizeof(container::element_type) * CHAR_BIT - sorter::RADIX_ITERATIONS)) - 1; diff --git a/src/sorter.hpp b/src/sorter.hpp index 2dec15d..2747def 100755 --- a/src/sorter.hpp +++ b/src/sorter.hpp @@ -1,8 +1,10 @@ #pragma once #include "container.hpp" +#include "thread_pool.hpp" #include "functional" #include "math.h" +#include namespace ae { @@ -11,12 +13,21 @@ class sorter { void sort(container& data); // TODO You may add additional functions or data members to the sorter. + + sorter(uint32_t num_threads); + void msd_inplace_radix_sort( std::span range, size_t passes, const std::function bucket)>& bucket_sort ); + void parallel_msd_inplace_radix_sort( + std::span range, + size_t passes, + const std::function bucket)>& bucket_sort + ); + void msd_inplace_radix_sort_binary( std::span range, size_t passes, @@ -24,10 +35,12 @@ class sorter { ); const uint32_t OVERHEAD_SIZE = 100L; - const uint32_t SMALL_SORT_THRESHHOLD = 32; + const uint32_t SMALL_SORT_THRESHHOLD = 100; const uint32_t RADIX_SIZE = 4; const uint32_t RADIX_BUCKETS = std::pow(2, 4); const uint32_t RADIX_ITERATIONS = 8; + uint32_t num_threads; + ThreadPool* pool = nullptr; void robin_hood_sort(std::span range); }; diff --git a/src/thread_pool.cpp b/src/thread_pool.cpp new file mode 100644 index 0000000..af777bc --- /dev/null +++ b/src/thread_pool.cpp @@ -0,0 +1,58 @@ +#include "thread_pool.hpp" + +ThreadPool::ThreadPool(size_t num_threads) { + states = new bool[num_threads]({ false }); + for (auto i = 0; i < num_threads; ++i) { + threads.emplace_back([this, i] { + while (true) { + std::function task; + + std::unique_lock lock(mutex); + + cv.wait(lock, [this] { return !tasks.empty() || stop; }); + + if (tasks.empty() || stop) { + return; + } + + states[i] = true; + + task = std::move(tasks.front()); + tasks.pop(); + + lock.unlock(); + task(); + + states[i] = false; + } + }); + } +} + +ThreadPool::~ThreadPool() { + std::unique_lock lock(mutex); + stop = true; + lock.unlock(); + + cv.notify_all(); + for (auto& thread : threads) { + thread.join(); + } +} + +void ThreadPool::add(std::function task) { + std::unique_lock lock(mutex); + tasks.emplace(std::move(task)); + cv.notify_one(); + lock.unlock(); +} + +uint32_t ThreadPool::size() { return tasks.size(); } +bool ThreadPool::isWorking() { + for (auto i = 0; i < threads.size(); i++) { + if (states[i]) { + return true; + } + } + return false; +} \ No newline at end of file diff --git a/src/thread_pool.hpp b/src/thread_pool.hpp new file mode 100644 index 0000000..b8a823a --- /dev/null +++ b/src/thread_pool.hpp @@ -0,0 +1,29 @@ +#pragma once + +#include +#include +#include +#include +#include +#include +#include + +// This class was inspired by https://www.geeksforgeeks.org/cpp/thread-pool-in-cpp/ (access: 26/09/2025) +// to more efficiently handle threads + +class ThreadPool { +private: + std::vector threads; + std::queue> tasks; + std::mutex mutex; + std::condition_variable cv; + bool stop = false; + bool* states; + +public: + ThreadPool(size_t num_threads); + ~ThreadPool(); + void add(std::function task); + uint32_t size(); + bool isWorking(); +};