OpenMP в рамках одной машины

This commit is contained in:
2025-12-02 12:55:16 +00:00
parent 73c9e580e4
commit 143e01b2dd
3 changed files with 110 additions and 18 deletions

View File

@@ -1,5 +1,5 @@
CXX = mpic++ CXX = mpic++
CXXFLAGS = -std=c++17 -O2 -Wall -Wextra -Wno-cast-function-type CXXFLAGS = -std=c++17 -O2 -Wall -Wextra -Wno-cast-function-type -fopenmp
NVCC = nvcc NVCC = nvcc
NVCCFLAGS = -O3 -std=c++17 -arch=sm_86 -Xcompiler -fPIC NVCCFLAGS = -O3 -std=c++17 -arch=sm_86 -Xcompiler -fPIC

View File

@@ -2,9 +2,11 @@
#SBATCH --job-name=btc #SBATCH --job-name=btc
#SBATCH --nodes=4 #SBATCH --nodes=4
#SBATCH --ntasks=4 #SBATCH --ntasks=4
#SBATCH --cpus-per-task=2
#SBATCH --output=out.txt #SBATCH --output=out.txt
# mpirun -np $SLURM_NTASKS ./build/bitcoin_app # Количество CPU потоков на узел (должно соответствовать cpus-per-task)
export NUM_CPU_THREADS=2
cd /mnt/shared/supercomputers/build cd /mnt/shared/supercomputers/build
mpirun -np $SLURM_NTASKS ./bitcoin_app mpirun -np $SLURM_NTASKS ./bitcoin_app

View File

@@ -1,7 +1,10 @@
#include <mpi.h> #include <mpi.h>
#include <omp.h>
#include <iostream> #include <iostream>
#include <vector> #include <vector>
#include <map> #include <map>
#include <iomanip>
#include <cstdlib>
#include "csv_loader.hpp" #include "csv_loader.hpp"
#include "utils.hpp" #include "utils.hpp"
@@ -27,6 +30,26 @@ std::vector<Record> select_records_for_rank(
return out; return out;
} }
// Разделить записи на N частей (по дням)
std::vector<std::vector<Record>> split_records(const std::vector<Record>& records, int n_parts) {
// Группируем по дням
std::map<DayIndex, std::vector<Record>> by_day;
for (const auto& r : records) {
DayIndex day = static_cast<DayIndex>(r.timestamp) / 86400;
by_day[day].push_back(r);
}
// Распределяем дни по частям
std::vector<std::vector<Record>> parts(n_parts);
int i = 0;
for (auto& [day, recs] : by_day) {
parts[i % n_parts].insert(parts[i % n_parts].end(), recs.begin(), recs.end());
i++;
}
return parts;
}
int main(int argc, char** argv) { int main(int argc, char** argv) {
MPI_Init(&argc, &argv); MPI_Init(&argc, &argv);
@@ -34,6 +57,15 @@ int main(int argc, char** argv) {
MPI_Comm_rank(MPI_COMM_WORLD, &rank); MPI_Comm_rank(MPI_COMM_WORLD, &rank);
MPI_Comm_size(MPI_COMM_WORLD, &size); MPI_Comm_size(MPI_COMM_WORLD, &size);
// Читаем количество CPU потоков из переменной окружения (по умолчанию 2)
int num_cpu_threads = 2;
const char* env_threads = std::getenv("NUM_CPU_THREADS");
if (env_threads) {
num_cpu_threads = std::atoi(env_threads);
if (num_cpu_threads < 1) num_cpu_threads = 1;
}
omp_set_num_threads(num_cpu_threads + 1); // +1 для GPU потока если есть
// ====== ЗАГРУЗКА GPU ФУНКЦИЙ ====== // ====== ЗАГРУЗКА GPU ФУНКЦИЙ ======
auto gpu_is_available = load_gpu_is_available(); auto gpu_is_available = load_gpu_is_available();
auto gpu_aggregate = load_gpu_aggregate_days(); auto gpu_aggregate = load_gpu_aggregate_days();
@@ -41,9 +73,9 @@ int main(int argc, char** argv) {
bool have_gpu = false; bool have_gpu = false;
if (gpu_is_available && gpu_is_available()) { if (gpu_is_available && gpu_is_available()) {
have_gpu = true; have_gpu = true;
std::cout << "Rank " << rank << ": GPU available" << std::endl; std::cout << "Rank " << rank << ": GPU available + " << num_cpu_threads << " CPU threads" << std::endl;
} else { } else {
std::cout << "Rank " << rank << ": GPU not available, using CPU" << std::endl; std::cout << "Rank " << rank << ": " << num_cpu_threads << " CPU threads only" << std::endl;
} }
std::vector<Record> local_records; std::vector<Record> local_records;
@@ -87,26 +119,84 @@ int main(int argc, char** argv) {
std::cout << "Rank " << rank << " received " std::cout << "Rank " << rank << " received "
<< local_records.size() << " records" << std::endl; << local_records.size() << " records" << std::endl;
// ====== АГРЕГАЦИЯ НА КАЖДОМ УЗЛЕ (GPU или CPU) ====== // ====== АГРЕГАЦИЯ НА КАЖДОМ УЗЛЕ ======
std::vector<DayStats> local_stats; std::vector<DayStats> local_stats;
double time_start = omp_get_wtime();
// Время работы: [0] = GPU (если есть), [1..n] = CPU потоки
std::vector<double> worker_times(num_cpu_threads + 1, 0.0);
if (have_gpu && gpu_aggregate) { if (have_gpu && gpu_aggregate) {
bool gpu_success = aggregate_days_gpu(local_records, local_stats, gpu_aggregate); // GPU узел: делим на (1 + num_cpu_threads) частей
if (gpu_success) { int n_workers = 1 + num_cpu_threads;
std::cout << "Rank " << rank << " aggregated " auto parts = split_records(local_records, n_workers);
<< local_stats.size() << " days (GPU)" << std::endl;
std::vector<std::vector<DayStats>> results(n_workers);
std::vector<bool> success(n_workers, true);
#pragma omp parallel
{
int tid = omp_get_thread_num();
if (tid < n_workers) {
double t0 = omp_get_wtime();
if (tid == 0) {
// GPU поток
success[0] = aggregate_days_gpu(parts[0], results[0], gpu_aggregate);
} else { } else {
// Fallback на CPU при ошибке GPU // CPU потоки
std::cout << "Rank " << rank << ": GPU aggregation failed, falling back to CPU" << std::endl; results[tid] = aggregate_days(parts[tid]);
local_stats = aggregate_days(local_records);
std::cout << "Rank " << rank << " aggregated "
<< local_stats.size() << " days (CPU)" << std::endl;
} }
worker_times[tid] = omp_get_wtime() - t0;
}
}
// Объединяем результаты
for (int i = 0; i < n_workers; i++) {
if (i == 0 && !success[0]) {
// GPU failed - обработаем на CPU
std::cout << "Rank " << rank << ": GPU failed, processing on CPU" << std::endl;
double t0 = omp_get_wtime();
results[0] = aggregate_days(parts[0]);
worker_times[0] = omp_get_wtime() - t0;
}
local_stats.insert(local_stats.end(), results[i].begin(), results[i].end());
}
} else { } else {
local_stats = aggregate_days(local_records); // CPU-only узел
std::cout << "Rank " << rank << " aggregated " auto parts = split_records(local_records, num_cpu_threads);
<< local_stats.size() << " days (CPU)" << std::endl; std::vector<std::vector<DayStats>> results(num_cpu_threads);
#pragma omp parallel
{
int tid = omp_get_thread_num();
if (tid < num_cpu_threads) {
double t0 = omp_get_wtime();
results[tid] = aggregate_days(parts[tid]);
worker_times[tid + 1] = omp_get_wtime() - t0; // +1 т.к. [0] для GPU
} }
}
for (int i = 0; i < num_cpu_threads; i++) {
local_stats.insert(local_stats.end(), results[i].begin(), results[i].end());
}
}
double time_total = omp_get_wtime() - time_start;
// Вывод времени
std::cout << std::fixed << std::setprecision(3);
std::cout << "Rank " << rank << " aggregated " << local_stats.size() << " days in "
<< time_total << "s (";
if (have_gpu) {
std::cout << "GPU: " << worker_times[0] << "s, ";
}
for (int i = 0; i < num_cpu_threads; i++) {
int idx = have_gpu ? (i + 1) : (i + 1);
std::cout << "CPU" << i << ": " << worker_times[idx] << "s";
if (i < num_cpu_threads - 1) std::cout << ", ";
}
std::cout << ")" << std::endl;
// ====== СБОР АГРЕГИРОВАННЫХ ДАННЫХ НА RANK 0 ====== // ====== СБОР АГРЕГИРОВАННЫХ ДАННЫХ НА RANK 0 ======
std::vector<DayStats> all_stats; std::vector<DayStats> all_stats;