Я написал простой код Halide для вычисления квадрата чисел от 0 до n, однако на GPU это занимает в 22 раза больше времени, чем на CPU.
#include"stdafx.h"
#include "Halide.h"
#include <stdio.h>
using namespace Halide;
#include "HalideRuntimeOpenCL.h"
#define GPU_TILE 16
#define COMPUTE_SIZE 1024
Target find_gpu_target();
// Define some Vars to use.
Halide::Var x, y, xo, yo, xi, yi;
// We're going to want to schedule a pipeline in several ways, so we
// define the pipeline in a class so that we can recreate it several
// times with different schedules.
class MyPipeline {
public:
Halide::Func f;
MyPipeline() {
f(x) = x * x;
}
// Now we define methods that give our pipeline several different
// schedules.
void schedule_for_cpu() {
// JIT-compile the pipeline for the CPU.
Target target = get_host_target();
f.compile_jit(target);
}
// Now a schedule that uses CUDA or OpenCL.
bool schedule_for_gpu() {
Target target = find_gpu_target();
if (!target.has_gpu_feature()) {
return false;
}
// Schedule f on the GPU in 16x16 tiles.
f.gpu_tile(x, xo, xi, GPU_TILE);
f.compile_jit(target);
return true;
}
void test_performance() {
// Test the performance of the scheduled MyPipeline.
// Run the filter once to initialize any GPU runtime state.
// Run it.
Halide::Buffer<int> result = f.realize(COMPUTE_SIZE);
// Now take the best of 3 runs for timing.
double best_time = 0.0;
for (int i = 0; i < 3; i++) {
double t1 = clock();//current_time();
// Run the filter 100 times.
for (int j = 0; j < 100; j++) {
// Run it.
Halide::Buffer<int> result = f.realize(COMPUTE_SIZE);
// Force any GPU code to finish by copying the buffer back to the CPU.
result.copy_to_host();
}
double t2 = clock();// current_time();
double elapsed = (t2 - t1) / 100;
if (i == 0 || elapsed < best_time) {
best_time = elapsed;
}
best_time = (t2 - t1) * 1000. / CLOCKS_PER_SEC;
}
printf("%1.4f milliseconds\n", best_time);
}
bool test_correctness() {
Halide::Buffer<int> result = f.realize(COMPUTE_SIZE);
for (int i = 0; i < COMPUTE_SIZE; i++)
{
if (result(i) != i * i)
return false;
}
return true;
}
};
int main(int argc, char **argv) {
MyPipeline p1;
p1.schedule_for_cpu();
printf("Running pipeline on CPU:\n");
printf("Test Correctness of cpu scheduler: %d\n",p1.test_correctness());
MyPipeline p2;
bool has_gpu_target = p2.schedule_for_gpu();
printf("Running pipeline on GPU:\n");
printf("Test Correctness of gpu scheduler: %d\n", p2.test_correctness());
printf("Testing performance on CPU:\n");
p1.test_performance();
if (has_gpu_target) {
printf("Testing performance on GPU:\n");
p2.test_performance();
}
return 0;
}
Target find_gpu_target() {
// Start with a target suitable for the machine you're running this on.
Target target = get_host_target();
// Uncomment the following lines to try CUDA instead:
target.set_feature(Target::CUDA);
// Enable debugging so that you can see what OpenCL API calls we do.
//target.set_feature(Halide::Target::Debug);
return target;
}
Выход
Running pipeline on CPU:
Test Correctness of cpu scheduler: 1
Running pipeline on GPU:
Test Correctness of gpu scheduler: 1
Testing performance on CPU:
1.0000 milliseconds
Testing performance on GPU:
22.0000 milliseconds
Я попытался запустить планировщик графического процессора с флагом отладки, записанное время указано ниже.
CUDA: halide_cuda_initialize_kernels: 1.303033e+00 мс
CUDA: halide_cuda_device_malloc: 1.070443e+00 мс
CUDA: halide_cuda_run: 5.184570e+00 мс
CUDA: halide_cuda_buffer_copy: 7.340180e-01 мс
CUDA: halide_cuda_device_free: 1.317381e+00 мс
Редактировать 1: возможно ли с помощью Halide инициализировать ядро gpu и malloc/free только один раз, повторно использовать ядро для разных входных данных?
tools/RunGenMain.cpp
- person Alex Reinking   schedule 21.11.2019