#include <cuda.h>
|
#include <cstdio>
|
#include "utils.hpp"
|
|
__global__ void helloKernel()
|
{
|
printf("hello world at index:%d\n", threadIdx.x);
|
}
|
|
void printHello()
|
{
|
helloKernel<<<1, 4>>>();
|
cudaDeviceSynchronize(); // 没有这一行将会无法打印出结果,原因待查
|
}
|
|
typedef unsigned char uint8_t;
|
|
__device__ void affine_project(float* matrix, int x, int y, float* proj_x, float* proj_y) {
|
// m0, m1, m2
|
// m3, m4, m5
|
*proj_x = matrix[0] * x + matrix[1] * y + matrix[2];
|
*proj_y = matrix[3] * x + matrix[4] * y + matrix[5];
|
}
|
|
__global__ void warp_affine_bilinear_kernel(
|
uint8_t* src, int src_line_size, int src_width, int src_height,
|
float* dst, int dst_width, int dst_height, uint8_t fill_value, AffineMatrix matrix
|
)
|
{
|
int dx = blockDim.x * blockIdx.x + threadIdx.x;
|
int dy = blockDim.y * blockIdx.y + threadIdx.y;
|
if (dx >= dst_width || dy >= dst_height) return;
|
|
float c0 = fill_value, c1 = fill_value, c2 = fill_value;
|
float src_x = 0; float src_y = 0;
|
affine_project(matrix.d2i, dx, dy, &src_x, &src_y);
|
|
if (src_x < -1 || src_x >= src_width || src_y < -1 || src_y >= src_height) {
|
//src_x<-1ʱ�����λhigh_x<0��������Χ
|
//src_x>=-1ʱ�����λhigh_x>=0������ȡֵ
|
}
|
else {
|
int x_low = std::floor(src_x);
|
int y_low = std::floor(src_y);
|
int x_high = x_low + 1;
|
int y_high = y_low + 1;
|
|
uint8_t const_values[] = { fill_value,fill_value,fill_value };
|
float lx = src_x - x_low;
|
float ly = src_y - y_low;
|
float hx = 1 - lx;
|
float hy = 1 - ly;
|
float w1 = hx * hy, w2 = lx * hy, w3 = hx * ly, w4 = lx * ly;
|
uint8_t* v1 = const_values;
|
uint8_t* v2 = const_values;
|
uint8_t* v3 = const_values;
|
uint8_t* v4 = const_values;
|
if (y_low >= 0) {
|
if (x_low >= 0)
|
v1 = src + y_low * src_line_size + x_low * 3;
|
|
if (x_high < src_width)
|
v2 = src + y_low * src_line_size + x_high * 3;
|
}
|
|
if (y_high < src_height) {
|
if (x_low >= 0)
|
v3 = src + y_high * src_line_size + x_low * 3;
|
|
if (x_high < src_width)
|
v4 = src + y_high * src_line_size + x_high * 3;
|
}
|
|
c0 = std::floor(w1 * v1[0] + w2 * v2[0] + w3 * v3[0] + w4 * v4[0] + 0.5f);
|
c1 = std::floor(w1 * v1[1] + w2 * v2[1] + w3 * v3[1] + w4 * v4[1] + 0.5f);
|
c2 = std::floor(w1 * v1[2] + w2 * v2[2] + w3 * v3[2] + w4 * v4[2] + 0.5f);
|
|
}
|
|
//uint8_t* pdst = dst + dy * dst_line_size + dx * 3;
|
//pdst[0] = c0; pdst[1] = c1; pdst[2] = c2;
|
|
float* pdst = dst + dy * dst_width + dx;
|
pdst[0] = c2 / 255.0f;
|
pdst[dst_width * dst_height] = c1 / 255.0f;
|
pdst[2 * dst_width * dst_height] = c0 / 255.0f;
|
|
}
|
|
void warp_affine_bilinear(uint8_t* src, int src_line_size, int src_width, int src_height,
|
float* dst, int dst_width, int dst_height, uint8_t fill_value, AffineMatrix matrix, cudaStream_t stream)
|
{
|
dim3 block_size(32, 32);//blocksize���1024
|
dim3 grid_size((dst_width + 31) / 32, (dst_height + 31) / 32);
|
|
warp_affine_bilinear_kernel <<<grid_size, block_size, 0, stream >>> (
|
src, src_line_size, src_width, src_height,
|
dst, dst_width, dst_height,
|
fill_value, matrix);
|
}
|