many tries with __builtin_prefetch to optimize neon

This commit is contained in:
Vincent Buso 2022-12-22 01:18:28 +01:00
parent 78150ef41d
commit c78ba94025

View File

@ -158,6 +158,12 @@ void fbtft_transpose_neon(uint16_t* src, uint16_t* dst, int w, int h){
for (y=0; y<h; y+=4){ for (y=0; y<h; y+=4){
for (x=0; x<w; x+=4){ for (x=0; x<w; x+=4){
/* 1% CPU gain */
__builtin_prefetch(src + (y+0)*w + x + 4);
__builtin_prefetch(src + (y+1)*w + x + 4);
__builtin_prefetch(src + (y+2)*w + x + 4);
__builtin_prefetch(src + (y+3)*w + x + 4);
/* Neon Load */ /* Neon Load */
v_tmp.val[0] = vld1_u16(src + (y+0)*w + x ); v_tmp.val[0] = vld1_u16(src + (y+0)*w + x );
v_tmp.val[1] = vld1_u16(src + (y+1)*w + x ); v_tmp.val[1] = vld1_u16(src + (y+1)*w + x );
@ -187,6 +193,12 @@ void fbtft_transpose_inv_neon(uint16_t* src, uint16_t* dst, int w, int h){
for (y=0; y<h; y+=4){ for (y=0; y<h; y+=4){
for (x=0; x<w; x+=4){ for (x=0; x<w; x+=4){
/* 1% CPU gain */
__builtin_prefetch(src + (y+0)*w + x + 4);
__builtin_prefetch(src + (y+1)*w + x + 4);
__builtin_prefetch(src + (y+2)*w + x + 4);
__builtin_prefetch(src + (y+3)*w + x + 4);
/* Neon Load */ /* Neon Load */
v_tmp.val[0] = vld1_u16(src + (y+3)*w + x ); v_tmp.val[0] = vld1_u16(src + (y+3)*w + x );
v_tmp.val[1] = vld1_u16(src + (y+2)*w + x ); v_tmp.val[1] = vld1_u16(src + (y+2)*w + x );
@ -216,6 +228,12 @@ void fbtft_rotate_90cw_neon(uint16_t* src, uint16_t* dst, int w, int h){
for (y=0; y<h; y+=4){ for (y=0; y<h; y+=4){
for (x=0; x<w; x+=4){ for (x=0; x<w; x+=4){
/* 1% CPU gain */
__builtin_prefetch(src + (y+0)*w + x + 4);
__builtin_prefetch(src + (y+1)*w + x + 4);
__builtin_prefetch(src + (y+2)*w + x + 4);
__builtin_prefetch(src + (y+3)*w + x + 4);
/* Neon Load */ /* Neon Load */
v_tmp.val[0] = vld1_u16(src + (y+3)*w + x ); v_tmp.val[0] = vld1_u16(src + (y+3)*w + x );
v_tmp.val[1] = vld1_u16(src + (y+2)*w + x ); v_tmp.val[1] = vld1_u16(src + (y+2)*w + x );
@ -259,6 +277,231 @@ void fbtft_rotate_270cw_neon(uint16_t* src, uint16_t* dst, int w, int h){
} }
} }
} }
/*
NEON optimized matrix rotate 270° CW
(dimensions multiple of 4, 16bits pixels)
*/
#define PREFETCH_ORDER_X 32
#define PREFETCH_ORDER_Y 4
void fbtft_rotate_270cw_neon_prefetch(uint16_t* src, uint16_t* dst, int w, int h){
/* Vars */
uint16x4x4_t v_tmp;
int y, x;
/* Main loop */
for (y=0; y<h; y+=4){
/* Prefetch src */
__builtin_prefetch(src + (y+PREFETCH_ORDER_Y+0)*w + x);
__builtin_prefetch(src + (y+PREFETCH_ORDER_Y+1)*w + x);
__builtin_prefetch(src + (y+PREFETCH_ORDER_Y+2)*w + x);
__builtin_prefetch(src + (y+PREFETCH_ORDER_Y+3)*w + x);
/* Prefetch dst */
/*__builtin_prefetch(dst + ( (w-1) - x - 3 + PREFETCH_ORDER_Y )*h + y + 4, 1);
__builtin_prefetch(dst + ( (w-1) - x - 2 + PREFETCH_ORDER_Y )*h + y + 4, 1);
__builtin_prefetch(dst + ( (w-1) - x - 1 + PREFETCH_ORDER_Y )*h + y + 4, 1);
__builtin_prefetch(dst + ( (w-1) - x - 0 + PREFETCH_ORDER_Y )*h + y + 4, 1);*/
for (x=0; x<w; x+=4){
/* Prefetch src */
__builtin_prefetch(src + (y+0)*w + x + PREFETCH_ORDER_X);
__builtin_prefetch(src + (y+1)*w + x + PREFETCH_ORDER_X);
__builtin_prefetch(src + (y+2)*w + x + PREFETCH_ORDER_X);
__builtin_prefetch(src + (y+3)*w + x + PREFETCH_ORDER_X);
/* Prefetch dst */
/*__builtin_prefetch(dst + ( (w-1) - x - 3 + PREFETCH_ORDER_X )*h + y, 1);
__builtin_prefetch(dst + ( (w-1) - x - 2 + PREFETCH_ORDER_X )*h + y, 1);
__builtin_prefetch(dst + ( (w-1) - x - 1 + PREFETCH_ORDER_X )*h + y, 1);
__builtin_prefetch(dst + ( (w-1) - x - 0 + PREFETCH_ORDER_X )*h + y, 1);*/
/* Neon Load */
v_tmp.val[0] = vld1_u16(src + (y+0)*w + x );
v_tmp.val[1] = vld1_u16(src + (y+1)*w + x );
v_tmp.val[2] = vld1_u16(src + (y+2)*w + x );
v_tmp.val[3] = vld1_u16(src + (y+3)*w + x );
/* Neon store (4 interleaved) */
vst4_lane_u16(dst + ( (w-1) - x - 3 )*h + y, v_tmp, 3);
vst4_lane_u16(dst + ( (w-1) - x - 2 )*h + y, v_tmp, 2);
vst4_lane_u16(dst + ( (w-1) - x - 1 )*h + y, v_tmp, 1);
vst4_lane_u16(dst + ( (w-1) - x - 0 )*h + y, v_tmp, 0);
}
}
}
/*
NEON optimized matrix rotate 270° CW
(dimensions multiple of 4, 16bits pixels)
*/
#define PREFETCH_ORDER_X 32
#define PREFETCH_ORDER_Y 4
void fbtft_rotate_270cw_neon_prefetch2(uint16_t* src, uint16_t* dst, int w, int h){
/* Vars */
uint16x4x4_t v_tmp, v_tmp2, v_tmp3, v_tmp4;
int y, x;
/* Main loop */
for (y=0; y<h; y+=4){
/* Prefetch src */
__builtin_prefetch(src + (y+PREFETCH_ORDER_Y+0)*w + x);
__builtin_prefetch(src + (y+PREFETCH_ORDER_Y+1)*w + x);
__builtin_prefetch(src + (y+PREFETCH_ORDER_Y+2)*w + x);
__builtin_prefetch(src + (y+PREFETCH_ORDER_Y+3)*w + x);
/* Prefetch dst */
/*__builtin_prefetch(dst + ( (w-1) - x - 3 + PREFETCH_ORDER_Y )*h + y + 4, 1);
__builtin_prefetch(dst + ( (w-1) - x - 2 + PREFETCH_ORDER_Y )*h + y + 4, 1);
__builtin_prefetch(dst + ( (w-1) - x - 1 + PREFETCH_ORDER_Y )*h + y + 4, 1);
__builtin_prefetch(dst + ( (w-1) - x - 0 + PREFETCH_ORDER_Y )*h + y + 4, 1);*/
for (x=0; x<w; x+=16){
/* Prefetch src */
__builtin_prefetch(src + (y+0)*w + x + PREFETCH_ORDER_X);
__builtin_prefetch(src + (y+1)*w + x + PREFETCH_ORDER_X);
__builtin_prefetch(src + (y+2)*w + x + PREFETCH_ORDER_X);
__builtin_prefetch(src + (y+3)*w + x + PREFETCH_ORDER_X);
/* Prefetch dst */
/*__builtin_prefetch(dst + ( (w-1) - x - 3 + PREFETCH_ORDER_X )*h + y, 1);
__builtin_prefetch(dst + ( (w-1) - x - 2 + PREFETCH_ORDER_X )*h + y, 1);
__builtin_prefetch(dst + ( (w-1) - x - 1 + PREFETCH_ORDER_X )*h + y, 1);
__builtin_prefetch(dst + ( (w-1) - x - 0 + PREFETCH_ORDER_X )*h + y, 1);*/
/* Neon Load */
v_tmp.val[0] = vld1_u16(src + (y+0)*w + x );
v_tmp.val[1] = vld1_u16(src + (y+1)*w + x );
v_tmp.val[2] = vld1_u16(src + (y+2)*w + x );
v_tmp.val[3] = vld1_u16(src + (y+3)*w + x );
/* Neon store (4 interleaved) */
vst4_lane_u16(dst + ( (w-1) - x - 3 )*h + y, v_tmp, 3);
vst4_lane_u16(dst + ( (w-1) - x - 2 )*h + y, v_tmp, 2);
vst4_lane_u16(dst + ( (w-1) - x - 1 )*h + y, v_tmp, 1);
vst4_lane_u16(dst + ( (w-1) - x - 0 )*h + y, v_tmp, 0);
/* Neon Load */
v_tmp2.val[0] = vld1_u16(src + (y+0)*w + x+4 );
v_tmp2.val[1] = vld1_u16(src + (y+1)*w + x+4 );
v_tmp2.val[2] = vld1_u16(src + (y+2)*w + x+4 );
v_tmp2.val[3] = vld1_u16(src + (y+3)*w + x+4 );
/* Neon store (4 interleaved) */
vst4_lane_u16(dst + ( (w-1) - x+4 - 3 )*h + y, v_tmp2, 3);
vst4_lane_u16(dst + ( (w-1) - x+4 - 2 )*h + y, v_tmp2, 2);
vst4_lane_u16(dst + ( (w-1) - x+4 - 1 )*h + y, v_tmp2, 1);
vst4_lane_u16(dst + ( (w-1) - x+4 - 0 )*h + y, v_tmp2, 0);
/* Neon Load */
v_tmp3.val[0] = vld1_u16(src + (y+0)*w + x+8 );
v_tmp3.val[1] = vld1_u16(src + (y+1)*w + x+8 );
v_tmp3.val[2] = vld1_u16(src + (y+2)*w + x+8 );
v_tmp3.val[3] = vld1_u16(src + (y+3)*w + x+8 );
/* Neon store (4 interleaved) */
vst4_lane_u16(dst + ( (w-1) - x+8 - 3 )*h + y, v_tmp3, 3);
vst4_lane_u16(dst + ( (w-1) - x+8 - 2 )*h + y, v_tmp3, 2);
vst4_lane_u16(dst + ( (w-1) - x+8 - 1 )*h + y, v_tmp3, 1);
vst4_lane_u16(dst + ( (w-1) - x+8 - 0 )*h + y, v_tmp3, 0);
/* Neon Load */
v_tmp4.val[0] = vld1_u16(src + (y+0)*w + x+12 );
v_tmp4.val[1] = vld1_u16(src + (y+1)*w + x+12 );
v_tmp4.val[2] = vld1_u16(src + (y+2)*w + x+12 );
v_tmp4.val[3] = vld1_u16(src + (y+3)*w + x+12 );
/* Neon store (4 interleaved) */
vst4_lane_u16(dst + ( (w-1) - x+12 - 3 )*h + y, v_tmp4, 3);
vst4_lane_u16(dst + ( (w-1) - x+12 - 2 )*h + y, v_tmp4, 2);
vst4_lane_u16(dst + ( (w-1) - x+12 - 1 )*h + y, v_tmp4, 1);
vst4_lane_u16(dst + ( (w-1) - x+12 - 0 )*h + y, v_tmp4, 0);
}
}
}
#define prefetch(x) __builtin_prefetch(x)
#ifndef PREFETCH_STRIDE
#define PREFETCH_STRIDE (2*32)
#endif
static inline void prefetch_range(void *addr, size_t len)
{
char *cp;
char *end = addr + len;
for (cp = addr; cp < end; cp += PREFETCH_STRIDE)
prefetch(cp);
}
/*
NEON optimized matrix rotate 270° CW
(dimensions multiple of 4, 16bits pixels)
*/
#define PREFETCH_ORDER_X 20
#define PREFETCH_ORDER_Y 4
#define PREFETCH_RANGE 1
void fbtft_rotate_270cw_neon_prefetch3(uint16_t* src, uint16_t* dst, int w, int h){
/* Vars */
uint16x4x4_t v_tmp;
int y, x;
/* Main loop */
for (y=0; y<h; y+=4){
/* Prefetch src */
prefetch_range(src + (y+PREFETCH_ORDER_Y+0)*w + x, PREFETCH_RANGE);
prefetch_range(src + (y+PREFETCH_ORDER_Y+1)*w + x, PREFETCH_RANGE);
prefetch_range(src + (y+PREFETCH_ORDER_Y+2)*w + x, PREFETCH_RANGE);
prefetch_range(src + (y+PREFETCH_ORDER_Y+3)*w + x, PREFETCH_RANGE);
/* Prefetch dst */
/*__builtin_prefetch(dst + ( (w-1) - x - 3 + PREFETCH_ORDER_Y )*h + y + 4, 1);
__builtin_prefetch(dst + ( (w-1) - x - 2 + PREFETCH_ORDER_Y )*h + y + 4, 1);
__builtin_prefetch(dst + ( (w-1) - x - 1 + PREFETCH_ORDER_Y )*h + y + 4, 1);
__builtin_prefetch(dst + ( (w-1) - x - 0 + PREFETCH_ORDER_Y )*h + y + 4, 1);*/
for (x=0; x<w; x+=4){
/* Prefetch src */
prefetch_range(src + (y+0)*w + x + PREFETCH_ORDER_X, PREFETCH_RANGE);
prefetch_range(src + (y+1)*w + x + PREFETCH_ORDER_X, PREFETCH_RANGE);
prefetch_range(src + (y+2)*w + x + PREFETCH_ORDER_X, PREFETCH_RANGE);
prefetch_range(src + (y+3)*w + x + PREFETCH_ORDER_X, PREFETCH_RANGE);
/* Prefetch dst */
/*__builtin_prefetch(dst + ( (w-1) - x - 3 + PREFETCH_ORDER_X )*h + y, 1);
__builtin_prefetch(dst + ( (w-1) - x - 2 + PREFETCH_ORDER_X )*h + y, 1);
__builtin_prefetch(dst + ( (w-1) - x - 1 + PREFETCH_ORDER_X )*h + y, 1);
__builtin_prefetch(dst + ( (w-1) - x - 0 + PREFETCH_ORDER_X )*h + y, 1);*/
/* Neon Load */
v_tmp.val[0] = vld1_u16( &src[(y+0)*w + x] );
v_tmp.val[1] = vld1_u16( &src[(y+1)*w + x] );
v_tmp.val[2] = vld1_u16( &src[(y+2)*w + x] );
v_tmp.val[3] = vld1_u16( &src[(y+3)*w + x] );
/* Neon store (4 interleaved) */
vst4_lane_u16( &dst[ ( (w-1) - x - 3 )*h + y ], v_tmp, 3);
vst4_lane_u16( &dst[ ( (w-1) - x - 2 )*h + y ], v_tmp, 2);
vst4_lane_u16( &dst[ ( (w-1) - x - 1 )*h + y ], v_tmp, 1);
vst4_lane_u16( &dst[ ( (w-1) - x - 0 )*h + y ], v_tmp, 0);
}
}
}
#endif //__ARM_FP #endif //__ARM_FP
@ -891,7 +1134,7 @@ int launch_prod_screen_tests(int argc, char *argv[]){
/****************************************** 3 bis (non squared) ********************************/ /****************************************** 3 bis (non squared) ********************************/
#if 1 #if 0
/* Vars */ /* Vars */
int w = image_rgb_16b->w/2, h = image_rgb_16b->h; int w = image_rgb_16b->w/2, h = image_rgb_16b->h;
SDL_Surface *image_rgb_16b_notsquare = SDL_CreateRGBSurface(SDL_SWSURFACE, w, h, 16, 0,0,0,0); SDL_Surface *image_rgb_16b_notsquare = SDL_CreateRGBSurface(SDL_SWSURFACE, w, h, 16, 0,0,0,0);
@ -1008,6 +1251,7 @@ int h = image_rgb_16b->h, w = image_rgb_16b->w;
uint16_t * p = (uint16_t *)image_rgb_16b->pixels; uint16_t * p = (uint16_t *)image_rgb_16b->pixels;
SDL_Surface *image_rgb_16b_transposed = SDL_CreateRGBSurface(SDL_SWSURFACE, h, w, 16, 0,0,0,0); SDL_Surface *image_rgb_16b_transposed = SDL_CreateRGBSurface(SDL_SWSURFACE, h, w, 16, 0,0,0,0);
uint16_t * p2 = (uint16_t *)image_rgb_16b_transposed->pixels; uint16_t * p2 = (uint16_t *)image_rgb_16b_transposed->pixels;
int i; int i;
uint32_t now = SDL_GetTicks(); uint32_t now = SDL_GetTicks();
@ -1068,6 +1312,65 @@ now = SDL_GetTicks();
/****************************************** 4 bis (optims neon) ********************************/
#if 1
//* Vars */
int h = image_rgb_16b->h, w = image_rgb_16b->w;
uint16_t * p = (uint16_t *)image_rgb_16b->pixels;
SDL_Surface *image_rgb_16b_transposed = SDL_CreateRGBSurface(SDL_SWSURFACE, h, w, 16, 0,0,0,0);
uint16_t * p2 = (uint16_t *)image_rgb_16b_transposed->pixels;
int i;
uint32_t now = SDL_GetTicks();
/* Saved perfs for 10000 iterations: */
/* Rotate square optimized with memcpy: 2381ms
* Rotate square optimized exported: 9847ms
* Translate soft: 8645ms
* Translate soft a la mano 4x4: 11110ms
* Translate neon: 7800ms
*/
#define ITERATIONS 3000
printf("\n");
#ifdef __ARM_FP
/* Rotate 270 with prefetch neon */
for (i=0; i<ITERATIONS; i++) fbtft_rotate_270cw_neon_prefetch2(p, p2, w, h);
printf("Rotate 270 cw neon with prefetch2: %dms\n", SDL_GetTicks()-now);
now = SDL_GetTicks();
/* Rotate 270 neon */
for (i=0; i<ITERATIONS; i++) fbtft_rotate_270cw_neon(p, p2, w, h);
printf("Rotate 270 cw neon: %dms\n", SDL_GetTicks()-now);
now = SDL_GetTicks();
/* Rotate 270 with prefetch neon */
for (i=0; i<ITERATIONS; i++) fbtft_rotate_270cw_neon_prefetch(p, p2, w, h);
printf("Rotate 270 cw neon with prefetch: %dms\n", SDL_GetTicks()-now);
now = SDL_GetTicks();
#endif //__ARM_FP
#endif
/*************************************************************************************/
/****************************************** 4 (perfs tests 320x240) ********************************/ /****************************************** 4 (perfs tests 320x240) ********************************/
#if 0 #if 0