From d41292e7d8a4d3ef54fd7521c45ea80e5d777c62 Mon Sep 17 00:00:00 2001 From: root Date: Sat, 16 Oct 2021 12:13:03 +0200 Subject: [PATCH] sms, vdp rendering optimisation core, fix rendering copy buffer overlap reported from asan --- pico/draw.c | 9 ++- pico/draw_arm.S | 34 +++++++++ pico/mode4.c | 187 ++++++++++++++++-------------------------------- 3 files changed, 102 insertions(+), 128 deletions(-) diff --git a/pico/draw.c b/pico/draw.c index 2b349228..236e347e 100644 --- a/pico/draw.c +++ b/pico/draw.c @@ -132,8 +132,12 @@ void blockcpy_or(void *dst, void *src, size_t n, int pat); void blockcpy_or(void *dst, void *src, size_t n, int pat) { unsigned char *pd = dst, *ps = src; - for (; n; n--) - *pd++ = (unsigned char) (*ps++ | pat); + if (dst > src) { + for (pd += n, ps += n; n; n--) + *--pd = (unsigned char) (*--ps | pat); + } else + for (; n; n--) + *pd++ = (unsigned char) (*ps++ | pat); } #define blockcpy memmove #endif @@ -2019,7 +2023,6 @@ void PicoDrawSetOutFormat(pdso_t which, int use_32x_line_mode) { case PDF_8BIT: FinalizeLine = FinalizeLine8bit; - PicoDrawSetInternalBuf(Pico.est.Draw2FB, 328); break; case PDF_RGB555: diff --git a/pico/draw_arm.S b/pico/draw_arm.S index 6a3641bd..a9915635 100644 --- a/pico/draw_arm.S +++ b/pico/draw_arm.S @@ -2205,6 +2205,9 @@ FinalizeLine555: blockcpy: stmfd sp!, {r4,r5} + cmp r0, r1 + bhs blockcpyhi + mov r2, r2, lsr #4 blockcpy_loop: ldmia r1!, {r3-r5,r12} @@ -2214,6 +2217,18 @@ blockcpy_loop: ldmfd sp!, {r4,r5} bx lr +blockcpyhi: + add r0, r0, r2 + add r1, r1, r2 + mov r2, r2, lsr #4 +blockcpyhi_loop: + ldmdb r1!, {r3-r5,r12} + subs r2, r2, #1 + stmdb r0!, {r3-r5,r12} + bne blockcpyhi_loop + ldmfd sp!, {r4,r5} + bx lr + .global blockcpy_or @ void *dst, void *src, size_t n, int pat @@ -2221,6 +2236,9 @@ blockcpy_or: stmfd sp!, {r4-r6} orr r3, r3, r3, lsl #8 orr r3, r3, r3, lsl #16 + cmp r0, r1 + bhs blockcpyhi_or + mov r2, r2, lsr #4 blockcpy_loop_or: ldmia r1!, {r4-r6,r12} @@ -2234,4 +2252,20 @@ blockcpy_loop_or: ldmfd sp!, {r4-r6} bx lr +blockcpyhi_or: + add r0, r0, r2 + add r1, r1, r2 + mov r2, r2, lsr #4 +blockcpyhi_loop_or: + ldmdb r1!, {r4-r6,r12} + subs r2, r2, #1 + orr r4, r4, r3 + orr r5, r5, r3 + orr r6, r6, r3 + orr r12,r12,r3 + stmdb r0!, {r4-r6,r12} + bne blockcpyhi_loop_or + ldmfd sp!, {r4-r6} + bx lr + @ vim:filetype=armasm diff --git a/pico/mode4.c b/pico/mode4.c index 6afe1e4a..0525b935 100644 --- a/pico/mode4.c +++ b/pico/mode4.c @@ -26,104 +26,90 @@ static int screen_offset, line_offset; static void TileBGM4(u16 sx, int pal) { u32 *pd = (u32 *)(Pico.est.HighCol + sx); - pd[0] = pd[1] = pal ? 0x10101010 : 0; + pd[0] = pd[1] = pal * 0x01010101; } // 8 pixels are arranged to have 1 bit in each byte of a 32 bit word. To pull // the 4 bitplanes together multiply with each bit distance (multiples of 1<<7) -#define PLANAR_PIXELL(x,p) \ +#define PLANAR_PIXELBG(x,p) \ t = (pack>>(7-p)) & 0x01010101; \ t = (t*0x10204080) >> 28; \ pd[x] = pal|t; -static void TileNormLowM4(u16 sx, unsigned int pack, int pal) +static void TileNormBGM4(u16 sx, unsigned int pack, int pal) { u8 *pd = Pico.est.HighCol + sx; u32 t; - PLANAR_PIXELL(0, 0) - PLANAR_PIXELL(1, 1) - PLANAR_PIXELL(2, 2) - PLANAR_PIXELL(3, 3) - PLANAR_PIXELL(4, 4) - PLANAR_PIXELL(5, 5) - PLANAR_PIXELL(6, 6) - PLANAR_PIXELL(7, 7) + PLANAR_PIXELBG(0, 0) + PLANAR_PIXELBG(1, 1) + PLANAR_PIXELBG(2, 2) + PLANAR_PIXELBG(3, 3) + PLANAR_PIXELBG(4, 4) + PLANAR_PIXELBG(5, 5) + PLANAR_PIXELBG(6, 6) + PLANAR_PIXELBG(7, 7) } -static void TileFlipLowM4(u16 sx, unsigned int pack, int pal) +static void TileFlipBGM4(u16 sx, unsigned int pack, int pal) { u8 *pd = Pico.est.HighCol + sx; u32 t; - PLANAR_PIXELL(0, 7) - PLANAR_PIXELL(1, 6) - PLANAR_PIXELL(2, 5) - PLANAR_PIXELL(3, 4) - PLANAR_PIXELL(4, 3) - PLANAR_PIXELL(5, 2) - PLANAR_PIXELL(6, 1) - PLANAR_PIXELL(7, 0) + PLANAR_PIXELBG(0, 7) + PLANAR_PIXELBG(1, 6) + PLANAR_PIXELBG(2, 5) + PLANAR_PIXELBG(3, 4) + PLANAR_PIXELBG(4, 3) + PLANAR_PIXELBG(5, 2) + PLANAR_PIXELBG(6, 1) + PLANAR_PIXELBG(7, 0) } -#define PLANAR_PIXEL(x,p) \ +// non-transparent sprite pixels apply if no higher prio pixel is already there +#define PLANAR_PIXELSP(x,p) \ t = (pack>>(7-p)) & 0x01010101; \ - if (t) { \ + if (t && (pd[x] & 0x2f) <= 0x20) { \ t = (t*0x10204080) >> 28; \ pd[x] = pal|t; \ } -static void TileNormM4(u16 sx, unsigned int pack, int pal) +static void TileNormSprM4(u16 sx, unsigned int pack, int pal) { u8 *pd = Pico.est.HighCol + sx; u32 t; - PLANAR_PIXEL(0, 0) - PLANAR_PIXEL(1, 1) - PLANAR_PIXEL(2, 2) - PLANAR_PIXEL(3, 3) - PLANAR_PIXEL(4, 4) - PLANAR_PIXEL(5, 5) - PLANAR_PIXEL(6, 6) - PLANAR_PIXEL(7, 7) + PLANAR_PIXELSP(0, 0) + PLANAR_PIXELSP(1, 1) + PLANAR_PIXELSP(2, 2) + PLANAR_PIXELSP(3, 3) + PLANAR_PIXELSP(4, 4) + PLANAR_PIXELSP(5, 5) + PLANAR_PIXELSP(6, 6) + PLANAR_PIXELSP(7, 7) } -static void TileFlipM4(u16 sx, unsigned int pack, int pal) +static void TileDoubleSprM4(int sx, unsigned int pack, int pal) { u8 *pd = Pico.est.HighCol + sx; u32 t; - PLANAR_PIXEL(0, 7) - PLANAR_PIXEL(1, 6) - PLANAR_PIXEL(2, 5) - PLANAR_PIXEL(3, 4) - PLANAR_PIXEL(4, 3) - PLANAR_PIXEL(5, 2) - PLANAR_PIXEL(6, 1) - PLANAR_PIXEL(7, 0) -} - -static void TileDoubleM4(int sx, unsigned int pack, int pal) -{ - u8 *pd = Pico.est.HighCol + sx; - u32 t; - - PLANAR_PIXEL(0, 0) - PLANAR_PIXEL(1, 0) - PLANAR_PIXEL(2, 1) - PLANAR_PIXEL(3, 1) - PLANAR_PIXEL(4, 2) - PLANAR_PIXEL(5, 2) - PLANAR_PIXEL(6, 3) - PLANAR_PIXEL(7, 3) - PLANAR_PIXEL(8, 4) - PLANAR_PIXEL(9, 4) - PLANAR_PIXEL(10, 5) - PLANAR_PIXEL(11, 5) - PLANAR_PIXEL(12, 6) - PLANAR_PIXEL(13, 6) - PLANAR_PIXEL(14, 7) - PLANAR_PIXEL(15, 7) + PLANAR_PIXELSP(0, 0) + PLANAR_PIXELSP(1, 0) + PLANAR_PIXELSP(2, 1) + PLANAR_PIXELSP(3, 1) + PLANAR_PIXELSP(4, 2) + PLANAR_PIXELSP(5, 2) + PLANAR_PIXELSP(6, 3) + PLANAR_PIXELSP(7, 3) + PLANAR_PIXELSP(8, 4) + PLANAR_PIXELSP(9, 4) + PLANAR_PIXELSP(10, 5) + PLANAR_PIXELSP(11, 5) + PLANAR_PIXELSP(12, 6) + PLANAR_PIXELSP(13, 6) + PLANAR_PIXELSP(14, 7) + PLANAR_PIXELSP(15, 7) } static void DrawSpritesM4(int scanline) @@ -181,13 +167,13 @@ static void DrawSpritesM4(int scanline) // now draw all sprites backwards for (--s; s >= 0; s--) { pack = CPU_LE2(*(u32 *)(PicoMem.vram + sprites_addr[s])); - if (zoomed) TileDoubleM4(sprites_x[s], pack, 0x10); - else TileNormM4(sprites_x[s], pack, 0x10); + if (zoomed) TileDoubleSprM4(sprites_x[s], pack, 0x10); + else TileNormSprM4(sprites_x[s], pack, 0x10); } } // cells_dx, tilex_ty merged to reduce register pressure -static void DrawStripLowM4(const u16 *nametab, int cells_dx, int tilex_ty) +static void DrawStripM4(const u16 *nametab, int cells_dx, int tilex_ty) { int oldcode = -1; int addr = 0, pal = 0; @@ -208,51 +194,13 @@ static void DrawStripLowM4(const u16 *nametab, int cells_dx, int tilex_ty) if (code & 0x0400) addr ^= 0xe; // Y-flip - pal = (code>>7) & 0x10; + pal = (code>>7) & 0x30; // prio | palette select } pack = CPU_LE2(*(u32 *)(PicoMem.vram + addr)); /* Get 4 bitplanes / 8 pixels */ if (pack == 0) TileBGM4(cells_dx, pal); - else if (code & 0x0200) TileFlipLowM4(cells_dx, pack, pal); - else TileNormLowM4(cells_dx, pack, pal); - } -} - -static void DrawStripHighM4(const u16 *nametab, int cells_dx, int tilex_ty) -{ - int oldcode = -1, blank = -1; // The tile we know is blank - int addr = 0, pal = 0; - - // Draw tiles across screen: - for (; cells_dx > 0; cells_dx += 8, tilex_ty++, cells_dx -= 0x10000) - { - unsigned int pack; - unsigned code; - - code = nametab[tilex_ty& 0x1f]; - if (code == blank) - continue; - if (!(code & 0x1000)) // priority low? - continue; - - if (code != oldcode) { - oldcode = code; - // Get tile address/2: - addr = (code & 0x1ff) << 4; - addr += tilex_ty>> 16; - if (code & 0x0400) - addr ^= 0xe; // Y-flip - - pal = (code>>7) & 0x10; - } - - pack = CPU_LE2(*(u32 *)(PicoMem.vram + addr)); /* Get 4 bitplanes / 8 pixels */ - if (pack == 0) { - blank = code; - continue; - } - if (code & 0x0200) TileFlipM4(cells_dx, pack, pal); - else TileNormM4(cells_dx, pack, pal); + else if (code & 0x0200) TileFlipBGM4(cells_dx, pack, pal); + else TileNormBGM4(cells_dx, pack, pal); } } @@ -296,37 +244,25 @@ static void DrawDisplayM4(int scanline) dx += cellskip << 3; dx += line_offset; - // low priority tiles + // tiles if (!(pv->debug_p & PVD_KILL_B)) { if ((Pico.m.hardware & 0x3) == 0x3) { // on GG render only the center 160 px - DrawStripLowM4(nametab , dx | ((cells-12)<< 16),(tilex+6) | (ty << 16)); + DrawStripM4(nametab , dx | ((cells-12)<< 16),(tilex+6) | (ty << 16)); } else if (pv->reg[0] & 0x80) { // vscroll disabled for rightmost 8 columns (e.g. Gauntlet) int dx2 = dx + (cells-8)*8, tilex2 = tilex + (cells-8), ty2 = scanline&7; - DrawStripLowM4(nametab, dx | ((cells-8) << 16), tilex | (ty << 16)); - DrawStripLowM4(nametab2, dx2 | (8 << 16), tilex2 | (ty2 << 17)); + DrawStripM4(nametab, dx | ((cells-8) << 16), tilex | (ty << 16)); + DrawStripM4(nametab2, dx2 | (8 << 16), tilex2 | (ty2 << 17)); } else - DrawStripLowM4(nametab , dx | ( cells << 16), tilex | (ty << 16)); + DrawStripM4(nametab , dx | ( cells << 16), tilex | (ty << 16)); } // sprites if (!(pv->debug_p & PVD_KILL_S_LO)) DrawSpritesM4(scanline); - // high priority tiles (use virtual layer switch just for fun) - if (!(pv->debug_p & PVD_KILL_A)) { - if ((Pico.m.hardware & 0x3) == 0x3) { - DrawStripHighM4(nametab , dx | ((cells-12)<< 16),(tilex+6) | (ty << 16)); - } else if (pv->reg[0] & 0x80) { - int dx2 = dx + (cells-8)*8, tilex2 = tilex + (cells-8), ty2 = scanline&7; - DrawStripHighM4(nametab, dx | ((cells-8) << 16), tilex | (ty << 16)); - DrawStripHighM4(nametab2, dx2 | (8 << 16), tilex2 | (ty2 << 17)); - } else - DrawStripHighM4(nametab , dx | ( cells << 16), tilex | (ty << 16)); - } - - if ((pv->reg[0] & 0x20) && (Pico.m.hardware & 0x3) != 3) { + if ((pv->reg[0] & 0x20) && (Pico.m.hardware & 0x3) != 0x3) { // first column masked with background, caculate offset to start of line dx = (dx&~0x1f) / 4; ty = 0xe0e0e0e0; // really (pv->reg[7]&0x3f) * 0x01010101, but the looks... @@ -646,6 +582,7 @@ void PicoDoHighPal555SMS(void) #endif *dpal = t; } + memcpy(&Pico.est.HighPal[0x20], Pico.est.HighPal, 0x20*2); // for prio bit Pico.est.HighPal[0xe0] = 0; }