diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index a6534f5..92a62e2 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -24,6 +24,8 @@ jobs:
         sudo rm -rf /var/lib/apt/lists/*
         sudo sed -i 's/# \(en_US.UTF-8\)/\1/' /etc/locale.gen
         sudo locale-gen --purge --lang en_US.UTF-8
+    - name: Build SDK
+      run: make sdk
     - name: Build FunKey-OS
       run: make
     - uses: actions/upload-artifact@v2
diff --git a/.gitmodules b/.gitmodules
index 4f2c72a..4bf7ba1 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -1,3 +1,3 @@
 [submodule "buildroot"]
 	path = buildroot
-	url = git://git.buildroot.net/buildroot
+	url = https://github.com/FunKey-Project/buildroot.git
diff --git a/FunKey/Config.in b/FunKey/Config.in
index 61b75af..8b1004a 100644
--- a/FunKey/Config.in
+++ b/FunKey/Config.in
@@ -9,3 +9,11 @@ source "$BR2_EXTERNAL_FUNKEY_PATH/package/PocketSNES/Config.in"
 source "$BR2_EXTERNAL_FUNKEY_PATH/package/gpsp/Config.in"
 source "$BR2_EXTERNAL_FUNKEY_PATH/package/dmtx-utils/Config.in"
 source "$BR2_EXTERNAL_FUNKEY_PATH/package/ProdScreens/Config.in"
+source "$BR2_EXTERNAL_FUNKEY_PATH/package/fonts-droid/Config.in"
+source "$BR2_EXTERNAL_FUNKEY_PATH/package/libini/Config.in"
+source "$BR2_EXTERNAL_FUNKEY_PATH/package/libopk/Config.in"
+source "$BR2_EXTERNAL_FUNKEY_PATH/package/libxdgmime/Config.in"
+source "$BR2_EXTERNAL_FUNKEY_PATH/package/gmenu2x/Config.in"
+source "$BR2_EXTERNAL_FUNKEY_PATH/package/agg/Config.in"
+source "$BR2_EXTERNAL_FUNKEY_PATH/package/fluidlite/Config.in"
+source "$BR2_EXTERNAL_FUNKEY_PATH/package/libmikmod/Config.in"
diff --git a/FunKey/board/funkey/busybox.config b/FunKey/board/funkey/busybox.config
index 35e3f35..70b5598 100644
--- a/FunKey/board/funkey/busybox.config
+++ b/FunKey/board/funkey/busybox.config
@@ -1,7 +1,7 @@
 #
 # Automatically generated make config: don't edit
-# Busybox version: 1.31.1
-# Mon Nov  9 00:34:05 2020
+# Busybox version: 1.32.0
+# Fri Jan 15 10:58:20 2021
 #
 CONFIG_HAVE_DOT_CONFIG=y
 
@@ -76,6 +76,7 @@ CONFIG_PREFIX="./_install"
 # CONFIG_DEBUG_SANITIZE is not set
 # CONFIG_UNIT_TEST is not set
 # CONFIG_WERROR is not set
+# CONFIG_WARN_SIMPLE_MSG is not set
 CONFIG_NO_DEBUG_LIB=y
 # CONFIG_DMALLOC is not set
 # CONFIG_EFENCE is not set
@@ -462,6 +463,7 @@ CONFIG_FEATURE_FIND_SIZE=y
 CONFIG_FEATURE_FIND_PRUNE=y
 CONFIG_FEATURE_FIND_QUIT=y
 # CONFIG_FEATURE_FIND_DELETE is not set
+CONFIG_FEATURE_FIND_EMPTY=y
 CONFIG_FEATURE_FIND_PATH=y
 CONFIG_FEATURE_FIND_REGEX=y
 # CONFIG_FEATURE_FIND_CONTEXT is not set
@@ -692,6 +694,7 @@ CONFIG_FEATURE_SETPRIV_CAPABILITY_NAMES=y
 CONFIG_SWITCH_ROOT=y
 # CONFIG_TASKSET is not set
 # CONFIG_FEATURE_TASKSET_FANCY is not set
+# CONFIG_FEATURE_TASKSET_CPULIST is not set
 CONFIG_UEVENT=y
 CONFIG_UMOUNT=y
 CONFIG_FEATURE_UMOUNT_ALL=y
@@ -805,6 +808,7 @@ CONFIG_MAKEDEVS=y
 CONFIG_FEATURE_MAKEDEVS_TABLE=y
 # CONFIG_MAN is not set
 CONFIG_MICROCOM=y
+CONFIG_MIM=y
 CONFIG_MT=y
 # CONFIG_NANDWRITE is not set
 # CONFIG_NANDDUMP is not set
@@ -1095,6 +1099,7 @@ CONFIG_SH_IS_ASH=y
 # CONFIG_BASH_IS_ASH is not set
 # CONFIG_BASH_IS_HUSH is not set
 CONFIG_BASH_IS_NONE=y
+CONFIG_SHELL_ASH=y
 CONFIG_ASH=y
 CONFIG_ASH_OPTIMIZE_FOR_SIZE=y
 CONFIG_ASH_INTERNAL_GLOB=y
@@ -1115,6 +1120,7 @@ CONFIG_ASH_GETOPTS=y
 CONFIG_ASH_CMDCMD=y
 # CONFIG_CTTYHACK is not set
 # CONFIG_HUSH is not set
+# CONFIG_SHELL_HUSH is not set
 # CONFIG_HUSH_BASH_COMPAT is not set
 # CONFIG_HUSH_BRACE_EXPANSION is not set
 # CONFIG_HUSH_LINENO_VAR is not set
@@ -1177,6 +1183,7 @@ CONFIG_FEATURE_ROTATE_LOGFILE=y
 CONFIG_FEATURE_REMOTE_LOG=y
 # CONFIG_FEATURE_SYSLOGD_DUP is not set
 # CONFIG_FEATURE_SYSLOGD_CFG is not set
+# CONFIG_FEATURE_SYSLOGD_PRECISE_TIMESTAMPS is not set
 CONFIG_FEATURE_SYSLOGD_READ_BUFFER_SIZE=256
 # CONFIG_FEATURE_IPC_SYSLOG is not set
 CONFIG_FEATURE_IPC_SYSLOG_BUFFER_SIZE=0
diff --git a/FunKey/board/funkey/linux.config b/FunKey/board/funkey/linux.config
index 8ff92a7..5cb4c06 100644
--- a/FunKey/board/funkey/linux.config
+++ b/FunKey/board/funkey/linux.config
@@ -25,6 +25,7 @@ CONFIG_CPU_FREQ=y
 CONFIG_CPUFREQ_DT=y
 CONFIG_VFP=y
 CONFIG_NEON=y
+CONFIG_KERNEL_MODE_NEON=y
 # CONFIG_COREDUMP is not set
 CONFIG_HIBERNATION=y
 CONFIG_PM_STD_PARTITION="/dev/mmcblk0p2"
@@ -152,7 +153,7 @@ CONFIG_AUTOFS4_FS=y
 CONFIG_VFAT_FS=y
 CONFIG_TMPFS=y
 CONFIG_CONFIGFS_FS=y
-# CONFIG_MISC_FILESYSTEMS is not set
+CONFIG_SQUASHFS=y
 # CONFIG_NETWORK_FILESYSTEMS is not set
 CONFIG_NLS_CODEPAGE_437=y
 CONFIG_NLS_ISO8859_1=y
diff --git a/FunKey/board/funkey/patches/pixman/0002-aarch64NEON_asm.patch b/FunKey/board/funkey/patches/pixman/0002-aarch64NEON_asm.patch
new file mode 100644
index 0000000..1f27440
--- /dev/null
+++ b/FunKey/board/funkey/patches/pixman/0002-aarch64NEON_asm.patch
@@ -0,0 +1,6526 @@
+From 7b128ae8c56b1055a93573004148a98465d79857 Mon Sep 17 00:00:00 2001
+From: Mizuki Asakura <ed6e117f@gmail.com>
+Date: Sun, 17 Apr 2016 20:16:12 +0900
+Subject: [PATCH] [mod] added aarch64 bilinear implementations (ver.4.1)
+
+Since aarch64 has different neon syntax from aarch32 and has no
+support for (older) arm-simd,
+there are no SIMD accelerations for pixman on aarch64.
+
+We need new implementations.
+
+This patch also contains Ben Avions's series of patches for aarch32
+and now the benchmark results are fine to aarch64.
+
+Please find the result at the below ticket.
+
+Added: https://bugs.freedesktop.org/show_bug.cgi?id=94758
+Signed-off-by: Mizuki Asakura <ed6e117f@gmail.com>
+---
+ configure.ac                             |   36 +-
+ pixman/Makefile.am                       |   17 +-
+ pixman/pixman-arm-neon.c                 |   23 +-
+ pixman/pixman-arm.c                      |    6 +
+ pixman/pixman-arma64-neon-asm-bilinear.S | 1275 ++++++++++
+ pixman/pixman-arma64-neon-asm.S          | 3704 ++++++++++++++++++++++++++++++
+ pixman/pixman-arma64-neon-asm.h          | 1310 +++++++++++
+ pixman/pixman-private.h                  |    7 +-
+ 8 files changed, 6374 insertions(+), 4 deletions(-)
+ create mode 100644 pixman/pixman-arma64-neon-asm-bilinear.S
+ create mode 100644 pixman/pixman-arma64-neon-asm.S
+ create mode 100644 pixman/pixman-arma64-neon-asm.h
+
+diff --git a/configure.ac b/configure.ac
+index 6b2134e..26203a8
+--- a/configure.ac
++++ b/configure.ac
+@@ -667,6 +667,40 @@ if test $enable_arm_neon = yes && test $have_arm_neon = no ; then
+    AC_MSG_ERROR([ARM NEON intrinsics not detected])
+ fi
+ 
++dnl ==========================================================================
++dnl Check if assembler is gas compatible and supports ARM-a64 NEON instructions
++have_arm_a64_neon=no
++AC_MSG_CHECKING(whether to use ARM A64 NEON assembler)
++xserver_save_CFLAGS=$CFLAGS
++CFLAGS="-x assembler-with-cpp $CFLAGS"
++AC_COMPILE_IFELSE([AC_LANG_SOURCE([[
++.text
++.arch armv8-a
++.altmacro
++prfm pldl2strm, [x0]
++xtn v0.8b, v0.8h]])], have_arm_a64_neon=yes)
++CFLAGS=$xserver_save_CFLAGS
++
++AC_ARG_ENABLE(arm-a64-neon,
++   [AC_HELP_STRING([--disable-arm-a64-neon],
++                   [disable ARM A64 NEON fast paths])],
++   [enable_arm_a64_neon=$enableval], [enable_arm_a64_neon=auto])
++
++if test $enable_arm_a64_neon = no ; then
++   have_arm_a64_neon=disabled
++fi
++
++if test $have_arm_a64_neon = yes ; then
++   AC_DEFINE(USE_ARM_A64_NEON, 1, [use ARM A64_NEON assembly optimizations])
++fi
++
++AM_CONDITIONAL(USE_ARM_A64_NEON, test $have_arm_a64_neon = yes)
++
++AC_MSG_RESULT($have_arm_a64_neon)
++if test $enable_arm_a64_neon = yes && test $have_arm_a64_neon4 = no ; then
++   AC_MSG_ERROR([ARM A64 NEON intrinsics not detected])
++fi
++
+ dnl ===========================================================================
+ dnl Check for IWMMXT
+ 
+diff --git a/pixman/Makefile.am b/pixman/Makefile.am
+index 581b6f6..f1afa27
+--- a/pixman/Makefile.am
++++ b/pixman/Makefile.am
+@@ -94,6 +94,21 @@ libpixman_1_la_LIBADD += libpixman-arm-neon.la
+ ASM_CFLAGS_arm_neon=
+ endif
+ 
++# arm a64 neon code
++if USE_ARM_A64_NEON
++noinst_LTLIBRARIES += libpixman-arma64-neon.la
++libpixman_arma64_neon_la_SOURCES = \
++        pixman-arm-neon.c        \
++        pixman-arm-common.h      \
++        pixman-arma64-neon-asm.S \
++        pixman-arma64-neon-asm-bilinear.S \
++        pixman-arm-asm.h         \
++        pixman-arma64-neon-asm.h
++libpixman_1_la_LIBADD += libpixman-arma64-neon.la
++
++ASM_CFLAGS_arm_neon=
++endif
++
+ # iwmmxt code
+ if USE_ARM_IWMMXT
+ libpixman_iwmmxt_la_SOURCES = pixman-mmx.c
+diff --git a/pixman/pixman-arm-neon.c b/pixman/pixman-arm-neon.c
+index be761c9..62c9442 100644
+--- a/pixman/pixman-arm-neon.c
++++ b/pixman/pixman-arm-neon.c
+@@ -194,7 +194,7 @@ arm_neon_fill (pixman_implementation_t *imp,
+ 	       uint32_t                 _xor)
+ {
+     /* stride is always multiple of 32bit units in pixman */
+-    uint32_t byte_stride = stride * sizeof(uint32_t);
++    int32_t byte_stride = stride * sizeof(uint32_t);
+ 
+     switch (bpp)
+     {
+@@ -331,6 +331,7 @@ static const pixman_fast_path_t arm_neon_fast_paths[] =
+     PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, a8,       b5g6r5,   neon_composite_over_8888_8_0565),
+     PIXMAN_STD_FAST_PATH (OVER, r5g6b5,   a8,       r5g6b5,   neon_composite_over_0565_8_0565),
+     PIXMAN_STD_FAST_PATH (OVER, b5g6r5,   a8,       b5g6r5,   neon_composite_over_0565_8_0565),
++    PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, x8r8g8b8, neon_composite_over_8888_8888_8888),
+     PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, a8r8g8b8, neon_composite_over_8888_8888_8888),
+     PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null,     r5g6b5,   neon_composite_over_8888_0565),
+     PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null,     b5g6r5,   neon_composite_over_8888_0565),
+@@ -341,17 +342,33 @@ static const pixman_fast_path_t arm_neon_fast_paths[] =
+     PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, null,     a8r8g8b8, neon_composite_src_x888_8888),
+     PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, null,     a8b8g8r8, neon_composite_src_x888_8888),
+     PIXMAN_STD_FAST_PATH (ADD,  solid,    a8,       a8,       neon_composite_add_n_8_8),
++    PIXMAN_STD_FAST_PATH (ADD,  solid,    a8,       x8r8g8b8, neon_composite_add_n_8_8888),
+     PIXMAN_STD_FAST_PATH (ADD,  solid,    a8,       a8r8g8b8, neon_composite_add_n_8_8888),
++    PIXMAN_STD_FAST_PATH (ADD,  solid,    a8,       x8b8g8r8, neon_composite_add_n_8_8888),
+     PIXMAN_STD_FAST_PATH (ADD,  solid,    a8,       a8b8g8r8, neon_composite_add_n_8_8888),
+     PIXMAN_STD_FAST_PATH (ADD,  a8,       a8,       a8,       neon_composite_add_8_8_8),
+     PIXMAN_STD_FAST_PATH (ADD,  r5g6b5,   a8,       r5g6b5,   neon_composite_add_0565_8_0565),
+     PIXMAN_STD_FAST_PATH (ADD,  b5g6r5,   a8,       b5g6r5,   neon_composite_add_0565_8_0565),
++    PIXMAN_STD_FAST_PATH (ADD,  x8r8g8b8, a8,       x8r8g8b8, neon_composite_add_8888_8_8888),
++    PIXMAN_STD_FAST_PATH (ADD,  a8r8g8b8, a8,       x8r8g8b8, neon_composite_add_8888_8_8888),
++    PIXMAN_STD_FAST_PATH (ADD,  x8b8g8r8, a8,       x8b8g8r8, neon_composite_add_8888_8_8888),
++    PIXMAN_STD_FAST_PATH (ADD,  a8b8g8r8, a8,       x8b8g8r8, neon_composite_add_8888_8_8888),
+     PIXMAN_STD_FAST_PATH (ADD,  a8r8g8b8, a8,       a8r8g8b8, neon_composite_add_8888_8_8888),
+     PIXMAN_STD_FAST_PATH (ADD,  a8b8g8r8, a8,       a8b8g8r8, neon_composite_add_8888_8_8888),
++    PIXMAN_STD_FAST_PATH (ADD,  x8r8g8b8, a8r8g8b8, x8r8g8b8, neon_composite_add_8888_8888_8888),
++    PIXMAN_STD_FAST_PATH (ADD,  a8r8g8b8, a8r8g8b8, x8r8g8b8, neon_composite_add_8888_8888_8888),
+     PIXMAN_STD_FAST_PATH (ADD,  a8r8g8b8, a8r8g8b8, a8r8g8b8, neon_composite_add_8888_8888_8888),
++    PIXMAN_STD_FAST_PATH (ADD,  x8r8g8b8, solid,    x8r8g8b8, neon_composite_add_8888_n_8888),
++    PIXMAN_STD_FAST_PATH (ADD,  a8r8g8b8, solid,    x8r8g8b8, neon_composite_add_8888_n_8888),
++    PIXMAN_STD_FAST_PATH (ADD,  x8b8g8r8, solid,    x8b8g8r8, neon_composite_add_8888_n_8888),
++    PIXMAN_STD_FAST_PATH (ADD,  a8b8g8r8, solid,    x8b8g8r8, neon_composite_add_8888_n_8888),
+     PIXMAN_STD_FAST_PATH (ADD,  a8r8g8b8, solid,    a8r8g8b8, neon_composite_add_8888_n_8888),
+     PIXMAN_STD_FAST_PATH (ADD,  a8b8g8r8, solid,    a8b8g8r8, neon_composite_add_8888_n_8888),
+     PIXMAN_STD_FAST_PATH (ADD,  a8,       null,     a8,       neon_composite_add_8_8),
++    PIXMAN_STD_FAST_PATH (ADD,  x8r8g8b8, null,     x8r8g8b8, neon_composite_add_8888_8888),
++    PIXMAN_STD_FAST_PATH (ADD,  a8r8g8b8, null,     x8r8g8b8, neon_composite_add_8888_8888),
++    PIXMAN_STD_FAST_PATH (ADD,  x8b8g8r8, null,     x8b8g8r8, neon_composite_add_8888_8888),
++    PIXMAN_STD_FAST_PATH (ADD,  a8b8g8r8, null,     x8b8g8r8, neon_composite_add_8888_8888),
+     PIXMAN_STD_FAST_PATH (ADD,  a8r8g8b8, null,     a8r8g8b8, neon_composite_add_8888_8888),
+     PIXMAN_STD_FAST_PATH (ADD,  a8b8g8r8, null,     a8b8g8r8, neon_composite_add_8888_8888),
+     PIXMAN_STD_FAST_PATH (IN,   solid,    null,     a8,       neon_composite_in_n_8),
+@@ -359,7 +376,9 @@ static const pixman_fast_path_t arm_neon_fast_paths[] =
+     PIXMAN_STD_FAST_PATH (OVER_REVERSE, solid, null, a8b8g8r8, neon_composite_over_reverse_n_8888),
+     PIXMAN_STD_FAST_PATH (OUT_REVERSE,  a8,    null, r5g6b5,   neon_composite_out_reverse_8_0565),
+     PIXMAN_STD_FAST_PATH (OUT_REVERSE,  a8,    null, b5g6r5,   neon_composite_out_reverse_8_0565),
++    PIXMAN_STD_FAST_PATH (OUT_REVERSE,  a8,    null, x8r8g8b8, neon_composite_out_reverse_8_8888),
+     PIXMAN_STD_FAST_PATH (OUT_REVERSE,  a8,    null, a8r8g8b8, neon_composite_out_reverse_8_8888),
++    PIXMAN_STD_FAST_PATH (OUT_REVERSE,  a8,    null, x8b8g8r8, neon_composite_out_reverse_8_8888),
+     PIXMAN_STD_FAST_PATH (OUT_REVERSE,  a8,    null, a8b8g8r8, neon_composite_out_reverse_8_8888),
+ 
+     SIMPLE_NEAREST_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, neon_8888_8888),
+@@ -404,6 +423,7 @@ static const pixman_fast_path_t arm_neon_fast_paths[] =
+ 
+     SIMPLE_BILINEAR_FAST_PATH (ADD, a8r8g8b8, a8r8g8b8, neon_8888_8888),
+     SIMPLE_BILINEAR_FAST_PATH (ADD, a8r8g8b8, x8r8g8b8, neon_8888_8888),
++    SIMPLE_BILINEAR_FAST_PATH (ADD, x8r8g8b8, x8r8g8b8, neon_8888_8888),
+ 
+     SIMPLE_BILINEAR_A8_MASK_FAST_PATH (SRC, a8r8g8b8, a8r8g8b8, neon_8888_8_8888),
+     SIMPLE_BILINEAR_A8_MASK_FAST_PATH (SRC, a8r8g8b8, x8r8g8b8, neon_8888_8_8888),
+@@ -420,6 +440,7 @@ static const pixman_fast_path_t arm_neon_fast_paths[] =
+ 
+     SIMPLE_BILINEAR_A8_MASK_FAST_PATH (ADD, a8r8g8b8, a8r8g8b8, neon_8888_8_8888),
+     SIMPLE_BILINEAR_A8_MASK_FAST_PATH (ADD, a8r8g8b8, x8r8g8b8, neon_8888_8_8888),
++    SIMPLE_BILINEAR_A8_MASK_FAST_PATH (ADD, x8r8g8b8, x8r8g8b8, neon_8888_8_8888),
+ 
+     { PIXMAN_OP_NONE },
+ };
+diff --git a/pixman/pixman-arm.c b/pixman/pixman-arm.c
+index 23374e4..734cbea 100644
+--- a/pixman/pixman-arm.c
++++ b/pixman/pixman-arm.c
+@@ -221,5 +221,11 @@ _pixman_arm_get_implementations (pixman_implementation_t *imp)
+ 	imp = _pixman_implementation_create_arm_neon (imp);
+ #endif
+ 
++#ifdef USE_ARM_A64_NEON
++    /* neon is a part of aarch64 */
++    if (!_pixman_disabled ("arm-neon"))
++        imp = _pixman_implementation_create_arm_neon (imp);
++#endif
++
+     return imp;
+ }
+diff --git a/pixman/pixman-arma64-neon-asm-bilinear.S b/pixman/pixman-arma64-neon-asm-bilinear.S
+new file mode 100644
+index 0000000..aaa4a83
+--- /dev/null
++++ b/pixman/pixman-arma64-neon-asm-bilinear.S
+@@ -0,0 +1,1275 @@
++/*
++ * Copyright © 2011 SCore Corporation
++ *
++ * Permission is hereby granted, free of charge, to any person obtaining a
++ * copy of this software and associated documentation files (the "Software"),
++ * to deal in the Software without restriction, including without limitation
++ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
++ * and/or sell copies of the Software, and to permit persons to whom the
++ * Software is furnished to do so, subject to the following conditions:
++ *
++ * The above copyright notice and this permission notice (including the next
++ * paragraph) shall be included in all copies or substantial portions of the
++ * Software.
++ *
++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
++ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
++ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
++ * DEALINGS IN THE SOFTWARE.
++ *
++ * Author:  Siarhei Siamashka (siarhei.siamashka@nokia.com)
++ * Author:  Taekyun Kim (tkq.kim@samsung.com)
++ */
++
++/*
++ * This file contains scaled bilinear scanline functions implemented
++ * using older siarhei's bilinear macro template.
++ *
++ * << General scanline function procedures >>
++ *  1. bilinear interpolate source pixels
++ *  2. load mask pixels
++ *  3. load destination pixels
++ *  4. duplicate mask to fill whole register
++ *  5. interleave source & destination pixels
++ *  6. apply mask to source pixels
++ *  7. combine source & destination pixels
++ *  8, Deinterleave final result
++ *  9. store destination pixels
++ *
++ * All registers with single number (i.e. src0, tmp0) are 64-bits registers.
++ * Registers with double numbers(src01, dst01) are 128-bits registers.
++ * All temp registers can be used freely outside the code block.
++ * Assume that symbol(register .req) OUT and MASK are defined at caller of these macro blocks.
++ *
++ * Remarks
++ *  There can be lots of pipeline stalls inside code block and between code blocks.
++ *  Further optimizations will be done by new macro templates using head/tail_head/tail scheme.
++ */
++
++/* Prevent the stack from becoming executable for no reason... */
++#if defined(__linux__) && defined (__ELF__)
++.section .note.GNU-stack,"",%progbits
++#endif
++
++.text
++.arch armv8-a
++.altmacro
++.p2align 2
++
++#include "pixman-private.h"
++#include "pixman-arm-asm.h"
++#include "pixman-arma64-neon-asm.h"
++
++/*
++ * Bilinear macros from pixman-arm-neon-asm.S
++ */
++
++/*
++ * Bilinear scaling support code which tries to provide pixel fetching, color
++ * format conversion, and interpolation as separate macros which can be used
++ * as the basic building blocks for constructing bilinear scanline functions.
++ */
++
++.macro bilinear_load_8888 reg1, reg2, tmp
++    asr       WTMP1, X, #16
++    add       X, X, UX
++    add       TMP1, TOP, TMP1, lsl #2
++    ld1       {&reg1&.2s}, [TMP1], STRIDE
++    ld1       {&reg2&.2s}, [TMP1]
++.endm
++
++.macro bilinear_load_0565 reg1, reg2, tmp
++    asr       WTMP1, X, #16
++    add       X, X, UX
++    add       TMP1, TOP, TMP1, lsl #1
++    ld1       {&reg2&.s}[0], [TMP1], STRIDE
++    ld1       {&reg2&.s}[1], [TMP1]
++    convert_four_0565_to_x888_packed reg2, reg1, reg2, tmp
++.endm
++
++.macro bilinear_load_and_vertical_interpolate_two_8888 \
++                    acc1, acc2, reg1, reg2, reg3, reg4, tmp1, tmp2
++
++    bilinear_load_8888 reg1, reg2, tmp1
++    umull     &acc1&.8h, &reg1&.8b, v28.8b
++    umlal     &acc1&.8h, &reg2&.8b, v29.8b
++    bilinear_load_8888 reg3, reg4, tmp2
++    umull     &acc2&.8h, &reg3&.8b, v28.8b
++    umlal     &acc2&.8h, &reg4&.8b, v29.8b
++.endm
++
++.macro bilinear_load_and_vertical_interpolate_four_8888 \
++                xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi \
++                yacc1, yacc2, yreg1, yreg2, yreg3, yreg4, yacc2lo, yacc2hi
++
++    bilinear_load_and_vertical_interpolate_two_8888 \
++                xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi
++    bilinear_load_and_vertical_interpolate_two_8888 \
++                yacc1, yacc2, yreg1, yreg2, yreg3, yreg4, yacc2lo, yacc2hi
++.endm
++
++.macro vzip reg1, reg2
++    zip1      v24.8b, reg1, reg2
++    zip2      reg2,   reg1, reg2
++    mov       reg1,   v24.8b
++.endm
++
++.macro vuzp reg1, reg2
++    uzp1     v24.8b, reg1, reg2
++    uzp2     reg2,   reg1, reg2
++    mov      reg1,   v24.8b
++.endm
++
++.macro bilinear_load_and_vertical_interpolate_two_0565 \
++                acc1, acc2, reg1, reg2, reg3, reg4, acc2lo, acc2hi
++    asr       WTMP1, X, #16
++    add       X, X, UX
++    add       TMP1, TOP, TMP1, lsl #1
++    asr       WTMP2, X, #16
++    add       X, X, UX
++    add       TMP2, TOP, TMP2, lsl #1
++    ld1       {&acc2&.s}[0], [TMP1], STRIDE
++    ld1       {&acc2&.s}[2], [TMP2], STRIDE
++    ld1       {&acc2&.s}[1], [TMP1]
++    ld1       {&acc2&.s}[3], [TMP2]
++    convert_0565_to_x888 acc2, reg3, reg2, reg1
++    vzip      &reg1&.8b, &reg3&.8b
++    vzip      &reg2&.8b, &reg4&.8b
++    vzip      &reg3&.8b, &reg4&.8b
++    vzip      &reg1&.8b, &reg2&.8b
++    umull     &acc1&.8h, &reg1&.8b, v28.8b
++    umlal     &acc1&.8h, &reg2&.8b, v29.8b
++    umull     &acc2&.8h, &reg3&.8b, v28.8b
++    umlal     &acc2&.8h, &reg4&.8b, v29.8b
++.endm
++
++.macro bilinear_load_and_vertical_interpolate_four_0565 \
++                xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi \
++                yacc1, yacc2, yreg1, yreg2, yreg3, yreg4, yacc2lo, yacc2hi
++
++    asr       WTMP1, X, #16
++    add       X, X, UX
++    add       TMP1, TOP, TMP1, lsl #1
++    asr       WTMP2, X, #16
++    add       X, X, UX
++    add       TMP2, TOP, TMP2, lsl #1
++    ld1       {&xacc2&.s}[0], [TMP1], STRIDE
++    ld1       {&xacc2&.s}[2], [TMP2], STRIDE
++    ld1       {&xacc2&.s}[1], [TMP1]
++    ld1       {&xacc2&.s}[3], [TMP2]
++    convert_0565_to_x888 xacc2, xreg3, xreg2, xreg1
++    asr       WTMP1, X, #16
++    add       X, X, UX
++    add       TMP1, TOP, TMP1, lsl #1
++    asr       WTMP2, X, #16
++    add       X, X, UX
++    add       TMP2, TOP, TMP2, lsl #1
++    ld1       {&yacc2&.s}[0], [TMP1], STRIDE
++    vzip      &xreg1&.8b, &xreg3&.8b
++    ld1       {&yacc2&.s}[2], [TMP2], STRIDE
++    vzip      &xreg2&.8b, &xreg4&.8b
++    ld1       {&yacc2&.s}[1], [TMP1]
++    vzip      &xreg3&.8b, &xreg4&.8b
++    ld1       {&yacc2&.s}[3], [TMP2]
++    vzip      &xreg1&.8b, &xreg2&.8b
++    convert_0565_to_x888 yacc2, yreg3, yreg2, yreg1
++    umull     &xacc1&.8h, &xreg1&.8b, v28.8b
++    vzip      &yreg1&.8b, &yreg3&.8b
++    umlal     &xacc1&.8h, &xreg2&.8b, v29.8b
++    vzip      &yreg2&.8b, &yreg4&.8b
++    umull     &xacc2&.8h, &xreg3&.8b, v28.8b
++    vzip      &yreg3&.8b, &yreg4&.8b
++    umlal     &xacc2&.8h, &xreg4&.8b, v29.8b
++    vzip      &yreg1&.8b, &yreg2&.8b
++    umull     &yacc1&.8h, &yreg1&.8b, v28.8b
++    umlal     &yacc1&.8h, &yreg2&.8b, v29.8b
++    umull     &yacc2&.8h, &yreg3&.8b, v28.8b
++    umlal     &yacc2&.8h, &yreg4&.8b, v29.8b
++.endm
++
++.macro bilinear_store_8888 numpix, tmp1, tmp2
++.if numpix == 4
++    st1       {v0.2s, v1.2s}, [OUT], #16
++.elseif numpix == 2
++    st1       {v0.2s}, [OUT], #8
++.elseif numpix == 1
++    st1       {v0.s}[0], [OUT], #4
++.else
++    .error bilinear_store_8888 numpix is unsupported
++.endif
++.endm
++
++.macro bilinear_store_0565 numpix, tmp1, tmp2
++    vuzp    v0.8b, v1.8b
++    vuzp    v2.8b, v3.8b
++    vuzp    v1.8b, v3.8b
++    vuzp    v0.8b, v2.8b
++    convert_8888_to_0565 v2, v1, v0, v1, tmp1, tmp2
++.if numpix == 4
++    st1       {v1.4h}, [OUT], #8
++.elseif numpix == 2
++    st1       {v1.s}[0], [OUT], #4
++.elseif numpix == 1
++    st1       {v1.h}[0], [OUT], #2
++.else
++    .error bilinear_store_0565 numpix is unsupported
++.endif
++.endm
++
++
++/*
++ * Macros for loading mask pixels into register 'mask'.
++ * dup must be done in somewhere else.
++ */
++.macro bilinear_load_mask_x numpix, mask
++.endm
++
++.macro bilinear_load_mask_8 numpix, mask
++.if numpix == 4
++    ld1         {&mask&.s}[0], [MASK], #4
++.elseif numpix == 2
++    ld1         {&mask&.h}[0], [MASK], #2
++.elseif numpix == 1
++    ld1         {&mask&.b}[0], [MASK], #1
++.else
++    .error bilinear_load_mask_8 numpix is unsupported
++.endif
++    prfm        PREFETCH_MODE, [MASK, #prefetch_offset]
++.endm
++
++.macro bilinear_load_mask mask_fmt, numpix, mask
++    bilinear_load_mask_&mask_fmt numpix, mask
++.endm
++
++
++/*
++ * Macros for loading destination pixels into register 'dst0' and 'dst1'.
++ * Interleave should be done somewhere else.
++ */
++.macro bilinear_load_dst_0565_src numpix, dst0, dst1, dst01
++.endm
++
++.macro bilinear_load_dst_8888_src numpix, dst0, dst1, dst01
++.endm
++
++.macro bilinear_load_dst_8888 numpix, dst0, dst1, dst01
++.if numpix == 4
++    ld1         {&dst0&.2s, &dst1&.2s}, [OUT]
++.elseif numpix == 2
++    ld1         {&dst0&.2s}, [OUT]
++.elseif numpix == 1
++    ld1         {&dst0&.s}[0], [OUT]
++.else
++    .error bilinear_load_dst_8888 numpix is unsupported
++.endif
++    mov         &dst01&.d[0], &dst0&.d[0]
++    mov         &dst01&.d[1], &dst1&.d[0]
++    prfm        PREFETCH_MODE, [OUT, #(prefetch_offset * 4)]
++.endm
++
++.macro bilinear_load_dst_8888_over numpix, dst0, dst1, dst01
++    bilinear_load_dst_8888 numpix, dst0, dst1, dst01
++.endm
++
++.macro bilinear_load_dst_8888_add numpix, dst0, dst1, dst01
++    bilinear_load_dst_8888 numpix, dst0, dst1, dst01
++.endm
++
++.macro bilinear_load_dst dst_fmt, op, numpix, dst0, dst1, dst01
++    bilinear_load_dst_&dst_fmt&_&op numpix, dst0, dst1, dst01
++.endm
++
++/*
++ * Macros for duplicating partially loaded mask to fill entire register.
++ * We will apply mask to interleaved source pixels, that is
++ *  (r0, r1, r2, r3, g0, g1, g2, g3) x (m0, m1, m2, m3, m0, m1, m2, m3)
++ *  (b0, b1, b2, b3, a0, a1, a2, a3) x (m0, m1, m2, m3, m0, m1, m2, m3)
++ * So, we need to duplicate loaded mask into whole register.
++ *
++ * For two pixel case
++ *  (r0, r1, x, x, g0, g1, x, x) x (m0, m1, m0, m1, m0, m1, m0, m1)
++ *  (b0, b1, x, x, a0, a1, x, x) x (m0, m1, m0, m1, m0, m1, m0, m1)
++ * We can do some optimizations for this including last pixel cases.
++ */
++.macro bilinear_duplicate_mask_x numpix, mask
++.endm
++
++.macro bilinear_duplicate_mask_8 numpix, mask
++.if numpix == 4
++    dup         &mask&.2s, &mask&.s[0]
++.elseif numpix == 2
++    dup         &mask&.4h, &mask&.h[0]
++.elseif numpix == 1
++    dup         &mask&.8b, &mask&.b[0]
++.else
++    .error bilinear_duplicate_mask_8 is unsupported
++.endif
++.endm
++
++.macro bilinear_duplicate_mask mask_fmt, numpix, mask
++    bilinear_duplicate_mask_&mask_fmt numpix, mask
++.endm
++
++/*
++ * Macros for interleaving src and dst pixels to rrrr gggg bbbb aaaa form.
++ * Interleave should be done when maks is enabled or operator is 'over'.
++ */
++.macro bilinear_interleave src0, src1, src01, dst0, dst1, dst01
++    vuzp       &src0&.8b, &src1&.8b
++    vuzp       &dst0&.8b, &dst1&.8b
++    vuzp       &src0&.8b, &src1&.8b
++    vuzp       &dst0&.8b, &dst1&.8b
++    mov        &src01&.d[1], &src1&.d[0]
++    mov        &src01&.d[0], &src0&.d[0]
++    mov        &dst01&.d[1], &dst1&.d[0]
++    mov        &dst01&.d[0], &dst0&.d[0]
++.endm
++
++.macro bilinear_interleave_src_dst_x_src \
++                numpix, src0, src1, src01, dst0, dst1, dst01
++.endm
++
++.macro bilinear_interleave_src_dst_x_over \
++                numpix, src0, src1, src01, dst0, dst1, dst01
++
++    bilinear_interleave src0, src1, src01, dst0, dst1, dst01
++.endm
++
++.macro bilinear_interleave_src_dst_x_add \
++                numpix, src0, src1, src01, dst0, dst1, dst01
++    bilinear_interleave src0, src1, src01, dst0, dst1, dst01
++.endm
++
++.macro bilinear_interleave_src_dst_8_src \
++                numpix, src0, src1, src01, dst0, dst1, dst01
++
++    bilinear_interleave src0, src1, src01, dst0, dst1, dst01
++.endm
++
++.macro bilinear_interleave_src_dst_8_over \
++                numpix, src0, src1, src01, dst0, dst1, dst01
++
++    bilinear_interleave src0, src1, src01, dst0, dst1, dst01
++.endm
++
++.macro bilinear_interleave_src_dst_8_add \
++                numpix, src0, src1, src01, dst0, dst1, dst01
++
++    bilinear_interleave src0, src1, src01, dst0, dst1, dst01
++.endm
++
++.macro bilinear_interleave_src_dst \
++                mask_fmt, op, numpix, src0, src1, src01, dst0, dst1, dst01
++
++    bilinear_interleave_src_dst_&mask_fmt&_&op \
++                numpix, src0, src1, src01, dst0, dst1, dst01
++.endm
++
++
++/*
++ * Macros for applying masks to src pixels. (see combine_mask_u() function)
++ * src, dst should be in interleaved form.
++ * mask register should be in form (m0, m1, m2, m3).
++ */
++.macro bilinear_apply_mask_to_src_x \
++                numpix, src0, src1, src01, mask, \
++                tmp01, tmp23, tmp45, tmp67
++.endm
++
++.macro bilinear_apply_mask_to_src_8 \
++                numpix, src0, src1, src01, mask, \
++                tmp01, tmp23, tmp45, tmp67
++
++    umull           &tmp01&.8h, &src0&.8b, &mask&.8b
++    umull           &tmp23&.8h, &src1&.8b, &mask&.8b
++    /* bubbles */
++    urshr           &tmp45&.8h, &tmp01&.8h, #8
++    urshr           &tmp67&.8h, &tmp23&.8h, #8
++    /* bubbles */
++    raddhn          &src0&.8b, &tmp45&.8h, &tmp01&.8h
++    raddhn          &src1&.8b, &tmp67&.8h, &tmp23&.8h
++    mov             &src01&.d[0], &src0&.d[0]
++    mov             &src01&.d[1], &src1&.d[0]
++.endm
++
++.macro bilinear_apply_mask_to_src \
++                mask_fmt, numpix, src0, src1, src01, mask, \
++                tmp01, tmp23, tmp45, tmp67
++
++    bilinear_apply_mask_to_src_&mask_fmt \
++                numpix, src0, src1, src01, mask, \
++                tmp01, tmp23, tmp45, tmp67
++.endm
++
++
++/*
++ * Macros for combining src and destination pixels.
++ * Interleave or not is depending on operator 'op'.
++ */
++.macro bilinear_combine_src \
++                numpix, src0, src1, src01, dst0, dst1, dst01, \
++                tmp01, tmp23, tmp45, tmp67, tmp8
++.endm
++
++.macro bilinear_combine_over \
++                numpix, src0, src1, src01, dst0, dst1, dst01, \
++                tmp01, tmp23, tmp45, tmp67, tmp8
++
++    dup         &tmp8&.2s, &src1&.s[1]
++    /* bubbles */
++    mvn         &tmp8&.8b, &tmp8&.8b
++    /* bubbles */
++    umull       &tmp01&.8h, &dst0&.8b, &tmp8&.8b
++    /* bubbles */
++    umull       &tmp23&.8h, &dst1&.8b, &tmp8&.8b
++    /* bubbles */
++    urshr       &tmp45&.8h, &tmp01&.8h, #8
++    urshr       &tmp67&.8h, &tmp23&.8h, #8
++    /* bubbles */
++    raddhn      &dst0&.8b, &tmp45&.8h, &tmp01&.8h
++    raddhn      &dst1&.8b, &tmp67&.8h, &tmp23&.8h
++    mov         &dst01&.d[0], &dst0&.d[0]
++    mov         &dst01&.d[1], &dst1&.d[0]
++    /* bubbles */
++    uqadd       &src0&.8b, &dst0&.8b, &src0&.8b
++    uqadd       &src1&.8b, &dst1&.8b, &src1&.8b
++    mov         &src01&.d[0], &src0&.d[0]
++    mov         &src01&.d[1], &src1&.d[0]
++.endm
++
++.macro bilinear_combine_add \
++                numpix, src0, src1, src01, dst0, dst1, dst01, \
++                tmp01, tmp23, tmp45, tmp67, tmp8
++
++    uqadd       &src0&.8b, &dst0&.8b, &src0&.8b
++    uqadd       &src1&.8b, &dst1&.8b, &src1&.8b
++    mov         &src01&.d[0], &src0&.d[0]
++    mov         &src01&.d[1], &src1&.d[0]
++.endm
++
++.macro bilinear_combine \
++                op, numpix, src0, src1, src01, dst0, dst1, dst01, \
++                tmp01, tmp23, tmp45, tmp67, tmp8
++
++    bilinear_combine_&op \
++                numpix, src0, src1, src01, dst0, dst1, dst01, \
++                tmp01, tmp23, tmp45, tmp67, tmp8
++.endm
++
++/*
++ * Macros for final deinterleaving of destination pixels if needed.
++ */
++.macro bilinear_deinterleave numpix, dst0, dst1, dst01
++    vuzp       &dst0&.8b, &dst1&.8b
++    /* bubbles */
++    vuzp       &dst0&.8b, &dst1&.8b
++    mov        &dst01&.d[0], &dst0&.d[0]
++    mov        &dst01&.d[1], &dst1&.d[0]
++.endm
++
++.macro bilinear_deinterleave_dst_x_src numpix, dst0, dst1, dst01
++.endm
++
++.macro bilinear_deinterleave_dst_x_over numpix, dst0, dst1, dst01
++    bilinear_deinterleave numpix, dst0, dst1, dst01
++.endm
++
++.macro bilinear_deinterleave_dst_x_add numpix, dst0, dst1, dst01
++    bilinear_deinterleave numpix, dst0, dst1, dst01
++.endm
++
++.macro bilinear_deinterleave_dst_8_src numpix, dst0, dst1, dst01
++    bilinear_deinterleave numpix, dst0, dst1, dst01
++.endm
++
++.macro bilinear_deinterleave_dst_8_over numpix, dst0, dst1, dst01
++    bilinear_deinterleave numpix, dst0, dst1, dst01
++.endm
++
++.macro bilinear_deinterleave_dst_8_add numpix, dst0, dst1, dst01
++    bilinear_deinterleave numpix, dst0, dst1, dst01
++.endm
++
++.macro bilinear_deinterleave_dst mask_fmt, op, numpix, dst0, dst1, dst01
++    bilinear_deinterleave_dst_&mask_fmt&_&op numpix, dst0, dst1, dst01
++.endm
++
++
++.macro bilinear_interpolate_last_pixel src_fmt, mask_fmt, dst_fmt, op
++    bilinear_load_&src_fmt v0, v1, v2
++    bilinear_load_mask mask_fmt, 1, v4
++    bilinear_load_dst dst_fmt, op, 1, v18, v19, v9
++    umull     v2.8h, v0.8b, v28.8b
++    umlal     v2.8h, v1.8b, v29.8b
++    /* 5 cycles bubble */
++    ushll     v0.4s, v2.4h, #BILINEAR_INTERPOLATION_BITS
++    umlsl     v0.4s, v2.4h, v15.h[0]
++    umlal2    v0.4s, v2.8h, v15.h[0]
++    /* 5 cycles bubble */
++    bilinear_duplicate_mask mask_fmt, 1, v4
++    shrn      v0.4h, v0.4s, #(2 * BILINEAR_INTERPOLATION_BITS)
++    /* 3 cycles bubble */
++    xtn       v0.8b, v0.8h
++    /* 1 cycle bubble */
++    bilinear_interleave_src_dst \
++                mask_fmt, op, 1, v0, v1, v0, v18, v19, v9
++    bilinear_apply_mask_to_src \
++                mask_fmt, 1, v0, v1, v0, v4, \
++                v3, v8, v10, v11
++    bilinear_combine \
++                op, 1, v0, v1, v0, v18, v19, v9, \
++                v3, v8, v10, v11, v5
++    bilinear_deinterleave_dst mask_fmt, op, 1, v0, v1, v0
++    bilinear_store_&dst_fmt 1, v17, v18
++.endm
++
++.macro bilinear_interpolate_two_pixels src_fmt, mask_fmt, dst_fmt, op
++    bilinear_load_and_vertical_interpolate_two_&src_fmt \
++                v1, v11, v18, v19, v20, v21, v22, v23
++    bilinear_load_mask mask_fmt, 2, v4
++    bilinear_load_dst dst_fmt, op, 2, v18, v19, v9
++    ushll     v0.4s, v1.4h, #BILINEAR_INTERPOLATION_BITS
++    umlsl     v0.4s, v1.4h, v15.h[0]
++    umlal2    v0.4s, v1.8h, v15.h[0]
++    ushll     v10.4s, v11.4h, #BILINEAR_INTERPOLATION_BITS
++    umlsl     v10.4s, v11.4h, v15.h[4]
++    umlal2    v10.4s, v11.8h, v15.h[4]
++    shrn      v0.4h, v0.4s, #(2 * BILINEAR_INTERPOLATION_BITS)
++    shrn2     v0.8h, v10.4s, #(2 * BILINEAR_INTERPOLATION_BITS)
++    bilinear_duplicate_mask mask_fmt, 2, v4
++    ushr      v15.8h, v12.8h, #(16 - BILINEAR_INTERPOLATION_BITS)
++    add       v12.8h, v12.8h, v13.8h
++    xtn       v0.8b, v0.8h
++    bilinear_interleave_src_dst \
++                mask_fmt, op, 2, v0, v1, v0, v18, v19, v9
++    bilinear_apply_mask_to_src \
++                mask_fmt, 2, v0, v1, v0, v4, \
++                v3, v8, v10, v11
++    bilinear_combine \
++                op, 2, v0, v1, v0, v18, v19, v9, \
++                v3, v8, v10, v11, v5
++    bilinear_deinterleave_dst mask_fmt, op, 2, v0, v1, v0
++    bilinear_store_&dst_fmt 2, v16, v17
++.endm
++
++.macro bilinear_interpolate_four_pixels src_fmt, mask_fmt, dst_fmt, op
++    bilinear_load_and_vertical_interpolate_four_&src_fmt \
++                v1, v11, v4,  v5,  v6,  v7,  v22, v23 \
++                v3, v9,  v16, v17, v20, v21, v18, v19
++    prfm      PREFETCH_MODE, [TMP1, PF_OFFS]
++    sub       TMP1, TMP1, STRIDE
++    prfm      PREFETCH_MODE, [TMP1, PF_OFFS]
++    ushll     v0.4s, v1.4h, #BILINEAR_INTERPOLATION_BITS
++    umlsl     v0.4s, v1.4h, v15.h[0]
++    umlal2    v0.4s, v1.8h, v15.h[0]
++    ushll     v10.4s, v11.4h, #BILINEAR_INTERPOLATION_BITS
++    umlsl     v10.4s, v11.4h, v15.h[4]
++    umlal2    v10.4s, v11.8h, v15.h[4]
++    ushr      v15.8h, v12.8h, #(16 - BILINEAR_INTERPOLATION_BITS)
++    ushll     v2.4s, v3.4h, #BILINEAR_INTERPOLATION_BITS
++    umlsl     v2.4s, v3.4h, v15.h[0]
++    umlal2    v2.4s, v3.8h, v15.h[0]
++    ushll     v8.4s, v9.4h, #BILINEAR_INTERPOLATION_BITS
++    umlsl     v8.4s, v9.4h, v15.h[4]
++    umlal2    v8.4s, v9.8h, v15.h[4]
++    add       v12.8h, v12.8h, v13.8h
++    shrn      v0.4h,  v0.4s, #(2 * BILINEAR_INTERPOLATION_BITS)
++    shrn2     v0.8h, v10.4s, #(2 * BILINEAR_INTERPOLATION_BITS)
++    shrn      v2.4h,  v2.4s, #(2 * BILINEAR_INTERPOLATION_BITS)
++    shrn2     v2.8h,  v8.4s, #(2 * BILINEAR_INTERPOLATION_BITS)
++    bilinear_load_mask mask_fmt, 4, v4
++    bilinear_duplicate_mask mask_fmt, 4, v4
++    ushr      v15.8h, v12.8h, #(16 - BILINEAR_INTERPOLATION_BITS)
++    xtn       v0.8b, v0.8h
++    xtn       v1.8b, v2.8h
++    add       v12.8h, v12.8h, v13.8h
++    bilinear_load_dst dst_fmt, op, 4, v2, v3, v21
++    bilinear_interleave_src_dst \
++                mask_fmt, op, 4, v0, v1, v0, v2, v3, v11
++    bilinear_apply_mask_to_src \
++                mask_fmt, 4, v0, v1, v0, v4, \
++                v6, v8, v9, v10
++    bilinear_combine \
++                op, 4, v0, v1, v0, v2, v3, v1, \
++                v6, v8, v9, v10, v23
++    bilinear_deinterleave_dst mask_fmt, op, 4, v0, v1, v0
++    bilinear_store_&dst_fmt 4, v6, v7
++.endm
++
++.set BILINEAR_FLAG_USE_MASK,        1
++.set BILINEAR_FLAG_USE_ALL_NEON_REGS,    2
++
++/*
++ * Main template macro for generating NEON optimized bilinear scanline functions.
++ *
++ * Bilinear scanline generator macro take folling arguments:
++ *  fname            - name of the function to generate
++ *  src_fmt            - source color format (8888 or 0565)
++ *  dst_fmt            - destination color format (8888 or 0565)
++ *  src/dst_bpp_shift        - (1 << bpp_shift) is the size of src/dst pixel in bytes
++ *  process_last_pixel        - code block that interpolate one pixel and does not
++ *                  update horizontal weight
++ *  process_two_pixels        - code block that interpolate two pixels and update
++ *                  horizontal weight
++ *  process_four_pixels        - code block that interpolate four pixels and update
++ *                  horizontal weight
++ *  process_pixblock_head    - head part of middle loop
++ *  process_pixblock_tail    - tail part of middle loop
++ *  process_pixblock_tail_head    - tail_head of middle loop
++ *  pixblock_size        - number of pixels processed in a single middle loop
++ *  prefetch_distance        - prefetch in the source image by that many pixels ahead
++ */
++
++.macro generate_bilinear_scanline_func \
++    fname, \
++    src_fmt, dst_fmt, src_bpp_shift, dst_bpp_shift, \
++    bilinear_process_last_pixel, \
++    bilinear_process_two_pixels, \
++    bilinear_process_four_pixels, \
++    bilinear_process_pixblock_head, \
++    bilinear_process_pixblock_tail, \
++    bilinear_process_pixblock_tail_head, \
++    pixblock_size, \
++    prefetch_distance, \
++    flags
++
++pixman_asm_function fname
++.if pixblock_size == 8
++.elseif pixblock_size == 4
++.else
++    .error unsupported pixblock size
++.endif
++
++.if ((flags) & BILINEAR_FLAG_USE_MASK) == 0
++    OUT       .req    x0
++    TOP       .req    x1
++    BOTTOM    .req    x2
++    WT        .req    x3
++    WWT       .req    w3
++    WB        .req    x4
++    WWB       .req    w4
++    X         .req    w5
++    UX        .req    w6
++    WIDTH     .req    x7
++    TMP1      .req    x10
++    WTMP1     .req    w10
++    TMP2      .req    x11
++    WTMP2     .req    w11
++    PF_OFFS   .req    x12
++    TMP3      .req    x13
++    WTMP3     .req    w13
++    TMP4      .req    x14
++    WTMP4     .req    w14
++    STRIDE    .req    x15
++    DUMMY     .req    x30
++
++    stp       x29, x30, [sp, -16]!
++    mov       x29, sp
++    sub       sp, sp, 112
++    sub       x29, x29, 64
++    st1       {v8.8b, v9.8b, v10.8b, v11.8b}, [x29], 32
++    st1       {v12.8b, v13.8b, v14.8b, v15.8b}, [x29], 32
++    stp       x10, x11, [x29, -80]
++    stp       x12, x13, [x29, -96]
++    stp       x14, x15, [x29, -112]
++.else
++    OUT       .req      x0
++    MASK      .req      x1
++    TOP       .req      x2
++    BOTTOM    .req      x3
++    WT        .req      x4
++    WWT       .req      w4
++    WB        .req      x5
++    WWB       .req      w5
++    X         .req      w6
++    UX        .req      w7
++    WIDTH     .req      x8
++    TMP1      .req      x10
++    WTMP1     .req      w10
++    TMP2      .req      x11
++    WTMP2     .req      w11
++    PF_OFFS   .req      x12
++    TMP3      .req      x13
++    WTMP3     .req      w13
++    TMP4      .req      x14
++    WTMP4     .req      w14
++    STRIDE    .req      x15
++    DUMMY     .req      x30
++
++    .set prefetch_offset, prefetch_distance
++
++    stp      x29, x30, [sp, -16]!
++    mov      x29, sp
++    sub      x29, x29, 64
++    st1      {v8.8b, v9.8b, v10.8b, v11.8b}, [x29], 32
++    st1      {v12.8b, v13.8b, v14.8b, v15.8b}, [x29], 32
++    stp      x10, x11, [x29, -80]
++    stp      x12, x13, [x29, -96]
++    stp      x14, x15, [x29, -112]
++    str      x8, [x29, -120]
++    ldr      w8, [x29, 16]
++    sub      sp, sp, 120
++.endif
++
++    mov      WTMP1, #prefetch_distance
++    umull    PF_OFFS, WTMP1, UX
++
++    sub      STRIDE, BOTTOM, TOP
++    .unreq   BOTTOM
++
++    cmp      WIDTH, #0
++    ble      300f
++
++    dup      v12.8h, X
++    dup      v13.8h, UX
++    dup      v28.8b, WWT
++    dup      v29.8b, WWB
++    mov      v25.d[0], v12.d[1]
++    mov      v26.d[0], v13.d[0]
++    add      v25.4h, v25.4h, v26.4h
++    mov      v12.d[1], v25.d[0]
++
++    /* ensure good destination alignment  */
++    cmp       WIDTH, #1
++    blt       100f
++    tst       OUT, #(1 << dst_bpp_shift)
++    beq       100f
++    ushr      v15.8h, v12.8h, #(16 - BILINEAR_INTERPOLATION_BITS)
++    add       v12.8h, v12.8h, v13.8h
++    bilinear_process_last_pixel
++    sub       WIDTH, WIDTH, #1
++100:
++    add       v13.8h, v13.8h, v13.8h
++    ushr      v15.8h, v12.8h, #(16 - BILINEAR_INTERPOLATION_BITS)
++    add       v12.8h, v12.8h, v13.8h
++
++    cmp       WIDTH, #2
++    blt       100f
++    tst       OUT, #(1 << (dst_bpp_shift + 1))
++    beq       100f
++    bilinear_process_two_pixels
++    sub       WIDTH, WIDTH, #2
++100:
++.if pixblock_size == 8
++    cmp       WIDTH, #4
++    blt       100f
++    tst       OUT, #(1 << (dst_bpp_shift + 2))
++    beq       100f
++    bilinear_process_four_pixels
++    sub       WIDTH, WIDTH, #4
++100:
++.endif
++    subs      WIDTH, WIDTH, #pixblock_size
++    blt       100f
++    asr       PF_OFFS, PF_OFFS, #(16 - src_bpp_shift)
++    bilinear_process_pixblock_head
++    subs      WIDTH, WIDTH, #pixblock_size
++    blt       500f
++0:
++    bilinear_process_pixblock_tail_head
++    subs      WIDTH, WIDTH, #pixblock_size
++    bge       0b
++500:
++    bilinear_process_pixblock_tail
++100:
++.if pixblock_size == 8
++    tst       WIDTH, #4
++    beq       200f
++    bilinear_process_four_pixels
++200:
++.endif
++    /* handle the remaining trailing pixels */
++    tst       WIDTH, #2
++    beq       200f
++    bilinear_process_two_pixels
++200:
++    tst       WIDTH, #1
++    beq       300f
++    bilinear_process_last_pixel
++300:
++
++.if ((flags) & BILINEAR_FLAG_USE_MASK) == 0
++    sub       x29, x29, 64
++    ld1       {v8.8b, v9.8b, v10.8b, v11.8b}, [x29], 32
++    ld1       {v12.8b, v13.8b, v14.8b, v15.8b}, [x29], 32
++    ldp       x10, x11, [x29, -80]
++    ldp       x12, x13, [x29, -96]
++    ldp       x14, x15, [x29, -112]
++    mov       sp, x29
++    ldp       x29, x30, [sp], 16
++.else
++    sub       x29, x29, 64
++    ld1       {v8.8b, v9.8b, v10.8b, v11.8b}, [x29], 32
++    ld1       {v12.8b, v13.8b, v14.8b, v15.8b}, [x29], 32
++    ldp       x10, x11, [x29, -80]
++    ldp       x12, x13, [x29, -96]
++    ldp       x14, x15, [x29, -112]
++    ldr       x8, [x29, -120]
++    mov       sp, x29
++    ldp       x29, x30, [sp], 16
++.endif
++    ret
++
++    .unreq    OUT
++    .unreq    TOP
++    .unreq    WT
++    .unreq    WWT
++    .unreq    WB
++    .unreq    WWB
++    .unreq    X
++    .unreq    UX
++    .unreq    WIDTH
++    .unreq    TMP1
++    .unreq    WTMP1
++    .unreq    TMP2
++    .unreq    PF_OFFS
++    .unreq    TMP3
++    .unreq    TMP4
++    .unreq    STRIDE
++.if ((flags) & BILINEAR_FLAG_USE_MASK) != 0
++    .unreq    MASK
++.endif
++
++.endfunc
++
++.endm
++
++/* src_8888_8_8888 */
++.macro bilinear_src_8888_8_8888_process_last_pixel
++    bilinear_interpolate_last_pixel 8888, 8, 8888, src
++.endm
++
++.macro bilinear_src_8888_8_8888_process_two_pixels
++    bilinear_interpolate_two_pixels 8888, 8, 8888, src
++.endm
++
++.macro bilinear_src_8888_8_8888_process_four_pixels
++    bilinear_interpolate_four_pixels 8888, 8, 8888, src
++.endm
++
++.macro bilinear_src_8888_8_8888_process_pixblock_head
++    bilinear_src_8888_8_8888_process_four_pixels
++.endm
++
++.macro bilinear_src_8888_8_8888_process_pixblock_tail
++.endm
++
++.macro bilinear_src_8888_8_8888_process_pixblock_tail_head
++    bilinear_src_8888_8_8888_process_pixblock_tail
++    bilinear_src_8888_8_8888_process_pixblock_head
++.endm
++
++/* src_8888_8_0565 */
++.macro bilinear_src_8888_8_0565_process_last_pixel
++    bilinear_interpolate_last_pixel 8888, 8, 0565, src
++.endm
++
++.macro bilinear_src_8888_8_0565_process_two_pixels
++    bilinear_interpolate_two_pixels 8888, 8, 0565, src
++.endm
++
++.macro bilinear_src_8888_8_0565_process_four_pixels
++    bilinear_interpolate_four_pixels 8888, 8, 0565, src
++.endm
++
++.macro bilinear_src_8888_8_0565_process_pixblock_head
++    bilinear_src_8888_8_0565_process_four_pixels
++.endm
++
++.macro bilinear_src_8888_8_0565_process_pixblock_tail
++.endm
++
++.macro bilinear_src_8888_8_0565_process_pixblock_tail_head
++    bilinear_src_8888_8_0565_process_pixblock_tail
++    bilinear_src_8888_8_0565_process_pixblock_head
++.endm
++
++/* src_0565_8_x888 */
++.macro bilinear_src_0565_8_x888_process_last_pixel
++    bilinear_interpolate_last_pixel 0565, 8, 8888, src
++.endm
++
++.macro bilinear_src_0565_8_x888_process_two_pixels
++    bilinear_interpolate_two_pixels 0565, 8, 8888, src
++.endm
++
++.macro bilinear_src_0565_8_x888_process_four_pixels
++    bilinear_interpolate_four_pixels 0565, 8, 8888, src
++.endm
++
++.macro bilinear_src_0565_8_x888_process_pixblock_head
++    bilinear_src_0565_8_x888_process_four_pixels
++.endm
++
++.macro bilinear_src_0565_8_x888_process_pixblock_tail
++.endm
++
++.macro bilinear_src_0565_8_x888_process_pixblock_tail_head
++    bilinear_src_0565_8_x888_process_pixblock_tail
++    bilinear_src_0565_8_x888_process_pixblock_head
++.endm
++
++/* src_0565_8_0565 */
++.macro bilinear_src_0565_8_0565_process_last_pixel
++    bilinear_interpolate_last_pixel 0565, 8, 0565, src
++.endm
++
++.macro bilinear_src_0565_8_0565_process_two_pixels
++    bilinear_interpolate_two_pixels 0565, 8, 0565, src
++.endm
++
++.macro bilinear_src_0565_8_0565_process_four_pixels
++    bilinear_interpolate_four_pixels 0565, 8, 0565, src
++.endm
++
++.macro bilinear_src_0565_8_0565_process_pixblock_head
++    bilinear_src_0565_8_0565_process_four_pixels
++.endm
++
++.macro bilinear_src_0565_8_0565_process_pixblock_tail
++.endm
++
++.macro bilinear_src_0565_8_0565_process_pixblock_tail_head
++    bilinear_src_0565_8_0565_process_pixblock_tail
++    bilinear_src_0565_8_0565_process_pixblock_head
++.endm
++
++/* over_8888_8888 */
++.macro bilinear_over_8888_8888_process_last_pixel
++    bilinear_interpolate_last_pixel 8888, x, 8888, over
++.endm
++
++.macro bilinear_over_8888_8888_process_two_pixels
++    bilinear_interpolate_two_pixels 8888, x, 8888, over
++.endm
++
++.macro bilinear_over_8888_8888_process_four_pixels
++    bilinear_interpolate_four_pixels 8888, x, 8888, over
++.endm
++
++.macro bilinear_over_8888_8888_process_pixblock_head
++    asr         WTMP1, X, #16
++    add         X, X, UX
++    add         TMP1, TOP, TMP1, lsl #2
++    asr         WTMP2, X, #16
++    add         X, X, UX
++    add         TMP2, TOP, TMP2, lsl #2
++
++    ld1         {v22.2s}, [TMP1], STRIDE
++    ld1         {v23.2s}, [TMP1]
++    asr         WTMP3, X, #16
++    add         X, X, UX
++    add         TMP3, TOP, TMP3, lsl #2
++    umull       v8.8h, v22.8b, v28.8b
++    umlal       v8.8h, v23.8b, v29.8b
++
++    ld1         {v22.2s}, [TMP2], STRIDE
++    ld1         {v23.2s}, [TMP2]
++    asr         WTMP4, X, #16
++    add         X, X, UX
++    add         TMP4, TOP, TMP4, lsl #2
++    umull       v9.8h, v22.8b, v28.8b
++    umlal       v9.8h, v23.8b, v29.8b
++
++    ld1         {v22.2s}, [TMP3], STRIDE
++    ld1         {v23.2s}, [TMP3]
++    umull       v10.8h, v22.8b, v28.8b
++    umlal       v10.8h, v23.8b, v29.8b
++
++    ushll       v0.4s, v8.4h, #BILINEAR_INTERPOLATION_BITS
++    umlsl       v0.4s, v8.4h, v15.h[0]
++    umlal2      v0.4s, v8.8h, v15.h[0]
++
++    prfm        PREFETCH_MODE, [TMP4, PF_OFFS]
++    ld1         {v16.2s}, [TMP4], STRIDE
++    ld1         {v17.2s}, [TMP4]
++    prfm        PREFETCH_MODE, [TMP4, PF_OFFS]
++    umull       v11.8h, v16.8b, v28.8b
++    umlal       v11.8h, v17.8b, v29.8b
++
++    ushll       v1.4s, v9.4h, #BILINEAR_INTERPOLATION_BITS
++    umlsl       v1.4s, v9.4h, v15.h[4]
++    umlal2      v1.4s, v9.8h, v15.h[4]
++    ushr        v15.8h, v12.8h, #(16 - BILINEAR_INTERPOLATION_BITS)
++    add         v12.8h, v12.8h, v13.8h
++.endm
++
++.macro bilinear_over_8888_8888_process_pixblock_tail
++    ushll       v2.4s, v10.4h, #BILINEAR_INTERPOLATION_BITS
++    umlsl       v2.4s, v10.4h, v15.h[0]
++    umlal2      v2.4s, v10.8h, v15.h[0]
++    ushll       v3.4s, v11.4h, #BILINEAR_INTERPOLATION_BITS
++    umlsl       v3.4s, v11.4h, v15.h[4]
++    umlal2      v3.4s, v11.8h, v15.h[4]
++    shrn        v0.4h, v0.4s, #(2 * BILINEAR_INTERPOLATION_BITS)
++    shrn2       v0.8h, v1.4s, #(2 * BILINEAR_INTERPOLATION_BITS)
++    shrn        v2.4h, v2.4s, #(2 * BILINEAR_INTERPOLATION_BITS)
++    ushr        v15.8h, v12.8h, #(16 - BILINEAR_INTERPOLATION_BITS)
++    shrn2       v2.8h, v3.4s, #(2 * BILINEAR_INTERPOLATION_BITS)
++    xtn         v6.8b, v0.8h
++    xtn         v7.8b, v2.8h
++    ld1         {v2.2s, v3.2s}, [OUT]
++    prfm        PREFETCH_MODE, [OUT, #(prefetch_offset * 4)]
++    vuzp        v6.8b, v7.8b
++    vuzp        v2.8b, v3.8b
++    vuzp        v6.8b, v7.8b
++    vuzp        v2.8b, v3.8b
++    dup         v4.2s, v7.s[1]
++    mvn         v4.8b, v4.8b
++    umull       v11.8h, v2.8b, v4.8b
++    umull       v2.8h,  v3.8b, v4.8b
++    urshr       v1.8h, v11.8h, #8
++    urshr       v10.8h, v2.8h, #8
++    raddhn      v3.8b, v10.8h, v2.8h
++    raddhn      v2.8b, v1.8h, v11.8h
++    uqadd       v6.8b, v2.8b,  v6.8b
++    uqadd       v7.8b, v3.8b,  v7.8b
++    vuzp        v6.8b, v7.8b
++    vuzp        v6.8b, v7.8b
++    add         v12.8h, v12.8h, v13.8h
++    st1         {v6.2s, v7.2s}, [OUT], #16
++.endm
++
++.macro bilinear_over_8888_8888_process_pixblock_tail_head
++                                            ushll       v2.4s, v10.4h, #BILINEAR_INTERPOLATION_BITS
++    asr         WTMP1, X, #16
++    add         X, X, UX
++    add         TMP1, TOP, TMP1, lsl #2
++                                            umlsl       v2.4s, v10.4h, v15.h[0]
++    asr         WTMP2, X, #16
++    add         X, X, UX
++    add         TMP2, TOP, TMP2, lsl #2
++                                            umlal2      v2.4s, v10.8h, v15.h[0]
++                                            ushll       v3.4s, v11.4h, #BILINEAR_INTERPOLATION_BITS
++    ld1         {v20.2s}, [TMP1], STRIDE
++                                            umlsl       v3.4s, v11.4h, v15.h[4]
++                                            umlal2      v3.4s, v11.8h, v15.h[4]
++    ld1         {v21.2s}, [TMP1]
++    umull       v8.8h, v20.8b, v28.8b
++    umlal       v8.8h, v21.8b, v29.8b
++                                            shrn        v0.4h, v0.4s, #(2 * BILINEAR_INTERPOLATION_BITS)
++                                            shrn2       v0.8h, v1.4s, #(2 * BILINEAR_INTERPOLATION_BITS)
++                                            shrn        v2.4h, v2.4s, #(2 * BILINEAR_INTERPOLATION_BITS)
++                                            ushr        v15.8h, v12.8h, #(16 - BILINEAR_INTERPOLATION_BITS)
++    ld1         {v22.2s}, [TMP2], STRIDE
++                                            shrn2       v2.8h, v3.4s, #(2 * BILINEAR_INTERPOLATION_BITS)
++                                            xtn         v6.8b, v0.8h
++    ld1         {v23.2s}, [TMP2]
++    umull       v9.8h, v22.8b, v28.8b
++    asr         WTMP3, X, #16
++    add         X, X, UX
++    add         TMP3, TOP, TMP3, lsl #2
++    asr         WTMP4, X, #16
++    add         X, X, UX
++    add         TMP4, TOP, TMP4, lsl #2
++    umlal       v9.8h, v23.8b, v29.8b
++                                            xtn         v7.8b, v2.8h
++                                            ld1         {v2.2s, v3.2s}, [OUT]
++                                            prfm        PREFETCH_MODE, [OUT, PF_OFFS]
++    ld1         {v22.2s}, [TMP3], STRIDE
++                                            vuzp        v6.8b, v7.8b
++                                            vuzp        v2.8b, v3.8b
++                                            vuzp        v6.8b, v7.8b
++                                            vuzp        v2.8b, v3.8b
++                                            dup         v4.2s, v7.s[1]
++    ld1         {v23.2s}, [TMP3]
++                                            mvn         v4.8b, v4.8b
++    umull       v10.8h, v22.8b, v28.8b
++    umlal       v10.8h, v23.8b, v29.8b
++                                            umull       v11.8h, v2.8b, v4.8b
++                                            umull        v2.8h, v3.8b, v4.8b
++    ushll       v0.4s, v8.4h, #BILINEAR_INTERPOLATION_BITS
++    umlsl       v0.4s, v8.4h, v15.h[0]
++                                            urshr       v1.8h, v11.8h, #8
++    umlal2      v0.4s, v8.8h, v15.h[0]
++                                            urshr       v8.8h, v2.8h, #8
++                                            raddhn      v3.8b, v8.8h, v2.8h
++                                            raddhn      v2.8b, v1.8h, v11.8h
++    prfm        PREFETCH_MODE, [TMP4, PF_OFFS]
++    ld1         {v16.2s}, [TMP4], STRIDE
++                                            uqadd       v6.8b, v2.8b, v6.8b
++                                            uqadd       v7.8b, v3.8b, v7.8b
++    ld1         {v17.2s}, [TMP4]
++    prfm        PREFETCH_MODE, [TMP4, PF_OFFS]
++    umull       v11.8h, v16.8b, v28.8b
++    umlal       v11.8h, v17.8b, v29.8b
++                                            vuzp        v6.8b, v7.8b
++    ushll       v1.4s, v9.4h, #BILINEAR_INTERPOLATION_BITS
++                                            vuzp        v6.8b, v7.8b
++    umlsl       v1.4s, v9.4h, v15.h[4]
++                                            add         v12.8h, v12.8h, v13.8h
++    umlal2      v1.4s, v9.8h, v15.h[4]
++    ushr        v15.8h, v12.8h, #(16 - BILINEAR_INTERPOLATION_BITS)
++    add         v12.8h, v12.8h, v13.8h
++                                            st1         {v6.2s, v7.2s}, [OUT], #16
++.endm
++
++/* over_8888_8_8888 */
++.macro bilinear_over_8888_8_8888_process_last_pixel
++    bilinear_interpolate_last_pixel 8888, 8, 8888, over
++.endm
++
++.macro bilinear_over_8888_8_8888_process_two_pixels
++    bilinear_interpolate_two_pixels 8888, 8, 8888, over
++.endm
++
++.macro bilinear_over_8888_8_8888_process_four_pixels
++    bilinear_interpolate_two_pixels 8888, 8, 8888, over
++    bilinear_interpolate_two_pixels 8888, 8, 8888, over
++.endm
++
++.macro bilinear_over_8888_8_8888_process_pixblock_head
++    bilinear_over_8888_8_8888_process_four_pixels
++.endm
++
++.macro bilinear_over_8888_8_8888_process_pixblock_tail
++.endm
++
++.macro bilinear_over_8888_8_8888_process_pixblock_tail_head
++     bilinear_over_8888_8_8888_process_pixblock_tail
++     bilinear_over_8888_8_8888_process_pixblock_head
++.endm
++
++/* add_8888_8888 */
++.macro bilinear_add_8888_8888_process_last_pixel
++    bilinear_interpolate_last_pixel 8888, x, 8888, add
++.endm
++
++.macro bilinear_add_8888_8888_process_two_pixels
++    bilinear_interpolate_two_pixels 8888, x, 8888, add
++.endm
++
++.macro bilinear_add_8888_8888_process_four_pixels
++    bilinear_interpolate_two_pixels 8888, x, 8888, add
++    bilinear_interpolate_two_pixels 8888, x, 8888, add
++.endm
++
++.macro bilinear_add_8888_8888_process_pixblock_head
++    bilinear_add_8888_8888_process_four_pixels
++.endm
++
++.macro bilinear_add_8888_8888_process_pixblock_tail
++.endm
++
++.macro bilinear_add_8888_8888_process_pixblock_tail_head
++    bilinear_add_8888_8888_process_pixblock_tail
++    bilinear_add_8888_8888_process_pixblock_head
++.endm
++
++/* add_8888_8_8888 */
++.macro bilinear_add_8888_8_8888_process_last_pixel
++    bilinear_interpolate_last_pixel 8888, 8, 8888, add
++.endm
++
++.macro bilinear_add_8888_8_8888_process_two_pixels
++    bilinear_interpolate_two_pixels 8888, 8, 8888, add
++.endm
++
++.macro bilinear_add_8888_8_8888_process_four_pixels
++    bilinear_interpolate_four_pixels 8888, 8, 8888, add
++.endm
++
++.macro bilinear_add_8888_8_8888_process_pixblock_head
++    bilinear_add_8888_8_8888_process_four_pixels
++.endm
++
++.macro bilinear_add_8888_8_8888_process_pixblock_tail
++.endm
++
++.macro bilinear_add_8888_8_8888_process_pixblock_tail_head
++    bilinear_add_8888_8_8888_process_pixblock_tail
++    bilinear_add_8888_8_8888_process_pixblock_head
++.endm
++
++
++/* Bilinear scanline functions */
++generate_bilinear_scanline_func \
++    pixman_scaled_bilinear_scanline_8888_8_8888_SRC_asm_neon, \
++    8888, 8888, 2, 2, \
++    bilinear_src_8888_8_8888_process_last_pixel, \
++    bilinear_src_8888_8_8888_process_two_pixels, \
++    bilinear_src_8888_8_8888_process_four_pixels, \
++    bilinear_src_8888_8_8888_process_pixblock_head, \
++    bilinear_src_8888_8_8888_process_pixblock_tail, \
++    bilinear_src_8888_8_8888_process_pixblock_tail_head, \
++    4, 28, BILINEAR_FLAG_USE_MASK
++
++generate_bilinear_scanline_func \
++    pixman_scaled_bilinear_scanline_8888_8_0565_SRC_asm_neon, \
++    8888, 0565, 2, 1, \
++    bilinear_src_8888_8_0565_process_last_pixel, \
++    bilinear_src_8888_8_0565_process_two_pixels, \
++    bilinear_src_8888_8_0565_process_four_pixels, \
++    bilinear_src_8888_8_0565_process_pixblock_head, \
++    bilinear_src_8888_8_0565_process_pixblock_tail, \
++    bilinear_src_8888_8_0565_process_pixblock_tail_head, \
++    4, 28, BILINEAR_FLAG_USE_MASK
++
++generate_bilinear_scanline_func \
++    pixman_scaled_bilinear_scanline_0565_8_x888_SRC_asm_neon, \
++    0565, 8888, 1, 2, \
++    bilinear_src_0565_8_x888_process_last_pixel, \
++    bilinear_src_0565_8_x888_process_two_pixels, \
++    bilinear_src_0565_8_x888_process_four_pixels, \
++    bilinear_src_0565_8_x888_process_pixblock_head, \
++    bilinear_src_0565_8_x888_process_pixblock_tail, \
++    bilinear_src_0565_8_x888_process_pixblock_tail_head, \
++    4, 28, BILINEAR_FLAG_USE_MASK
++
++generate_bilinear_scanline_func \
++    pixman_scaled_bilinear_scanline_0565_8_0565_SRC_asm_neon, \
++    0565, 0565, 1, 1, \
++    bilinear_src_0565_8_0565_process_last_pixel, \
++    bilinear_src_0565_8_0565_process_two_pixels, \
++    bilinear_src_0565_8_0565_process_four_pixels, \
++    bilinear_src_0565_8_0565_process_pixblock_head, \
++    bilinear_src_0565_8_0565_process_pixblock_tail, \
++    bilinear_src_0565_8_0565_process_pixblock_tail_head, \
++    4, 28, BILINEAR_FLAG_USE_MASK
++
++generate_bilinear_scanline_func \
++    pixman_scaled_bilinear_scanline_8888_8888_OVER_asm_neon, \
++    8888, 8888, 2, 2, \
++    bilinear_over_8888_8888_process_last_pixel, \
++    bilinear_over_8888_8888_process_two_pixels, \
++    bilinear_over_8888_8888_process_four_pixels, \
++    bilinear_over_8888_8888_process_pixblock_head, \
++    bilinear_over_8888_8888_process_pixblock_tail, \
++    bilinear_over_8888_8888_process_pixblock_tail_head, \
++    4, 28, 0
++
++generate_bilinear_scanline_func \
++    pixman_scaled_bilinear_scanline_8888_8_8888_OVER_asm_neon, \
++    8888, 8888, 2, 2, \
++    bilinear_over_8888_8_8888_process_last_pixel, \
++    bilinear_over_8888_8_8888_process_two_pixels, \
++    bilinear_over_8888_8_8888_process_four_pixels, \
++    bilinear_over_8888_8_8888_process_pixblock_head, \
++    bilinear_over_8888_8_8888_process_pixblock_tail, \
++    bilinear_over_8888_8_8888_process_pixblock_tail_head, \
++    4, 28, BILINEAR_FLAG_USE_MASK
++
++generate_bilinear_scanline_func \
++    pixman_scaled_bilinear_scanline_8888_8888_ADD_asm_neon, \
++    8888, 8888, 2, 2, \
++    bilinear_add_8888_8888_process_last_pixel, \
++    bilinear_add_8888_8888_process_two_pixels, \
++    bilinear_add_8888_8888_process_four_pixels, \
++    bilinear_add_8888_8888_process_pixblock_head, \
++    bilinear_add_8888_8888_process_pixblock_tail, \
++    bilinear_add_8888_8888_process_pixblock_tail_head, \
++    4, 28, 0
++
++generate_bilinear_scanline_func \
++    pixman_scaled_bilinear_scanline_8888_8_8888_ADD_asm_neon, \
++    8888, 8888, 2, 2, \
++    bilinear_add_8888_8_8888_process_last_pixel, \
++    bilinear_add_8888_8_8888_process_two_pixels, \
++    bilinear_add_8888_8_8888_process_four_pixels, \
++    bilinear_add_8888_8_8888_process_pixblock_head, \
++    bilinear_add_8888_8_8888_process_pixblock_tail, \
++    bilinear_add_8888_8_8888_process_pixblock_tail_head, \
++    4, 28, BILINEAR_FLAG_USE_MASK
+diff --git a/pixman/pixman-arma64-neon-asm.S b/pixman/pixman-arma64-neon-asm.S
+new file mode 100644
+index 0000000..18ace0e
+--- /dev/null
++++ b/pixman/pixman-arma64-neon-asm.S
+@@ -0,0 +1,3704 @@
++/*
++ * Copyright © 2009 Nokia Corporation
++ *
++ * Permission is hereby granted, free of charge, to any person obtaining a
++ * copy of this software and associated documentation files (the "Software"),
++ * to deal in the Software without restriction, including without limitation
++ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
++ * and/or sell copies of the Software, and to permit persons to whom the
++ * Software is furnished to do so, subject to the following conditions:
++ *
++ * The above copyright notice and this permission notice (including the next
++ * paragraph) shall be included in all copies or substantial portions of the
++ * Software.
++ *
++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
++ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
++ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
++ * DEALINGS IN THE SOFTWARE.
++ *
++ * Author:  Siarhei Siamashka (siarhei.siamashka@nokia.com)
++ */
++
++/*
++ * This file contains implementations of NEON optimized pixel processing
++ * functions. There is no full and detailed tutorial, but some functions
++ * (those which are exposing some new or interesting features) are
++ * extensively commented and can be used as examples.
++ *
++ * You may want to have a look at the comments for following functions:
++ *  - pixman_composite_over_8888_0565_asm_neon
++ *  - pixman_composite_over_n_8_0565_asm_neon
++ */
++
++/* Prevent the stack from becoming executable for no reason... */
++#if defined(__linux__) && defined(__ELF__)
++.section .note.GNU-stack,"",%progbits
++#endif
++
++.text
++.arch armv8-a
++
++.altmacro
++.p2align 2
++
++#include "pixman-private.h"
++#include "pixman-arm-asm.h"
++#include "pixman-arma64-neon-asm.h"
++
++/* Global configuration options and preferences */
++
++/*
++ * The code can optionally make use of unaligned memory accesses to improve
++ * performance of handling leading/trailing pixels for each scanline.
++ * Configuration variable RESPECT_STRICT_ALIGNMENT can be set to 0 for
++ * example in linux if unaligned memory accesses are not configured to
++ * generate.exceptions.
++ */
++.set RESPECT_STRICT_ALIGNMENT, 1
++
++/*
++ * Set default prefetch type. There is a choice between the following options:
++ *
++ * PREFETCH_TYPE_NONE (may be useful for the ARM cores where PLD is set to work
++ * as NOP to workaround some HW bugs or for whatever other reason)
++ *
++ * PREFETCH_TYPE_SIMPLE (may be useful for simple single-issue ARM cores where
++ * advanced prefetch intruduces heavy overhead)
++ *
++ * PREFETCH_TYPE_ADVANCED (useful for superscalar cores such as ARM Cortex-A8
++ * which can run ARM and NEON instructions simultaneously so that extra ARM
++ * instructions do not add (many) extra cycles, but improve prefetch efficiency)
++ *
++ * Note: some types of function can't support advanced prefetch and fallback
++ *       to simple one (those which handle 24bpp pixels)
++ */
++.set PREFETCH_TYPE_DEFAULT, PREFETCH_TYPE_ADVANCED
++
++/* Prefetch distance in pixels for simple prefetch */
++.set PREFETCH_DISTANCE_SIMPLE, 64
++
++/*
++ * Implementation of pixman_composite_over_8888_0565_asm_neon
++ *
++ * This function takes a8r8g8b8 source buffer, r5g6b5 destination buffer and
++ * performs OVER compositing operation. Function fast_composite_over_8888_0565
++ * from pixman-fast-path.c does the same in C and can be used as a reference.
++ *
++ * First we need to have some NEON assembly code which can do the actual
++ * operation on the pixels and provide it to the template macro.
++ *
++ * Template macro quite conveniently takes care of emitting all the necessary
++ * code for memory reading and writing (including quite tricky cases of
++ * handling unaligned leading/trailing pixels), so we only need to deal with
++ * the data in NEON registers.
++ *
++ * NEON registers allocation in general is recommented to be the following:
++ * v0,  v1,  v2,  v3  - contain loaded source pixel data
++ * v4,  v5,  v6,  v7  - contain loaded destination pixels (if they are needed)
++ * v24, v25, v26, v27 - contain loading mask pixel data (if mask is used)
++ * v28, v29, v30, v31 - place for storing the result (destination pixels)
++ *
++ * As can be seen above, four 64-bit NEON registers are used for keeping
++ * intermediate pixel data and up to 8 pixels can be processed in one step
++ * for 32bpp formats (16 pixels for 16bpp, 32 pixels for 8bpp).
++ *
++ * This particular function uses the following registers allocation:
++ * v0,  v1,  v2,  v3  - contain loaded source pixel data
++ * v4,  v5            - contain loaded destination pixels (they are needed)
++ * v28, v29           - place for storing the result (destination pixels)
++ */
++
++/*
++ * Step one. We need to have some code to do some arithmetics on pixel data.
++ * This is implemented as a pair of macros: '*_head' and '*_tail'. When used
++ * back-to-back, they take pixel data from {v0, v1, v2, v3} and {v4, v5},
++ * perform all the needed calculations and write the result to {v28, v29}.
++ * The rationale for having two macros and not just one will be explained
++ * later. In practice, any single monolitic function which does the work can
++ * be split into two parts in any arbitrary way without affecting correctness.
++ *
++ * There is one special trick here too. Common template macro can optionally
++ * make our life a bit easier by doing R, G, B, A color components
++ * deinterleaving for 32bpp pixel formats (and this feature is used in
++ * 'pixman_composite_over_8888_0565_asm_neon' function). So it means that
++ * instead of having 8 packed pixels in {v0, v1, v2, v3} registers, we
++ * actually use v0 register for blue channel (a vector of eight 8-bit
++ * values), v1 register for green, v2 for red and v3 for alpha. This
++ * simple conversion can be also done with a few NEON instructions:
++ *
++ * Packed to planar conversion: // vuzp8 is a wrapper macro
++ *  vuzp8 v0, v1
++ *  vuzp8 v2, v3
++ *  vuzp8 v1, v3
++ *  vuzp8 v0, v2
++ *
++ * Planar to packed conversion: // vzip8 is a wrapper macro
++ *  vzip8 v0, v2
++ *  vzip8 v1, v3
++ *  vzip8 v2, v3
++ *  vzip8 v0, v1
++ *
++ * But pixel can be loaded directly in planar format using LD4 / b NEON
++ * instruction. It is 1 cycle slower than LD1 / s, so this is not always
++ * desirable, that's why deinterleaving is optional.
++ *
++ * But anyway, here is the code:
++ */
++
++.macro pixman_composite_over_8888_0565_process_pixblock_head
++    /* convert 8 r5g6b5 pixel data from {v4} to planar 8-bit format
++       and put data into v6 - red, v7 - green, v30 - blue */
++    mov         v4.d[1], v5.d[0]
++    shrn        v6.8b, v4.8h, #8
++    shrn        v7.8b, v4.8h, #3
++    sli         v4.8h, v4.8h, #5
++    sri         v6.8b, v6.8b, #5
++    mvn         v3.8b, v3.8b      /* invert source alpha */
++    sri         v7.8b, v7.8b, #6
++    shrn        v30.8b, v4.8h, #2
++    /* now do alpha blending, storing results in 8-bit planar format
++       into v20 - red, v23 - green, v22 - blue */
++    umull       v10.8h, v3.8b, v6.8b
++    umull       v11.8h, v3.8b, v7.8b
++    umull       v12.8h, v3.8b, v30.8b
++    urshr       v17.8h, v10.8h, #8
++    urshr       v18.8h, v11.8h, #8
++    urshr       v19.8h, v12.8h, #8
++    raddhn      v20.8b, v10.8h, v17.8h
++    raddhn      v23.8b, v11.8h, v18.8h
++    raddhn      v22.8b, v12.8h, v19.8h
++.endm
++
++.macro pixman_composite_over_8888_0565_process_pixblock_tail
++    /* ... continue alpha blending */
++    uqadd       v17.8b, v2.8b, v20.8b
++    uqadd       v18.8b, v0.8b, v22.8b
++    uqadd       v19.8b, v1.8b, v23.8b
++    /* convert the result to r5g6b5 and store it into {v14} */
++    ushll       v14.8h, v17.8b, #7
++    sli         v14.8h, v14.8h, #1
++    ushll       v8.8h, v19.8b, #7
++    sli         v8.8h, v8.8h, #1
++    ushll       v9.8h, v18.8b, #7
++    sli         v9.8h, v9.8h, #1
++    sri         v14.8h, v8.8h, #5
++    sri         v14.8h, v9.8h, #11
++    mov         v28.d[0], v14.d[0]
++    mov         v29.d[0], v14.d[1]
++.endm
++
++/*
++ * OK, now we got almost everything that we need. Using the above two
++ * macros, the work can be done right. But now we want to optimize
++ * it a bit. ARM Cortex-A8 is an in-order core, and benefits really
++ * a lot from good code scheduling and software pipelining.
++ *
++ * Let's construct some code, which will run in the core main loop.
++ * Some pseudo-code of the main loop will look like this:
++ *   head
++ *   while (...) {
++ *     tail
++ *     head
++ *   }
++ *   tail
++ *
++ * It may look a bit weird, but this setup allows to hide instruction
++ * latencies better and also utilize dual-issue capability more
++ * efficiently (make pairs of load-store and ALU instructions).
++ *
++ * So what we need now is a '*_tail_head' macro, which will be used
++ * in the core main loop. A trivial straightforward implementation
++ * of this macro would look like this:
++ *
++ *   pixman_composite_over_8888_0565_process_pixblock_tail
++ *   st1         {v28.4h, v29.4h}, [DST_W], #32
++ *   ld1         {v4.4h, v5.4h}, [DST_R], #16
++ *   ld4         {v0.2s, v1.2s, v2.2s, v3.2s}, [SRC], #32
++ *   pixman_composite_over_8888_0565_process_pixblock_head
++ *   cache_preload 8, 8
++ *
++ * Now it also got some VLD/VST instructions. We simply can't move from
++ * processing one block of pixels to the other one with just arithmetics.
++ * The previously processed data needs to be written to memory and new
++ * data needs to be fetched. Fortunately, this main loop does not deal
++ * with partial leading/trailing pixels and can load/store a full block
++ * of pixels in a bulk. Additionally, destination buffer is already
++ * 16 bytes aligned here (which is good for performance).
++ *
++ * New things here are DST_R, DST_W, SRC and MASK identifiers. These
++ * are the aliases for ARM registers which are used as pointers for
++ * accessing data. We maintain separate pointers for reading and writing
++ * destination buffer (DST_R and DST_W).
++ *
++ * Another new thing is 'cache_preload' macro. It is used for prefetching
++ * data into CPU L2 cache and improve performance when dealing with large
++ * images which are far larger than cache size. It uses one argument
++ * (actually two, but they need to be the same here) - number of pixels
++ * in a block. Looking into 'pixman-arm-neon-asm.h' can provide some
++ * details about this macro. Moreover, if good performance is needed
++ * the code from this macro needs to be copied into '*_tail_head' macro
++ * and mixed with the rest of code for optimal instructions scheduling.
++ * We are actually doing it below.
++ *
++ * Now after all the explanations, here is the optimized code.
++ * Different instruction streams (originaling from '*_head', '*_tail'
++ * and 'cache_preload' macro) use different indentation levels for
++ * better readability. Actually taking the code from one of these
++ * indentation levels and ignoring a few LD/ST instructions would
++ * result in exactly the code from '*_head', '*_tail' or 'cache_preload'
++ * macro!
++ */
++
++#if 1
++
++.macro pixman_composite_over_8888_0565_process_pixblock_tail_head
++        uqadd       v17.8b, v2.8b, v20.8b
++    ld1         {v4.4h, v5.4h}, [DST_R], #16
++    mov         v4.d[1], v5.d[0]
++        uqadd       v18.8b, v0.8b, v22.8b
++        uqadd       v19.8b, v1.8b, v23.8b
++    shrn        v6.8b, v4.8h, #8
++    fetch_src_pixblock
++    shrn        v7.8b, v4.8h, #3
++    sli         v4.8h, v4.8h, #5
++        ushll       v14.8h, v17.8b, #7
++        sli         v14.8h, v14.8h, #1
++                                    PF add PF_X, PF_X, #8
++        ushll       v8.8h, v19.8b, #7
++        sli         v8.8h, v8.8h,  #1
++                                    PF tst PF_CTL, #0xF
++    sri         v6.8b, v6.8b, #5
++                                    PF beq 10f
++                                    PF add PF_X, PF_X, #8
++10:
++    mvn         v3.8b, v3.8b
++                                    PF beq 10f
++                                    PF sub PF_CTL, PF_CTL, #1
++10:
++    sri         v7.8b, v7.8b, #6
++    shrn        v30.8b, v4.8h, #2
++    umull       v10.8h, v3.8b, v6.8b
++                                    PF lsl DUMMY, PF_X, #src_bpp_shift
++                                    PF prfm PREFETCH_MODE, [PF_SRC, DUMMY]
++    umull       v11.8h, v3.8b, v7.8b
++    umull       v12.8h, v3.8b, v30.8b
++                                    PF lsl DUMMY, PF_X, #dst_bpp_shift
++                                    PF prfm PREFETCH_MODE, [PF_DST, DUMMY]
++        sri         v14.8h, v8.8h, #5
++                                    PF cmp PF_X, ORIG_W
++        ushll       v9.8h, v18.8b, #7
++        sli         v9.8h, v9.8h, #1
++    urshr       v17.8h, v10.8h, #8
++                                    PF ble 10f
++                                    PF sub PF_X, PF_X, ORIG_W
++10:
++    urshr       v19.8h, v11.8h, #8
++    urshr       v18.8h, v12.8h, #8
++                                    PF ble 10f
++                                    PF subs PF_CTL, PF_CTL, #0x10
++10:
++        sri         v14.8h, v9.8h, #11
++        mov         v28.d[0], v14.d[0]
++        mov         v29.d[0], v14.d[1]
++                                    PF ble 10f
++                                    PF lsl DUMMY, SRC_STRIDE, #src_bpp_shift
++                                    PF ldrsb DUMMY, [PF_SRC, DUMMY]
++                                    PF add PF_SRC, PF_SRC, #1
++10:
++    raddhn      v20.8b, v10.8h, v17.8h
++    raddhn      v23.8b, v11.8h, v19.8h
++                                    PF ble 10f
++                                    PF lsl DUMMY, DST_STRIDE, #dst_bpp_shift
++                                    PF ldrsb DUMMY, [PF_DST, DUMMY]
++                                    PF add PF_DST, PF_SRC, #1
++10:
++    raddhn      v22.8b, v12.8h, v18.8h
++        st1         {v14.8h}, [DST_W], #16
++.endm
++
++#else
++
++/* If we did not care much about the performance, we would just use this... */
++.macro pixman_composite_over_8888_0565_process_pixblock_tail_head
++    pixman_composite_over_8888_0565_process_pixblock_tail
++    st1         {v14.8h}, [DST_W], #16
++    ld1         {v4.4h, v4.5h}, [DST_R], #16
++    fetch_src_pixblock
++    pixman_composite_over_8888_0565_process_pixblock_head
++    cache_preload 8, 8
++.endm
++
++#endif
++
++/*
++ * And now the final part. We are using 'generate_composite_function' macro
++ * to put all the stuff together. We are specifying the name of the function
++ * which we want to get, number of bits per pixel for the source, mask and
++ * destination (0 if unused, like mask in this case). Next come some bit
++ * flags:
++ *   FLAG_DST_READWRITE      - tells that the destination buffer is both read
++ *                             and written, for write-only buffer we would use
++ *                             FLAG_DST_WRITEONLY flag instead
++ *   FLAG_DEINTERLEAVE_32BPP - tells that we prefer to work with planar data
++ *                             and separate color channels for 32bpp format.
++ * The next things are:
++ *  - the number of pixels processed per iteration (8 in this case, because
++ *    that's the maximum what can fit into four 64-bit NEON registers).
++ *  - prefetch distance, measured in pixel blocks. In this case it is 5 times
++ *    by 8 pixels. That would be 40 pixels, or up to 160 bytes. Optimal
++ *    prefetch distance can be selected by running some benchmarks.
++ *
++ * After that we specify some macros, these are 'default_init',
++ * 'default_cleanup' here which are empty (but it is possible to have custom
++ * init/cleanup macros to be able to save/restore some extra NEON registers
++ * like d8-d15 or do anything else) followed by
++ * 'pixman_composite_over_8888_0565_process_pixblock_head',
++ * 'pixman_composite_over_8888_0565_process_pixblock_tail' and
++ * 'pixman_composite_over_8888_0565_process_pixblock_tail_head'
++ * which we got implemented above.
++ *
++ * The last part is the NEON registers allocation scheme.
++ */
++generate_composite_function \
++    pixman_composite_over_8888_0565_asm_neon, 32, 0, 16, \
++    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
++    8, /* number of pixels, processed in a single block */ \
++    5, /* prefetch distance */ \
++    default_init, \
++    default_cleanup, \
++    pixman_composite_over_8888_0565_process_pixblock_head, \
++    pixman_composite_over_8888_0565_process_pixblock_tail, \
++    pixman_composite_over_8888_0565_process_pixblock_tail_head, \
++    28, /* dst_w_basereg */ \
++    4,  /* dst_r_basereg */ \
++    0,  /* src_basereg   */ \
++    24  /* mask_basereg  */
++
++/******************************************************************************/
++
++.macro pixman_composite_over_n_0565_process_pixblock_head
++    /* convert 8 r5g6b5 pixel data from {v4} to planar 8-bit format
++       and put data into v6 - red, v7 - green, v30 - blue */
++    mov         v4.d[1], v5.d[0]
++    shrn        v6.8b, v4.8h, #8
++    shrn        v7.8b, v4.8h, #3
++    sli         v4.8h, v4.8h, #5
++    sri         v6.8b, v6.8b, #5
++    sri         v7.8b, v7.8b, #6
++    shrn        v30.8b, v4.8h, #2
++    /* now do alpha blending, storing results in 8-bit planar format
++       into v20 - red, v23 - green, v22 - blue */
++    umull       v10.8h, v3.8b, v6.8b
++    umull       v11.8h, v3.8b, v7.8b
++    umull       v12.8h, v3.8b, v30.8b
++    urshr       v13.8h, v10.8h, #8
++    urshr       v14.8h, v11.8h, #8
++    urshr       v15.8h, v12.8h, #8
++    raddhn      v20.8b, v10.8h, v13.8h
++    raddhn      v23.8b, v11.8h, v14.8h
++    raddhn      v22.8b, v12.8h, v15.8h
++.endm
++
++.macro pixman_composite_over_n_0565_process_pixblock_tail
++    /* ... continue alpha blending */
++    uqadd       v17.8b, v2.8b, v20.8b
++    uqadd       v18.8b, v0.8b, v22.8b
++    uqadd       v19.8b, v1.8b, v23.8b
++    /* convert the result to r5g6b5 and store it into {v14} */
++    ushll       v14.8h, v17.8b, #7
++    sli         v14.8h, v14.8h, #1
++    ushll       v8.8h, v19.8b, #7
++    sli         v8.8h, v8.8h, #1
++    ushll       v9.8h, v18.8b, #7
++    sli         v9.8h, v9.8h, #1
++    sri         v14.8h, v8.8h, #5
++    sri         v14.8h, v9.8h, #11
++    mov         v28.d[0], v14.d[0]
++    mov         v29.d[0], v14.d[1]
++.endm
++
++/* TODO: expand macros and do better instructions scheduling */
++.macro pixman_composite_over_n_0565_process_pixblock_tail_head
++    pixman_composite_over_n_0565_process_pixblock_tail
++    ld1         {v4.4h, v5.4h}, [DST_R], #16
++    st1         {v14.8h}, [DST_W], #16
++    pixman_composite_over_n_0565_process_pixblock_head
++    cache_preload 8, 8
++.endm
++
++.macro pixman_composite_over_n_0565_init
++    mov         v3.s[0], w4
++    dup         v0.8b, v3.b[0]
++    dup         v1.8b, v3.b[1]
++    dup         v2.8b, v3.b[2]
++    dup         v3.8b, v3.b[3]
++    mvn         v3.8b, v3.8b      /* invert source alpha */
++.endm
++
++generate_composite_function \
++    pixman_composite_over_n_0565_asm_neon, 0, 0, 16, \
++    FLAG_DST_READWRITE, \
++    8, /* number of pixels, processed in a single block */ \
++    5, /* prefetch distance */ \
++    pixman_composite_over_n_0565_init, \
++    default_cleanup, \
++    pixman_composite_over_n_0565_process_pixblock_head, \
++    pixman_composite_over_n_0565_process_pixblock_tail, \
++    pixman_composite_over_n_0565_process_pixblock_tail_head, \
++    28, /* dst_w_basereg */ \
++    4,  /* dst_r_basereg */ \
++    0,  /* src_basereg   */ \
++    24  /* mask_basereg  */
++
++/******************************************************************************/
++
++.macro pixman_composite_src_8888_0565_process_pixblock_head
++    ushll       v8.8h,  v1.8b,  #7
++    sli         v8.8h,  v8.8h,  #1
++    ushll       v14.8h, v2.8b,  #7
++    sli         v14.8h, v14.8h, #1
++    ushll       v9.8h,  v0.8b,  #7
++    sli         v9.8h,  v9.8h,  #1
++.endm
++
++.macro pixman_composite_src_8888_0565_process_pixblock_tail
++    sri         v14.8h, v8.8h, #5
++    sri         v14.8h, v9.8h, #11
++    mov         v28.d[0], v14.d[0]
++    mov         v29.d[0], v14.d[1]
++.endm
++
++.macro pixman_composite_src_8888_0565_process_pixblock_tail_head
++        sri         v14.8h, v8.8h, #5
++                                    PF add PF_X, PF_X, #8
++                                    PF tst PF_CTL, #0xF
++    fetch_src_pixblock
++                                    PF beq 10f
++                                    PF add PF_X, PF_X, #8
++                                    PF sub PF_CTL, PF_CTL, #1
++10:
++        sri         v14.8h, v9.8h, #11
++        mov         v28.d[0], v14.d[0]
++        mov         v29.d[0], v14.d[1]
++                                    PF cmp PF_X, ORIG_W
++                                    PF lsl DUMMY, PF_X, #src_bpp_shift
++                                    PF prfm PREFETCH_MODE, [PF_SRC, DUMMY]
++    ushll       v8.8h, v1.8b, #7
++    sli         v8.8h, v8.8h, #1
++        st1        {v14.8h}, [DST_W], #16
++                                    PF ble 10f
++                                    PF sub PF_X, PF_X, ORIG_W
++                                    PF subs PF_CTL, PF_CTL, #0x10
++10:
++    ushll       v14.8h, v2.8b, #7
++    sli         v14.8h, v14.8h, #1
++                                    PF ble 10f
++                                    PF lsl DUMMY, SRC_STRIDE, #src_bpp_shift
++                                    PF ldrsb DUMMY, [PF_SRC, DUMMY]
++                                    PF add PF_SRC, PF_SRC, #1
++10:
++    ushll       v9.8h, v0.8b, #7
++    sli         v9.8h, v9.8h, #1
++.endm
++
++generate_composite_function \
++    pixman_composite_src_8888_0565_asm_neon, 32, 0, 16, \
++    FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \
++    8, /* number of pixels, processed in a single block */ \
++    10, /* prefetch distance */ \
++    default_init, \
++    default_cleanup, \
++    pixman_composite_src_8888_0565_process_pixblock_head, \
++    pixman_composite_src_8888_0565_process_pixblock_tail, \
++    pixman_composite_src_8888_0565_process_pixblock_tail_head
++
++/******************************************************************************/
++
++.macro pixman_composite_src_0565_8888_process_pixblock_head
++    mov         v0.d[1], v1.d[0]
++    shrn        v30.8b, v0.8h, #8
++    shrn        v29.8b, v0.8h, #3
++    sli         v0.8h,  v0.8h, #5
++    movi        v31.8b, #255
++    sri         v30.8b, v30.8b, #5
++    sri         v29.8b, v29.8b, #6
++    shrn        v28.8b, v0.8h, #2
++.endm
++
++.macro pixman_composite_src_0565_8888_process_pixblock_tail
++.endm
++
++/* TODO: expand macros and do better instructions scheduling */
++.macro pixman_composite_src_0565_8888_process_pixblock_tail_head
++    pixman_composite_src_0565_8888_process_pixblock_tail
++    st4         {v28.8b, v29.8b, v30.8b, v31.8b}, [DST_W], #32
++    fetch_src_pixblock
++    pixman_composite_src_0565_8888_process_pixblock_head
++    cache_preload 8, 8
++.endm
++
++generate_composite_function \
++    pixman_composite_src_0565_8888_asm_neon, 16, 0, 32, \
++    FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \
++    8, /* number of pixels, processed in a single block */ \
++    10, /* prefetch distance */ \
++    default_init, \
++    default_cleanup, \
++    pixman_composite_src_0565_8888_process_pixblock_head, \
++    pixman_composite_src_0565_8888_process_pixblock_tail, \
++    pixman_composite_src_0565_8888_process_pixblock_tail_head
++
++/******************************************************************************/
++
++.macro pixman_composite_add_8_8_process_pixblock_head
++    uqadd       v28.8b, v0.8b, v4.8b
++    uqadd       v29.8b, v1.8b, v5.8b
++    uqadd       v30.8b, v2.8b, v6.8b
++    uqadd       v31.8b, v3.8b, v7.8b
++.endm
++
++.macro pixman_composite_add_8_8_process_pixblock_tail
++.endm
++
++.macro pixman_composite_add_8_8_process_pixblock_tail_head
++    fetch_src_pixblock
++                                    PF add PF_X, PF_X, #32
++                                    PF tst PF_CTL, #0xF
++    ld1         {v4.8b, v5.8b, v6.8b, v7.8b}, [DST_R], #32
++                                    PF beq 10f
++                                    PF add PF_X, PF_X, #32
++                                    PF sub PF_CTL, PF_CTL, #1
++10:
++        st1     {v28.8b, v29.8b, v30.8b, v31.8b}, [DST_W], #32
++                                    PF cmp PF_X, ORIG_W
++                                    PF lsl DUMMY, PF_X, #src_bpp_shift
++                                    PF prfm PREFETCH_MODE, [PF_SRC, DUMMY]
++                                    PF lsl DUMMY, PF_X, #dst_bpp_shift
++                                    PF prfm PREFETCH_MODE, [PF_DST, DUMMY]
++                                    PF ble 10f
++                                    PF sub PF_X, PF_X, ORIG_W
++                                    PF subs PF_CTL, PF_CTL, #0x10
++10:
++    uqadd       v28.8b, v0.8b, v4.8b
++                                    PF ble 10f
++                                    PF lsl DUMMY, SRC_STRIDE, #src_bpp_shift
++                                    PF ldrsb DUMMY, [PF_SRC, DUMMY]
++                                    PF add PF_SRC, PF_SRC, #1
++                                    PF lsl DUMMY, DST_STRIDE, #dst_bpp_shift
++                                    PF ldrsb DUMMY, [PF_DST, DUMMY]
++                                    PF add PF_DST, PF_DST, #1
++10:
++    uqadd       v29.8b, v1.8b, v5.8b
++    uqadd       v30.8b, v2.8b, v6.8b
++    uqadd       v31.8b, v3.8b, v7.8b
++.endm
++
++generate_composite_function \
++    pixman_composite_add_8_8_asm_neon, 8, 0, 8, \
++    FLAG_DST_READWRITE, \
++    32, /* number of pixels, processed in a single block */ \
++    10, /* prefetch distance */ \
++    default_init, \
++    default_cleanup, \
++    pixman_composite_add_8_8_process_pixblock_head, \
++    pixman_composite_add_8_8_process_pixblock_tail, \
++    pixman_composite_add_8_8_process_pixblock_tail_head
++
++/******************************************************************************/
++
++.macro pixman_composite_add_8888_8888_process_pixblock_tail_head
++    fetch_src_pixblock
++                                    PF add PF_X, PF_X, #8
++                                    PF tst PF_CTL, #0xF
++    ld1         {v4.8b, v5.8b, v6.8b, v7.8b}, [DST_R], #32
++                                    PF beq 10f
++                                    PF add PF_X, PF_X, #8
++                                    PF sub PF_CTL, PF_CTL, #1
++10:
++        st1     {v28.8b, v29.8b, v30.8b, v31.8b}, [DST_W], #32
++                                    PF cmp PF_X, ORIG_W
++                                    PF lsl DUMMY, PF_X, #src_bpp_shift
++                                    PF prfm PREFETCH_MODE, [PF_SRC, DUMMY]
++                                    PF lsl DUMMY, PF_X, #dst_bpp_shift
++                                    PF prfm PREFETCH_MODE, [PF_DST, DUMMY]
++                                    PF ble 10f
++                                    PF sub PF_X, PF_X, ORIG_W
++                                    PF subs PF_CTL, PF_CTL, #0x10
++10:
++    uqadd       v28.8b, v0.8b, v4.8b
++                                    PF ble 10f
++                                    PF lsl DUMMY, SRC_STRIDE, #src_bpp_shift
++                                    PF ldrsb DUMMY, [PF_SRC, DUMMY]
++                                    PF add PF_SRC, PF_SRC, #1
++                                    PF lsl DUMMY, DST_STRIDE, #dst_bpp_shift
++                                    PF ldrsb DUMMY, [PF_DST, DUMMY]
++                                    PF add PF_DST, PF_DST, #1
++10:
++    uqadd       v29.8b, v1.8b, v5.8b
++    uqadd       v30.8b, v2.8b, v6.8b
++    uqadd       v31.8b, v3.8b, v7.8b
++.endm
++
++generate_composite_function \
++    pixman_composite_add_8888_8888_asm_neon, 32, 0, 32, \
++    FLAG_DST_READWRITE, \
++    8, /* number of pixels, processed in a single block */ \
++    10, /* prefetch distance */ \
++    default_init, \
++    default_cleanup, \
++    pixman_composite_add_8_8_process_pixblock_head, \
++    pixman_composite_add_8_8_process_pixblock_tail, \
++    pixman_composite_add_8888_8888_process_pixblock_tail_head
++
++generate_composite_function_single_scanline \
++    pixman_composite_scanline_add_asm_neon, 32, 0, 32, \
++    FLAG_DST_READWRITE, \
++    8, /* number of pixels, processed in a single block */ \
++    default_init, \
++    default_cleanup, \
++    pixman_composite_add_8_8_process_pixblock_head, \
++    pixman_composite_add_8_8_process_pixblock_tail, \
++    pixman_composite_add_8888_8888_process_pixblock_tail_head
++
++/******************************************************************************/
++
++.macro pixman_composite_out_reverse_8888_8888_process_pixblock_head
++    mvn         v24.8b, v3.8b  /* get inverted alpha */
++    /* do alpha blending */
++    umull       v8.8h, v24.8b, v4.8b
++    umull       v9.8h, v24.8b, v5.8b
++    umull       v10.8h, v24.8b, v6.8b
++    umull       v11.8h, v24.8b, v7.8b
++.endm
++
++.macro pixman_composite_out_reverse_8888_8888_process_pixblock_tail
++    urshr       v14.8h, v8.8h, #8
++    urshr       v15.8h, v9.8h, #8
++    urshr       v16.8h, v10.8h, #8
++    urshr       v17.8h, v11.8h, #8
++    raddhn      v28.8b, v14.8h, v8.8h
++    raddhn      v29.8b, v15.8h, v9.8h
++    raddhn      v30.8b, v16.8h, v10.8h
++    raddhn      v31.8b, v17.8h, v11.8h
++.endm
++
++.macro pixman_composite_out_reverse_8888_8888_process_pixblock_tail_head
++     ld4        {v4.8b, v5.8b, v6.8b, v7.8b}, [DST_R], #32
++        urshr       v14.8h, v8.8h, #8
++                                    PF add PF_X, PF_X, #8
++                                    PF tst PF_CTL, #0xF
++        urshr       v15.8h, v9.8h, #8
++        urshr       v16.8h, v10.8h, #8
++        urshr       v17.8h, v11.8h, #8
++                                    PF beq 10f
++                                    PF add PF_X, PF_X, #8
++                                    PF sub PF_CTL, PF_CTL, #1
++10:
++        raddhn      v28.8b, v14.8h, v8.8h
++        raddhn      v29.8b, v15.8h, v9.8h
++                                    PF cmp PF_X, ORIG_W
++        raddhn      v30.8b, v16.8h, v10.8h
++        raddhn      v31.8b, v17.8h, v11.8h
++    fetch_src_pixblock
++                                    PF lsl DUMMY, PF_X, #src_bpp_shift
++                                    PF prfm PREFETCH_MODE, [PF_SRC, DUMMY]
++    mvn         v22.8b, v3.8b
++                                    PF lsl DUMMY, PF_X, #dst_bpp_shift
++                                    PF prfm PREFETCH_MODE, [PF_DST, DUMMY]
++         st4        {v28.8b, v29.8b, v30.8b, v31.8b}, [DST_W], #32
++                                    PF ble 10f
++                                    PF sub PF_X, PF_X, ORIG_W
++10:
++    umull      v8.8h, v22.8b, v4.8b
++                                    PF ble 10f
++                                    PF subs PF_CTL, PF_CTL, #0x10
++10:
++    umull      v9.8h, v22.8b, v5.8b
++                                    PF ble 10f
++                                    PF lsl DUMMY, SRC_STRIDE, #src_bpp_shift
++                                    PF ldrsb DUMMY, [PF_SRC, DUMMY]
++                                    PF add PF_SRC, PF_SRC, #1
++10:
++    umull      v10.8h, v22.8b, v6.8b
++                                    PF ble 10f
++                                    PF lsl DUMMY, DST_STRIDE, #dst_bpp_shift
++                                    PF ldrsb DUMMY, [PF_DST, DUMMY]
++                                    PF add PF_DST, PF_DST, #1
++10:
++     umull     v11.8h, v22.8b, v7.8b
++.endm
++
++generate_composite_function_single_scanline \
++    pixman_composite_scanline_out_reverse_asm_neon, 32, 0, 32, \
++    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
++    8, /* number of pixels, processed in a single block */ \
++    default_init, \
++    default_cleanup, \
++    pixman_composite_out_reverse_8888_8888_process_pixblock_head, \
++    pixman_composite_out_reverse_8888_8888_process_pixblock_tail, \
++    pixman_composite_out_reverse_8888_8888_process_pixblock_tail_head
++
++/******************************************************************************/
++
++.macro pixman_composite_over_8888_8888_process_pixblock_head
++    pixman_composite_out_reverse_8888_8888_process_pixblock_head
++.endm
++
++.macro pixman_composite_over_8888_8888_process_pixblock_tail
++    pixman_composite_out_reverse_8888_8888_process_pixblock_tail
++    uqadd       v28.8b, v0.8b, v28.8b
++    uqadd       v29.8b, v1.8b, v29.8b
++    uqadd       v30.8b, v2.8b, v30.8b
++    uqadd       v31.8b, v3.8b, v31.8b
++.endm
++
++.macro pixman_composite_over_8888_8888_process_pixblock_tail_head
++     ld4        {v4.8b, v5.8b, v6.8b, v7.8b}, [DST_R], #32
++        urshr       v14.8h, v8.8h, #8
++                                    PF add PF_X, PF_X, #8
++                                    PF tst PF_CTL, #0xF
++        urshr       v15.8h, v9.8h, #8
++        urshr       v16.8h, v10.8h, #8
++        urshr       v17.8h, v11.8h, #8
++                                    PF beq 10f
++                                    PF add PF_X, PF_X, #8
++                                    PF sub PF_CTL, PF_CTL, #1
++10:
++        raddhn      v28.8b, v14.8h, v8.8h
++        raddhn      v29.8b, v15.8h, v9.8h
++                                    PF cmp PF_X, ORIG_W
++        raddhn      v30.8b, v16.8h, v10.8h
++        raddhn      v31.8b, v17.8h, v11.8h
++        uqadd       v28.8b, v0.8b, v28.8b
++        uqadd       v29.8b, v1.8b, v29.8b
++        uqadd       v30.8b, v2.8b, v30.8b
++        uqadd       v31.8b, v3.8b, v31.8b
++    fetch_src_pixblock
++                                    PF lsl DUMMY, PF_X, #src_bpp_shift
++                                    PF prfm PREFETCH_MODE, [PF_SRC, DUMMY]
++    mvn        v22.8b, v3.8b
++                                    PF lsl DUMMY, PF_X, #dst_bpp_shift
++                                    PF prfm PREFETCH_MODE, [PF_DST, DUMMY]
++         st4        {v28.8b, v29.8b, v30.8b, v31.8b}, [DST_W], #32
++                                    PF ble 10f
++                                    PF sub PF_X, PF_X, ORIG_W
++10:
++    umull      v8.8h, v22.8b, v4.8b
++                                    PF ble 10f
++                                    PF subs PF_CTL, PF_CTL, #0x10
++10:
++    umull      v9.8h, v22.8b, v5.8b
++                                    PF ble 10f
++                                    PF lsl DUMMY, SRC_STRIDE, #src_bpp_shift
++                                    PF ldrsb DUMMY, [PF_SRC, DUMMY]
++                                    PF add PF_SRC, PF_SRC, #1
++10:
++    umull      v10.8h, v22.8b, v6.8b
++                                    PF ble 10f
++                                    PF lsl DUMMY, DST_STRIDE, #dst_bpp_shift
++                                    PF ldrsb DUMMY, [PF_DST, DUMMY]
++                                    PF add PF_DST, PF_DST, #1
++10:
++    umull      v11.8h, v22.8b, v7.8b
++.endm
++
++generate_composite_function \
++    pixman_composite_over_8888_8888_asm_neon, 32, 0, 32, \
++    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
++    8, /* number of pixels, processed in a single block */ \
++    5, /* prefetch distance */ \
++    default_init, \
++    default_cleanup, \
++    pixman_composite_over_8888_8888_process_pixblock_head, \
++    pixman_composite_over_8888_8888_process_pixblock_tail, \
++    pixman_composite_over_8888_8888_process_pixblock_tail_head
++
++generate_composite_function_single_scanline \
++    pixman_composite_scanline_over_asm_neon, 32, 0, 32, \
++    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
++    8, /* number of pixels, processed in a single block */ \
++    default_init, \
++    default_cleanup, \
++    pixman_composite_over_8888_8888_process_pixblock_head, \
++    pixman_composite_over_8888_8888_process_pixblock_tail, \
++    pixman_composite_over_8888_8888_process_pixblock_tail_head
++
++/******************************************************************************/
++
++.macro pixman_composite_over_n_8888_process_pixblock_head
++    /* deinterleaved source pixels in {v0, v1, v2, v3} */
++    /* inverted alpha in {v24} */
++    /* destination pixels in {v4, v5, v6, v7} */
++    umull       v8.8h, v24.8b, v4.8b
++    umull       v9.8h, v24.8b, v5.8b
++    umull       v10.8h, v24.8b, v6.8b
++    umull       v11.8h, v24.8b, v7.8b
++.endm
++
++.macro pixman_composite_over_n_8888_process_pixblock_tail
++    urshr       v14.8h, v8.8h, #8
++    urshr       v15.8h, v9.8h, #8
++    urshr       v16.8h, v10.8h, #8
++    urshr       v17.8h, v11.8h, #8
++    raddhn      v28.8b, v14.8h, v8.8h
++    raddhn      v29.8b, v15.8h, v9.8h
++    raddhn      v30.8b, v16.8h, v10.8h
++    raddhn      v31.8b, v17.8h, v11.8h
++    uqadd       v28.8b, v0.8b, v28.8b
++    uqadd       v29.8b, v1.8b, v29.8b
++    uqadd       v30.8b, v2.8b, v30.8b
++    uqadd       v31.8b, v3.8b, v31.8b
++.endm
++
++.macro pixman_composite_over_n_8888_process_pixblock_tail_head
++        urshr       v14.8h, v8.8h, #8
++        urshr       v15.8h, v9.8h, #8
++        urshr       v16.8h, v10.8h, #8
++        urshr       v17.8h, v11.8h, #8
++        raddhn      v28.8b, v14.8h, v8.8h
++        raddhn      v29.8b, v15.8h, v9.8h
++        raddhn      v30.8b, v16.8h, v10.8h
++        raddhn      v31.8b, v17.8h, v11.8h
++    ld4         {v4.8b, v5.8b, v6.8b, v7.8b}, [DST_R], #32
++        uqadd       v28.8b, v0.8b, v28.8b
++                                    PF add PF_X, PF_X, #8
++                                    PF tst PF_CTL, #0x0F
++                                    PF beq 10f
++                                    PF add PF_X, PF_X, #8
++                                    PF sub PF_CTL, PF_CTL, #1
++10:
++        uqadd       v29.8b, v1.8b, v29.8b
++        uqadd       v30.8b, v2.8b, v30.8b
++        uqadd       v31.8b, v3.8b, v31.8b
++                                    PF cmp PF_X, ORIG_W
++    umull       v8.8h, v24.8b, v4.8b
++                                    PF lsl DUMMY, PF_X, #dst_bpp_shift
++                                    PF prfm PREFETCH_MODE, [PF_DST, DUMMY]
++    umull       v9.8h, v24.8b, v5.8b
++                                    PF ble 10f
++                                    PF sub PF_X, PF_X, ORIG_W
++10:
++    umull       v10.8h, v24.8b, v6.8b
++                                    PF subs PF_CTL, PF_CTL, #0x10
++    umull       v11.8h, v24.8b, v7.8b
++                                    PF ble 10f
++                                    PF lsl DUMMY, DST_STRIDE, #dst_bpp_shift
++                                    PF ldrsb DUMMY, [PF_DST, DUMMY]
++                                    PF add PF_DST, PF_DST, #1
++10:
++        st4         {v28.8b, v29.8b, v30.8b, v31.8b}, [DST_W], #32
++.endm
++
++.macro pixman_composite_over_n_8888_init
++    mov         v3.s[0], w4
++    dup         v0.8b, v3.b[0]
++    dup         v1.8b, v3.b[1]
++    dup         v2.8b, v3.b[2]
++    dup         v3.8b, v3.b[3]
++    mvn         v24.8b, v3.8b  /* get inverted alpha */
++.endm
++
++generate_composite_function \
++    pixman_composite_over_n_8888_asm_neon, 0, 0, 32, \
++    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
++    8, /* number of pixels, processed in a single block */ \
++    5, /* prefetch distance */ \
++    pixman_composite_over_n_8888_init, \
++    default_cleanup, \
++    pixman_composite_over_8888_8888_process_pixblock_head, \
++    pixman_composite_over_8888_8888_process_pixblock_tail, \
++    pixman_composite_over_n_8888_process_pixblock_tail_head
++
++/******************************************************************************/
++
++.macro pixman_composite_over_reverse_n_8888_process_pixblock_tail_head
++        urshr       v14.8h, v8.8h, #8
++                                    PF add PF_X, PF_X, #8
++                                    PF tst PF_CTL, #0xF
++        urshr       v15.8h, v9.8h, #8
++        urshr       v12.8h, v10.8h, #8
++        urshr       v13.8h, v11.8h, #8
++                                    PF beq 10f
++                                    PF add PF_X, PF_X, #8
++                                    PF sub PF_CTL, PF_CTL, #1
++10:
++        raddhn      v28.8b, v14.8h, v8.8h
++        raddhn      v29.8b, v15.8h, v9.8h
++                                    PF cmp PF_X, ORIG_W
++        raddhn      v30.8b, v12.8h, v10.8h
++        raddhn      v31.8b, v13.8h, v11.8h
++        uqadd       v28.8b, v0.8b, v28.8b
++        uqadd       v29.8b, v1.8b, v29.8b
++        uqadd       v30.8b, v2.8b, v30.8b
++        uqadd       v31.8b, v3.8b, v31.8b
++    ld4         {v0.8b, v1.8b, v2.8b, v3.8b}, [DST_R], #32
++    mvn         v22.8b, v3.8b
++                                    PF lsl DUMMY, PF_X, #dst_bpp_shift
++                                    PF prfm PREFETCH_MODE, [PF_DST, DUMMY]
++        st4         {v28.8b, v29.8b, v30.8b, v31.8b}, [DST_W], #32
++                                    PF blt 10f
++                                    PF sub PF_X, PF_X, ORIG_W
++10:
++    umull       v8.8h, v22.8b, v4.8b
++                                    PF blt 10f
++                                    PF subs PF_CTL, PF_CTL, #0x10
++10:
++    umull       v9.8h, v22.8b, v5.8b
++    umull       v10.8h, v22.8b, v6.8b
++                                    PF blt 10f
++                                    PF lsl DUMMY, DST_STRIDE, #dst_bpp_shift
++                                    PF ldrsb DUMMY, [PF_DST, DUMMY]
++                                    PF add PF_DST, PF_DST, #1
++10:
++    umull       v11.8h, v22.8b, v7.8b
++.endm
++
++.macro pixman_composite_over_reverse_n_8888_init
++    mov         v7.s[0], w4
++    dup         v4.8b, v7.b[0]
++    dup         v5.8b, v7.b[1]
++    dup         v6.8b, v7.b[2]
++    dup         v7.8b, v7.b[3]
++.endm
++
++generate_composite_function \
++    pixman_composite_over_reverse_n_8888_asm_neon, 0, 0, 32, \
++    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
++    8, /* number of pixels, processed in a single block */ \
++    5, /* prefetch distance */ \
++    pixman_composite_over_reverse_n_8888_init, \
++    default_cleanup, \
++    pixman_composite_over_8888_8888_process_pixblock_head, \
++    pixman_composite_over_8888_8888_process_pixblock_tail, \
++    pixman_composite_over_reverse_n_8888_process_pixblock_tail_head, \
++    28, /* dst_w_basereg */ \
++    0,  /* dst_r_basereg */ \
++    4,  /* src_basereg   */ \
++    24  /* mask_basereg  */
++
++/******************************************************************************/
++
++.macro pixman_composite_over_8888_8_0565_process_pixblock_head
++    umull       v0.8h,  v24.8b, v8.8b    /* IN for SRC pixels (part1) */
++    umull       v1.8h,  v24.8b, v9.8b
++    umull       v2.8h,  v24.8b, v10.8b
++    umull       v3.8h,  v24.8b, v11.8b
++        mov         v4.d[1], v5.d[0]
++        shrn        v25.8b,  v4.8h, #8 /* convert DST_R data to 32-bpp (part1) */
++        shrn        v26.8b,  v4.8h, #3
++        sli         v4.8h,   v4.8h, #5
++    urshr       v17.8h, v0.8h,  #8    /* IN for SRC pixels (part2) */
++    urshr       v18.8h, v1.8h,  #8
++    urshr       v19.8h, v2.8h,  #8
++    urshr       v20.8h, v3.8h,  #8
++    raddhn      v0.8b,  v0.8h,  v17.8h
++    raddhn      v1.8b,  v1.8h,  v18.8h
++    raddhn      v2.8b,  v2.8h,  v19.8h
++    raddhn      v3.8b,  v3.8h,  v20.8h
++        sri         v25.8b, v25.8b, #5 /* convert DST_R data to 32-bpp (part2) */
++        sri         v26.8b, v26.8b, #6
++    mvn         v3.8b,  v3.8b
++        shrn        v30.8b, v4.8h,  #2
++    umull       v18.8h, v3.8b, v25.8b     /* now do alpha blending */
++    umull       v19.8h, v3.8b, v26.8b
++    umull       v20.8h, v3.8b, v30.8b
++.endm
++
++.macro pixman_composite_over_8888_8_0565_process_pixblock_tail
++    /* 3 cycle bubble (after vmull.u8) */
++    urshr       v5.8h, v18.8h, #8
++    urshr       v6.8h, v19.8h, #8
++    urshr       v7.8h, v20.8h, #8
++    raddhn      v17.8b, v18.8h, v5.8h
++    raddhn      v19.8b, v19.8h, v6.8h
++    raddhn      v18.8b, v20.8h, v7.8h
++    uqadd       v5.8b, v2.8b,  v17.8b
++    /* 1 cycle bubble */
++    uqadd       v6.8b, v0.8b,  v18.8b
++    uqadd       v7.8b, v1.8b,  v19.8b
++    ushll       v14.8h, v5.8b, #7    /* convert to 16bpp */
++    sli         v14.8h, v14.8h, #1
++    ushll       v18.8h, v7.8b, #7
++    sli         v18.8h, v18.8h, #1
++    ushll       v19.8h, v6.8b, #7
++    sli         v19.8h, v19.8h, #1
++    sri         v14.8h, v18.8h, #5
++    /* 1 cycle bubble */
++    sri         v14.8h, v19.8h, #11
++    mov         v28.d[0], v14.d[0]
++    mov         v29.d[0], v14.d[1]
++.endm
++
++.macro pixman_composite_over_8888_8_0565_process_pixblock_tail_head
++#if 0
++    ld1         {v4.8h}, [DST_R], #16
++    shrn        v25.8b,  v4.8h,  #8
++    fetch_mask_pixblock
++    shrn        v26.8b,  v4.8h,  #3
++    fetch_src_pixblock
++    umull       v22.8h,  v24.8b, v10.8b
++        urshr       v13.8h, v18.8h, #8
++        urshr       v11.8h, v19.8h, #8
++        urshr       v15.8h, v20.8h, #8
++        raddhn      v17.8b, v18.8h, v13.8h
++        raddhn      v19.8b, v19.8h, v11.8h
++        raddhn      v18.8b, v20.8h, v15.8h
++        uqadd       v17.8b, v2.8b, v17.8b
++    umull       v21.8h,  v24.8b, v9.8b
++        uqadd       v18.8b, v0.8b, v18.8b
++        uqadd       v19.8b, v1.8b, v19.8b
++        ushll       v14.8h, v17.8b, #7
++        sli         v14.8h, v14.8h, #1
++    umull       v20.8h,  v24.8b, v8.8b
++        ushll       v18.8h,  v18.8b, #7
++        sli         v18.8h,  v18.8h, #1
++        ushll       v19.8h,  v19.8b, #7
++        sli         v19.8h,  v19.8h, #1
++        sri         v14.8h,  v18.8h, #5
++    umull       v23.8h,  v24.8b, v11.8b
++        sri         v14.8h,  v19.8h, #11
++        mov         v28.d[0], v14.d[0]
++        mov         v29.d[0], v14.d[1]
++
++    cache_preload 8, 8
++    
++    sli         v4.8h,  v4.8h,   #5
++    urshr       v16.8h, v20.8h,  #8
++    urshr       v17.8h, v21.8h,  #8
++    urshr       v18.8h, v22.8h,  #8
++    urshr       v19.8h, v23.8h,  #8
++    raddhn      v0.8b,  v20.8h, v16.8h
++    raddhn      v1.8b,  v21.8h, v17.8h
++    raddhn      v2.8b,  v22.8h, v18.8h
++    raddhn      v3.8b,  v23.8h, v19.8h
++    sri         v25.8b,  v25.8b,  #5
++    sri         v26.8b,  v26.8b,  #6
++    mvn         v3.8b,  v3.8b
++    shrn        v30.8b, v4.8h,  #2
++    st1         {v14.8h}, [DST_W], #16
++    umull       v18.8h, v3.8b, v25.8b
++    umull       v19.8h, v3.8b, v26.8b
++    umull       v20.8h, v3.8b, v30.8b
++#else
++    pixman_composite_over_8888_8_0565_process_pixblock_tail
++    st1         {v28.4h, v29.4h}, [DST_W], #16
++    ld1         {v4.4h, v5.4h}, [DST_R], #16
++    fetch_mask_pixblock
++    fetch_src_pixblock
++    pixman_composite_over_8888_8_0565_process_pixblock_head
++#endif
++.endm
++
++generate_composite_function \
++    pixman_composite_over_8888_8_0565_asm_neon, 32, 8, 16, \
++    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
++    8, /* number of pixels, processed in a single block */ \
++    5, /* prefetch distance */ \
++    default_init_need_all_regs, \
++    default_cleanup_need_all_regs, \
++    pixman_composite_over_8888_8_0565_process_pixblock_head, \
++    pixman_composite_over_8888_8_0565_process_pixblock_tail, \
++    pixman_composite_over_8888_8_0565_process_pixblock_tail_head, \
++    28, /* dst_w_basereg */ \
++    4,  /* dst_r_basereg */ \
++    8,  /* src_basereg   */ \
++    24  /* mask_basereg  */
++
++/******************************************************************************/
++
++/*
++ * This function needs a special initialization of solid mask.
++ * Solid source pixel data is fetched from stack at ARGS_STACK_OFFSET
++ * offset, split into color components and replicated in d8-d11
++ * registers. Additionally, this function needs all the NEON registers,
++ * so it has to save d8-d15 registers which are callee saved according
++ * to ABI. These registers are restored from 'cleanup' macro. All the
++ * other NEON registers are caller saved, so can be clobbered freely
++ * without introducing any problems.
++ */
++.macro pixman_composite_over_n_8_0565_init
++    mov         v11.s[0], w4
++    dup         v8.8b, v11.b[0]
++    dup         v9.8b, v11.b[1]
++    dup         v10.8b, v11.b[2]
++    dup         v11.8b, v11.b[3]
++.endm
++
++.macro pixman_composite_over_n_8_0565_cleanup
++.endm
++
++generate_composite_function \
++    pixman_composite_over_n_8_0565_asm_neon, 0, 8, 16, \
++    FLAG_DST_READWRITE, \
++    8, /* number of pixels, processed in a single block */ \
++    5, /* prefetch distance */ \
++    pixman_composite_over_n_8_0565_init, \
++    pixman_composite_over_n_8_0565_cleanup, \
++    pixman_composite_over_8888_8_0565_process_pixblock_head, \
++    pixman_composite_over_8888_8_0565_process_pixblock_tail, \
++    pixman_composite_over_8888_8_0565_process_pixblock_tail_head, \
++    28, /* dst_w_basereg */ \
++    4,  /* dst_r_basereg */ \
++    8,  /* src_basereg   */ \
++    24  /* mask_basereg  */
++
++/******************************************************************************/
++
++.macro pixman_composite_over_8888_n_0565_init
++    mov         v24.s[0], w6
++    dup         v24.8b, v24.b[3]
++.endm
++
++.macro pixman_composite_over_8888_n_0565_cleanup
++.endm
++
++generate_composite_function \
++    pixman_composite_over_8888_n_0565_asm_neon, 32, 0, 16, \
++    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
++    8, /* number of pixels, processed in a single block */ \
++    5, /* prefetch distance */ \
++    pixman_composite_over_8888_n_0565_init, \
++    pixman_composite_over_8888_n_0565_cleanup, \
++    pixman_composite_over_8888_8_0565_process_pixblock_head, \
++    pixman_composite_over_8888_8_0565_process_pixblock_tail, \
++    pixman_composite_over_8888_8_0565_process_pixblock_tail_head, \
++    28, /* dst_w_basereg */ \
++    4,  /* dst_r_basereg */ \
++    8,  /* src_basereg   */ \
++    24  /* mask_basereg  */
++
++/******************************************************************************/
++
++.macro pixman_composite_src_0565_0565_process_pixblock_head
++.endm
++
++.macro pixman_composite_src_0565_0565_process_pixblock_tail
++.endm
++
++.macro pixman_composite_src_0565_0565_process_pixblock_tail_head
++    st1     {v0.4h, v1.4h, v2.4h, v3.4h}, [DST_W], #32
++    fetch_src_pixblock
++    cache_preload 16, 16
++.endm
++
++generate_composite_function \
++    pixman_composite_src_0565_0565_asm_neon, 16, 0, 16, \
++    FLAG_DST_WRITEONLY, \
++    16, /* number of pixels, processed in a single block */ \
++    10, /* prefetch distance */ \
++    default_init, \
++    default_cleanup, \
++    pixman_composite_src_0565_0565_process_pixblock_head, \
++    pixman_composite_src_0565_0565_process_pixblock_tail, \
++    pixman_composite_src_0565_0565_process_pixblock_tail_head, \
++    0, /* dst_w_basereg */ \
++    0, /* dst_r_basereg */ \
++    0, /* src_basereg   */ \
++    0  /* mask_basereg  */
++
++/******************************************************************************/
++
++.macro pixman_composite_src_n_8_process_pixblock_head
++.endm
++
++.macro pixman_composite_src_n_8_process_pixblock_tail
++.endm
++
++.macro pixman_composite_src_n_8_process_pixblock_tail_head
++    st1         {v0.8b, v1.8b, v2.8b, v3.8b}, [DST_W], 32
++.endm
++
++.macro pixman_composite_src_n_8_init
++    mov         v0.s[0], w4
++    dup         v3.8b, v0.b[0]
++    dup         v2.8b, v0.b[0]
++    dup         v1.8b, v0.b[0]
++    dup         v0.8b, v0.b[0]
++.endm
++
++.macro pixman_composite_src_n_8_cleanup
++.endm
++
++generate_composite_function \
++    pixman_composite_src_n_8_asm_neon, 0, 0, 8, \
++    FLAG_DST_WRITEONLY, \
++    32, /* number of pixels, processed in a single block */ \
++    0,  /* prefetch distance */ \
++    pixman_composite_src_n_8_init, \
++    pixman_composite_src_n_8_cleanup, \
++    pixman_composite_src_n_8_process_pixblock_head, \
++    pixman_composite_src_n_8_process_pixblock_tail, \
++    pixman_composite_src_n_8_process_pixblock_tail_head, \
++    0, /* dst_w_basereg */ \
++    0, /* dst_r_basereg */ \
++    0, /* src_basereg   */ \
++    0  /* mask_basereg  */
++
++/******************************************************************************/
++
++.macro pixman_composite_src_n_0565_process_pixblock_head
++.endm
++
++.macro pixman_composite_src_n_0565_process_pixblock_tail
++.endm
++
++.macro pixman_composite_src_n_0565_process_pixblock_tail_head
++    st1     {v0.4h, v1.4h, v2.4h, v3.4h}, [DST_W], #32
++.endm
++
++.macro pixman_composite_src_n_0565_init
++    mov         v0.s[0], w4
++    dup         v3.4h, v0.h[0]
++    dup         v2.4h, v0.h[0]
++    dup         v1.4h, v0.h[0]
++    dup         v0.4h, v0.h[0]
++.endm
++
++.macro pixman_composite_src_n_0565_cleanup
++.endm
++
++generate_composite_function \
++    pixman_composite_src_n_0565_asm_neon, 0, 0, 16, \
++    FLAG_DST_WRITEONLY, \
++    16, /* number of pixels, processed in a single block */ \
++    0,  /* prefetch distance */ \
++    pixman_composite_src_n_0565_init, \
++    pixman_composite_src_n_0565_cleanup, \
++    pixman_composite_src_n_0565_process_pixblock_head, \
++    pixman_composite_src_n_0565_process_pixblock_tail, \
++    pixman_composite_src_n_0565_process_pixblock_tail_head, \
++    0, /* dst_w_basereg */ \
++    0, /* dst_r_basereg */ \
++    0, /* src_basereg   */ \
++    0  /* mask_basereg  */
++
++/******************************************************************************/
++
++.macro pixman_composite_src_n_8888_process_pixblock_head
++.endm
++
++.macro pixman_composite_src_n_8888_process_pixblock_tail
++.endm
++
++.macro pixman_composite_src_n_8888_process_pixblock_tail_head
++    st1         {v0.2s, v1.2s, v2.2s, v3.2s}, [DST_W], #32
++.endm
++
++.macro pixman_composite_src_n_8888_init
++    mov         v0.s[0], w4
++    dup         v3.2s, v0.s[0]
++    dup         v2.2s, v0.s[0]
++    dup         v1.2s, v0.s[0]
++    dup         v0.2s, v0.s[0]
++.endm
++
++.macro pixman_composite_src_n_8888_cleanup
++.endm
++
++generate_composite_function \
++    pixman_composite_src_n_8888_asm_neon, 0, 0, 32, \
++    FLAG_DST_WRITEONLY, \
++    8, /* number of pixels, processed in a single block */ \
++    0, /* prefetch distance */ \
++    pixman_composite_src_n_8888_init, \
++    pixman_composite_src_n_8888_cleanup, \
++    pixman_composite_src_n_8888_process_pixblock_head, \
++    pixman_composite_src_n_8888_process_pixblock_tail, \
++    pixman_composite_src_n_8888_process_pixblock_tail_head, \
++    0, /* dst_w_basereg */ \
++    0, /* dst_r_basereg */ \
++    0, /* src_basereg   */ \
++    0  /* mask_basereg  */
++
++/******************************************************************************/
++
++.macro pixman_composite_src_8888_8888_process_pixblock_head
++.endm
++
++.macro pixman_composite_src_8888_8888_process_pixblock_tail
++.endm
++
++.macro pixman_composite_src_8888_8888_process_pixblock_tail_head
++    st1  {v0.2s, v1.2s, v2.2s, v3.2s}, [DST_W], #32
++    fetch_src_pixblock
++    cache_preload 8, 8
++.endm
++
++generate_composite_function \
++    pixman_composite_src_8888_8888_asm_neon, 32, 0, 32, \
++    FLAG_DST_WRITEONLY, \
++    8, /* number of pixels, processed in a single block */ \
++    10, /* prefetch distance */ \
++    default_init, \
++    default_cleanup, \
++    pixman_composite_src_8888_8888_process_pixblock_head, \
++    pixman_composite_src_8888_8888_process_pixblock_tail, \
++    pixman_composite_src_8888_8888_process_pixblock_tail_head, \
++    0, /* dst_w_basereg */ \
++    0, /* dst_r_basereg */ \
++    0, /* src_basereg   */ \
++    0  /* mask_basereg  */
++
++/******************************************************************************/
++
++.macro pixman_composite_src_x888_8888_process_pixblock_head
++    orr      v0.8b, v0.8b, v4.8b
++    orr      v1.8b, v1.8b, v4.8b
++    orr      v2.8b, v2.8b, v4.8b
++    orr      v3.8b, v3.8b, v4.8b
++.endm
++
++.macro pixman_composite_src_x888_8888_process_pixblock_tail
++.endm
++
++.macro pixman_composite_src_x888_8888_process_pixblock_tail_head
++    st1      {v0.2s, v1.2s, v2.2s, v3.2s}, [DST_W], #32
++    fetch_src_pixblock
++    orr      v0.8b, v0.8b, v4.8b
++    orr      v1.8b, v1.8b, v4.8b
++    orr      v2.8b, v2.8b, v4.8b
++    orr      v3.8b, v3.8b, v4.8b
++    cache_preload 8, 8
++.endm
++
++.macro pixman_composite_src_x888_8888_init
++    movi    v4.2s, #0xff, lsl 24
++.endm
++
++generate_composite_function \
++    pixman_composite_src_x888_8888_asm_neon, 32, 0, 32, \
++    FLAG_DST_WRITEONLY, \
++    8, /* number of pixels, processed in a single block */ \
++    10, /* prefetch distance */ \
++    pixman_composite_src_x888_8888_init, \
++    default_cleanup, \
++    pixman_composite_src_x888_8888_process_pixblock_head, \
++    pixman_composite_src_x888_8888_process_pixblock_tail, \
++    pixman_composite_src_x888_8888_process_pixblock_tail_head, \
++    0, /* dst_w_basereg */ \
++    0, /* dst_r_basereg */ \
++    0, /* src_basereg   */ \
++    0  /* mask_basereg  */
++
++/******************************************************************************/
++
++.macro pixman_composite_src_n_8_8888_process_pixblock_head
++    /* expecting solid source in {v0, v1, v2, v3} */
++    /* mask is in v24 (v25, v26, v27 are unused) */
++
++    /* in */
++    umull       v8.8h,  v24.8b, v0.8b
++    umull       v9.8h,  v24.8b, v1.8b
++    umull       v10.8h, v24.8b, v2.8b
++    umull       v11.8h, v24.8b, v3.8b
++    ursra       v8.8h,  v8.8h, #8
++    ursra       v9.8h,  v9.8h, #8
++    ursra       v10.8h, v10.8h, #8
++    ursra       v11.8h, v11.8h, #8
++.endm
++
++.macro pixman_composite_src_n_8_8888_process_pixblock_tail
++    rshrn       v28.8b, v8.8h, #8
++    rshrn       v29.8b, v9.8h, #8
++    rshrn       v30.8b, v10.8h, #8
++    rshrn       v31.8b, v11.8h, #8
++.endm
++
++.macro pixman_composite_src_n_8_8888_process_pixblock_tail_head
++    fetch_mask_pixblock
++                                    PF add PF_X, PF_X, #8
++        rshrn       v28.8b, v8.8h, #8
++                                    PF tst PF_CTL, #0x0F
++        rshrn       v29.8b, v9.8h, #8
++                                    PF beq 10f
++                                    PF add PF_X, PF_X, #8
++10:
++        rshrn      v30.8b, v10.8h, #8
++                                    PF beq 10f
++                                    PF sub PF_CTL, PF_CTL, #1
++10:
++        rshrn      v31.8b, v11.8h, #8
++                                    PF cmp PF_X, ORIG_W
++    umull          v8.8h, v24.8b, v0.8b
++                                    PF lsl DUMMY, PF_X, #mask_bpp_shift
++                                    PF prfm PREFETCH_MODE, [PF_MASK, DUMMY]
++    umull          v9.8h, v24.8b, v1.8b
++                                    PF ble 10f
++                                    PF sub PF_X, PF_X, ORIG_W
++10:
++    umull          v10.8h, v24.8b, v2.8b
++                                    PF ble 10f
++                                    PF subs PF_CTL, PF_CTL, #0x10
++10:
++    umull          v11.8h, v24.8b, v3.8b
++                                    PF ble 10f
++                                    PF lsl DUMMY, MASK_STRIDE, #mask_bpp_shift
++                                    PF ldrsb DUMMY, [PF_MASK, DUMMY]
++                                    PF add PF_MASK, PF_MASK, #1
++10:
++        st4        {v28.8b, v29.8b, v30.8b, v31.8b}, [DST_W], #32
++    ursra       v8.8h, v8.8h, #8
++    ursra       v9.8h, v9.8h, #8
++    ursra       v10.8h, v10.8h, #8
++    ursra       v11.8h, v11.8h, #8
++.endm
++
++.macro pixman_composite_src_n_8_8888_init
++    mov         v3.s[0], w4
++    dup         v0.8b, v3.b[0]
++    dup         v1.8b, v3.b[1]
++    dup         v2.8b, v3.b[2]
++    dup         v3.8b, v3.b[3]
++.endm
++
++.macro pixman_composite_src_n_8_8888_cleanup
++.endm
++
++generate_composite_function \
++    pixman_composite_src_n_8_8888_asm_neon, 0, 8, 32, \
++    FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \
++    8, /* number of pixels, processed in a single block */ \
++    5, /* prefetch distance */ \
++    pixman_composite_src_n_8_8888_init, \
++    pixman_composite_src_n_8_8888_cleanup, \
++    pixman_composite_src_n_8_8888_process_pixblock_head, \
++    pixman_composite_src_n_8_8888_process_pixblock_tail, \
++    pixman_composite_src_n_8_8888_process_pixblock_tail_head, \
++
++/******************************************************************************/
++
++.macro pixman_composite_src_n_8_8_process_pixblock_head
++    umull       v0.8h, v24.8b, v16.8b
++    umull       v1.8h, v25.8b, v16.8b
++    umull       v2.8h, v26.8b, v16.8b
++    umull       v3.8h, v27.8b, v16.8b
++    ursra       v0.8h, v0.8h,  #8
++    ursra       v1.8h, v1.8h,  #8
++    ursra       v2.8h, v2.8h,  #8
++    ursra       v3.8h, v3.8h,  #8
++.endm
++
++.macro pixman_composite_src_n_8_8_process_pixblock_tail
++    rshrn       v28.8b, v0.8h, #8
++    rshrn       v29.8b, v1.8h, #8
++    rshrn       v30.8b, v2.8h, #8
++    rshrn       v31.8b, v3.8h, #8
++.endm
++
++.macro pixman_composite_src_n_8_8_process_pixblock_tail_head
++    fetch_mask_pixblock
++                                    PF add PF_X, PF_X, #8
++        rshrn       v28.8b, v0.8h, #8
++                                    PF tst PF_CTL, #0x0F
++        rshrn       v29.8b, v1.8h, #8
++                                    PF beq 10f
++                                    PF add PF_X, PF_X, #8
++10:
++        rshrn       v30.8b, v2.8h, #8
++                                    PF beq 10f
++                                    PF sub PF_CTL, PF_CTL, #1
++10:
++        rshrn       v31.8b, v3.8h, #8
++                                    PF cmp PF_X, ORIG_W
++    umull       v0.8h,  v24.8b, v16.8b
++                                    PF lsl DUMMY, PF_X, mask_bpp_shift
++                                    PF prfm PREFETCH_MODE, [PF_MASK, DUMMY]
++    umull       v1.8h,  v25.8b, v16.8b
++                                    PF ble 10f
++                                    PF sub PF_X, PF_X, ORIG_W
++10:
++    umull       v2.8h,  v26.8b, v16.8b
++                                    PF ble 10f
++                                    PF subs PF_CTL, PF_CTL, #0x10
++10:
++    umull       v3.8h,  v27.8b, v16.8b
++                                    PF ble 10f
++                                    PF lsl DUMMY, MASK_STRIDE, #mask_bpp_shift
++                                    PF ldrsb DUMMY, [PF_MASK, DUMMY]
++                                    PF add PF_MASK, PF_MASK, #1
++10:
++        st1         {v28.8b, v29.8b, v30.8b, v31.8b}, [DST_W], #32
++    ursra       v0.8h, v0.8h,  #8
++    ursra       v1.8h, v1.8h,  #8
++    ursra       v2.8h, v2.8h,  #8
++    ursra       v3.8h, v3.8h,  #8
++.endm
++
++.macro pixman_composite_src_n_8_8_init
++    mov         v16.s[0], w4
++    dup         v16.8b, v16.b[3]
++.endm
++
++.macro pixman_composite_src_n_8_8_cleanup
++.endm
++
++generate_composite_function \
++    pixman_composite_src_n_8_8_asm_neon, 0, 8, 8, \
++    FLAG_DST_WRITEONLY, \
++    32, /* number of pixels, processed in a single block */ \
++    5, /* prefetch distance */ \
++    pixman_composite_src_n_8_8_init, \
++    pixman_composite_src_n_8_8_cleanup, \
++    pixman_composite_src_n_8_8_process_pixblock_head, \
++    pixman_composite_src_n_8_8_process_pixblock_tail, \
++    pixman_composite_src_n_8_8_process_pixblock_tail_head
++
++/******************************************************************************/
++
++.macro pixman_composite_over_n_8_8888_process_pixblock_head
++    /* expecting deinterleaved source data in {v8, v9, v10, v11} */
++    /* v8 - blue, v9 - green, v10 - red, v11 - alpha */
++    /* and destination data in {v4, v5, v6, v7} */
++    /* mask is in v24 (v25, v26, v27 are unused) */
++
++    /* in */
++    umull       v12.8h, v24.8b, v8.8b
++    umull       v13.8h, v24.8b, v9.8b
++    umull       v14.8h, v24.8b, v10.8b
++    umull       v15.8h, v24.8b, v11.8b
++    urshr       v16.8h, v12.8h, #8
++    urshr       v17.8h, v13.8h, #8
++    urshr       v18.8h, v14.8h, #8
++    urshr       v19.8h, v15.8h, #8
++    raddhn      v0.8b, v12.8h, v16.8h
++    raddhn      v1.8b, v13.8h, v17.8h
++    raddhn      v2.8b, v14.8h, v18.8h
++    raddhn      v3.8b, v15.8h, v19.8h
++    mvn         v25.8b, v3.8b  /* get inverted alpha */
++    /* source:      v0 - blue, v1 - green, v2 - red, v3 - alpha */
++    /* destination: v4 - blue, v5 - green, v6 - red, v7 - alpha */
++    /* now do alpha blending */
++    umull       v12.8h, v25.8b, v4.8b
++    umull       v13.8h, v25.8b, v5.8b
++    umull       v14.8h, v25.8b, v6.8b
++    umull       v15.8h, v25.8b, v7.8b
++.endm
++
++.macro pixman_composite_over_n_8_8888_process_pixblock_tail
++    urshr       v16.8h, v12.8h, #8
++    urshr       v17.8h, v13.8h, #8
++    urshr       v18.8h, v14.8h, #8
++    urshr       v19.8h, v15.8h, #8
++    raddhn      v28.8b, v16.8h, v12.8h
++    raddhn      v29.8b, v17.8h, v13.8h
++    raddhn      v30.8b, v18.8h, v14.8h
++    raddhn      v31.8b, v19.8h, v15.8h
++    uqadd       v28.8b, v0.8b, v28.8b
++    uqadd       v29.8b, v1.8b, v29.8b
++    uqadd       v30.8b, v2.8b, v30.8b
++    uqadd       v31.8b, v3.8b, v31.8b
++.endm
++
++.macro pixman_composite_over_n_8_8888_process_pixblock_tail_head
++        urshr       v16.8h, v12.8h, #8
++     ld4        {v4.8b, v5.8b, v6.8b, v7.8b}, [DST_R], #32
++        urshr       v17.8h, v13.8h, #8
++    fetch_mask_pixblock
++        urshr       v18.8h, v14.8h, #8
++                                    PF add PF_X, PF_X, #8
++        urshr       v19.8h, v15.8h, #8
++                                    PF tst PF_CTL, #0x0F
++        raddhn      v28.8b, v16.8h, v12.8h
++                                    PF beq 10f
++                                    PF add PF_X, PF_X, #8
++10:
++        raddhn      v29.8b, v17.8h, v13.8h
++                                    PF beq 10f
++                                    PF sub PF_CTL, PF_CTL, #1
++10:
++        raddhn      v30.8b, v18.8h, v14.8h
++                                    PF cmp PF_X, ORIG_W
++        raddhn      v31.8b, v19.8h, v15.8h
++                                    PF lsl DUMMY, PF_X, #dst_bpp_shift
++                                    PF prfm PREFETCH_MODE, [PF_DST, DUMMY]
++    umull       v16.8h, v24.8b, v8.8b
++                                    PF lsl DUMMY, PF_X, #mask_bpp_shift
++                                    PF prfm PREFETCH_MODE, [PF_MASK, DUMMY]
++    umull       v17.8h, v24.8b, v9.8b
++                                    PF ble 10f
++                                    PF sub PF_X, PF_X, ORIG_W
++10:
++    umull       v18.8h, v24.8b, v10.8b
++                                    PF ble 10f
++                                    PF subs PF_CTL, PF_CTL, #0x10
++10:
++    umull       v19.8h, v24.8b, v11.8b
++                                    PF ble 10f
++                                    PF lsl DUMMY, DST_STRIDE, #dst_bpp_shift
++                                    PF ldrsb DUMMY, [PF_DST, DUMMY]
++                                    PF add PF_DST, PF_DST, #1
++10:
++        uqadd       v28.8b, v0.8b, v28.8b
++                                    PF ble 10f
++                                    PF lsl DUMMY, MASK_STRIDE, #mask_bpp_shift
++                                    PF ldrsb DUMMY, [PF_MASK, DUMMY]
++                                    PF add PF_MASK, PF_MASK, #1
++10:
++        uqadd        v29.8b, v1.8b, v29.8b
++        uqadd        v30.8b, v2.8b, v30.8b
++        uqadd        v31.8b, v3.8b, v31.8b
++    urshr       v12.8h, v16.8h, #8
++    urshr       v13.8h, v17.8h, #8
++    urshr       v14.8h, v18.8h, #8
++    urshr       v15.8h, v19.8h, #8
++    raddhn      v0.8b, v16.8h, v12.8h
++    raddhn      v1.8b, v17.8h, v13.8h
++    raddhn      v2.8b, v18.8h, v14.8h
++    raddhn      v3.8b, v19.8h, v15.8h
++        st4          {v28.8b, v29.8b, v30.8b, v31.8b}, [DST_W], #32
++    mvn         v25.8b, v3.8b
++    umull       v12.8h, v25.8b, v4.8b
++    umull       v13.8h, v25.8b, v5.8b
++    umull       v14.8h, v25.8b, v6.8b
++    umull       v15.8h, v25.8b, v7.8b
++.endm
++
++.macro pixman_composite_over_n_8_8888_init
++    mov         v11.s[0], w4
++    dup         v8.8b, v11.b[0]
++    dup         v9.8b, v11.b[1]
++    dup         v10.8b, v11.b[2]
++    dup         v11.8b, v11.b[3]
++.endm
++
++.macro pixman_composite_over_n_8_8888_cleanup
++.endm
++
++generate_composite_function \
++    pixman_composite_over_n_8_8888_asm_neon, 0, 8, 32, \
++    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
++    8, /* number of pixels, processed in a single block */ \
++    5, /* prefetch distance */ \
++    pixman_composite_over_n_8_8888_init, \
++    pixman_composite_over_n_8_8888_cleanup, \
++    pixman_composite_over_n_8_8888_process_pixblock_head, \
++    pixman_composite_over_n_8_8888_process_pixblock_tail, \
++    pixman_composite_over_n_8_8888_process_pixblock_tail_head
++
++/******************************************************************************/
++
++.macro pixman_composite_over_n_8_8_process_pixblock_head
++    umull       v0.8h,  v24.8b, v8.8b
++    umull       v1.8h,  v25.8b, v8.8b
++    umull       v2.8h,  v26.8b, v8.8b
++    umull       v3.8h,  v27.8b, v8.8b
++    urshr       v10.8h, v0.8h,  #8
++    urshr       v11.8h, v1.8h,  #8
++    urshr       v12.8h, v2.8h,  #8
++    urshr       v13.8h, v3.8h,  #8
++    raddhn      v0.8b,  v0.8h,  v10.8h
++    raddhn      v1.8b,  v1.8h,  v11.8h
++    raddhn      v2.8b,  v2.8h,  v12.8h
++    raddhn      v3.8b,  v3.8h,  v13.8h
++    mvn         v24.8b, v0.8b
++    mvn         v25.8b, v1.8b
++    mvn         v26.8b, v2.8b
++    mvn         v27.8b, v3.8b
++    umull       v10.8h, v24.8b, v4.8b
++    umull       v11.8h, v25.8b, v5.8b
++    umull       v12.8h, v26.8b, v6.8b
++    umull       v13.8h, v27.8b, v7.8b
++.endm
++
++.macro pixman_composite_over_n_8_8_process_pixblock_tail
++    urshr       v14.8h, v10.8h,  #8
++    urshr       v15.8h, v11.8h,  #8
++    urshr       v16.8h, v12.8h, #8
++    urshr       v17.8h, v13.8h, #8
++    raddhn      v28.8b, v14.8h, v10.8h
++    raddhn      v29.8b, v15.8h, v11.8h
++    raddhn      v30.8b, v16.8h, v12.8h
++    raddhn      v31.8b, v17.8h, v13.8h
++    uqadd       v28.8b, v0.8b,  v28.8b
++    uqadd       v29.8b, v1.8b,  v29.8b
++    uqadd       v30.8b, v2.8b,  v30.8b
++    uqadd       v31.8b, v3.8b,  v31.8b
++.endm
++
++/* TODO: expand macros and do better instructions scheduling */
++.macro pixman_composite_over_n_8_8_process_pixblock_tail_head
++    ld1         {v4.8b, v5.8b, v6.8b, v7.8b}, [DST_R], #32
++    pixman_composite_over_n_8_8_process_pixblock_tail
++    fetch_mask_pixblock
++    cache_preload 32, 32
++    st1         {v28.8b, v29.8b, v30.8b, v31.8b}, [DST_W], #32
++    pixman_composite_over_n_8_8_process_pixblock_head
++.endm
++
++.macro pixman_composite_over_n_8_8_init
++    mov         v8.s[0], w4
++    dup         v8.8b, v8.b[3]
++.endm
++
++.macro pixman_composite_over_n_8_8_cleanup
++.endm
++
++generate_composite_function \
++    pixman_composite_over_n_8_8_asm_neon, 0, 8, 8, \
++    FLAG_DST_READWRITE, \
++    32, /* number of pixels, processed in a single block */ \
++    5, /* prefetch distance */ \
++    pixman_composite_over_n_8_8_init, \
++    pixman_composite_over_n_8_8_cleanup, \
++    pixman_composite_over_n_8_8_process_pixblock_head, \
++    pixman_composite_over_n_8_8_process_pixblock_tail, \
++    pixman_composite_over_n_8_8_process_pixblock_tail_head
++
++/******************************************************************************/
++
++.macro pixman_composite_over_n_8888_8888_ca_process_pixblock_head
++    /*
++     * 'combine_mask_ca' replacement
++     *
++     * input:  solid src (n) in {v8,  v9,  v10, v11}
++     *         dest in          {v4,  v5,  v6,  v7 }
++     *         mask in          {v24, v25, v26, v27}
++     * output: updated src in   {v0,  v1,  v2,  v3 }
++     *         updated mask in  {v24, v25, v26, v3 }
++     */
++    umull       v0.8h,  v24.8b, v8.8b
++    umull       v1.8h,  v25.8b, v9.8b
++    umull       v2.8h,  v26.8b, v10.8b
++    umull       v3.8h,  v27.8b, v11.8b
++    umull       v12.8h, v11.8b, v25.8b
++    umull       v13.8h, v11.8b, v24.8b
++    umull       v14.8h, v11.8b, v26.8b
++    urshr       v15.8h, v0.8h,  #8
++    urshr       v16.8h, v1.8h,  #8
++    urshr       v17.8h, v2.8h,  #8
++    raddhn      v0.8b,  v0.8h,  v15.8h
++    raddhn      v1.8b,  v1.8h,  v16.8h
++    raddhn      v2.8b,  v2.8h,  v17.8h
++    urshr       v15.8h, v13.8h, #8
++    urshr       v16.8h, v12.8h, #8
++    urshr       v17.8h, v14.8h, #8
++    urshr       v18.8h, v3.8h,  #8
++    raddhn      v24.8b, v13.8h, v15.8h
++    raddhn      v25.8b, v12.8h, v16.8h
++    raddhn      v26.8b, v14.8h, v17.8h
++    raddhn      v3.8b,  v3.8h,  v18.8h
++    /*
++     * 'combine_over_ca' replacement
++     *
++     * output: updated dest in {v28, v29, v30, v31}
++     */
++    mvn         v24.8b, v24.8b
++    mvn         v25.8b, v25.8b
++    mvn         v26.8b, v26.8b
++    mvn         v27.8b, v3.8b
++    umull       v12.8h, v24.8b, v4.8b
++    umull       v13.8h, v25.8b, v5.8b
++    umull       v14.8h, v26.8b, v6.8b
++    umull       v15.8h, v27.8b, v7.8b
++.endm
++
++.macro pixman_composite_over_n_8888_8888_ca_process_pixblock_tail
++    /* ... continue 'combine_over_ca' replacement */
++    urshr       v16.8h, v12.8h, #8
++    urshr       v17.8h, v13.8h, #8
++    urshr       v18.8h, v14.8h, #8
++    urshr       v19.8h, v15.8h, #8
++    raddhn      v28.8b, v16.8h, v12.8h
++    raddhn      v29.8b, v17.8h, v13.8h
++    raddhn      v30.8b, v18.8h, v14.8h
++    raddhn      v31.8b, v19.8h, v15.8h
++    uqadd       v28.8b, v0.8b,  v28.8b
++    uqadd       v29.8b, v1.8b,  v29.8b
++    uqadd       v30.8b, v2.8b,  v30.8b
++    uqadd       v31.8b, v3.8b,  v31.8b
++.endm
++
++.macro pixman_composite_over_n_8888_8888_ca_process_pixblock_tail_head
++        urshr       v16.8h, v12.8h, #8
++        urshr       v17.8h, v13.8h, #8
++    ld4         {v4.8b, v5.8b, v6.8b, v7.8b}, [DST_R], #32
++        urshr       v18.8h, v14.8h, #8
++        urshr       v19.8h, v15.8h, #8
++        raddhn      v28.8b, v16.8h, v12.8h
++        raddhn      v29.8b, v17.8h, v13.8h
++        raddhn      v30.8b, v18.8h, v14.8h
++        raddhn      v31.8b, v19.8h, v15.8h
++    fetch_mask_pixblock
++        uqadd       v28.8b, v0.8b, v28.8b
++        uqadd       v29.8b, v1.8b, v29.8b
++        uqadd       v30.8b, v2.8b, v30.8b
++        uqadd       v31.8b, v3.8b, v31.8b
++    cache_preload 8, 8
++    pixman_composite_over_n_8888_8888_ca_process_pixblock_head
++    st4         {v28.8b, v29.8b, v30.8b, v31.8b}, [DST_W], #32
++.endm
++
++.macro pixman_composite_over_n_8888_8888_ca_init
++    mov         v13.s[0], w4
++    dup         v8.8b, v13.b[0]
++    dup         v9.8b, v13.b[1]
++    dup         v10.8b, v13.b[2]
++    dup         v11.8b, v13.b[3]
++.endm
++
++.macro pixman_composite_over_n_8888_8888_ca_cleanup
++.endm
++
++generate_composite_function \
++    pixman_composite_over_n_8888_8888_ca_asm_neon, 0, 32, 32, \
++    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
++    8, /* number of pixels, processed in a single block */ \
++    5, /* prefetch distance */ \
++    pixman_composite_over_n_8888_8888_ca_init, \
++    pixman_composite_over_n_8888_8888_ca_cleanup, \
++    pixman_composite_over_n_8888_8888_ca_process_pixblock_head, \
++    pixman_composite_over_n_8888_8888_ca_process_pixblock_tail, \
++    pixman_composite_over_n_8888_8888_ca_process_pixblock_tail_head
++
++/******************************************************************************/
++
++.macro pixman_composite_over_n_8888_0565_ca_process_pixblock_head
++    /*
++     * 'combine_mask_ca' replacement
++     *
++     * input:  solid src (n) in {v8,  v9,  v10, v11}  [B, G, R, A]
++     *         mask in          {v24, v25, v26}       [B, G, R]
++     * output: updated src in   {v0,  v1,  v2 }       [B, G, R]
++     *         updated mask in  {v24, v25, v26}       [B, G, R]
++     */
++    umull       v0.8h,  v24.8b, v8.8b
++    umull       v1.8h,  v25.8b, v9.8b
++    umull       v2.8h,  v26.8b, v10.8b
++    umull       v12.8h, v11.8b, v24.8b
++    umull       v13.8h, v11.8b, v25.8b
++    umull       v14.8h, v11.8b, v26.8b
++    urshr       v15.8h, v0.8h,  #8
++    urshr       v16.8h, v1.8h,  #8
++    urshr       v17.8h, v2.8h,  #8
++    raddhn      v0.8b,  v0.8h,  v15.8h
++    raddhn      v1.8b,  v1.8h,  v16.8h
++    raddhn      v2.8b,  v2.8h,  v17.8h
++    urshr       v19.8h, v12.8h, #8
++    urshr       v20.8h, v13.8h, #8
++    urshr       v21.8h, v14.8h, #8
++    raddhn      v24.8b, v12.8h, v19.8h
++    raddhn      v25.8b, v13.8h, v20.8h
++    /*
++     * convert 8 r5g6b5 pixel data from {v4} to planar 8-bit format
++     * and put data into v16 - blue, v17 - green, v18 - red
++     */
++       mov         v4.d[1], v5.d[0]
++       shrn        v17.8b, v4.8h,  #3
++       shrn        v18.8b, v4.8h,  #8
++    raddhn      v26.8b, v14.8h, v21.8h
++       sli         v4.8h,  v4.8h,  #5
++       sri         v18.8b, v18.8b, #5
++       sri         v17.8b, v17.8b, #6
++    /*
++     * 'combine_over_ca' replacement
++     *
++     * output: updated dest in v16 - blue, v17 - green, v18 - red
++     */
++    mvn         v24.8b, v24.8b
++    mvn         v25.8b, v25.8b
++       shrn       v16.8b, v4.8h,  #2
++    mvn         v26.8b, v26.8b
++    umull       v5.8h, v16.8b, v24.8b
++    umull       v6.8h, v17.8b, v25.8b
++    umull       v7.8h, v18.8b, v26.8b
++.endm
++
++.macro pixman_composite_over_n_8888_0565_ca_process_pixblock_tail
++    /* ... continue 'combine_over_ca' replacement */
++    urshr       v13.8h, v5.8h, #8
++    urshr       v14.8h, v6.8h, #8
++    urshr       v15.8h, v7.8h, #8
++    raddhn      v16.8b, v13.8h, v5.8h
++    raddhn      v17.8b, v14.8h, v6.8h
++    raddhn      v18.8b, v15.8h, v7.8h
++    uqadd       v16.8b, v0.8b, v16.8b
++    uqadd       v17.8b, v1.8b, v17.8b
++    uqadd       v18.8b, v2.8b, v18.8b
++    /*
++     * convert the results in v16, v17, v18 to r5g6b5 and store
++     * them into {v14}
++     */
++    ushll       v14.8h, v18.8b, #7
++    sli         v14.8h, v14.8h, #1
++    ushll       v12.8h, v17.8b, #7
++    sli         v12.8h, v12.8h, #1
++    ushll       v13.8h, v16.8b, #7
++    sli         v13.8h, v13.8h, #1
++    sri         v14.8h, v12.8h, #5
++    sri         v14.8h, v13.8h, #11
++    mov         v28.d[0], v14.d[0]
++    mov         v29.d[0], v14.d[1]
++.endm
++
++.macro pixman_composite_over_n_8888_0565_ca_process_pixblock_tail_head
++    fetch_mask_pixblock
++        urshr       v13.8h, v5.8h, #8
++        urshr       v14.8h, v6.8h, #8
++    ld1         {v4.8h}, [DST_R], #16
++        urshr       v15.8h, v7.8h, #8
++        raddhn      v16.8b, v13.8h, v5.8h
++        raddhn      v17.8b, v14.8h, v6.8h
++        raddhn      v18.8b, v15.8h, v7.8h
++    mov         v5.d[0], v4.d[1]
++            /* process_pixblock_head */
++            /*
++             * 'combine_mask_ca' replacement
++             *
++             * input:  solid src (n) in {v8,  v9,  v10, v11}  [B, G, R, A]
++             *         mask in          {v24, v25, v26}       [B, G, R]
++             * output: updated src in   {v0,  v1,  v2 }       [B, G, R]
++             *         updated mask in  {v24, v25, v26}       [B, G, R]
++             */
++        uqadd       v16.8b, v0.8b, v16.8b
++        uqadd       v17.8b, v1.8b, v17.8b
++        uqadd       v18.8b, v2.8b, v18.8b
++            umull       v0.8h,  v24.8b, v8.8b
++            umull       v1.8h,  v25.8b, v9.8b
++            umull       v2.8h,  v26.8b, v10.8b
++        /*
++         * convert the result in v16, v17, v18 to r5g6b5 and store
++         * it into {v14}
++         */
++        ushll       v14.8h, v18.8b, #7
++        sli         v14.8h, v14.8h, #1
++        ushll       v18.8h, v16.8b, #7
++        sli         v18.8h, v18.8h, #1
++        ushll       v19.8h, v17.8b, #7
++        sli         v19.8h, v19.8h, #1
++            umull       v12.8h, v11.8b, v24.8b
++        sri         v14.8h, v19.8h, #5
++            umull       v13.8h, v11.8b, v25.8b
++            umull       v15.8h, v11.8b, v26.8b
++        sri         v14.8h, v18.8h, #11
++        mov         v28.d[0], v14.d[0]
++        mov         v29.d[0], v14.d[1]
++    cache_preload 8, 8
++            urshr       v16.8h, v0.8h,  #8
++            urshr       v17.8h, v1.8h,  #8
++            urshr       v18.8h, v2.8h,  #8
++            raddhn      v0.8b,  v0.8h,  v16.8h
++            raddhn      v1.8b,  v1.8h,  v17.8h
++            raddhn      v2.8b,  v2.8h,  v18.8h
++            urshr       v19.8h, v12.8h, #8
++            urshr       v20.8h, v13.8h, #8
++            urshr       v21.8h, v15.8h, #8
++            raddhn      v24.8b, v12.8h, v19.8h
++            raddhn      v25.8b, v13.8h, v20.8h
++                /*
++                 * convert 8 r5g6b5 pixel data from {v4, v5} to planar
++             * 8-bit format and put data into v16 - blue, v17 - green,
++             * v18 - red
++                 */
++		mov         v4.d[1], v5.d[0]
++                shrn        v17.8b, v4.8h,  #3
++                shrn        v18.8b, v4.8h,  #8
++            raddhn      v26.8b, v15.8h, v21.8h
++                sli         v4.8h,  v4.8h,  #5
++                sri         v17.8b, v17.8b, #6
++                sri         v18.8b, v18.8b, #5
++            /*
++             * 'combine_over_ca' replacement
++             *
++             * output: updated dest in v16 - blue, v17 - green, v18 - red
++             */
++            mvn         v24.8b, v24.8b
++            mvn         v25.8b, v25.8b
++                shrn        v16.8b, v4.8h,  #2
++            mvn         v26.8b, v26.8b
++            umull       v5.8h, v16.8b, v24.8b
++            umull       v6.8h, v17.8b, v25.8b
++            umull       v7.8h, v18.8b, v26.8b
++    st1         {v14.8h}, [DST_W], #16
++.endm
++
++.macro pixman_composite_over_n_8888_0565_ca_init
++    mov         v13.s[0], w4
++    dup         v8.8b, v13.b[0]
++    dup         v9.8b, v13.b[1]
++    dup         v10.8b, v13.b[2]
++    dup         v11.8b, v13.b[3]
++.endm
++
++.macro pixman_composite_over_n_8888_0565_ca_cleanup
++.endm
++
++generate_composite_function \
++    pixman_composite_over_n_8888_0565_ca_asm_neon, 0, 32, 16, \
++    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
++    8, /* number of pixels, processed in a single block */ \
++    5, /* prefetch distance */ \
++    pixman_composite_over_n_8888_0565_ca_init, \
++    pixman_composite_over_n_8888_0565_ca_cleanup, \
++    pixman_composite_over_n_8888_0565_ca_process_pixblock_head, \
++    pixman_composite_over_n_8888_0565_ca_process_pixblock_tail, \
++    pixman_composite_over_n_8888_0565_ca_process_pixblock_tail_head
++
++/******************************************************************************/
++
++.macro pixman_composite_in_n_8_process_pixblock_head
++    /* expecting source data in {v0, v1, v2, v3} */
++    /* and destination data in {v4, v5, v6, v7} */
++    umull       v8.8h,  v4.8b,  v3.8b
++    umull       v9.8h,  v5.8b,  v3.8b
++    umull       v10.8h, v6.8b,  v3.8b
++    umull       v11.8h, v7.8b,  v3.8b
++.endm
++
++.macro pixman_composite_in_n_8_process_pixblock_tail
++    urshr       v14.8h, v8.8h,  #8
++    urshr       v15.8h, v9.8h,  #8
++    urshr       v12.8h, v10.8h, #8
++    urshr       v13.8h, v11.8h, #8
++    raddhn      v28.8b, v8.8h,  v14.8h
++    raddhn      v29.8b, v9.8h,  v15.8h
++    raddhn      v30.8b, v10.8h, v12.8h
++    raddhn      v31.8b, v11.8h, v13.8h
++.endm
++
++.macro pixman_composite_in_n_8_process_pixblock_tail_head
++    pixman_composite_in_n_8_process_pixblock_tail
++    ld1         {v4.8b, v5.8b, v6.8b, v7.8b}, [DST_R], #32
++    cache_preload 32, 32
++    pixman_composite_in_n_8_process_pixblock_head
++    st1         {v28.8b, v29.8b, v30.8b, v31.8b}, [DST_W], #32
++.endm
++
++.macro pixman_composite_in_n_8_init
++    mov         v3.s[0], w4
++    dup         v3.8b, v3.b[3]
++.endm
++
++.macro pixman_composite_in_n_8_cleanup
++.endm
++
++generate_composite_function \
++    pixman_composite_in_n_8_asm_neon, 0, 0, 8, \
++    FLAG_DST_READWRITE, \
++    32, /* number of pixels, processed in a single block */ \
++    5, /* prefetch distance */ \
++    pixman_composite_in_n_8_init, \
++    pixman_composite_in_n_8_cleanup, \
++    pixman_composite_in_n_8_process_pixblock_head, \
++    pixman_composite_in_n_8_process_pixblock_tail, \
++    pixman_composite_in_n_8_process_pixblock_tail_head, \
++    28, /* dst_w_basereg */ \
++    4,  /* dst_r_basereg */ \
++    0,  /* src_basereg   */ \
++    24  /* mask_basereg  */
++
++.macro pixman_composite_add_n_8_8_process_pixblock_head
++    /* expecting source data in {v8, v9, v10, v11} */
++    /* v8 - blue, v9 - green, v10 - red, v11 - alpha */
++    /* and destination data in {v4, v5, v6, v7} */
++    /* mask is in v24, v25, v26, v27 */
++    umull       v0.8h, v24.8b, v11.8b
++    umull       v1.8h, v25.8b, v11.8b
++    umull       v2.8h, v26.8b, v11.8b
++    umull       v3.8h, v27.8b, v11.8b
++    urshr       v12.8h, v0.8h, #8
++    urshr       v13.8h, v1.8h, #8
++    urshr       v14.8h, v2.8h, #8
++    urshr       v15.8h, v3.8h, #8
++    raddhn      v0.8b, v0.8h, v12.8h
++    raddhn      v1.8b, v1.8h, v13.8h
++    raddhn      v2.8b, v2.8h, v14.8h
++    raddhn      v3.8b, v3.8h, v15.8h
++    uqadd       v28.8b, v0.8b, v4.8b
++    uqadd       v29.8b, v1.8b, v5.8b
++    uqadd       v30.8b, v2.8b, v6.8b
++    uqadd       v31.8b, v3.8b, v7.8b
++.endm
++
++.macro pixman_composite_add_n_8_8_process_pixblock_tail
++.endm
++
++/* TODO: expand macros and do better instructions scheduling */
++.macro pixman_composite_add_n_8_8_process_pixblock_tail_head
++    pixman_composite_add_n_8_8_process_pixblock_tail
++    st1         {v28.8b, v29.8b, v30.8b, v31.8b}, [DST_W], #32
++    ld1         {v4.8b, v5.8b, v6.8b, v7.8b}, [DST_R], #32
++    fetch_mask_pixblock
++    cache_preload 32, 32
++    pixman_composite_add_n_8_8_process_pixblock_head
++.endm
++
++.macro pixman_composite_add_n_8_8_init
++    mov         v11.s[0], w4
++    dup         v11.8b, v11.b[3]
++.endm
++
++.macro pixman_composite_add_n_8_8_cleanup
++.endm
++
++generate_composite_function \
++    pixman_composite_add_n_8_8_asm_neon, 0, 8, 8, \
++    FLAG_DST_READWRITE, \
++    32, /* number of pixels, processed in a single block */ \
++    5, /* prefetch distance */ \
++    pixman_composite_add_n_8_8_init, \
++    pixman_composite_add_n_8_8_cleanup, \
++    pixman_composite_add_n_8_8_process_pixblock_head, \
++    pixman_composite_add_n_8_8_process_pixblock_tail, \
++    pixman_composite_add_n_8_8_process_pixblock_tail_head
++
++/******************************************************************************/
++
++.macro pixman_composite_add_8_8_8_process_pixblock_head
++    /* expecting source data in {v0, v1, v2, v3} */
++    /* destination data in {v4, v5, v6, v7} */
++    /* mask in {v24, v25, v26, v27} */
++    umull       v8.8h, v24.8b, v0.8b
++    umull       v9.8h, v25.8b, v1.8b
++    umull       v10.8h, v26.8b, v2.8b
++    umull       v11.8h, v27.8b, v3.8b
++    urshr       v0.8h, v8.8h, #8
++    urshr       v1.8h, v9.8h, #8
++    urshr       v12.8h, v10.8h, #8
++    urshr       v13.8h, v11.8h, #8
++    raddhn      v0.8b, v0.8h, v8.8h
++    raddhn      v1.8b, v1.8h, v9.8h
++    raddhn      v2.8b, v12.8h, v10.8h
++    raddhn      v3.8b, v13.8h, v11.8h
++    uqadd       v28.8b, v0.8b, v4.8b
++    uqadd       v29.8b, v1.8b, v5.8b
++    uqadd       v30.8b, v2.8b, v6.8b
++    uqadd       v31.8b, v3.8b, v7.8b
++.endm
++
++.macro pixman_composite_add_8_8_8_process_pixblock_tail
++.endm
++
++/* TODO: expand macros and do better instructions scheduling */
++.macro pixman_composite_add_8_8_8_process_pixblock_tail_head
++    pixman_composite_add_8_8_8_process_pixblock_tail
++    st1         {v28.8b, v29.8b, v30.8b, v31.8b}, [DST_W], #32
++    ld1         {v4.8b, v5.8b, v6.8b, v7.8b}, [DST_R], #32
++    fetch_mask_pixblock
++    fetch_src_pixblock
++    cache_preload 32, 32
++    pixman_composite_add_8_8_8_process_pixblock_head
++.endm
++
++.macro pixman_composite_add_8_8_8_init
++.endm
++
++.macro pixman_composite_add_8_8_8_cleanup
++.endm
++
++generate_composite_function \
++    pixman_composite_add_8_8_8_asm_neon, 8, 8, 8, \
++    FLAG_DST_READWRITE, \
++    32, /* number of pixels, processed in a single block */ \
++    5, /* prefetch distance */ \
++    pixman_composite_add_8_8_8_init, \
++    pixman_composite_add_8_8_8_cleanup, \
++    pixman_composite_add_8_8_8_process_pixblock_head, \
++    pixman_composite_add_8_8_8_process_pixblock_tail, \
++    pixman_composite_add_8_8_8_process_pixblock_tail_head
++
++/******************************************************************************/
++
++.macro pixman_composite_add_8888_8888_8888_process_pixblock_head
++    /* expecting source data in {v0, v1, v2, v3} */
++    /* destination data in {v4, v5, v6, v7} */
++    /* mask in {v24, v25, v26, v27} */
++    umull       v8.8h,  v27.8b, v0.8b
++    umull       v9.8h,  v27.8b, v1.8b
++    umull       v10.8h, v27.8b, v2.8b
++    umull       v11.8h, v27.8b, v3.8b
++    /* 1 cycle bubble */
++    ursra       v8.8h,  v8.8h,  #8
++    ursra       v9.8h,  v9.8h,  #8
++    ursra       v10.8h, v10.8h, #8
++    ursra       v11.8h, v11.8h, #8
++.endm
++
++.macro pixman_composite_add_8888_8888_8888_process_pixblock_tail
++    /* 2 cycle bubble */
++    rshrn       v28.8b, v8.8h,  #8
++    rshrn       v29.8b, v9.8h,  #8
++    rshrn       v30.8b, v10.8h, #8
++    rshrn       v31.8b, v11.8h, #8
++    uqadd       v28.8b, v4.8b,  v28.8b
++    uqadd       v29.8b, v5.8b,  v29.8b
++    uqadd       v30.8b, v6.8b,  v30.8b
++    uqadd       v31.8b, v7.8b,  v31.8b
++.endm
++
++.macro pixman_composite_add_8888_8888_8888_process_pixblock_tail_head
++    fetch_src_pixblock
++        rshrn       v28.8b, v8.8h,  #8
++    fetch_mask_pixblock
++        rshrn       v29.8b, v9.8h,  #8
++    umull       v8.8h,  v27.8b, v0.8b
++        rshrn       v30.8b, v10.8h, #8
++    umull       v9.8h,  v27.8b, v1.8b
++        rshrn       v31.8b, v11.8h, #8
++    umull       v10.8h, v27.8b, v2.8b
++    umull       v11.8h, v27.8b, v3.8b
++        uqadd       v28.8b, v4.8b,  v28.8b
++        uqadd       v29.8b, v5.8b,  v29.8b
++        uqadd       v30.8b, v6.8b,  v30.8b
++        uqadd       v31.8b, v7.8b,  v31.8b
++    ursra       v8.8h,  v8.8h,  #8
++    ld4         {v4.8b, v5.8b, v6.8b, v7.8b}, [DST_R], #32
++    ursra       v9.8h,  v9.8h,  #8
++        st4         {v28.8b, v29.8b, v30.8b, v31.8b}, [DST_W], #32
++    ursra       v10.8h, v10.8h, #8
++
++    cache_preload 8, 8
++
++    ursra       v11.8h, v11.8h, #8
++.endm
++
++generate_composite_function \
++    pixman_composite_add_8888_8888_8888_asm_neon, 32, 32, 32, \
++    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
++    8, /* number of pixels, processed in a single block */ \
++    10, /* prefetch distance */ \
++    default_init, \
++    default_cleanup, \
++    pixman_composite_add_8888_8888_8888_process_pixblock_head, \
++    pixman_composite_add_8888_8888_8888_process_pixblock_tail, \
++    pixman_composite_add_8888_8888_8888_process_pixblock_tail_head, \
++    28, /* dst_w_basereg */ \
++    4,  /* dst_r_basereg */ \
++    0,  /* src_basereg   */ \
++    24  /* mask_basereg  */
++
++generate_composite_function_single_scanline \
++    pixman_composite_scanline_add_mask_asm_neon, 32, 32, 32, \
++    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
++    8, /* number of pixels, processed in a single block */ \
++    default_init, \
++    default_cleanup, \
++    pixman_composite_add_8888_8888_8888_process_pixblock_head, \
++    pixman_composite_add_8888_8888_8888_process_pixblock_tail, \
++    pixman_composite_add_8888_8888_8888_process_pixblock_tail_head, \
++    28, /* dst_w_basereg */ \
++    4,  /* dst_r_basereg */ \
++    0,  /* src_basereg   */ \
++    24  /* mask_basereg  */
++
++/******************************************************************************/
++
++generate_composite_function \
++    pixman_composite_add_8888_8_8888_asm_neon, 32, 8, 32, \
++    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
++    8, /* number of pixels, processed in a single block */ \
++    5, /* prefetch distance */ \
++    default_init, \
++    default_cleanup, \
++    pixman_composite_add_8888_8888_8888_process_pixblock_head, \
++    pixman_composite_add_8888_8888_8888_process_pixblock_tail, \
++    pixman_composite_add_8888_8888_8888_process_pixblock_tail_head, \
++    28, /* dst_w_basereg */ \
++    4,  /* dst_r_basereg */ \
++    0,  /* src_basereg   */ \
++    27  /* mask_basereg  */
++
++/******************************************************************************/
++
++.macro pixman_composite_add_n_8_8888_init
++    mov         v3.s[0], w4
++    dup         v0.8b, v3.b[0]
++    dup         v1.8b, v3.b[1]
++    dup         v2.8b, v3.b[2]
++    dup         v3.8b, v3.b[3]
++.endm
++
++.macro pixman_composite_add_n_8_8888_cleanup
++.endm
++
++generate_composite_function \
++    pixman_composite_add_n_8_8888_asm_neon, 0, 8, 32, \
++    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
++    8, /* number of pixels, processed in a single block */ \
++    5, /* prefetch distance */ \
++    pixman_composite_add_n_8_8888_init, \
++    pixman_composite_add_n_8_8888_cleanup, \
++    pixman_composite_add_8888_8888_8888_process_pixblock_head, \
++    pixman_composite_add_8888_8888_8888_process_pixblock_tail, \
++    pixman_composite_add_8888_8888_8888_process_pixblock_tail_head, \
++    28, /* dst_w_basereg */ \
++    4,  /* dst_r_basereg */ \
++    0,  /* src_basereg   */ \
++    27  /* mask_basereg  */
++
++/******************************************************************************/
++
++.macro pixman_composite_add_8888_n_8888_init
++    mov         v27.s[0], w6
++    dup         v27.8b, v27.b[3]
++.endm
++
++.macro pixman_composite_add_8888_n_8888_cleanup
++.endm
++
++generate_composite_function \
++    pixman_composite_add_8888_n_8888_asm_neon, 32, 0, 32, \
++    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
++    8, /* number of pixels, processed in a single block */ \
++    5, /* prefetch distance */ \
++    pixman_composite_add_8888_n_8888_init, \
++    pixman_composite_add_8888_n_8888_cleanup, \
++    pixman_composite_add_8888_8888_8888_process_pixblock_head, \
++    pixman_composite_add_8888_8888_8888_process_pixblock_tail, \
++    pixman_composite_add_8888_8888_8888_process_pixblock_tail_head, \
++    28, /* dst_w_basereg */ \
++    4,  /* dst_r_basereg */ \
++    0,  /* src_basereg   */ \
++    27  /* mask_basereg  */
++
++/******************************************************************************/
++
++.macro pixman_composite_out_reverse_8888_n_8888_process_pixblock_head
++    /* expecting source data in {v0, v1, v2, v3} */
++    /* destination data in {v4, v5, v6, v7} */
++    /* solid mask is in v15 */
++
++    /* 'in' */
++    umull       v11.8h, v15.8b, v3.8b
++    umull       v10.8h, v15.8b, v2.8b
++    umull       v9.8h,  v15.8b, v1.8b
++    umull       v8.8h,  v15.8b, v0.8b
++    urshr       v16.8h, v11.8h, #8
++    urshr       v14.8h, v10.8h, #8
++    urshr       v13.8h,  v9.8h, #8
++    urshr       v12.8h,  v8.8h, #8
++    raddhn      v3.8b, v11.8h, v16.8h
++    raddhn      v2.8b, v10.8h, v14.8h
++    raddhn      v1.8b,  v9.8h, v13.8h
++    raddhn      v0.8b,  v8.8h, v12.8h
++    mvn         v24.8b, v3.8b  /* get inverted alpha */
++    /* now do alpha blending */
++    umull       v8.8h, v24.8b, v4.8b
++    umull       v9.8h, v24.8b, v5.8b
++    umull       v10.8h, v24.8b, v6.8b
++    umull       v11.8h, v24.8b, v7.8b
++.endm
++
++.macro pixman_composite_out_reverse_8888_n_8888_process_pixblock_tail
++    urshr       v16.8h, v8.8h, #8
++    urshr       v17.8h, v9.8h, #8
++    urshr       v18.8h, v10.8h, #8
++    urshr       v19.8h, v11.8h, #8
++    raddhn      v28.8b, v16.8h, v8.8h
++    raddhn      v29.8b, v17.8h, v9.8h
++    raddhn      v30.8b, v18.8h, v10.8h
++    raddhn      v31.8b, v19.8h, v11.8h
++.endm
++
++/* TODO: expand macros and do better instructions scheduling */
++.macro pixman_composite_out_reverse_8888_8888_8888_process_pixblock_tail_head
++    ld4        {v4.8b, v5.8b, v6.8b, v7.8b}, [DST_R], #32
++    pixman_composite_out_reverse_8888_n_8888_process_pixblock_tail
++    fetch_src_pixblock
++    cache_preload 8, 8
++    fetch_mask_pixblock
++    pixman_composite_out_reverse_8888_n_8888_process_pixblock_head
++    st4        {v28.8b, v29.8b, v30.8b, v31.8b}, [DST_W], #32
++.endm
++
++generate_composite_function_single_scanline \
++    pixman_composite_scanline_out_reverse_mask_asm_neon, 32, 32, 32, \
++    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
++    8, /* number of pixels, processed in a single block */ \
++    default_init_need_all_regs, \
++    default_cleanup_need_all_regs, \
++    pixman_composite_out_reverse_8888_n_8888_process_pixblock_head, \
++    pixman_composite_out_reverse_8888_n_8888_process_pixblock_tail, \
++    pixman_composite_out_reverse_8888_8888_8888_process_pixblock_tail_head \
++    28, /* dst_w_basereg */ \
++    4,  /* dst_r_basereg */ \
++    0,  /* src_basereg   */ \
++    12  /* mask_basereg  */
++
++/******************************************************************************/
++
++.macro pixman_composite_over_8888_n_8888_process_pixblock_head
++    pixman_composite_out_reverse_8888_n_8888_process_pixblock_head
++.endm
++
++.macro pixman_composite_over_8888_n_8888_process_pixblock_tail
++    pixman_composite_out_reverse_8888_n_8888_process_pixblock_tail
++    uqadd       v28.8b, v0.8b, v28.8b
++    uqadd       v29.8b, v1.8b, v29.8b
++    uqadd       v30.8b, v2.8b, v30.8b
++    uqadd       v31.8b, v3.8b, v31.8b
++.endm
++
++/* TODO: expand macros and do better instructions scheduling */
++.macro pixman_composite_over_8888_n_8888_process_pixblock_tail_head
++    ld4        {v4.8b, v5.8b, v6.8b, v7.8b}, [DST_R], #32
++    pixman_composite_over_8888_n_8888_process_pixblock_tail
++    fetch_src_pixblock
++    cache_preload 8, 8
++    pixman_composite_over_8888_n_8888_process_pixblock_head
++    st4        {v28.8b, v29.8b, v30.8b, v31.8b}, [DST_W], #32
++.endm
++
++.macro pixman_composite_over_8888_n_8888_init
++    mov         v15.s[0], w6
++    dup         v15.8b, v15.b[3]
++.endm
++
++.macro pixman_composite_over_8888_n_8888_cleanup
++.endm
++
++generate_composite_function \
++    pixman_composite_over_8888_n_8888_asm_neon, 32, 0, 32, \
++    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
++    8, /* number of pixels, processed in a single block */ \
++    5, /* prefetch distance */ \
++    pixman_composite_over_8888_n_8888_init, \
++    pixman_composite_over_8888_n_8888_cleanup, \
++    pixman_composite_over_8888_n_8888_process_pixblock_head, \
++    pixman_composite_over_8888_n_8888_process_pixblock_tail, \
++    pixman_composite_over_8888_n_8888_process_pixblock_tail_head, \
++    28, /* dst_w_basereg */ \
++    4,  /* dst_r_basereg */ \
++    0,  /* src_basereg   */ \
++    12  /* mask_basereg  */
++
++/******************************************************************************/
++
++/* TODO: expand macros and do better instructions scheduling */
++.macro pixman_composite_over_8888_8888_8888_process_pixblock_tail_head
++    ld4        {v4.8b, v5.8b, v6.8b, v7.8b}, [DST_R], #32
++    pixman_composite_over_8888_n_8888_process_pixblock_tail
++    fetch_src_pixblock
++    cache_preload 8, 8
++    fetch_mask_pixblock
++    pixman_composite_over_8888_n_8888_process_pixblock_head
++    st4        {v28.8b, v29.8b, v30.8b, v31.8b}, [DST_W], #32
++.endm
++
++generate_composite_function \
++    pixman_composite_over_8888_8888_8888_asm_neon, 32, 32, 32, \
++    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
++    8, /* number of pixels, processed in a single block */ \
++    5, /* prefetch distance */ \
++    default_init_need_all_regs, \
++    default_cleanup_need_all_regs, \
++    pixman_composite_over_8888_n_8888_process_pixblock_head, \
++    pixman_composite_over_8888_n_8888_process_pixblock_tail, \
++    pixman_composite_over_8888_8888_8888_process_pixblock_tail_head \
++    28, /* dst_w_basereg */ \
++    4,  /* dst_r_basereg */ \
++    0,  /* src_basereg   */ \
++    12  /* mask_basereg  */
++
++generate_composite_function_single_scanline \
++    pixman_composite_scanline_over_mask_asm_neon, 32, 32, 32, \
++    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
++    8, /* number of pixels, processed in a single block */ \
++    default_init_need_all_regs, \
++    default_cleanup_need_all_regs, \
++    pixman_composite_over_8888_n_8888_process_pixblock_head, \
++    pixman_composite_over_8888_n_8888_process_pixblock_tail, \
++    pixman_composite_over_8888_8888_8888_process_pixblock_tail_head \
++    28, /* dst_w_basereg */ \
++    4,  /* dst_r_basereg */ \
++    0,  /* src_basereg   */ \
++    12  /* mask_basereg  */
++
++/******************************************************************************/
++
++/* TODO: expand macros and do better instructions scheduling */
++.macro pixman_composite_over_8888_8_8888_process_pixblock_tail_head
++    ld4        {v4.8b, v5.8b, v6.8b, v7.8b}, [DST_R], #32
++    pixman_composite_over_8888_n_8888_process_pixblock_tail
++    fetch_src_pixblock
++    cache_preload 8, 8
++    fetch_mask_pixblock
++    pixman_composite_over_8888_n_8888_process_pixblock_head
++    st4        {v28.8b, v29.8b, v30.8b, v31.8b}, [DST_W], #32
++.endm
++
++generate_composite_function \
++    pixman_composite_over_8888_8_8888_asm_neon, 32, 8, 32, \
++    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
++    8, /* number of pixels, processed in a single block */ \
++    5, /* prefetch distance */ \
++    default_init_need_all_regs, \
++    default_cleanup_need_all_regs, \
++    pixman_composite_over_8888_n_8888_process_pixblock_head, \
++    pixman_composite_over_8888_n_8888_process_pixblock_tail, \
++    pixman_composite_over_8888_8_8888_process_pixblock_tail_head \
++    28, /* dst_w_basereg */ \
++    4,  /* dst_r_basereg */ \
++    0,  /* src_basereg   */ \
++    15  /* mask_basereg  */
++
++/******************************************************************************/
++
++.macro pixman_composite_src_0888_0888_process_pixblock_head
++.endm
++
++.macro pixman_composite_src_0888_0888_process_pixblock_tail
++.endm
++
++.macro pixman_composite_src_0888_0888_process_pixblock_tail_head
++    st3     {v0.8b, v1.8b, v2.8b}, [DST_W], #24
++    fetch_src_pixblock
++    cache_preload 8, 8
++.endm
++
++generate_composite_function \
++    pixman_composite_src_0888_0888_asm_neon, 24, 0, 24, \
++    FLAG_DST_WRITEONLY, \
++    8, /* number of pixels, processed in a single block */ \
++    10, /* prefetch distance */ \
++    default_init, \
++    default_cleanup, \
++    pixman_composite_src_0888_0888_process_pixblock_head, \
++    pixman_composite_src_0888_0888_process_pixblock_tail, \
++    pixman_composite_src_0888_0888_process_pixblock_tail_head, \
++    0, /* dst_w_basereg */ \
++    0, /* dst_r_basereg */ \
++    0, /* src_basereg   */ \
++    0  /* mask_basereg  */
++
++/******************************************************************************/
++
++.macro pixman_composite_src_0888_8888_rev_process_pixblock_head
++    mov    v31.8b, v2.8b
++    mov    v2.8b, v0.8b
++    mov    v0.8b, v31.8b
++.endm
++
++.macro pixman_composite_src_0888_8888_rev_process_pixblock_tail
++.endm
++
++.macro pixman_composite_src_0888_8888_rev_process_pixblock_tail_head
++    st4    {v0.8b, v1.8b, v2.8b, v3.8b}, [DST_W], #32
++    fetch_src_pixblock
++    mov    v31.8b, v2.8b
++    mov    v2.8b, v0.8b
++    mov    v0.8b, v31.8b
++    cache_preload 8, 8
++.endm
++
++.macro pixman_composite_src_0888_8888_rev_init
++    eor    v3.8b, v3.8b, v3.8b
++.endm
++
++generate_composite_function \
++    pixman_composite_src_0888_8888_rev_asm_neon, 24, 0, 32, \
++    FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \
++    8, /* number of pixels, processed in a single block */ \
++    10, /* prefetch distance */ \
++    pixman_composite_src_0888_8888_rev_init, \
++    default_cleanup, \
++    pixman_composite_src_0888_8888_rev_process_pixblock_head, \
++    pixman_composite_src_0888_8888_rev_process_pixblock_tail, \
++    pixman_composite_src_0888_8888_rev_process_pixblock_tail_head, \
++    0, /* dst_w_basereg */ \
++    0, /* dst_r_basereg */ \
++    0, /* src_basereg   */ \
++    0  /* mask_basereg  */
++
++/******************************************************************************/
++
++.macro pixman_composite_src_0888_0565_rev_process_pixblock_head
++    ushll       v8.8h, v1.8b, #7
++    sli         v8.8h, v8.8h, #1
++    ushll       v9.8h, v2.8b, #7
++    sli         v9.8h, v9.8h, #1
++.endm
++
++.macro pixman_composite_src_0888_0565_rev_process_pixblock_tail
++    ushll       v14.8h, v0.8b, #7
++    sli         v14.8h, v14.8h, #1
++    sri         v14.8h, v8.8h, #5
++    sri         v14.8h, v9.8h, #11
++    mov         v28.d[0], v14.d[0]
++    mov         v29.d[0], v14.d[1]
++.endm
++
++.macro pixman_composite_src_0888_0565_rev_process_pixblock_tail_head
++        ushll       v14.8h, v0.8b, #7
++        sli         v14.8h, v14.8h, #1
++    fetch_src_pixblock
++        sri         v14.8h, v8.8h, #5
++        sri         v14.8h, v9.8h, #11
++        mov         v28.d[0], v14.d[0]
++        mov         v29.d[0], v14.d[1]
++    ushll       v8.8h, v1.8b, #7
++    sli         v8.8h, v8.8h, #1
++        st1     {v14.8h}, [DST_W], #16
++    ushll       v9.8h, v2.8b, #7
++    sli         v9.8h, v9.8h, #1
++.endm
++
++generate_composite_function \
++    pixman_composite_src_0888_0565_rev_asm_neon, 24, 0, 16, \
++    FLAG_DST_WRITEONLY, \
++    8, /* number of pixels, processed in a single block */ \
++    10, /* prefetch distance */ \
++    default_init, \
++    default_cleanup, \
++    pixman_composite_src_0888_0565_rev_process_pixblock_head, \
++    pixman_composite_src_0888_0565_rev_process_pixblock_tail, \
++    pixman_composite_src_0888_0565_rev_process_pixblock_tail_head, \
++    28, /* dst_w_basereg */ \
++    0, /* dst_r_basereg */ \
++    0, /* src_basereg   */ \
++    0  /* mask_basereg  */
++
++/******************************************************************************/
++
++.macro pixman_composite_src_pixbuf_8888_process_pixblock_head
++    umull       v8.8h, v3.8b, v0.8b
++    umull       v9.8h, v3.8b, v1.8b
++    umull       v10.8h, v3.8b, v2.8b
++.endm
++
++.macro pixman_composite_src_pixbuf_8888_process_pixblock_tail
++    urshr       v11.8h, v8.8h, #8
++    mov         v30.8b, v31.8b
++    mov         v31.8b, v3.8b
++    mov         v3.8b, v30.8b
++    urshr       v12.8h, v9.8h, #8
++    urshr       v13.8h, v10.8h, #8
++    raddhn      v30.8b, v11.8h, v8.8h
++    raddhn      v29.8b, v12.8h, v9.8h
++    raddhn      v28.8b, v13.8h, v10.8h
++.endm
++
++.macro pixman_composite_src_pixbuf_8888_process_pixblock_tail_head
++        urshr       v11.8h, v8.8h, #8
++        mov         v30.8b, v31.8b
++        mov         v31.8b, v3.8b
++        mov         v3.8b, v31.8b
++        urshr       v12.8h, v9.8h, #8
++        urshr       v13.8h, v10.8h, #8
++    fetch_src_pixblock
++        raddhn      v30.8b, v11.8h, v8.8h
++                                    PF add PF_X, PF_X, #8
++                                    PF tst PF_CTL, #0xF
++                                    PF beq 10f
++                                    PF add PF_X, PF_X, #8
++                                    PF sub PF_CTL, PF_CTL, #1
++10:
++        raddhn      v29.8b, v12.8h, v9.8h
++        raddhn      v28.8b, v13.8h, v10.8h
++    umull       v8.8h, v3.8b, v0.8b
++    umull       v9.8h, v3.8b, v1.8b
++    umull       v10.8h, v3.8b, v2.8b
++         st4    {v28.8b, v29.8b, v30.8b, v31.8b}, [DST_W], #32
++                                    PF cmp PF_X, ORIG_W
++                                    PF lsl DUMMY, PF_X, src_bpp_shift
++                                    PF prfm PREFETCH_MODE, [PF_SRC, DUMMY]
++                                    PF ble 10f
++                                    PF sub PF_X, PF_X, ORIG_W
++                                    PF subs PF_CTL, PF_CTL, #0x10
++                                    PF ble 10f
++                                    PF lsl DUMMY, SRC_STRIDE, #src_bpp_shift
++                                    PF ldrsb DUMMY, [PF_SRC, DUMMY]
++                                    PF add PF_SRC, PF_SRC, #1
++10:
++.endm
++
++generate_composite_function \
++    pixman_composite_src_pixbuf_8888_asm_neon, 32, 0, 32, \
++    FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \
++    8, /* number of pixels, processed in a single block */ \
++    10, /* prefetch distance */ \
++    default_init, \
++    default_cleanup, \
++    pixman_composite_src_pixbuf_8888_process_pixblock_head, \
++    pixman_composite_src_pixbuf_8888_process_pixblock_tail, \
++    pixman_composite_src_pixbuf_8888_process_pixblock_tail_head, \
++    28, /* dst_w_basereg */ \
++    0, /* dst_r_basereg */ \
++    0, /* src_basereg   */ \
++    0  /* mask_basereg  */
++
++/******************************************************************************/
++
++.macro pixman_composite_src_rpixbuf_8888_process_pixblock_head
++    umull       v8.8h, v3.8b, v0.8b
++    umull       v9.8h, v3.8b, v1.8b
++    umull       v10.8h, v3.8b, v2.8b
++.endm
++
++.macro pixman_composite_src_rpixbuf_8888_process_pixblock_tail
++    urshr       v11.8h, v8.8h, #8
++    mov         v30.8b, v31.8b
++    mov         v31.8b, v3.8b
++    mov         v3.8b, v30.8b
++    urshr       v12.8h, v9.8h, #8
++    urshr       v13.8h, v10.8h, #8
++    raddhn      v28.8b, v11.8h, v8.8h
++    raddhn      v29.8b, v12.8h, v9.8h
++    raddhn      v30.8b, v13.8h, v10.8h
++.endm
++
++.macro pixman_composite_src_rpixbuf_8888_process_pixblock_tail_head
++        urshr       v11.8h, v8.8h, #8
++        mov         v30.8b, v31.8b
++        mov         v31.8b, v3.8b
++        mov         v3.8b, v30.8b
++        urshr       v12.8h, v9.8h, #8
++        urshr       v13.8h, v10.8h, #8
++    fetch_src_pixblock
++        raddhn      v28.8b, v11.8h, v8.8h
++                                    PF add PF_X, PF_X, #8
++                                    PF tst PF_CTL, #0xF
++                                    PF beq 10f
++                                    PF add PF_X, PF_X, #8
++                                    PF sub PF_CTL, PF_CTL, #1
++10:
++        raddhn      v29.8b, v12.8h, v9.8h
++        raddhn      v30.8b, v13.8h, v10.8h
++    umull       v8.8h, v3.8b, v0.8b
++    umull       v9.8h, v3.8b, v1.8b
++    umull       v10.8h, v3.8b, v2.8b
++         st4    {v28.8b, v29.8b, v30.8b, v31.8b}, [DST_W], #32
++                                    PF cmp PF_X, ORIG_W
++                                    PF lsl DUMMY, PF_X, src_bpp_shift
++                                    PF prfm PREFETCH_MODE, [PF_SRC, DUMMY]
++                                    PF ble 10f
++                                    PF sub PF_X, PF_X, ORIG_W
++                                    PF subs PF_CTL, PF_CTL, #0x10
++                                    PF ble 10f
++                                    PF lsl DUMMY, SRC_STRIDE, #src_bpp_shift
++                                    PF ldrsb DUMMY, [PF_SRC, DUMMY]
++                                    PF add PF_SRC, PF_SRC, #1
++10:
++.endm
++
++generate_composite_function \
++    pixman_composite_src_rpixbuf_8888_asm_neon, 32, 0, 32, \
++    FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \
++    8, /* number of pixels, processed in a single block */ \
++    10, /* prefetch distance */ \
++    default_init, \
++    default_cleanup, \
++    pixman_composite_src_rpixbuf_8888_process_pixblock_head, \
++    pixman_composite_src_rpixbuf_8888_process_pixblock_tail, \
++    pixman_composite_src_rpixbuf_8888_process_pixblock_tail_head, \
++    28, /* dst_w_basereg */ \
++    0, /* dst_r_basereg */ \
++    0, /* src_basereg   */ \
++    0  /* mask_basereg  */
++
++/******************************************************************************/
++
++.macro pixman_composite_over_0565_8_0565_process_pixblock_head
++    /* mask is in v15 */
++    mov         v4.d[0], v8.d[0]
++    mov         v4.d[1], v9.d[0]
++    mov         v13.d[0], v10.d[0]
++    mov         v13.d[1], v11.d[0]
++    convert_0565_to_x888 v4, v2, v1, v0
++    convert_0565_to_x888 v13, v6, v5, v4
++    /* source pixel data is in      {v0, v1, v2, XX} */
++    /* destination pixel data is in {v4, v5, v6, XX} */
++    mvn         v7.8b,  v15.8b
++    umull       v10.8h, v15.8b, v2.8b
++    umull       v9.8h,  v15.8b, v1.8b
++    umull       v8.8h,  v15.8b, v0.8b
++    umull       v11.8h, v7.8b,  v4.8b
++    umull       v12.8h, v7.8b,  v5.8b
++    umull       v13.8h, v7.8b,  v6.8b
++    urshr       v19.8h, v10.8h, #8
++    urshr       v18.8h, v9.8h,  #8
++    urshr       v17.8h, v8.8h,  #8
++    raddhn      v2.8b,  v10.8h, v19.8h
++    raddhn      v1.8b,  v9.8h,  v18.8h
++    raddhn      v0.8b,  v8.8h,  v17.8h
++.endm
++
++.macro pixman_composite_over_0565_8_0565_process_pixblock_tail
++    urshr       v17.8h, v11.8h,  #8
++    urshr       v18.8h, v12.8h,  #8
++    urshr       v19.8h, v13.8h,  #8
++    raddhn      v28.8b, v17.8h, v11.8h
++    raddhn      v29.8b, v18.8h, v12.8h
++    raddhn      v30.8b, v19.8h, v13.8h
++    uqadd       v0.8b,  v0.8b,  v28.8b
++    uqadd       v1.8b,  v1.8b,  v29.8b
++    uqadd       v2.8b,  v2.8b,  v30.8b
++    /* 32bpp result is in {v0, v1, v2, XX} */
++    convert_8888_to_0565 v2, v1, v0, v14, v30, v13
++    mov         v28.d[0], v14.d[0]
++    mov         v29.d[0], v14.d[1]
++.endm
++
++/* TODO: expand macros and do better instructions scheduling */
++.macro pixman_composite_over_0565_8_0565_process_pixblock_tail_head
++    fetch_mask_pixblock
++    pixman_composite_over_0565_8_0565_process_pixblock_tail
++    fetch_src_pixblock
++    ld1        {v10.4h, v11.4h}, [DST_R], #16
++    cache_preload 8, 8
++    pixman_composite_over_0565_8_0565_process_pixblock_head
++    st1        {v14.8h}, [DST_W], #16
++.endm
++
++generate_composite_function \
++    pixman_composite_over_0565_8_0565_asm_neon, 16, 8, 16, \
++    FLAG_DST_READWRITE, \
++    8, /* number of pixels, processed in a single block */ \
++    5, /* prefetch distance */ \
++    default_init_need_all_regs, \
++    default_cleanup_need_all_regs, \
++    pixman_composite_over_0565_8_0565_process_pixblock_head, \
++    pixman_composite_over_0565_8_0565_process_pixblock_tail, \
++    pixman_composite_over_0565_8_0565_process_pixblock_tail_head, \
++    28, /* dst_w_basereg */ \
++    10,  /* dst_r_basereg */ \
++    8,  /* src_basereg   */ \
++    15  /* mask_basereg  */
++
++/******************************************************************************/
++
++.macro pixman_composite_over_0565_n_0565_init
++    mov         v15.s[0], w6
++    dup         v15.8b, v15.b[3]
++.endm
++
++.macro pixman_composite_over_0565_n_0565_cleanup
++.endm
++
++generate_composite_function \
++    pixman_composite_over_0565_n_0565_asm_neon, 16, 0, 16, \
++    FLAG_DST_READWRITE, \
++    8, /* number of pixels, processed in a single block */ \
++    5, /* prefetch distance */ \
++    pixman_composite_over_0565_n_0565_init, \
++    pixman_composite_over_0565_n_0565_cleanup, \
++    pixman_composite_over_0565_8_0565_process_pixblock_head, \
++    pixman_composite_over_0565_8_0565_process_pixblock_tail, \
++    pixman_composite_over_0565_8_0565_process_pixblock_tail_head, \
++    28, /* dst_w_basereg */ \
++    10,  /* dst_r_basereg */ \
++    8,  /* src_basereg   */ \
++    15  /* mask_basereg  */
++
++/******************************************************************************/
++
++.macro pixman_composite_add_0565_8_0565_process_pixblock_head
++    /* mask is in v15 */
++    mov         v4.d[0], v8.d[0]
++    mov         v4.d[1], v9.d[0]
++    mov         v13.d[0], v10.d[0]
++    mov         v13.d[1], v11.d[0]
++    convert_0565_to_x888 v4,  v2, v1, v0
++    convert_0565_to_x888 v13, v6, v5, v4
++    /* source pixel data is in      {v0, v1, v2, XX} */
++    /* destination pixel data is in {v4, v5, v6, XX} */
++    umull       v9.8h,  v15.8b, v2.8b
++    umull       v8.8h,  v15.8b, v1.8b
++    umull       v7.8h,  v15.8b, v0.8b
++    urshr       v12.8h, v9.8h,  #8
++    urshr       v11.8h, v8.8h,  #8
++    urshr       v10.8h, v7.8h,  #8
++    raddhn      v2.8b,  v9.8h,  v12.8h
++    raddhn      v1.8b,  v8.8h,  v11.8h
++    raddhn      v0.8b,  v7.8h,  v10.8h
++.endm
++
++.macro pixman_composite_add_0565_8_0565_process_pixblock_tail
++    uqadd       v0.8b,  v0.8b,  v4.8b
++    uqadd       v1.8b,  v1.8b,  v5.8b
++    uqadd       v2.8b,  v2.8b,  v6.8b
++    /* 32bpp result is in {v0, v1, v2, XX} */
++    convert_8888_to_0565 v2, v1, v0, v14, v30, v13
++    mov         v28.d[0], v14.d[0]
++    mov         v29.d[0], v14.d[1]
++.endm
++
++/* TODO: expand macros and do better instructions scheduling */
++.macro pixman_composite_add_0565_8_0565_process_pixblock_tail_head
++    fetch_mask_pixblock
++    pixman_composite_add_0565_8_0565_process_pixblock_tail
++    fetch_src_pixblock
++    ld1        {v10.4h, v11.4h}, [DST_R], #16
++    cache_preload 8, 8
++    pixman_composite_add_0565_8_0565_process_pixblock_head
++    st1        {v14.8h}, [DST_W], #16
++.endm
++
++generate_composite_function \
++    pixman_composite_add_0565_8_0565_asm_neon, 16, 8, 16, \
++    FLAG_DST_READWRITE, \
++    8, /* number of pixels, processed in a single block */ \
++    5, /* prefetch distance */ \
++    default_init_need_all_regs, \
++    default_cleanup_need_all_regs, \
++    pixman_composite_add_0565_8_0565_process_pixblock_head, \
++    pixman_composite_add_0565_8_0565_process_pixblock_tail, \
++    pixman_composite_add_0565_8_0565_process_pixblock_tail_head, \
++    28, /* dst_w_basereg */ \
++    10, /* dst_r_basereg */ \
++    8,  /* src_basereg   */ \
++    15  /* mask_basereg  */
++
++/******************************************************************************/
++
++.macro pixman_composite_out_reverse_8_0565_process_pixblock_head
++    /* mask is in v15 */
++    mov         v12.d[0], v10.d[0]
++    mov         v12.d[1], v11.d[0]
++    convert_0565_to_x888 v12, v6, v5, v4
++    /* destination pixel data is in {v4, v5, v6, xx} */
++    mvn         v24.8b, v15.8b /* get inverted alpha */
++    /* now do alpha blending */
++    umull       v8.8h,  v24.8b, v4.8b
++    umull       v9.8h,  v24.8b, v5.8b
++    umull       v10.8h, v24.8b, v6.8b
++.endm
++
++.macro pixman_composite_out_reverse_8_0565_process_pixblock_tail
++    urshr       v11.8h, v8.8h, #8
++    urshr       v12.8h, v9.8h, #8
++    urshr       v13.8h, v10.8h, #8
++    raddhn      v0.8b, v11.8h, v8.8h
++    raddhn      v1.8b, v12.8h, v9.8h
++    raddhn      v2.8b, v13.8h, v10.8h
++    /* 32bpp result is in {v0, v1, v2, XX} */
++    convert_8888_to_0565 v2, v1, v0, v14, v12, v3
++    mov         v28.d[0], v14.d[0]
++    mov         v29.d[0], v14.d[1]
++.endm
++
++/* TODO: expand macros and do better instructions scheduling */
++.macro pixman_composite_out_reverse_8_0565_process_pixblock_tail_head
++    fetch_src_pixblock
++    pixman_composite_out_reverse_8_0565_process_pixblock_tail
++    ld1        {v10.4h, v11.4h}, [DST_R], #16
++    cache_preload 8, 8
++    pixman_composite_out_reverse_8_0565_process_pixblock_head
++    st1        {v14.8h}, [DST_W], #16
++.endm
++
++generate_composite_function \
++    pixman_composite_out_reverse_8_0565_asm_neon, 8, 0, 16, \
++    FLAG_DST_READWRITE, \
++    8, /* number of pixels, processed in a single block */ \
++    5, /* prefetch distance */ \
++    default_init_need_all_regs, \
++    default_cleanup_need_all_regs, \
++    pixman_composite_out_reverse_8_0565_process_pixblock_head, \
++    pixman_composite_out_reverse_8_0565_process_pixblock_tail, \
++    pixman_composite_out_reverse_8_0565_process_pixblock_tail_head, \
++    28, /* dst_w_basereg */ \
++    10,  /* dst_r_basereg */ \
++    15, /* src_basereg   */ \
++    0   /* mask_basereg  */
++
++/******************************************************************************/
++
++.macro pixman_composite_out_reverse_8_8888_process_pixblock_head
++    /* src is in v0 */
++    /* destination pixel data is in {v4, v5, v6, v7} */
++    mvn         v1.8b, v0.8b /* get inverted alpha */
++    /* now do alpha blending */
++    umull       v8.8h, v1.8b, v4.8b
++    umull       v9.8h, v1.8b, v5.8b
++    umull       v10.8h, v1.8b, v6.8b
++    umull       v11.8h, v1.8b, v7.8b
++.endm
++
++.macro pixman_composite_out_reverse_8_8888_process_pixblock_tail
++    urshr       v14.8h, v8.8h, #8
++    urshr       v15.8h, v9.8h, #8
++    urshr       v12.8h, v10.8h, #8
++    urshr       v13.8h, v11.8h, #8
++    raddhn      v28.8b, v14.8h, v8.8h
++    raddhn      v29.8b, v15.8h, v9.8h
++    raddhn      v30.8b, v12.8h, v10.8h
++    raddhn      v31.8b, v13.8h, v11.8h
++    /* 32bpp result is in {v28, v29, v30, v31} */
++.endm
++
++/* TODO: expand macros and do better instructions scheduling */
++.macro pixman_composite_out_reverse_8_8888_process_pixblock_tail_head
++    fetch_src_pixblock
++    pixman_composite_out_reverse_8_8888_process_pixblock_tail
++    ld4       {v4.8b, v5.8b, v6.8b, v7.8b}, [DST_R], #32
++    cache_preload 8, 8
++    pixman_composite_out_reverse_8_8888_process_pixblock_head
++    st4       {v28.8b, v29.8b, v30.8b, v31.8b}, [DST_W], #32
++.endm
++
++generate_composite_function \
++    pixman_composite_out_reverse_8_8888_asm_neon, 8, 0, 32, \
++    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
++    8, /* number of pixels, processed in a single block */ \
++    5, /* prefetch distance */ \
++    default_init, \
++    default_cleanup, \
++    pixman_composite_out_reverse_8_8888_process_pixblock_head, \
++    pixman_composite_out_reverse_8_8888_process_pixblock_tail, \
++    pixman_composite_out_reverse_8_8888_process_pixblock_tail_head, \
++    28, /* dst_w_basereg */ \
++    4, /* dst_r_basereg */ \
++    0, /* src_basereg   */ \
++    0   /* mask_basereg  */
++    
++/******************************************************************************/
++
++generate_composite_function_nearest_scanline \
++    pixman_scaled_nearest_scanline_8888_8888_OVER_asm_neon, 32, 0, 32, \
++    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
++    8, /* number of pixels, processed in a single block */ \
++    default_init, \
++    default_cleanup, \
++    pixman_composite_over_8888_8888_process_pixblock_head, \
++    pixman_composite_over_8888_8888_process_pixblock_tail, \
++    pixman_composite_over_8888_8888_process_pixblock_tail_head
++
++generate_composite_function_nearest_scanline \
++    pixman_scaled_nearest_scanline_8888_0565_OVER_asm_neon, 32, 0, 16, \
++    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
++    8, /* number of pixels, processed in a single block */ \
++    default_init, \
++    default_cleanup, \
++    pixman_composite_over_8888_0565_process_pixblock_head, \
++    pixman_composite_over_8888_0565_process_pixblock_tail, \
++    pixman_composite_over_8888_0565_process_pixblock_tail_head, \
++    28, /* dst_w_basereg */ \
++    4,  /* dst_r_basereg */ \
++    0,  /* src_basereg   */ \
++    24  /* mask_basereg  */
++
++generate_composite_function_nearest_scanline \
++    pixman_scaled_nearest_scanline_8888_0565_SRC_asm_neon, 32, 0, 16, \
++    FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \
++    8, /* number of pixels, processed in a single block */ \
++    default_init, \
++    default_cleanup, \
++    pixman_composite_src_8888_0565_process_pixblock_head, \
++    pixman_composite_src_8888_0565_process_pixblock_tail, \
++    pixman_composite_src_8888_0565_process_pixblock_tail_head, \
++    
++generate_composite_function_nearest_scanline \
++    pixman_scaled_nearest_scanline_0565_8888_SRC_asm_neon, 16, 0, 32, \
++    FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \
++    8, /* number of pixels, processed in a single block */ \
++    default_init, \
++    default_cleanup, \
++    pixman_composite_src_0565_8888_process_pixblock_head, \
++    pixman_composite_src_0565_8888_process_pixblock_tail, \
++    pixman_composite_src_0565_8888_process_pixblock_tail_head
++
++generate_composite_function_nearest_scanline \
++    pixman_scaled_nearest_scanline_8888_8_0565_OVER_asm_neon, 32, 8, 16, \
++    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
++    8, /* number of pixels, processed in a single block */ \
++    default_init_need_all_regs, \
++    default_cleanup_need_all_regs, \
++    pixman_composite_over_8888_8_0565_process_pixblock_head, \
++    pixman_composite_over_8888_8_0565_process_pixblock_tail, \
++    pixman_composite_over_8888_8_0565_process_pixblock_tail_head, \
++    28, /* dst_w_basereg */ \
++    4,  /* dst_r_basereg */ \
++    8,  /* src_basereg   */ \
++    24  /* mask_basereg  */
++
++generate_composite_function_nearest_scanline \
++    pixman_scaled_nearest_scanline_0565_8_0565_OVER_asm_neon, 16, 8, 16, \
++    FLAG_DST_READWRITE, \
++    8, /* number of pixels, processed in a single block */ \
++    default_init_need_all_regs, \
++    default_cleanup_need_all_regs, \
++    pixman_composite_over_0565_8_0565_process_pixblock_head, \
++    pixman_composite_over_0565_8_0565_process_pixblock_tail, \
++    pixman_composite_over_0565_8_0565_process_pixblock_tail_head, \
++    28, /* dst_w_basereg */ \
++    10,  /* dst_r_basereg */ \
++    8,  /* src_basereg   */ \
++    15  /* mask_basereg  */
++
++/******************************************************************************/
++
++/*
++ * Bilinear scaling support code which tries to provide pixel fetching, color
++ * format conversion, and interpolation as separate macros which can be used
++ * as the basic building blocks for constructing bilinear scanline functions.
++ */
++
++.macro bilinear_load_8888 reg1, reg2, tmp
++    asr       TMP1, X, #16
++    add       X, X, UX
++    add       TMP1, TOP, TMP1, lsl #2
++    ld1       {&reg1&.2s}, [TMP1], STRIDE
++    ld1       {&reg2&.2s}, [TMP1]
++.endm
++
++.macro bilinear_load_0565 reg1, reg2, tmp
++    asr       TMP1, X, #16
++    add       X, X, UX
++    add       TMP1, TOP, TMP1, lsl #1
++    ld1       {&reg2&.s}[0], [TMP1], STRIDE
++    ld1       {&reg2&.s}[1], [TMP1]
++    convert_four_0565_to_x888_packed reg2, reg1, reg2, tmp
++.endm
++
++.macro bilinear_load_and_vertical_interpolate_two_8888 \
++                    acc1, acc2, reg1, reg2, reg3, reg4, tmp1, tmp2
++
++    bilinear_load_8888 reg1, reg2, tmp1
++    umull     &acc1&.8h, &reg1&.8b, v28.8b
++    umlal     &acc1&.8h, &reg2&.8b, v29.8b
++    bilinear_load_8888 reg3, reg4, tmp2
++    umull     &acc2&.8h, &reg3&.8b, v28.8b
++    umlal     &acc2&.8h, &reg4&.8b, v29.8b
++.endm
++
++.macro bilinear_load_and_vertical_interpolate_four_8888 \
++                xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi \
++                yacc1, yacc2, yreg1, yreg2, yreg3, yreg4, yacc2lo, yacc2hi
++
++    bilinear_load_and_vertical_interpolate_two_8888 \
++                xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi
++    bilinear_load_and_vertical_interpolate_two_8888 \
++                yacc1, yacc2, yreg1, yreg2, yreg3, yreg4, yacc2lo, yacc2hi
++.endm
++
++.macro vzip reg1, reg2
++    umov      TMP4, v31.d[0]
++    zip1      v31.8b, reg1, reg2
++    zip2      reg2,   reg1, reg2
++    mov       reg1,   v31.8b
++    mov       v31.d[0], TMP4
++.endm
++
++.macro vuzp reg1, reg2
++    umov      TMP4, v31.d[0]
++    uzp1      v31.8b, reg1, reg2
++    uzp2      reg2,   reg1, reg2
++    mov       reg1,   v31.8b
++    mov       v31.d[0], TMP4
++.endm
++
++.macro bilinear_load_and_vertical_interpolate_two_0565 \
++                acc1, acc2, reg1, reg2, reg3, reg4, acc2lo, acc2hi
++    asr       TMP1, X, #16
++    add       X, X, UX
++    add       TMP1, TOP, TMP1, lsl #1
++    asr       TMP2, X, #16
++    add       X, X, UX
++    add       TMP2, TOP, TMP2, lsl #1
++    ld1       {&acc2&.s}[0], [TMP1], STRIDE
++    ld1       {&acc2&.s}[2], [TMP2], STRIDE
++    ld1       {&acc2&.s}[1], [TMP1]
++    ld1       {&acc2&.s}[3], [TMP2]
++    convert_0565_to_x888 acc2, reg3, reg2, reg1
++    vzip      &reg1&.8b, &reg3&.8b
++    vzip      &reg2&.8b, &reg4&.8b
++    vzip      &reg3&.8b, &reg4&.8b
++    vzip      &reg1&.8b, &reg2&.8b
++    umull     &acc1&.8h, &reg1&.8b, v28.8b
++    umlal     &acc1&.8h, &reg2&.8b, v29.8b
++    umull     &acc2&.8h, &reg3&.8b, v28.8b
++    umlal     &acc2&.8h, &reg4&.8b, v29.8b
++.endm
++
++.macro bilinear_load_and_vertical_interpolate_four_0565 \
++                xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi \
++                yacc1, yacc2, yreg1, yreg2, yreg3, yreg4, yacc2lo, yacc2hi
++    asr       TMP1, X, #16
++    add       X, X, UX
++    add       TMP1, TOP, TMP1, lsl #1
++    asr       TMP2, X, #16
++    add       X, X, UX
++    add       TMP2, TOP, TMP2, lsl #1
++    ld1       {&xacc2&.s}[0], [TMP1], STRIDE
++    ld1       {&xacc2&.s}[2], [TMP2], STRIDE
++    ld1       {&xacc2&.s}[1], [TMP1]
++    ld1       {&xacc2&.s}[3], [TMP2]
++    convert_0565_to_x888 xacc2, xreg3, xreg2, xreg1
++    asr       TMP1, X, #16
++    add       X, X, UX
++    add       TMP1, TOP, TMP1, lsl #1
++    asr       TMP2, X, #16
++    add       X, X, UX
++    add       TMP2, TOP, TMP2, lsl #1
++    ld1       {&yacc2&.s}[0], [TMP1], STRIDE
++    vzip      &xreg1&.8b, &xreg3&.8b
++    ld1       {&yacc2&.s}[2], [TMP2], STRIDE
++    vzip      &xreg2&.8b, &xreg4&.8b
++    ld1       {&yacc2&.s}[1], [TMP1]
++    vzip      &xreg3&.8b, &xreg4&.8b
++    ld1       {&yacc2&.s}[3], [TMP2]
++    vzip      &xreg1&.8b, &xreg2&.8b
++    convert_0565_to_x888 yacc2, yreg3, yreg2, yreg1
++    umull     &xacc1&.8h, &xreg1&.8b, v28.8b
++    vzip      &yreg1&.8b, &yreg3&.8b
++    umlal     &xacc1&.8h, &xreg2&.8b, v29.8b
++    vzip      &yreg2&.8b, &yreg4&.8b
++    umull     &xacc2&.8h, &xreg3&.8b, v28.8b
++    vzip      &yreg3&.8b, &yreg4&.8b
++    umlal     &xacc2&.8h, &xreg4&.8b, v29.8b
++    vzip      &yreg1&.8b, &yreg2&.8b
++    umull     &yacc1&.8h, &yreg1&.8b, v28.8b
++    umlal     &yacc1&.8h, &yreg2&.8b, v29.8b
++    umull     &yacc2&.8h, &yreg3&.8b, v28.8b
++    umlal     &yacc2&.8h, &yreg4&.8b, v29.8b
++.endm
++
++.macro bilinear_store_8888 numpix, tmp1, tmp2
++.if numpix == 4
++    st1       {v0.2s, v1.2s}, [OUT], #16
++.elseif numpix == 2
++    st1       {v0.2s}, [OUT], #8
++.elseif numpix == 1
++    st1       {v0.s}[0], [OUT], #4
++.else
++    .error bilinear_store_8888 numpix is unsupported
++.endif
++.endm
++
++.macro bilinear_store_0565 numpix, tmp1, tmp2
++    vuzp      v0.8b, v1.8b
++    vuzp      v2.8b, v3.8b
++    vuzp      v1.8b, v3.8b
++    vuzp      v0.8b, v2.8b
++    convert_8888_to_0565 v2, v1, v0, v1, tmp1, tmp2
++.if numpix == 4
++    st1       {v1.4h}, [OUT], #8
++.elseif numpix == 2
++    st1       {v1.s}[0], [OUT], #4
++.elseif numpix == 1
++    st1       {v1.h}[0], [OUT], #2
++.else
++    .error bilinear_store_0565 numpix is unsupported
++.endif
++.endm
++
++.macro bilinear_interpolate_last_pixel src_fmt, dst_fmt
++    bilinear_load_&src_fmt v0, v1, v2
++    umull     v2.8h, v0.8b, v28.8b
++    umlal     v2.8h, v1.8b, v29.8b
++    /* 5 cycles bubble */
++    ushll     v0.4s, v2.4h, #BILINEAR_INTERPOLATION_BITS
++    umlsl     v0.4s, v2.4h, v15.h[0]
++    umlal2    v0.4s, v2.8h, v15.h[0]
++    /* 5 cycles bubble */
++    shrn      v0.4h, v0.4s, #(2 * BILINEAR_INTERPOLATION_BITS)
++    /* 3 cycles bubble */
++    xtn       v0.8b, v0.8h
++    /* 1 cycle bubble */
++    bilinear_store_&dst_fmt 1, v3, v4
++.endm
++
++.macro bilinear_interpolate_two_pixels src_fmt, dst_fmt
++    bilinear_load_and_vertical_interpolate_two_&src_fmt \
++                v1, v11, v2, v3, v20, v21, v22, v23
++    ushll     v0.4s, v1.4h, #BILINEAR_INTERPOLATION_BITS
++    umlsl     v0.4s, v1.4h, v15.h[0]
++    umlal2    v0.4s, v1.8h, v15.h[0]
++    ushll     v10.4s, v11.4h, #BILINEAR_INTERPOLATION_BITS
++    umlsl     v10.4s, v11.4h, v15.h[4]
++    umlal2    v10.4s, v11.8h, v15.h[4]
++    shrn      v0.4h, v0.4s, #(2 * BILINEAR_INTERPOLATION_BITS)
++    shrn2     v0.8h, v10.4s, #(2 * BILINEAR_INTERPOLATION_BITS)
++    ushr      v15.8h, v12.8h, #(16 - BILINEAR_INTERPOLATION_BITS)
++    add       v12.8h, v12.8h, v13.8h
++    xtn       v0.8b, v0.8h
++    bilinear_store_&dst_fmt 2, v3, v4
++.endm
++
++.macro bilinear_interpolate_four_pixels src_fmt, dst_fmt
++    bilinear_load_and_vertical_interpolate_four_&src_fmt \
++                v1, v11, v14, v20, v16, v17, v22, v23 \
++                v3, v9,  v24, v25, v26, v27, v18, v19
++    prfm      PREFETCH_MODE, [TMP1, PF_OFFS]
++    sub       TMP1, TMP1, STRIDE
++    ushll     v0.4s, v1.4h, #BILINEAR_INTERPOLATION_BITS
++    umlsl     v0.4s, v1.4h, v15.h[0]
++    umlal2    v0.4s, v1.8h, v15.h[0]
++    ushll     v10.4s, v11.4h, #BILINEAR_INTERPOLATION_BITS
++    umlsl     v10.4s, v11.4h, v15.h[4]
++    umlal2    v10.4s, v11.8h, v15.h[4]
++    ushr      v15.8h, v12.8h, #(16 - BILINEAR_INTERPOLATION_BITS)
++    ushll     v2.4s, v3.4h, #BILINEAR_INTERPOLATION_BITS
++    umlsl     v2.4s, v3.4h, v15.h[0]
++    umlal2    v2.4s, v3.8h, v15.h[0]
++    ushll     v8.4s, v9.4h, #BILINEAR_INTERPOLATION_BITS
++    prfm      PREFETCH_MODE, [TMP2, PF_OFFS]
++    umlsl     v8.4s, v9.4h, v15.h[4]
++    umlal2    v8.4s, v9.8h, v15.h[4]
++    add       v12.8h, v12.8h, v13.8h
++    shrn      v0.4h, v0.4s, #(2 * BILINEAR_INTERPOLATION_BITS)
++    shrn2     v0.8h, v10.4s, #(2 * BILINEAR_INTERPOLATION_BITS)
++    shrn      v2.4h, v2.4s, #(2 * BILINEAR_INTERPOLATION_BITS)
++    shrn2     v2.8h, v8.4s, #(2 * BILINEAR_INTERPOLATION_BITS)
++    ushr      v15.8h, v12.8h, #(16 - BILINEAR_INTERPOLATION_BITS)
++    xtn       v0.8b, v0.8h
++    xtn       v1.8b, v2.8h
++    add       v12.8h, v12.8h, v13.8h
++    bilinear_store_&dst_fmt 4, v3, v4
++.endm
++
++.macro bilinear_interpolate_four_pixels_head src_fmt, dst_fmt
++.ifdef have_bilinear_interpolate_four_pixels_&src_fmt&_&dst_fmt
++    bilinear_interpolate_four_pixels_&src_fmt&_&dst_fmt&_head
++.else
++    bilinear_interpolate_four_pixels src_fmt, dst_fmt
++.endif
++.endm
++
++.macro bilinear_interpolate_four_pixels_tail src_fmt, dst_fmt
++.ifdef have_bilinear_interpolate_four_pixels_&src_fmt&_&dst_fmt
++    bilinear_interpolate_four_pixels_&src_fmt&_&dst_fmt&_tail
++.endif
++.endm
++
++.macro bilinear_interpolate_four_pixels_tail_head src_fmt, dst_fmt
++.ifdef have_bilinear_interpolate_four_pixels_&src_fmt&_&dst_fmt
++    bilinear_interpolate_four_pixels_&src_fmt&_&dst_fmt&_tail_head
++.else
++    bilinear_interpolate_four_pixels src_fmt, dst_fmt
++.endif
++.endm
++
++.macro bilinear_interpolate_eight_pixels_head src_fmt, dst_fmt
++.ifdef have_bilinear_interpolate_eight_pixels_&src_fmt&_&dst_fmt
++    bilinear_interpolate_eight_pixels_&src_fmt&_&dst_fmt&_head
++.else
++    bilinear_interpolate_four_pixels_head src_fmt, dst_fmt
++    bilinear_interpolate_four_pixels_tail_head src_fmt, dst_fmt
++.endif
++.endm
++
++.macro bilinear_interpolate_eight_pixels_tail src_fmt, dst_fmt
++.ifdef have_bilinear_interpolate_eight_pixels_&src_fmt&_&dst_fmt
++    bilinear_interpolate_eight_pixels_&src_fmt&_&dst_fmt&_tail
++.else
++    bilinear_interpolate_four_pixels_tail src_fmt, dst_fmt
++.endif
++.endm
++
++.macro bilinear_interpolate_eight_pixels_tail_head src_fmt, dst_fmt
++.ifdef have_bilinear_interpolate_eight_pixels_&src_fmt&_&dst_fmt
++    bilinear_interpolate_eight_pixels_&src_fmt&_&dst_fmt&_tail_head
++.else
++    bilinear_interpolate_four_pixels_tail_head src_fmt, dst_fmt
++    bilinear_interpolate_four_pixels_tail_head src_fmt, dst_fmt
++.endif
++.endm
++
++.set BILINEAR_FLAG_UNROLL_4,          0
++.set BILINEAR_FLAG_UNROLL_8,          1
++.set BILINEAR_FLAG_USE_ALL_NEON_REGS, 2
++
++/*
++ * Main template macro for generating NEON optimized bilinear scanline
++ * functions.
++ *
++ * Bilinear scanline scaler macro template uses the following arguments:
++ *  fname             - name of the function to generate
++ *  src_fmt           - source color format (8888 or 0565)
++ *  dst_fmt           - destination color format (8888 or 0565)
++ *  bpp_shift         - (1 << bpp_shift) is the size of source pixel in bytes
++ *  prefetch_distance - prefetch in the source image by that many
++ *                      pixels ahead
++ */
++
++.macro generate_bilinear_scanline_func fname, src_fmt, dst_fmt, \
++                                       src_bpp_shift, dst_bpp_shift, \
++                                       prefetch_distance, flags
++
++pixman_asm_function fname
++    OUT       .req      x0
++    TOP       .req      x1
++    BOTTOM    .req      x2
++    WT        .req      x3
++    WB        .req      x4
++    X         .req      x5
++    UX        .req      x6
++    WIDTH     .req      x7
++    TMP1      .req      x8
++    TMP2      .req      x9
++    PF_OFFS   .req      x10
++    TMP3      .req      x11
++    TMP4      .req      x12
++    STRIDE    .req      x13
++    
++    sxtw      x3, w3
++    sxtw      x4, w4
++    sxtw      x5, w5
++    sxtw      x6, w6
++    sxtw      x7, w7
++
++    stp       x29, x30, [sp, -16]!
++    mov       x29, sp
++    sub       sp,  sp, 112  /* push all registers */
++    sub       x29, x29, 64
++    st1       {v8.8b, v9.8b, v10.8b, v11.8b}, [x29], #32
++    st1       {v12.8b, v13.8b, v14.8b, v15.8b}, [x29], #32
++    stp        x8,  x9, [x29, -80]
++    stp       x10, x11, [x29, -96]
++    stp       x12, x13, [x29, -112]
++
++    mov       PF_OFFS, #prefetch_distance
++    mul       PF_OFFS, PF_OFFS, UX
++
++    subs      STRIDE, BOTTOM, TOP
++    .unreq    BOTTOM
++
++    cmp       WIDTH, #0
++    ble       300f
++
++    dup       v12.8h, w5
++    dup       v13.8h, w6
++    dup       v28.8b, w3
++    dup       v29.8b, w4
++    mov       v25.d[0], v12.d[1]
++    mov       v26.d[0], v13.d[0]
++    add       v25.4h, v25.4h, v26.4h
++    mov       v12.d[1], v25.d[0]
++
++    /* ensure good destination alignment  */
++    cmp       WIDTH, #1
++    blt       100f
++    tst       OUT, #(1 << dst_bpp_shift)
++    beq       100f
++    ushr      v15.8h, v12.8h, #(16 - BILINEAR_INTERPOLATION_BITS)
++    add       v12.8h, v12.8h, v13.8h
++    bilinear_interpolate_last_pixel src_fmt, dst_fmt
++    sub       WIDTH, WIDTH, #1
++100:
++    add       v13.8h, v13.8h, v13.8h
++    ushr      v15.8h, v12.8h, #(16 - BILINEAR_INTERPOLATION_BITS)
++    add       v12.8h, v12.8h, v13.8h
++
++    cmp       WIDTH, #2
++    blt       100f
++    tst       OUT, #(1 << (dst_bpp_shift + 1))
++    beq       100f
++    bilinear_interpolate_two_pixels src_fmt, dst_fmt
++    sub       WIDTH, WIDTH, #2
++100:
++.if ((flags) & BILINEAR_FLAG_UNROLL_8) != 0
++/*********** 8 pixels per iteration *****************/
++    cmp       WIDTH, #4
++    blt       100f
++    tst       OUT, #(1 << (dst_bpp_shift + 2))
++    beq       100f
++    bilinear_interpolate_four_pixels src_fmt, dst_fmt
++    sub       WIDTH, WIDTH, #4
++100:
++    subs      WIDTH, WIDTH, #8
++    blt       100f
++    asr       PF_OFFS, PF_OFFS, #(16 - src_bpp_shift)
++    bilinear_interpolate_eight_pixels_head src_fmt, dst_fmt
++    subs      WIDTH, WIDTH, #8
++    blt       500f
++1000:
++    bilinear_interpolate_eight_pixels_tail_head src_fmt, dst_fmt
++    subs      WIDTH, WIDTH, #8
++    bge       1000b
++500:
++    bilinear_interpolate_eight_pixels_tail src_fmt, dst_fmt
++100:
++    tst       WIDTH, #4
++    beq       200f
++    bilinear_interpolate_four_pixels src_fmt, dst_fmt
++200:
++.else
++/*********** 4 pixels per iteration *****************/
++    subs      WIDTH, WIDTH, #4
++    blt       100f
++    asr       PF_OFFS, PF_OFFS, #(16 - src_bpp_shift)
++    bilinear_interpolate_four_pixels_head src_fmt, dst_fmt
++    subs      WIDTH, WIDTH, #4
++    blt       500f
++1000:
++    bilinear_interpolate_four_pixels_tail_head src_fmt, dst_fmt
++    subs      WIDTH, WIDTH, #4
++    bge       1000b
++500:
++    bilinear_interpolate_four_pixels_tail src_fmt, dst_fmt
++100:
++/****************************************************/
++.endif
++    /* handle the remaining trailing pixels */
++    tst       WIDTH, #2
++    beq       200f
++    bilinear_interpolate_two_pixels src_fmt, dst_fmt
++200:
++    tst       WIDTH, #1
++    beq       300f
++    bilinear_interpolate_last_pixel src_fmt, dst_fmt
++300:
++    sub       x29, x29, 64
++    ld1       {v8.8b, v9.8b, v10.8b, v11.8b}, [x29], #32
++    ld1       {v12.8b, v13.8b, v14.8b, v15.8b}, [x29], #32
++    ldp        x8,  x9, [x29, -80]
++    ldp       x10, x11, [x29, -96]
++    ldp       x12, x13, [x29, -104]
++    mov       sp, x29
++    ldp       x29, x30, [sp], 16 
++    ret
++
++    .unreq    OUT
++    .unreq    TOP
++    .unreq    WT
++    .unreq    WB
++    .unreq    X
++    .unreq    UX
++    .unreq    WIDTH
++    .unreq    TMP1
++    .unreq    TMP2
++    .unreq    PF_OFFS
++    .unreq    TMP3
++    .unreq    TMP4
++    .unreq    STRIDE
++.endfunc
++
++.endm
++
++/*****************************************************************************/
++
++.set have_bilinear_interpolate_four_pixels_8888_8888, 1
++
++.macro bilinear_interpolate_four_pixels_8888_8888_head
++    asr       TMP1, X, #16
++    add       X, X, UX
++    add       TMP1, TOP, TMP1, lsl #2
++    asr       TMP2, X, #16
++    add       X, X, UX
++    add       TMP2, TOP, TMP2, lsl #2
++
++    ld1       {v22.2s}, [TMP1], STRIDE
++    ld1       {v23.2s}, [TMP1]
++    asr       TMP3, X, #16
++    add       X, X, UX
++    add       TMP3, TOP, TMP3, lsl #2
++    umull     v8.8h, v22.8b, v28.8b
++    umlal     v8.8h, v23.8b, v29.8b
++
++    ld1       {v22.2s}, [TMP2], STRIDE
++    ld1       {v23.2s}, [TMP2]
++    asr       TMP4, X, #16
++    add       X, X, UX
++    add       TMP4, TOP, TMP4, lsl #2
++    umull     v9.8h, v22.8b, v28.8b
++    umlal     v9.8h, v23.8b, v29.8b
++
++    ld1       {v22.2s}, [TMP3], STRIDE
++    ld1       {v23.2s}, [TMP3]
++    umull     v10.8h, v22.8b, v28.8b
++    umlal     v10.8h, v23.8b, v29.8b
++
++    ushll     v0.4s, v8.4h, #BILINEAR_INTERPOLATION_BITS
++    umlsl     v0.4s, v8.4h, v15.h[0]
++    umlal2    v0.4s, v8.8h, v15.h[0]
++
++    prfm      PREFETCH_MODE, [TMP4, PF_OFFS]
++    ld1       {v16.2s}, [TMP4], STRIDE
++    ld1       {v17.2s}, [TMP4]
++    prfm      PREFETCH_MODE, [TMP4, PF_OFFS]
++    umull     v11.8h, v16.8b, v28.8b
++    umlal     v11.8h, v17.8b, v29.8b
++
++    ushll     v1.4s, v9.4h, #BILINEAR_INTERPOLATION_BITS
++    umlsl     v1.4s, v9.4h, v15.h[4]
++.endm
++
++.macro bilinear_interpolate_four_pixels_8888_8888_tail
++    umlal2    v1.4s, v9.8h, v15.h[4]
++    ushr      v15.8h, v12.8h, #(16 - BILINEAR_INTERPOLATION_BITS)
++    ushll     v2.4s, v10.4h, #BILINEAR_INTERPOLATION_BITS
++    umlsl     v2.4s, v10.4h, v15.h[0]
++    umlal2    v2.4s, v10.8h, v15.h[0]
++    ushll     v3.4s, v11.4h, #BILINEAR_INTERPOLATION_BITS
++    umlsl     v3.4s, v11.4h, v15.h[4]
++    umlal2    v3.4s, v11.8h, v15.h[4]
++    add       v12.8h, v12.8h, v13.8h
++    shrn      v0.4h, v0.4s, #(2 * BILINEAR_INTERPOLATION_BITS)
++    shrn2     v0.8h, v1.4s, #(2 * BILINEAR_INTERPOLATION_BITS)
++    shrn      v2.4h, v2.4s, #(2 * BILINEAR_INTERPOLATION_BITS)
++    ushr      v15.8h, v12.8h, #(16 - BILINEAR_INTERPOLATION_BITS)
++    shrn2     v2.8h, v3.4s, #(2 * BILINEAR_INTERPOLATION_BITS)
++    xtn       v6.8b, v0.8h
++    xtn       v7.8b, v2.8h
++    add       v12.8h, v12.8h, v13.8h
++    st1       {v6.2s, v7.2s}, [OUT], #16
++.endm
++
++.macro bilinear_interpolate_four_pixels_8888_8888_tail_head
++    asr       TMP1, X, #16
++    add       X, X, UX
++    add       TMP1, TOP, TMP1, lsl #2
++    asr       TMP2, X, #16
++    add       X, X, UX
++    add       TMP2, TOP, TMP2, lsl #2
++        umlal2    v1.4s, v9.8h, v15.h[4]
++        ushr      v15.8h, v12.8h, #(16 - BILINEAR_INTERPOLATION_BITS)
++        ushll     v2.4s, v10.4h, #BILINEAR_INTERPOLATION_BITS
++        umlsl     v2.4s, v10.4h, v15.h[0]
++        umlal2    v2.4s, v10.8h, v15.h[0]
++        ushll     v3.4s, v11.4h, #BILINEAR_INTERPOLATION_BITS
++    ld1       {v20.2s}, [TMP1], STRIDE
++        umlsl     v3.4s, v11.4h, v15.h[4]
++        umlal2    v3.4s, v11.8h, v15.h[4]
++    ld1       {v21.2s}, [TMP1]
++    umull     v8.8h, v20.8b, v28.8b
++    umlal     v8.8h, v21.8b, v29.8b
++        shrn      v0.4h, v0.4s, #(2 * BILINEAR_INTERPOLATION_BITS)
++        shrn2     v0.8h, v1.4s, #(2 * BILINEAR_INTERPOLATION_BITS)
++        shrn      v4.4h, v2.4s, #(2 * BILINEAR_INTERPOLATION_BITS)
++    ld1       {v22.2s}, [TMP2], STRIDE
++        shrn2     v4.8h, v3.4s, #(2 * BILINEAR_INTERPOLATION_BITS)
++        add       v12.8h, v12.8h, v13.8h
++    ld1       {v23.2s}, [TMP2]
++    umull     v9.8h, v22.8b, v28.8b
++    asr       TMP3, X, #16
++    add       X, X, UX
++    add       TMP3, TOP, TMP3, lsl #2
++    asr       TMP4, X, #16
++    add       X, X, UX
++    add       TMP4, TOP, TMP4, lsl #2
++    umlal     v9.8h, v23.8b, v29.8b
++    ld1       {v22.2s}, [TMP3], STRIDE
++        ushr      v15.8h, v12.8h, #(16 - BILINEAR_INTERPOLATION_BITS)
++    ld1       {v23.2s}, [TMP3]
++    umull     v10.8h, v22.8b, v28.8b
++    umlal     v10.8h, v23.8b, v29.8b
++        xtn       v6.8b, v0.8h
++    ushll     v0.4s, v8.4h, #BILINEAR_INTERPOLATION_BITS
++        xtn       v7.8b, v4.8h
++    umlsl     v0.4s, v8.4h, v15.h[0]
++    umlal2    v0.4s, v8.8h, v15.h[0]
++    prfm      PREFETCH_MODE, [TMP4, PF_OFFS]
++    ld1       {v16.2s}, [TMP4], STRIDE
++        add       v12.8h, v12.8h, v13.8h
++    ld1       {v17.2s}, [TMP4]
++    prfm      PREFETCH_MODE, [TMP4, PF_OFFS]
++    umull     v11.8h, v16.8b, v28.8b
++    umlal     v11.8h, v17.8b, v29.8b
++        st1       {v6.2s, v7.2s}, [OUT], #16
++    ushll     v1.4s, v9.4h, #BILINEAR_INTERPOLATION_BITS
++    umlsl     v1.4s, v9.4h, v15.h[4]
++.endm
++
++/*****************************************************************************/
++
++generate_bilinear_scanline_func \
++    pixman_scaled_bilinear_scanline_8888_8888_SRC_asm_neon, 8888, 8888, \
++    2, 2, 28, BILINEAR_FLAG_UNROLL_4
++
++generate_bilinear_scanline_func \
++    pixman_scaled_bilinear_scanline_8888_0565_SRC_asm_neon, 8888, 0565, \
++    2, 1, 28, BILINEAR_FLAG_UNROLL_8 | BILINEAR_FLAG_USE_ALL_NEON_REGS
++
++generate_bilinear_scanline_func \
++    pixman_scaled_bilinear_scanline_0565_x888_SRC_asm_neon, 0565, 8888, \
++    1, 2, 28, BILINEAR_FLAG_UNROLL_4
++
++generate_bilinear_scanline_func \
++    pixman_scaled_bilinear_scanline_0565_0565_SRC_asm_neon, 0565, 0565, \
++    1, 1, 28, BILINEAR_FLAG_UNROLL_4
+diff --git a/pixman/pixman-arma64-neon-asm.h b/pixman/pixman-arma64-neon-asm.h
+new file mode 100644
+index 0000000..0389d12
+--- /dev/null
++++ b/pixman/pixman-arma64-neon-asm.h
+@@ -0,0 +1,1310 @@
++/*
++ * Copyright © 2009 Nokia Corporation
++ *
++ * Permission is hereby granted, free of charge, to any person obtaining a
++ * copy of this software and associated documentation files (the "Software"),
++ * to deal in the Software without restriction, including without limitation
++ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
++ * and/or sell copies of the Software, and to permit persons to whom the
++ * Software is furnished to do so, subject to the following conditions:
++ *
++ * The above copyright notice and this permission notice (including the next
++ * paragraph) shall be included in all copies or substantial portions of the
++ * Software.
++ *
++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
++ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
++ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
++ * DEALINGS IN THE SOFTWARE.
++ *
++ * Author:  Siarhei Siamashka (siarhei.siamashka@nokia.com)
++ */
++
++/*
++ * This file contains a macro ('generate_composite_function') which can
++ * construct 2D image processing functions, based on a common template.
++ * Any combinations of source, destination and mask images with 8bpp,
++ * 16bpp, 24bpp, 32bpp color formats are supported.
++ *
++ * This macro takes care of:
++ *  - handling of leading and trailing unaligned pixels
++ *  - doing most of the work related to L2 cache preload
++ *  - encourages the use of software pipelining for better instructions
++ *    scheduling
++ *
++ * The user of this macro has to provide some configuration parameters
++ * (bit depths for the images, prefetch distance, etc.) and a set of
++ * macros, which should implement basic code chunks responsible for
++ * pixels processing. See 'pixman-armv8-neon-asm.S' file for the usage
++ * examples.
++ *
++ * TODO:
++ *  - try overlapped pixel method (from Ian Rickards) when processing
++ *    exactly two blocks of pixels
++ *  - maybe add an option to do reverse scanline processing
++ */
++
++/*
++ * Bit flags for 'generate_composite_function' macro which are used
++ * to tune generated functions behavior.
++ */
++.set FLAG_DST_WRITEONLY,       0
++.set FLAG_DST_READWRITE,       1
++.set FLAG_DEINTERLEAVE_32BPP,  2
++
++/*
++ * Constants for selecting preferable prefetch type.
++ */
++.set PREFETCH_TYPE_NONE,       0 /* No prefetch at all */
++.set PREFETCH_TYPE_SIMPLE,     1 /* A simple, fixed-distance-ahead prefetch */
++.set PREFETCH_TYPE_ADVANCED,   2 /* Advanced fine-grained prefetch */
++
++/*
++ * prefetch mode
++ * available modes are:
++ * pldl1keep
++ * pldl1strm
++ * pldl2keep
++ * pldl2strm
++ * pldl3keep
++ * pldl3strm
++ */
++#define PREFETCH_MODE pldl1keep
++
++/*
++ * Definitions of supplementary pixld/pixst macros (for partial load/store of
++ * pixel data).
++ */
++
++.macro pixldst1 op, elem_size, reg1, mem_operand, abits
++    op {v&reg1&.&elem_size}, [&mem_operand&], #8
++.endm
++
++.macro pixldst2 op, elem_size, reg1, reg2, mem_operand, abits
++    op {v&reg1&.&elem_size, v&reg2&.&elem_size}, [&mem_operand&], #16
++.endm
++
++.macro pixldst4 op, elem_size, reg1, reg2, reg3, reg4, mem_operand, abits
++    op {v&reg1&.&elem_size, v&reg2&.&elem_size, v&reg3&.&elem_size, v&reg4&.&elem_size}, [&mem_operand&], #32
++.endm
++
++.macro pixldst0 op, elem_size, reg1, idx, mem_operand, abits, bytes
++    op {v&reg1&.&elem_size}[idx], [&mem_operand&], #&bytes&
++.endm
++
++.macro pixldst3 op, elem_size, reg1, reg2, reg3, mem_operand
++    op {v&reg1&.&elem_size, v&reg2&.&elem_size, v&reg3&.&elem_size}, [&mem_operand&], #24
++.endm
++
++.macro pixldst30 op, elem_size, reg1, reg2, reg3, idx, mem_operand
++    op {v&reg1&.&elem_size, v&reg2&.&elem_size, v&reg3&.&elem_size}[idx], [&mem_operand&], #3
++.endm
++
++.macro pixldst numbytes, op, elem_size, basereg, mem_operand, abits
++.if numbytes == 32
++    .if elem_size==32
++        pixldst4 op, 2s, %(basereg+4), %(basereg+5), \
++                              %(basereg+6), %(basereg+7), mem_operand, abits
++    .elseif elem_size==16
++        pixldst4 op, 4h, %(basereg+4), %(basereg+5), \
++                              %(basereg+6), %(basereg+7), mem_operand, abits
++    .else
++        pixldst4 op, 8b, %(basereg+4), %(basereg+5), \
++                              %(basereg+6), %(basereg+7), mem_operand, abits
++    .endif
++.elseif numbytes == 16
++    .if elem_size==32
++          pixldst2 op, 2s, %(basereg+2), %(basereg+3), mem_operand, abits
++    .elseif elem_size==16
++          pixldst2 op, 4h, %(basereg+2), %(basereg+3), mem_operand, abits
++    .else
++          pixldst2 op, 8b, %(basereg+2), %(basereg+3), mem_operand, abits
++    .endif
++.elseif numbytes == 8
++    .if elem_size==32
++        pixldst1 op, 2s, %(basereg+1), mem_operand, abits
++    .elseif elem_size==16
++        pixldst1 op, 4h, %(basereg+1), mem_operand, abits
++    .else
++        pixldst1 op, 8b, %(basereg+1), mem_operand, abits
++    .endif
++.elseif numbytes == 4
++    .if !RESPECT_STRICT_ALIGNMENT || (elem_size == 32)
++        pixldst0 op, s, %(basereg+0), 1, mem_operand, abits, 4
++    .elseif elem_size == 16
++        pixldst0 op, h, %(basereg+0), 2, mem_operand, abits, 2
++        pixldst0 op, h, %(basereg+0), 3, mem_operand, abits, 2
++    .else
++        pixldst0 op, b, %(basereg+0), 4, mem_operand, abits, 1
++        pixldst0 op, b, %(basereg+0), 5, mem_operand, abits, 1
++        pixldst0 op, b, %(basereg+0), 6, mem_operand, abits, 1
++        pixldst0 op, b, %(basereg+0), 7, mem_operand, abits, 1
++    .endif
++.elseif numbytes == 2
++    .if !RESPECT_STRICT_ALIGNMENT || (elem_size == 16)
++        pixldst0 op, h, %(basereg+0), 1, mem_operand, abits, 2
++    .else
++        pixldst0 op, b, %(basereg+0), 2, mem_operand, abits, 1
++        pixldst0 op, b, %(basereg+0), 3, mem_operand, abits, 1
++    .endif
++.elseif numbytes == 1
++        pixldst0 op, b, %(basereg+0), 1, mem_operand, abits, 1
++.else
++    .error "unsupported size: numbytes"
++.endif
++.endm
++
++.macro pixld numpix, bpp, basereg, mem_operand, abits=0
++.if bpp > 0
++.if (bpp == 32) && (numpix == 8) && (DEINTERLEAVE_32BPP_ENABLED != 0)
++    pixldst4 ld4, 8b, %(basereg+4), %(basereg+5), \
++                      %(basereg+6), %(basereg+7), mem_operand, abits
++.elseif (bpp == 24) && (numpix == 8)
++    pixldst3 ld3, 8b, %(basereg+3), %(basereg+4), %(basereg+5), mem_operand
++.elseif (bpp == 24) && (numpix == 4)
++    pixldst30 ld3, b, %(basereg+0), %(basereg+1), %(basereg+2), 4, mem_operand
++    pixldst30 ld3, b, %(basereg+0), %(basereg+1), %(basereg+2), 5, mem_operand
++    pixldst30 ld3, b, %(basereg+0), %(basereg+1), %(basereg+2), 6, mem_operand
++    pixldst30 ld3, b, %(basereg+0), %(basereg+1), %(basereg+2), 7, mem_operand
++.elseif (bpp == 24) && (numpix == 2)
++    pixldst30 ld3, b, %(basereg+0), %(basereg+1), %(basereg+2), 2, mem_operand
++    pixldst30 ld3, b, %(basereg+0), %(basereg+1), %(basereg+2), 3, mem_operand
++.elseif (bpp == 24) && (numpix == 1)
++    pixldst30 ld3, b, %(basereg+0), %(basereg+1), %(basereg+2), 1, mem_operand
++.else
++    pixldst %(numpix * bpp / 8), ld1, %(bpp), basereg, mem_operand, abits
++.endif
++.endif
++.endm
++
++.macro pixst numpix, bpp, basereg, mem_operand, abits=0
++.if bpp > 0
++.if (bpp == 32) && (numpix == 8) && (DEINTERLEAVE_32BPP_ENABLED != 0)
++    pixldst4 st4, 8b, %(basereg+4), %(basereg+5), \
++                      %(basereg+6), %(basereg+7), mem_operand, abits
++.elseif (bpp == 24) && (numpix == 8)
++    pixldst3 st3, 8b, %(basereg+3), %(basereg+4), %(basereg+5), mem_operand
++.elseif (bpp == 24) && (numpix == 4)
++    pixldst30 st3, b, %(basereg+0), %(basereg+1), %(basereg+2), 4, mem_operand
++    pixldst30 st3, b, %(basereg+0), %(basereg+1), %(basereg+2), 5, mem_operand
++    pixldst30 st3, b, %(basereg+0), %(basereg+1), %(basereg+2), 6, mem_operand
++    pixldst30 st3, b, %(basereg+0), %(basereg+1), %(basereg+2), 7, mem_operand
++.elseif (bpp == 24) && (numpix == 2)
++    pixldst30 st3, b, %(basereg+0), %(basereg+1), %(basereg+2), 2, mem_operand
++    pixldst30 st3, b, %(basereg+0), %(basereg+1), %(basereg+2), 3, mem_operand
++.elseif (bpp == 24) && (numpix == 1)
++    pixldst30 st3, b, %(basereg+0), %(basereg+1), %(basereg+2), 1, mem_operand
++.elseif numpix * bpp == 32 && abits == 32
++    pixldst 4, st1, 32, basereg, mem_operand, abits
++.elseif numpix * bpp == 16 && abits == 16
++    pixldst 2, st1, 16, basereg, mem_operand, abits
++.else
++    pixldst %(numpix * bpp / 8), st1, %(bpp), basereg, mem_operand, abits
++.endif
++.endif
++.endm
++
++.macro pixld_a numpix, bpp, basereg, mem_operand
++.if (bpp * numpix) <= 128
++    pixld numpix, bpp, basereg, mem_operand, %(bpp * numpix)
++.else
++    pixld numpix, bpp, basereg, mem_operand, 128
++.endif
++.endm
++
++.macro pixst_a numpix, bpp, basereg, mem_operand
++.if (bpp * numpix) <= 128
++    pixst numpix, bpp, basereg, mem_operand, %(bpp * numpix)
++.else
++    pixst numpix, bpp, basereg, mem_operand, 128
++.endif
++.endm
++
++/*
++ * Pixel fetcher for nearest scaling (needs TMP1, TMP2, VX, UNIT_X register
++ * aliases to be defined)
++ */
++.macro pixld1_s elem_size, reg1, mem_operand
++.if elem_size == 16
++    asr     TMP1, VX, #16
++    adds    VX, VX, UNIT_X
++    bmi     55f
++5:  subs    VX, VX, SRC_WIDTH_FIXED
++    bpl     5b
++55:
++    add     TMP1, mem_operand, TMP1, lsl #1
++    asr     TMP2, VX, #16
++    adds    VX, VX, UNIT_X
++    bmi     55f
++5:  subs    VX, VX, SRC_WIDTH_FIXED
++    bpl     5b
++55:
++    add     TMP2, mem_operand, TMP2, lsl #1
++    ld1     {v&reg1&.h}[0], [TMP1]
++    asr     TMP1, VX, #16
++    adds    VX, VX, UNIT_X
++    bmi     55f
++5:  subs    VX, VX, SRC_WIDTH_FIXED
++    bpl     5b
++55:
++    add     TMP1, mem_operand, TMP1, lsl #1
++    ld1     {v&reg1&.h}[1], [TMP2]
++    asr     TMP2, VX, #16
++    adds    VX, VX, UNIT_X
++    bmi     55f
++5:  subs    VX, VX, SRC_WIDTH_FIXED
++    bpl     5b
++55:
++    add     TMP2, mem_operand, TMP2, lsl #1
++    ld1     {v&reg1&.h}[2], [TMP1]
++    ld1     {v&reg1&.h}[3], [TMP2]
++.elseif elem_size == 32
++    asr     TMP1, VX, #16
++    adds    VX, VX, UNIT_X
++    bmi     55f
++5:  subs    VX, VX, SRC_WIDTH_FIXED
++    bpl     5b
++55:
++    add     TMP1, mem_operand, TMP1, lsl #2
++    asr     TMP2, VX, #16
++    adds    VX, VX, UNIT_X
++    bmi     55f
++5:  subs    VX, VX, SRC_WIDTH_FIXED
++    bpl     5b
++55:
++    add     TMP2, mem_operand, TMP2, lsl #2
++    ld1     {v&reg1&.s}[0], [TMP1]
++    ld1     {v&reg1&.s}[1], [TMP2]
++.else
++    .error "unsupported"
++.endif
++.endm
++
++.macro pixld2_s elem_size, reg1, reg2, mem_operand
++.if 0 /* elem_size == 32 */
++    mov     TMP1, VX, asr #16
++    add     VX, VX, UNIT_X, asl #1
++    add     TMP1, mem_operand, TMP1, asl #2
++    mov     TMP2, VX, asr #16
++    sub     VX, VX, UNIT_X
++    add     TMP2, mem_operand, TMP2, asl #2
++    ld1     {v&reg1&.s}[0], [TMP1]
++    mov     TMP1, VX, asr #16
++    add     VX, VX, UNIT_X, asl #1
++    add     TMP1, mem_operand, TMP1, asl #2
++    ld1     {v&reg2&.s}[0], [TMP2, :32]
++    mov     TMP2, VX, asr #16
++    add     VX, VX, UNIT_X
++    add     TMP2, mem_operand, TMP2, asl #2
++    ld1     {v&reg1&.s}[1], [TMP1]
++    ld1     {v&reg2&.s}[1], [TMP2]
++.else
++    pixld1_s elem_size, reg1, mem_operand
++    pixld1_s elem_size, reg2, mem_operand
++.endif
++.endm
++
++.macro pixld0_s elem_size, reg1, idx, mem_operand
++.if elem_size == 16
++    asr     TMP1, VX, #16
++    adds    VX, VX, UNIT_X
++    bmi     55f
++5:  subs    VX, VX, SRC_WIDTH_FIXED
++    bpl     5b
++55:
++    add     TMP1, mem_operand, TMP1, lsl #1
++    ld1     {v&reg1&.h}[idx], [TMP1]
++.elseif elem_size == 32
++    asr     DUMMY, VX, #16
++    mov     TMP1, DUMMY
++    adds    VX, VX, UNIT_X
++    bmi     55f
++5:  subs    VX, VX, SRC_WIDTH_FIXED
++    bpl     5b
++55:
++    add     TMP1, mem_operand, TMP1, lsl #2
++    ld1     {v&reg1&.s}[idx], [TMP1]
++.endif
++.endm
++
++.macro pixld_s_internal numbytes, elem_size, basereg, mem_operand
++.if numbytes == 32
++    pixld2_s elem_size, %(basereg+4), %(basereg+5), mem_operand
++    pixld2_s elem_size, %(basereg+6), %(basereg+7), mem_operand
++    pixdeinterleave elem_size, %(basereg+4)
++.elseif numbytes == 16
++    pixld2_s elem_size, %(basereg+2), %(basereg+3), mem_operand
++.elseif numbytes == 8
++    pixld1_s elem_size, %(basereg+1), mem_operand
++.elseif numbytes == 4
++    .if elem_size == 32
++        pixld0_s elem_size, %(basereg+0), 1, mem_operand
++    .elseif elem_size == 16
++        pixld0_s elem_size, %(basereg+0), 2, mem_operand
++        pixld0_s elem_size, %(basereg+0), 3, mem_operand
++    .else
++        pixld0_s elem_size, %(basereg+0), 4, mem_operand
++        pixld0_s elem_size, %(basereg+0), 5, mem_operand
++        pixld0_s elem_size, %(basereg+0), 6, mem_operand
++        pixld0_s elem_size, %(basereg+0), 7, mem_operand
++    .endif
++.elseif numbytes == 2
++    .if elem_size == 16
++        pixld0_s elem_size, %(basereg+0), 1, mem_operand
++    .else
++        pixld0_s elem_size, %(basereg+0), 2, mem_operand
++        pixld0_s elem_size, %(basereg+0), 3, mem_operand
++    .endif
++.elseif numbytes == 1
++    pixld0_s elem_size, %(basereg+0), 1, mem_operand
++.else
++    .error "unsupported size: numbytes"
++.endif
++.endm
++
++.macro pixld_s numpix, bpp, basereg, mem_operand
++.if bpp > 0
++    pixld_s_internal %(numpix * bpp / 8), %(bpp), basereg, mem_operand
++.endif
++.endm
++
++.macro vuzp8 reg1, reg2
++    umov DUMMY, v16.d[0]
++    uzp1 v16.8b,     v&reg1&.8b, v&reg2&.8b
++    uzp2 v&reg2&.8b, v&reg1&.8b, v&reg2&.8b
++    mov  v&reg1&.8b, v16.8b
++    mov  v16.d[0], DUMMY
++.endm
++
++.macro vzip8 reg1, reg2
++    umov DUMMY, v16.d[0]
++    zip1 v16.8b,     v&reg1&.8b, v&reg2&.8b
++    zip2 v&reg2&.8b, v&reg1&.8b, v&reg2&.8b
++    mov  v&reg1&.8b, v16.8b
++    mov  v16.d[0], DUMMY
++.endm
++
++/* deinterleave B, G, R, A channels for eight 32bpp pixels in 4 registers */
++.macro pixdeinterleave bpp, basereg
++.if (bpp == 32) && (DEINTERLEAVE_32BPP_ENABLED != 0)
++    vuzp8 %(basereg+0), %(basereg+1)
++    vuzp8 %(basereg+2), %(basereg+3)
++    vuzp8 %(basereg+1), %(basereg+3)
++    vuzp8 %(basereg+0), %(basereg+2)
++.endif
++.endm
++
++/* interleave B, G, R, A channels for eight 32bpp pixels in 4 registers */
++.macro pixinterleave bpp, basereg
++.if (bpp == 32) && (DEINTERLEAVE_32BPP_ENABLED != 0)
++    vzip8 %(basereg+0), %(basereg+2)
++    vzip8 %(basereg+1), %(basereg+3)
++    vzip8 %(basereg+2), %(basereg+3)
++    vzip8 %(basereg+0), %(basereg+1)
++.endif
++.endm
++
++/*
++ * This is a macro for implementing cache preload. The main idea is that
++ * cache preload logic is mostly independent from the rest of pixels
++ * processing code. It starts at the top left pixel and moves forward
++ * across pixels and can jump across scanlines. Prefetch distance is
++ * handled in an 'incremental' way: it starts from 0 and advances to the
++ * optimal distance over time. After reaching optimal prefetch distance,
++ * it is kept constant. There are some checks which prevent prefetching
++ * unneeded pixel lines below the image (but it still can prefetch a bit
++ * more data on the right side of the image - not a big issue and may
++ * be actually helpful when rendering text glyphs). Additional trick is
++ * the use of LDR instruction for prefetch instead of PLD when moving to
++ * the next line, the point is that we have a high chance of getting TLB
++ * miss in this case, and PLD would be useless.
++ *
++ * This sounds like it may introduce a noticeable overhead (when working with
++ * fully cached data). But in reality, due to having a separate pipeline and
++ * instruction queue for NEON unit in ARM Cortex-A8, normal ARM code can
++ * execute simultaneously with NEON and be completely shadowed by it. Thus
++ * we get no performance overhead at all (*). This looks like a very nice
++ * feature of Cortex-A8, if used wisely. We don't have a hardware prefetcher,
++ * but still can implement some rather advanced prefetch logic in software
++ * for almost zero cost!
++ *
++ * (*) The overhead of the prefetcher is visible when running some trivial
++ * pixels processing like simple copy. Anyway, having prefetch is a must
++ * when working with the graphics data.
++ */
++.macro PF a, x:vararg
++.if (PREFETCH_TYPE_CURRENT == PREFETCH_TYPE_ADVANCED)
++    a x
++.endif
++.endm
++
++.macro cache_preload std_increment, boost_increment
++.if (src_bpp_shift >= 0) || (dst_r_bpp != 0) || (mask_bpp_shift >= 0)
++.if std_increment != 0
++    PF add PF_X, PF_X, #std_increment
++.endif
++    PF tst PF_CTL, #0xF
++    PF beq 71f
++    PF add PF_X, PF_X, #boost_increment
++    PF sub PF_CTL, PF_CTL, #1
++71:
++    PF cmp PF_X, ORIG_W
++.if src_bpp_shift >= 0
++    PF lsl DUMMY, PF_X, #src_bpp_shift
++    PF prfm PREFETCH_MODE, [PF_SRC, DUMMY]
++.endif
++.if dst_r_bpp != 0
++    PF lsl DUMMY, PF_X, #dst_bpp_shift
++    PF prfm PREFETCH_MODE, [PF_DST, DUMMY]
++.endif
++.if mask_bpp_shift >= 0
++    PF lsl DUMMY, PF_X, #mask_bpp_shift
++    PF prfm PREFETCH_MODE, [PF_MASK, DUMMY]
++.endif
++    PF ble 71f
++    PF sub PF_X, PF_X, ORIG_W
++    PF subs PF_CTL, PF_CTL, #0x10
++71:
++    PF ble 72f
++.if src_bpp_shift >= 0
++    PF lsl DUMMY, SRC_STRIDE, #src_bpp_shift
++    PF ldrsb DUMMY, [PF_SRC, DUMMY]
++    PF add PF_SRC, PF_SRC, #1
++.endif
++.if dst_r_bpp != 0
++    PF lsl DUMMY, DST_STRIDE, #dst_bpp_shift
++    PF ldrsb DUMMY, [PF_DST, DUMMY]
++    PF add PF_DST, PF_DST, #1
++.endif
++.if mask_bpp_shift >= 0
++    PF lsl DUMMY, MASK_STRIDE, #mask_bpp_shift
++    PF ldrsb DUMMY, [PF_MASK, DUMMY]
++    PF add PF_MASK, PF_MASK, #1
++.endif
++72:
++.endif
++.endm
++
++.macro cache_preload_simple
++.if (PREFETCH_TYPE_CURRENT == PREFETCH_TYPE_SIMPLE)
++.if src_bpp > 0
++    prfm PREFETCH_MODE, [SRC, #(PREFETCH_DISTANCE_SIMPLE * src_bpp / 8)]
++.endif
++.if dst_r_bpp > 0
++    prfm PREFETCH_MODE, [DST_R, #(PREFETCH_DISTANCE_SIMPLE * dst_r_bpp / 8)]
++.endif
++.if mask_bpp > 0
++    prfm PREFETCH_MODE, [MASK, #(PREFETCH_DISTANCE_SIMPLE * mask_bpp / 8)]
++.endif
++.endif
++.endm
++
++.macro fetch_mask_pixblock
++    pixld       pixblock_size, mask_bpp, \
++                (mask_basereg - pixblock_size * mask_bpp / 64), MASK
++.endm
++
++/*
++ * Macro which is used to process leading pixels until destination
++ * pointer is properly aligned (at 16 bytes boundary). When destination
++ * buffer uses 16bpp format, this is unnecessary, or even pointless.
++ */
++.macro ensure_destination_ptr_alignment process_pixblock_head, \
++                                        process_pixblock_tail, \
++                                        process_pixblock_tail_head
++.if dst_w_bpp != 24
++    tst         DST_R, #0xF
++    beq         52f
++
++.if src_bpp > 0 || mask_bpp > 0 || dst_r_bpp > 0
++.irp lowbit, 1, 2, 4, 8, 16
++local skip1
++.if (dst_w_bpp <= (lowbit * 8)) && ((lowbit * 8) < (pixblock_size * dst_w_bpp))
++.if lowbit < 16 /* we don't need more than 16-byte alignment */
++    tst         DST_R, #lowbit
++    beq         51f
++.endif
++    pixld_src   (lowbit * 8 / dst_w_bpp), src_bpp, src_basereg, SRC
++    pixld       (lowbit * 8 / dst_w_bpp), mask_bpp, mask_basereg, MASK
++.if dst_r_bpp > 0
++    pixld_a     (lowbit * 8 / dst_r_bpp), dst_r_bpp, dst_r_basereg, DST_R
++.else
++    add         DST_R, DST_R, #lowbit
++.endif
++    PF add      PF_X, PF_X, #(lowbit * 8 / dst_w_bpp)
++    sub         W, W, #(lowbit * 8 / dst_w_bpp)
++51:
++.endif
++.endr
++.endif
++    pixdeinterleave src_bpp, src_basereg
++    pixdeinterleave mask_bpp, mask_basereg
++    pixdeinterleave dst_r_bpp, dst_r_basereg
++
++    process_pixblock_head
++    cache_preload 0, pixblock_size
++    cache_preload_simple
++    process_pixblock_tail
++
++    pixinterleave dst_w_bpp, dst_w_basereg
++
++.irp lowbit, 1, 2, 4, 8, 16
++.if (dst_w_bpp <= (lowbit * 8)) && ((lowbit * 8) < (pixblock_size * dst_w_bpp))
++.if lowbit < 16 /* we don't need more than 16-byte alignment */
++    tst         DST_W, #lowbit
++    beq         51f
++.endif
++.if src_bpp == 0 && mask_bpp == 0 && dst_r_bpp == 0
++    sub         W, W, #(lowbit * 8 / dst_w_bpp)
++.endif
++    pixst_a     (lowbit * 8 / dst_w_bpp), dst_w_bpp, dst_w_basereg, DST_W
++51:
++.endif
++.endr
++.endif
++52:
++.endm
++
++/*
++ * Special code for processing up to (pixblock_size - 1) remaining
++ * trailing pixels. As SIMD processing performs operation on
++ * pixblock_size pixels, anything smaller than this has to be loaded
++ * and stored in a special way. Loading and storing of pixel data is
++ * performed in such a way that we fill some 'slots' in the NEON
++ * registers (some slots naturally are unused), then perform compositing
++ * operation as usual. In the end, the data is taken from these 'slots'
++ * and saved to memory.
++ *
++ * cache_preload_flag - allows to suppress prefetch if
++ *                      set to 0
++ * dst_aligned_flag   - selects whether destination buffer
++ *                      is aligned
++ */
++.macro process_trailing_pixels cache_preload_flag, \
++                               dst_aligned_flag, \
++                               process_pixblock_head, \
++                               process_pixblock_tail, \
++                               process_pixblock_tail_head
++    tst         W, #(pixblock_size - 1)
++    beq         52f
++.if src_bpp > 0 || mask_bpp > 0 || dst_r_bpp > 0
++.irp chunk_size, 16, 8, 4, 2, 1
++.if pixblock_size > chunk_size
++    tst         W, #chunk_size
++    beq         51f
++    pixld_src   chunk_size, src_bpp, src_basereg, SRC
++    pixld       chunk_size, mask_bpp, mask_basereg, MASK
++.if dst_aligned_flag != 0
++    pixld_a     chunk_size, dst_r_bpp, dst_r_basereg, DST_R
++.else
++    pixld       chunk_size, dst_r_bpp, dst_r_basereg, DST_R
++.endif
++.if cache_preload_flag != 0
++    PF add      PF_X, PF_X, #chunk_size
++.endif
++51:
++.endif
++.endr
++.endif
++    pixdeinterleave src_bpp, src_basereg
++    pixdeinterleave mask_bpp, mask_basereg
++    pixdeinterleave dst_r_bpp, dst_r_basereg
++
++    process_pixblock_head
++.if cache_preload_flag != 0
++    cache_preload 0, pixblock_size
++    cache_preload_simple
++.endif
++    process_pixblock_tail
++    pixinterleave dst_w_bpp, dst_w_basereg
++.irp chunk_size, 16, 8, 4, 2, 1
++.if pixblock_size > chunk_size
++    tst         W, #chunk_size
++    beq         51f
++.if dst_aligned_flag != 0
++    pixst_a     chunk_size, dst_w_bpp, dst_w_basereg, DST_W
++.else
++    pixst       chunk_size, dst_w_bpp, dst_w_basereg, DST_W
++.endif
++51:
++.endif
++.endr
++52:
++.endm
++
++/*
++ * Macro, which performs all the needed operations to switch to the next
++ * scanline and start the next loop iteration unless all the scanlines
++ * are already processed.
++ */
++.macro advance_to_next_scanline start_of_loop_label
++    mov         W, ORIG_W
++    add         DST_W, DST_W, DST_STRIDE, lsl #dst_bpp_shift
++.if src_bpp != 0
++    add         SRC, SRC, SRC_STRIDE, lsl #src_bpp_shift
++.endif
++.if mask_bpp != 0
++    add         MASK, MASK, MASK_STRIDE, lsl #mask_bpp_shift
++.endif
++.if (dst_w_bpp != 24)
++    sub         DST_W, DST_W, W, lsl #dst_bpp_shift
++.endif
++.if (src_bpp != 24) && (src_bpp != 0)
++    sub         SRC, SRC, W, lsl #src_bpp_shift
++.endif
++.if (mask_bpp != 24) && (mask_bpp != 0)
++    sub         MASK, MASK, W, lsl #mask_bpp_shift
++.endif
++    subs        H, H, #1
++    mov         DST_R, DST_W
++    bge         start_of_loop_label
++.endm
++
++/*
++ * Registers are allocated in the following way by default:
++ * v0, v1, v2, v3     - reserved for loading source pixel data
++ * v4, v5, v6, v7     - reserved for loading destination pixel data
++ * v24, v25, v26, v27 - reserved for loading mask pixel data
++ * v28, v29, v30, v31 - final destination pixel data for writeback to memory
++ */
++.macro generate_composite_function fname, \
++                                   src_bpp_, \
++                                   mask_bpp_, \
++                                   dst_w_bpp_, \
++                                   flags, \
++                                   pixblock_size_, \
++                                   prefetch_distance, \
++                                   init, \
++                                   cleanup, \
++                                   process_pixblock_head, \
++                                   process_pixblock_tail, \
++                                   process_pixblock_tail_head, \
++                                   dst_w_basereg_ = 28, \
++                                   dst_r_basereg_ = 4, \
++                                   src_basereg_   = 0, \
++                                   mask_basereg_  = 24
++
++    pixman_asm_function fname
++    stp         x29, x30, [sp, -16]!
++    mov         x29, sp
++    sub         sp,   sp, 232  /* push all registers */
++    sub         x29, x29, 64
++    st1         {v8.8b, v9.8b, v10.8b, v11.8b}, [x29], #32
++    st1         {v12.8b, v13.8b, v14.8b, v15.8b}, [x29], #32
++    stp          x8,   x9, [x29, -80]
++    stp         x10,  x11, [x29, -96]
++    stp         x12,  x13, [x29, -112]
++    stp         x14,  x15, [x29, -128]
++    stp         x16,  x17, [x29, -144]
++    stp         x18,  x19, [x29, -160]
++    stp         x20,  x21, [x29, -176]
++    stp         x22,  x23, [x29, -192]
++    stp         x24,  x25, [x29, -208]
++    stp         x26,  x27, [x29, -224]
++    str         x28, [x29, -232]
++
++/*
++ * Select prefetch type for this function. If prefetch distance is
++ * set to 0 or one of the color formats is 24bpp, SIMPLE prefetch
++ * has to be used instead of ADVANCED.
++ */
++    .set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_DEFAULT
++.if prefetch_distance == 0
++    .set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_NONE
++.elseif (PREFETCH_TYPE_CURRENT > PREFETCH_TYPE_SIMPLE) && \
++        ((src_bpp_ == 24) || (mask_bpp_ == 24) || (dst_w_bpp_ == 24))
++    .set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_SIMPLE
++.endif
++
++/*
++ * Make some macro arguments globally visible and accessible
++ * from other macros
++ */
++    .set src_bpp, src_bpp_
++    .set mask_bpp, mask_bpp_
++    .set dst_w_bpp, dst_w_bpp_
++    .set pixblock_size, pixblock_size_
++    .set dst_w_basereg, dst_w_basereg_
++    .set dst_r_basereg, dst_r_basereg_
++    .set src_basereg, src_basereg_
++    .set mask_basereg, mask_basereg_
++
++    .macro pixld_src x:vararg
++        pixld x
++    .endm
++    .macro fetch_src_pixblock
++        pixld_src   pixblock_size, src_bpp, \
++                    (src_basereg - pixblock_size * src_bpp / 64), SRC
++    .endm
++/*
++ * Assign symbolic names to registers
++ */
++    W           .req       x0      /* width (is updated during processing) */
++    H           .req       x1      /* height (is updated during processing) */
++    DST_W       .req       x2      /* destination buffer pointer for writes */
++    DST_STRIDE  .req       x3      /* destination image stride */
++    SRC         .req       x4      /* source buffer pointer */
++    SRC_STRIDE  .req       x5      /* source image stride */
++    MASK        .req       x6      /* mask pointer */
++    MASK_STRIDE .req       x7      /* mask stride */
++
++    DST_R       .req       x8      /* destination buffer pointer for reads */
++
++    PF_CTL      .req       x9      /* combined lines counter and prefetch */
++                                    /* distance increment counter */
++    PF_X        .req       x10     /* pixel index in a scanline for current */
++                                    /* pretetch position */
++    PF_SRC      .req       x11     /* pointer to source scanline start */
++                                    /* for prefetch purposes */
++    PF_DST      .req       x12     /* pointer to destination scanline start */
++                                    /* for prefetch purposes */
++    PF_MASK     .req       x13     /* pointer to mask scanline start */
++                                    /* for prefetch purposes */
++
++    ORIG_W      .req       x14     /* saved original width */
++    DUMMY       .req       x15     /* temporary register */
++
++    sxtw        x0, w0
++    sxtw        x1, w1
++    sxtw        x3, w3
++    sxtw        x5, w5
++    sxtw        x7, w7
++
++    .set mask_bpp_shift, -1
++.if src_bpp == 32
++    .set src_bpp_shift, 2
++.elseif src_bpp == 24
++    .set src_bpp_shift, 0
++.elseif src_bpp == 16
++    .set src_bpp_shift, 1
++.elseif src_bpp == 8
++    .set src_bpp_shift, 0
++.elseif src_bpp == 0
++    .set src_bpp_shift, -1
++.else
++    .error "requested src bpp (src_bpp) is not supported"
++.endif
++.if mask_bpp == 32
++    .set mask_bpp_shift, 2
++.elseif mask_bpp == 24
++    .set mask_bpp_shift, 0
++.elseif mask_bpp == 8
++    .set mask_bpp_shift, 0
++.elseif mask_bpp == 0
++    .set mask_bpp_shift, -1
++.else
++    .error "requested mask bpp (mask_bpp) is not supported"
++.endif
++.if dst_w_bpp == 32
++    .set dst_bpp_shift, 2
++.elseif dst_w_bpp == 24
++    .set dst_bpp_shift, 0
++.elseif dst_w_bpp == 16
++    .set dst_bpp_shift, 1
++.elseif dst_w_bpp == 8
++    .set dst_bpp_shift, 0
++.else
++    .error "requested dst bpp (dst_w_bpp) is not supported"
++.endif
++
++.if (((flags) & FLAG_DST_READWRITE) != 0)
++    .set dst_r_bpp, dst_w_bpp
++.else
++    .set dst_r_bpp, 0
++.endif
++.if (((flags) & FLAG_DEINTERLEAVE_32BPP) != 0)
++    .set DEINTERLEAVE_32BPP_ENABLED, 1
++.else
++    .set DEINTERLEAVE_32BPP_ENABLED, 0
++.endif
++
++.if prefetch_distance < 0 || prefetch_distance > 15
++    .error "invalid prefetch distance (prefetch_distance)"
++.endif
++
++    PF mov      PF_X, #0
++    mov         DST_R, DST_W
++
++.if src_bpp == 24
++    sub         SRC_STRIDE, SRC_STRIDE, W
++    sub         SRC_STRIDE, SRC_STRIDE, W, lsl #1
++.endif
++.if mask_bpp == 24
++    sub         MASK_STRIDE, MASK_STRIDE, W
++    sub         MASK_STRIDE, MASK_STRIDE, W, lsl #1
++.endif
++.if dst_w_bpp == 24
++    sub         DST_STRIDE, DST_STRIDE, W
++    sub         DST_STRIDE, DST_STRIDE, W, lsl #1
++.endif
++
++/*
++ * Setup advanced prefetcher initial state
++ */
++    PF mov      PF_SRC, SRC
++    PF mov      PF_DST, DST_R
++    PF mov      PF_MASK, MASK
++    /* PF_CTL = prefetch_distance | ((h - 1) << 4) */
++    PF lsl      DUMMY, H, #4
++    PF mov      PF_CTL, DUMMY
++    PF add      PF_CTL, PF_CTL, #(prefetch_distance - 0x10)
++
++    init
++    subs        H, H, #1
++    mov         ORIG_W, W
++    blt         9f
++    cmp         W, #(pixblock_size * 2)
++    blt         800f
++/*
++ * This is the start of the pipelined loop, which if optimized for
++ * long scanlines
++ */
++0:
++    ensure_destination_ptr_alignment process_pixblock_head, \
++                                     process_pixblock_tail, \
++                                     process_pixblock_tail_head
++
++    /* Implement "head (tail_head) ... (tail_head) tail" loop pattern */
++    pixld_a     pixblock_size, dst_r_bpp, \
++                (dst_r_basereg - pixblock_size * dst_r_bpp / 64), DST_R
++    fetch_src_pixblock
++    pixld       pixblock_size, mask_bpp, \
++                (mask_basereg - pixblock_size * mask_bpp / 64), MASK
++    PF add      PF_X, PF_X, #pixblock_size
++    process_pixblock_head
++    cache_preload 0, pixblock_size
++    cache_preload_simple
++    subs        W, W, #(pixblock_size * 2)
++    blt         200f
++
++100:
++    process_pixblock_tail_head
++    cache_preload_simple
++    subs        W, W, #pixblock_size
++    bge         100b
++
++200:
++    process_pixblock_tail
++    pixst_a     pixblock_size, dst_w_bpp, \
++                (dst_w_basereg - pixblock_size * dst_w_bpp / 64), DST_W
++
++    /* Process the remaining trailing pixels in the scanline */
++    process_trailing_pixels 1, 1, \
++                            process_pixblock_head, \
++                            process_pixblock_tail, \
++                            process_pixblock_tail_head
++    advance_to_next_scanline 0b
++
++    cleanup
++1000:
++    /* pop all registers */
++    sub         x29, x29, 64
++    ld1         {v8.8b, v9.8b, v10.8b, v11.8b}, [x29], 32
++    ld1         {v12.8b, v13.8b, v14.8b, v15.8b}, [x29], 32
++    ldp          x8,   x9, [x29, -80]
++    ldp         x10,  x11, [x29, -96]
++    ldp         x12,  x13, [x29, -112]
++    ldp         x14,  x15, [x29, -128]
++    ldp         x16,  x17, [x29, -144]
++    ldp         x18,  x19, [x29, -160]
++    ldp         x20,  x21, [x29, -176]
++    ldp         x22,  x23, [x29, -192]
++    ldp         x24,  x25, [x29, -208]
++    ldp         x26,  x27, [x29, -224]
++    ldr         x28, [x29, -232]
++    mov         sp, x29
++    ldp         x29, x30, [sp], 16 
++    ret  /* exit */
++/*
++ * This is the start of the loop, designed to process images with small width
++ * (less than pixblock_size * 2 pixels). In this case neither pipelining
++ * nor prefetch are used.
++ */
++800:
++.if src_bpp_shift >= 0
++    PF lsl DUMMY, SRC_STRIDE, #src_bpp_shift
++    PF prfm PREFETCH_MODE, [SRC, DUMMY]
++.endif
++.if dst_r_bpp != 0
++    PF lsl  DUMMY, DST_STRIDE, #dst_bpp_shift
++    PF prfm PREFETCH_MODE, [DST_R, DUMMY]
++.endif
++.if mask_bpp_shift >= 0
++    PF lsl  DUMMY, MASK_STRIDE, #mask_bpp_shift
++    PF prfm PREFETCH_MODE, [MASK, DUMMY]
++.endif
++    /* Process exactly pixblock_size pixels if needed */
++    tst         W, #pixblock_size
++    beq         100f
++    pixld       pixblock_size, dst_r_bpp, \
++                (dst_r_basereg - pixblock_size * dst_r_bpp / 64), DST_R
++    fetch_src_pixblock
++    pixld       pixblock_size, mask_bpp, \
++                (mask_basereg - pixblock_size * mask_bpp / 64), MASK
++    process_pixblock_head
++    process_pixblock_tail
++    pixst       pixblock_size, dst_w_bpp, \
++                (dst_w_basereg - pixblock_size * dst_w_bpp / 64), DST_W
++100:
++    /* Process the remaining trailing pixels in the scanline */
++    process_trailing_pixels 0, 0, \
++                            process_pixblock_head, \
++                            process_pixblock_tail, \
++                            process_pixblock_tail_head
++    advance_to_next_scanline 800b
++9:
++    cleanup
++    /* pop all registers */
++    sub         x29, x29, 64
++    ld1         {v8.8b, v9.8b, v10.8b, v11.8b}, [x29], 32
++    ld1         {v12.8b, v13.8b, v14.8b, v15.8b}, [x29], 32
++    ldp          x8,   x9, [x29, -80]
++    ldp         x10,  x11, [x29, -96]
++    ldp         x12,  x13, [x29, -112]
++    ldp         x14,  x15, [x29, -128]
++    ldp         x16,  x17, [x29, -144]
++    ldp         x18,  x19, [x29, -160]
++    ldp         x20,  x21, [x29, -176]
++    ldp         x22,  x23, [x29, -192]
++    ldp         x24,  x25, [x29, -208]
++    ldp         x26,  x27, [x29, -224]
++    ldr         x28, [x29, -232]
++    mov         sp, x29
++    ldp         x29, x30, [sp], 16 
++    ret  /* exit */
++
++    .purgem     fetch_src_pixblock
++    .purgem     pixld_src
++
++    .unreq      SRC
++    .unreq      MASK
++    .unreq      DST_R
++    .unreq      DST_W
++    .unreq      ORIG_W
++    .unreq      W
++    .unreq      H
++    .unreq      SRC_STRIDE
++    .unreq      DST_STRIDE
++    .unreq      MASK_STRIDE
++    .unreq      PF_CTL
++    .unreq      PF_X
++    .unreq      PF_SRC
++    .unreq      PF_DST
++    .unreq      PF_MASK
++    .unreq      DUMMY
++    .endfunc
++.endm
++
++/*
++ * A simplified variant of function generation template for a single
++ * scanline processing (for implementing pixman combine functions)
++ */
++.macro generate_composite_function_scanline        use_nearest_scaling, \
++                                                   fname, \
++                                                   src_bpp_, \
++                                                   mask_bpp_, \
++                                                   dst_w_bpp_, \
++                                                   flags, \
++                                                   pixblock_size_, \
++                                                   init, \
++                                                   cleanup, \
++                                                   process_pixblock_head, \
++                                                   process_pixblock_tail, \
++                                                   process_pixblock_tail_head, \
++                                                   dst_w_basereg_ = 28, \
++                                                   dst_r_basereg_ = 4, \
++                                                   src_basereg_   = 0, \
++                                                   mask_basereg_  = 24
++
++    pixman_asm_function fname
++    .set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_NONE
++
++/*
++ * Make some macro arguments globally visible and accessible
++ * from other macros
++ */
++    .set src_bpp, src_bpp_
++    .set mask_bpp, mask_bpp_
++    .set dst_w_bpp, dst_w_bpp_
++    .set pixblock_size, pixblock_size_
++    .set dst_w_basereg, dst_w_basereg_
++    .set dst_r_basereg, dst_r_basereg_
++    .set src_basereg, src_basereg_
++    .set mask_basereg, mask_basereg_
++    
++.if use_nearest_scaling != 0
++    /*
++     * Assign symbolic names to registers for nearest scaling
++     */
++    W           .req        x0
++    DST_W       .req        x1
++    SRC         .req        x2
++    VX          .req        x3
++    UNIT_X      .req        x4
++    SRC_WIDTH_FIXED .req    x5
++    MASK        .req        x6
++    TMP1        .req        x8
++    TMP2        .req        x9
++    DST_R       .req        x10
++    DUMMY       .req        x30
++
++    .macro pixld_src x:vararg
++        pixld_s x
++    .endm
++
++    sxtw        x0, w0
++    sxtw        x3, w3
++    sxtw        x4, w4
++    sxtw        x5, w5
++
++    stp         x29, x30, [sp, -16]!
++    mov         x29, sp
++    sub         sp, sp, 88
++    sub         x29, x29, 64
++    st1         {v8.8b, v9.8b, v10.8b, v11.8b}, [x29], 32
++    st1         {v12.8b, v13.8b, v14.8b, v15.8b}, [x29], 32
++    stp         x8, x9, [x29, -80]
++    str         x10, [x29, -88]
++.else
++    /*
++     * Assign symbolic names to registers
++     */
++    W           .req        x0      /* width (is updated during processing) */
++    DST_W       .req        x1      /* destination buffer pointer for writes */
++    SRC         .req        x2      /* source buffer pointer */
++    MASK        .req        x3      /* mask pointer */
++    DST_R       .req        x4      /* destination buffer pointer for reads */
++    DUMMY       .req        x30
++
++    .macro pixld_src x:vararg
++        pixld x
++    .endm
++
++    sxtw        x0, w0
++
++    stp         x29, x30, [sp, -16]!
++    mov         x29, sp
++    sub         sp, sp, 64
++    sub         x29, x29, 64
++    st1         {v8.8b, v9.8b, v10.8b, v11.8b}, [x29], 32
++    st1         {v12.8b, v13.8b, v14.8b, v15.8b}, [x29], 32
++.endif
++
++.if (((flags) & FLAG_DST_READWRITE) != 0)
++    .set dst_r_bpp, dst_w_bpp
++.else
++    .set dst_r_bpp, 0
++.endif
++.if (((flags) & FLAG_DEINTERLEAVE_32BPP) != 0)
++    .set DEINTERLEAVE_32BPP_ENABLED, 1
++.else
++    .set DEINTERLEAVE_32BPP_ENABLED, 0
++.endif
++
++    .macro fetch_src_pixblock
++        pixld_src   pixblock_size, src_bpp, \
++                    (src_basereg - pixblock_size * src_bpp / 64), SRC
++    .endm
++
++    init
++    mov         DST_R, DST_W
++
++    cmp         W, #pixblock_size
++    blt         800f
++
++    ensure_destination_ptr_alignment process_pixblock_head, \
++                                     process_pixblock_tail, \
++                                     process_pixblock_tail_head
++
++    subs        W, W, #pixblock_size
++    blt         700f
++
++    /* Implement "head (tail_head) ... (tail_head) tail" loop pattern */
++    pixld_a     pixblock_size, dst_r_bpp, \
++                (dst_r_basereg - pixblock_size * dst_r_bpp / 64), DST_R
++    fetch_src_pixblock
++    pixld       pixblock_size, mask_bpp, \
++                (mask_basereg - pixblock_size * mask_bpp / 64), MASK
++    process_pixblock_head
++    subs        W, W, #pixblock_size
++    blt         200f
++100:
++    process_pixblock_tail_head
++    subs        W, W, #pixblock_size
++    bge         100b
++200:
++    process_pixblock_tail
++    pixst_a     pixblock_size, dst_w_bpp, \
++                (dst_w_basereg - pixblock_size * dst_w_bpp / 64), DST_W
++700:
++    /* Process the remaining trailing pixels in the scanline (dst aligned) */
++    process_trailing_pixels 0, 1, \
++                            process_pixblock_head, \
++                            process_pixblock_tail, \
++                            process_pixblock_tail_head
++
++    cleanup
++.if use_nearest_scaling != 0
++    sub         x29, x29, 64
++    ld1         {v8.8b, v9.8b, v10.8b, v11.8b}, [x29], 32
++    ld1         {v12.8b, v13.8b, v14.8b, v15.8b}, [x29], 32
++    ldp         x8, x9, [x29, -80]
++    ldr         x10, [x29, -96]
++    mov         sp, x29
++    ldp         x29, x30, [sp], 16 
++    ret  /* exit */
++.else
++    sub         x29, x29, 64
++    ld1         {v8.8b, v9.8b, v10.8b, v11.8b}, [x29], 32
++    ld1         {v12.8b, v13.8b, v14.8b, v15.8b}, [x29], 32
++    mov         sp, x29
++    ldp         x29, x30, [sp], 16 
++    ret  /* exit */
++.endif
++800:
++    /* Process the remaining trailing pixels in the scanline (dst unaligned) */
++    process_trailing_pixels 0, 0, \
++                            process_pixblock_head, \
++                            process_pixblock_tail, \
++                            process_pixblock_tail_head
++
++    cleanup
++.if use_nearest_scaling != 0
++    sub         x29, x29, 64
++    ld1         {v8.8b, v9.8b, v10.8b, v11.8b}, [x29], 32
++    ld1         {v12.8b, v13.8b, v14.8b, v15.8b}, [x29], 32
++    ldp         x8, x9, [x29, -80]
++    ldr         x10, [x29, -88]
++    mov         sp, x29
++    ldp         x29, x30, [sp], 16 
++    ret  /* exit */
++
++    .unreq      DUMMY
++    .unreq      DST_R
++    .unreq      SRC
++    .unreq      W
++    .unreq      VX
++    .unreq      UNIT_X
++    .unreq      TMP1
++    .unreq      TMP2
++    .unreq      DST_W
++    .unreq      MASK
++    .unreq      SRC_WIDTH_FIXED
++
++.else
++    sub         x29, x29, 64
++    ld1         {v8.8b, v9.8b, v10.8b, v11.8b}, [x29], 32
++    ld1         {v12.8b, v13.8b, v14.8b, v15.8b}, [x29], 32
++    mov          sp, x29
++    ldp          x29, x30, [sp], 16 
++    ret  /* exit */
++
++    .unreq      DUMMY
++    .unreq      SRC
++    .unreq      MASK
++    .unreq      DST_R
++    .unreq      DST_W
++    .unreq      W
++.endif
++
++    .purgem     fetch_src_pixblock
++    .purgem     pixld_src
++
++    .endfunc
++.endm
++
++.macro generate_composite_function_single_scanline x:vararg
++    generate_composite_function_scanline 0, x
++.endm
++
++.macro generate_composite_function_nearest_scanline x:vararg
++    generate_composite_function_scanline 1, x
++.endm
++
++/* Default prologue/epilogue, nothing special needs to be done */
++
++.macro default_init
++.endm
++
++.macro default_cleanup
++.endm
++
++/*
++ * Prologue/epilogue variant which additionally saves/restores v8-v15
++ * registers (they need to be saved/restored by callee according to ABI).
++ * This is required if the code needs to use all the NEON registers.
++ */
++
++.macro default_init_need_all_regs
++.endm
++
++.macro default_cleanup_need_all_regs
++.endm
++
++/******************************************************************************/
++
++/*
++ * Conversion of 8 r5g6b6 pixels packed in 128-bit register (in)
++ * into a planar a8r8g8b8 format (with a, r, g, b color components
++ * stored into 64-bit registers out_a, out_r, out_g, out_b respectively).
++ *
++ * Warning: the conversion is destructive and the original
++ *          value (in) is lost.
++ */
++.macro convert_0565_to_8888 in, out_a, out_r, out_g, out_b
++    shrn        &out_r&.8b, &in&.8h,    #8
++    shrn        &out_g&.8b, &in&.8h,    #3
++    sli         &in&.8h,    &in&.8h,    #5
++    movi        &out_a&.8b, #255
++    sri         &out_r&.8b, &out_r&.8b, #5
++    sri         &out_g&.8b, &out_g&.8b, #6
++    shrn        &out_b&.8b, &in&.8h,    #2
++.endm
++
++.macro convert_0565_to_x888 in, out_r, out_g, out_b
++    shrn        &out_r&.8b, &in&.8h,    #8
++    shrn        &out_g&.8b, &in&.8h,    #3
++    sli         &in&.8h,    &in&.8h,    #5
++    sri         &out_r&.8b, &out_r&.8b, #5
++    sri         &out_g&.8b, &out_g&.8b, #6
++    shrn        &out_b&.8b, &in&.8h,    #2
++.endm
++
++/*
++ * Conversion from planar a8r8g8b8 format (with a, r, g, b color components
++ * in 64-bit registers in_a, in_r, in_g, in_b respectively) into 8 r5g6b6
++ * pixels packed in 128-bit register (out). Requires two temporary 128-bit
++ * registers (tmp1, tmp2)
++ */
++.macro convert_8888_to_0565 in_r, in_g, in_b, out, tmp1, tmp2
++    ushll       &tmp1&.8h, &in_g&.8b, #7
++    shl         &tmp1&.8h, &tmp1&.8h, #1
++    ushll       &out&.8h,  &in_r&.8b, #7
++    shl         &out&.8h,  &out&.8h,  #1
++    ushll       &tmp2&.8h, &in_b&.8b, #7
++    shl         &tmp2&.8h, &tmp2&.8h, #1
++    sri         &out&.8h, &tmp1&.8h, #5
++    sri         &out&.8h, &tmp2&.8h, #11
++.endm
++
++/*
++ * Conversion of four r5g6b5 pixels (in) to four x8r8g8b8 pixels
++ * returned in (out0, out1) registers pair. Requires one temporary
++ * 64-bit register (tmp). 'out1' and 'in' may overlap, the original
++ * value from 'in' is lost
++ */
++.macro convert_four_0565_to_x888_packed in, out0, out1, tmp
++    shl         &out0&.4h, &in&.4h,   #5  /* G top 6 bits */
++    shl         &tmp&.4h,  &in&.4h,   #11 /* B top 5 bits */
++    sri         &in&.4h,   &in&.4h,   #5  /* R is ready in top bits */
++    sri         &out0&.4h, &out0&.4h, #6  /* G is ready in top bits */
++    sri         &tmp&.4h,  &tmp&.4h,  #5  /* B is ready in top bits */
++    ushr        &out1&.4h, &in&.4h,   #8  /* R is in place */
++    sri         &out0&.4h, &tmp&.4h,  #8  /* G & B is in place */
++    zip1        &tmp&.4h,  &out0&.4h, &out1&.4h  /* everything is in place */
++    zip2        &out1&.4h, &out0&.4h, &out1&.4h
++    mov         &out0&.d[0], &tmp&.d[0]
++.endm
+diff --git a/pixman/pixman-private.h b/pixman/pixman-private.h
+index 73a5414..22c8ccc
+--- a/pixman/pixman-private.h
++++ b/pixman/pixman-private.h
+@@ -607,6 +607,11 @@ pixman_implementation_t *
+ _pixman_implementation_create_arm_neon (pixman_implementation_t *fallback);
+ #endif
+ 
++#ifdef USE_ARM_A64_NEON
++pixman_implementation_t *
++_pixman_implementation_create_arm_neon (pixman_implementation_t *fallback);
++#endif
++
+ #ifdef USE_MIPS_DSPR2
+ pixman_implementation_t *
+ _pixman_implementation_create_mips_dspr2 (pixman_implementation_t *fallback);
+-- 
+2.8.0
+
diff --git a/FunKey/board/funkey/patches/sdl/003-alsa-fix-excessiveio.patch b/FunKey/board/funkey/patches/sdl/003-alsa-fix-excessiveio.patch
new file mode 100644
index 0000000..46e9524
--- /dev/null
+++ b/FunKey/board/funkey/patches/sdl/003-alsa-fix-excessiveio.patch
@@ -0,0 +1,26 @@
+Merge this bug as it can affect performance :
+https://github.com/OpenDingux/SDL/commit/e51100dce8da9099278dac9f5affbecf6396378b
+
+--- a/src/audio/alsa/SDL_alsa_audio.c 
++++ b/src/audio/alsa/SDL_alsa_audio.c 
+@@ -479,6 +479,10 @@
+ 		return(-1);
+ 	}
+ 
++	/* Switch to blocking mode for playback */
++	/* Note: this must happen before hw/sw params are set. */
++	SDL_NAME(snd_pcm_nonblock)(pcm_handle, 0);
++
+ 	/* Figure out what the hardware is capable of */
+ 	snd_pcm_hw_params_alloca(&hwparams);
+ 	status = SDL_NAME(snd_pcm_hw_params_any)(pcm_handle, hwparams);
+@@ -611,9 +615,6 @@
+ 	}
+ 	SDL_memset(mixbuf, spec->silence, spec->size);
+ 
+-	/* Switch to blocking mode for playback */
+-	SDL_NAME(snd_pcm_nonblock)(pcm_handle, 0);
+-
+ 	/* We're ready to rock and roll. :-) */
+ 	return(0);
+ }
diff --git a/FunKey/board/funkey/patches/sdl/sdl-fix-kb-input.patch b/FunKey/board/funkey/patches/sdl/sdl-fix-kb-input.patch
new file mode 100644
index 0000000..8f7db83
--- /dev/null
+++ b/FunKey/board/funkey/patches/sdl/sdl-fix-kb-input.patch
@@ -0,0 +1,22 @@
+diff --git a/src/video/fbcon/SDL_fbevents.c b/src/video/fbcon/SDL_fbevents.c
+index 5e369a4..549a7ad 100644
+--- a/src/video/fbcon/SDL_fbevents.c
++++ b/src/video/fbcon/SDL_fbevents.c
+@@ -270,17 +270,6 @@ int FB_OpenKeyboard(_THIS)
+ 				fprintf(stderr, "vtpath = %s, fd = %d\n",
+ 					vtpath, keyboard_fd);
+ #endif /* DEBUG_KEYBOARD */
+-
+-				/* This needs to be our controlling tty
+-				   so that the kernel ioctl() calls work
+-				*/
+-				if ( keyboard_fd >= 0 ) {
+-					tty0_fd = open("/dev/tty", O_RDWR, 0);
+-					if ( tty0_fd >= 0 ) {
+-						ioctl(tty0_fd, TIOCNOTTY, 0);
+-						close(tty0_fd);
+-					}
+-				}
+ 			}
+ 		}
+  		if ( keyboard_fd < 0 ) {
diff --git a/FunKey/board/funkey/rootfs-overlay/etc/fstab b/FunKey/board/funkey/rootfs-overlay/etc/fstab
index 47ade62..a729a33 100644
--- a/FunKey/board/funkey/rootfs-overlay/etc/fstab
+++ b/FunKey/board/funkey/rootfs-overlay/etc/fstab
@@ -8,4 +8,4 @@ tmpfs           /run            tmpfs   mode=0755,nosuid,nodev  0       0
 sysfs           /sys            sysfs   defaults        0       0
 /dev/mmcblk0p3	none		swap	sw		0	0
 configfs	/sys/kernel/config	configfs	rw,relatime	0	0
-/dev/mmcblk0p4	/mnt	vfat	rw,relatime,fmask=0022,dmask=0022,iocharset=iso8859-1,shortname=mixed,errors=remount-ro	0	0
+/dev/mmcblk0p4	/mnt	vfat	rw,relatime,fmask=0022,dmask=0022,iocharset=iso8859-1,shortname=mixed	0	0
diff --git a/FunKey/board/funkey/rootfs-overlay/etc/funkey_gpio_mapping.conf b/FunKey/board/funkey/rootfs-overlay/etc/funkey_gpio_mapping.conf
index 11a4e43..b3500da 100644
--- a/FunKey/board/funkey/rootfs-overlay/etc/funkey_gpio_mapping.conf
+++ b/FunKey/board/funkey/rootfs-overlay/etc/funkey_gpio_mapping.conf
@@ -1,6 +1,6 @@
-##################################
-# Funkey GPIO keymap config file #
-##################################
+####################################
+#  Funkey GPIO keymap config file  #
+####################################
 # Format:
 #
 # 	- First all GPIO Pin numbers must be declared (integers separated by commas)
@@ -21,19 +21,20 @@
 
 
 
-###################################
-#	Pins declaration:
+########################
+#	Pins declaration   #
+########################
 0,1,2,3,4,6,7,10*,11,12,13,14,15
 
 
-###################################
-#	Mapping:
+###############
+#	Mapping   #
+###############
 #7, KEYBOARD, KEY_F, KEY_F, Fn
 #7+6, KEYBOARD, KEY_K, KEY_K, Select
 7, KEYBOARD, KEY_K, KEY_K, Select
 6, KEYBOARD, KEY_S, KEY_S, Start
 3, KEYBOARD, KEY_U, KEY_U, Up
-7+3, KEYBOARD, KEY_P, KEY_P, Quick Save
 4, KEYBOARD, KEY_L, KEY_L, Left
 7+4, KEYBOARD, KEY_J, KEY_J, Aspect ratio factor --
 1, KEYBOARD, KEY_D, KEY_D, Down
@@ -49,18 +50,13 @@
 13, KEYBOARD, KEY_Y, KEY_Y, Y
 11, KEYBOARD, KEY_X, KEY_X, X
 
+7+3, SHELL_COMMAND, snap, Fn+Start, Snapshot
 7+12, SHELL_COMMAND, quick_action_volume_up, Fn+Y, Volume++
 7+13, SHELL_COMMAND, quick_action_volume_down, Fn+A, Volume--
 7+11, SHELL_COMMAND, quick_action_bright_up, Fn+B, Brightness++
 7+14, SHELL_COMMAND, quick_action_bright_down, Fn+X, Brightness--
-10, SHELL_COMMAND, sched_shutdown 1 & signal_usr1_to_emulators, N_OE, Quick save and Poweroff because of N_OE
+10, SHELL_COMMAND, sched_shutdown 0.1, N_OE, Instant Play save and Poweroff because of N_OE
 7+15+2, SHELL_COMMAND, display_notif_system_stats, Fn+L1+L2, display system cpu and ram usage
 
-#7+12, KEYBOARD, KEY_G, KEY_G, Brightness++
-#7+14, KEYBOARD, KEY_E, KEY_E, Volume--
-#7+13, KEYBOARD, KEY_W, KEY_W, Brightness--
-#7+11, KEYBOARD, KEY_C, KEY_C, Volume++
-#10, KEYBOARD, KEY_T, KEY_T, Should Poweroff because N_OE_received
-#7+15, KEYBOARD, KEY_Q, 7+11, Launch menu
 
 
diff --git a/FunKey/board/funkey/rootfs-overlay/etc/init.d/S01first_boot b/FunKey/board/funkey/rootfs-overlay/etc/init.d/S01first_boot
index a04a285..590b8cb 100755
--- a/FunKey/board/funkey/rootfs-overlay/etc/init.d/S01first_boot
+++ b/FunKey/board/funkey/rootfs-overlay/etc/init.d/S01first_boot
@@ -4,20 +4,10 @@ THIS=$(basename $0)
 
 case "$1" in
     start)
-
-    # Check is SWAP partition already created
-    fdisk -l /dev/mmcblk0 | grep "Linux swap" > /dev/null
-    if [ $? -ne 0 ]; then
-        first_boot
-        exit $?
-    fi
-
-    # Check is share partition already created
-    fdisk -l /dev/mmcblk0 | grep "W95 FAT32" > /dev/null
-    if [ $? -ne 0 ]; then
-        first_boot
-        exit $?
-    fi
+	first_boot_ok=$(fw_printenv -n first_boot_ok 2>/dev/null)
+	if ! [ ! "${first_boot_ok}" -ne "${first_boot_ok}" ] 2> /dev/null; then
+	    first_boot
+	fi
 	;;
     stop)
 	;;
diff --git a/FunKey/board/funkey/rootfs-overlay/etc/inittab b/FunKey/board/funkey/rootfs-overlay/etc/inittab
index 005aff9..b0d4598 100644
--- a/FunKey/board/funkey/rootfs-overlay/etc/inittab
+++ b/FunKey/board/funkey/rootfs-overlay/etc/inittab
@@ -38,7 +38,7 @@ null::sysinit:/bin/ln -sf /proc/self/fd/2 /dev/stderr
 #::ctrlaltdel:/sbin/reboot
 
 # Stuff to do before rebooting
-::shutdown:/etc/init.d/rcK
-::shutdown:/sbin/swapoff -a
-::shutdown:/bin/umount -r /
-::shutdown:/bin/umount -r /mnt
+#::shutdown:/etc/init.d/rcK
+#::shutdown:/sbin/swapoff -a
+#::shutdown:/bin/umount -r /
+#::shutdown:/bin/umount -r /mnt
diff --git a/FunKey/board/funkey/rootfs-overlay/etc/issue b/FunKey/board/funkey/rootfs-overlay/etc/issue
index c3d1cfd..6acbb23 100644
--- a/FunKey/board/funkey/rootfs-overlay/etc/issue
+++ b/FunKey/board/funkey/rootfs-overlay/etc/issue
@@ -5,6 +5,6 @@
 |___|    |_____|__|__||__|\__||_____|___  |
          FUN ON A KEYCHAIN          |_____|
  -----------------------------------------------------
- Version 1.1.0
+ Version 2.0.0
  -----------------------------------------------------
 
diff --git a/FunKey/board/funkey/rootfs-overlay/etc/os-release b/FunKey/board/funkey/rootfs-overlay/etc/os-release
index 5e259a6..625c497 100644
--- a/FunKey/board/funkey/rootfs-overlay/etc/os-release
+++ b/FunKey/board/funkey/rootfs-overlay/etc/os-release
@@ -1,12 +1,12 @@
 NAME="FunKey-OS"
-VERSION="1.1.0 (Quacking Quagga)"
+VERSION="2.0.0 (Rowdy Rabbit)"
 ID=funkey
 ID_LIKE=buildroot
-PRETTY_NAME="FunKey-OS 1.1.0"
-VERSION_ID="1.1.0"
+PRETTY_NAME="FunKey-OS 2.0.0"
+VERSION_ID="2.0.0.0"
 HOME_URL="https://www.funkey-project.com/"
 SUPPORT_URL="https://www.funkey-project.com/"
 BUG_REPORT_URL="https://www.funkey-project.com/"
 PRIVACY_POLICY_URL="https://www.funkey-project.com"
-VERSION_CODENAME=Quacking
-UBUNTU_CODENAME=Quacking
+VERSION_CODENAME=Rowdy
+UBUNTU_CODENAME=Rowdy
diff --git a/FunKey/board/funkey/rootfs-overlay/etc/profile b/FunKey/board/funkey/rootfs-overlay/etc/profile
index 8a3692e..831311f 100755
--- a/FunKey/board/funkey/rootfs-overlay/etc/profile
+++ b/FunKey/board/funkey/rootfs-overlay/etc/profile
@@ -11,7 +11,6 @@ fi
 export PAGER='/bin/more'
 export EDITOR='/usr/bin/nano'
 export SDL_NOMOUSE=1
-export MEDNAFEN_HOME=/tmp/funkey/.mednafen
 
 # Source configuration files from /etc/profile.d
 for i in /etc/profile.d/*.sh ; do
diff --git a/FunKey/board/funkey/rootfs-overlay/etc/sw-versions b/FunKey/board/funkey/rootfs-overlay/etc/sw-versions
index fc6545a..44869b1 100644
--- a/FunKey/board/funkey/rootfs-overlay/etc/sw-versions
+++ b/FunKey/board/funkey/rootfs-overlay/etc/sw-versions
@@ -1 +1 @@
-rootfs	1.1.0
+rootfs	2.0.0
diff --git a/FunKey/board/funkey/rootfs-overlay/media b/FunKey/board/funkey/rootfs-overlay/media
new file mode 120000
index 0000000..cca5abd
--- /dev/null
+++ b/FunKey/board/funkey/rootfs-overlay/media
@@ -0,0 +1 @@
+/mnt
\ No newline at end of file
diff --git a/FunKey/board/funkey/rootfs-overlay/opk/.empty b/FunKey/board/funkey/rootfs-overlay/opk/.empty
new file mode 100644
index 0000000..e69de29
diff --git a/FunKey/board/funkey/rootfs-overlay/root/.gmenu2x b/FunKey/board/funkey/rootfs-overlay/root/.gmenu2x
new file mode 120000
index 0000000..117526c
--- /dev/null
+++ b/FunKey/board/funkey/rootfs-overlay/root/.gmenu2x
@@ -0,0 +1 @@
+/mnt/funkey/.gmenu2x
\ No newline at end of file
diff --git a/FunKey/board/funkey/rootfs-overlay/root/.profile b/FunKey/board/funkey/rootfs-overlay/root/.profile
index eb33949..cad4256 100755
--- a/FunKey/board/funkey/rootfs-overlay/root/.profile
+++ b/FunKey/board/funkey/rootfs-overlay/root/.profile
@@ -18,6 +18,14 @@ alias l='ls $LS_OPTIONS -lA'
 # alias cp='cp -i'
 # alias mv='mv -i'
 
+# Relocate HOME into the r/w partition
+export HOME=/mnt/FunKey
+mkdir -p "${HOME}"
+export MEDNAFEN_HOME=$HOME/.mednafen
+mkdir -p "${MEDNAFEN_HOME}"
+export GMENU2X_HOME="$HOME/.gmenu2x"
+mkdir -p "${GMENU2X_HOME}"
+
 # Resize the console to the terminal dimensions
 resize() {
     if [[ -t 0 && $# -eq 0 ]]; then
@@ -37,11 +45,9 @@ resize() {
 }
 
 
-# Start ampli if necessary
-echo "Start audio amplifier if necessary"
-if [[ "$(volume_get)" -ne "0" ]]; then
-        start_audio_amp 1 >/dev/null 2>&1
-fi
+# Start ampli
+echo "Start audio amplifier"
+start_audio_amp 1 >/dev/null 2>&1
 
 # Force unmute sound card and reset volume
 echo "Force unmute sound card and reset volume"
diff --git a/FunKey/board/funkey/rootfs-overlay/usr/games/launchers/gamegear_launch.sh b/FunKey/board/funkey/rootfs-overlay/usr/games/launchers/gamegear_launch.sh
index ad784a4..8364468 100755
--- a/FunKey/board/funkey/rootfs-overlay/usr/games/launchers/gamegear_launch.sh
+++ b/FunKey/board/funkey/rootfs-overlay/usr/games/launchers/gamegear_launch.sh
@@ -1,8 +1,10 @@
 #!/bin/sh
 
-# Security
 cp /usr/games/mednafen-09x.cfg ${MEDNAFEN_HOME}/
-export HOME=/tmp/funkey
-mkdir -p ${HOME}
-cd ${HOME}
-mednafen -fs 1 -gg.stretch full "$1"
+
+# Launch the process in background, record the PID into a file, wait
+# for the process to terminate and erase the recorded PID
+mednafen -fs 1 -gg.stretch full "$1"&
+record_pid $!
+wait $!
+erase_pid
diff --git a/FunKey/board/funkey/rootfs-overlay/usr/games/launchers/gb_launch.sh b/FunKey/board/funkey/rootfs-overlay/usr/games/launchers/gb_launch.sh
index e720655..4246680 100755
--- a/FunKey/board/funkey/rootfs-overlay/usr/games/launchers/gb_launch.sh
+++ b/FunKey/board/funkey/rootfs-overlay/usr/games/launchers/gb_launch.sh
@@ -1,6 +1,8 @@
 #!/bin/sh
 
-export HOME=/tmp/funkey
-mkdir -p ${HOME}
-cd ${HOME}
-sdlgnuboy --syncrtc "$1"
+# Launch the process in background, record the PID into a file, wait
+# for the process to terminate and erase the recorded PID
+sdlgnuboy --syncrtc "$1"&
+record_pid $!
+wait $!
+erase_pid
diff --git a/FunKey/board/funkey/rootfs-overlay/usr/games/launchers/gba_launch_gpsp.sh b/FunKey/board/funkey/rootfs-overlay/usr/games/launchers/gba_launch_gpsp.sh
index 583d447..fb4d910 100755
--- a/FunKey/board/funkey/rootfs-overlay/usr/games/launchers/gba_launch_gpsp.sh
+++ b/FunKey/board/funkey/rootfs-overlay/usr/games/launchers/gba_launch_gpsp.sh
@@ -1,6 +1,10 @@
 #!/bin/sh
 
-export HOME=/tmp/funkey
-mkdir -p ${HOME}
+# Launch the process in background, record the PID into a file, wait
+# for the process to terminate and erase the recorded PID
 cd ${HOME}
-gpsp "$1"
+gpsp "$1"&
+record_pid $!
+wait $!
+erase_pid
+
diff --git a/FunKey/board/funkey/rootfs-overlay/usr/games/launchers/gba_launch_mednafen.sh b/FunKey/board/funkey/rootfs-overlay/usr/games/launchers/gba_launch_mednafen.sh
index 583d447..88dd220 100755
--- a/FunKey/board/funkey/rootfs-overlay/usr/games/launchers/gba_launch_mednafen.sh
+++ b/FunKey/board/funkey/rootfs-overlay/usr/games/launchers/gba_launch_mednafen.sh
@@ -1,6 +1,8 @@
 #!/bin/sh
 
-export HOME=/tmp/funkey
-mkdir -p ${HOME}
-cd ${HOME}
-gpsp "$1"
+# Launch the process in background, record the PID into a file, wait
+# for the process to terminate and erase the recorded PID
+gpsp "$1"&
+record_pid $!
+wait $!
+erase_pid
diff --git a/FunKey/board/funkey/rootfs-overlay/usr/games/launchers/lynx_launch.sh b/FunKey/board/funkey/rootfs-overlay/usr/games/launchers/lynx_launch.sh
index b49b791..300941f 100755
--- a/FunKey/board/funkey/rootfs-overlay/usr/games/launchers/lynx_launch.sh
+++ b/FunKey/board/funkey/rootfs-overlay/usr/games/launchers/lynx_launch.sh
@@ -1,8 +1,10 @@
 #!/bin/sh
 
-# Security
 cp /usr/games/mednafen-09x.cfg ${MEDNAFEN_HOME}/
-export HOME=/tmp/funkey
-mkdir -p ${HOME}
-cd ${HOME}
-mednafen -fs 1 -lynx.stretch full "$1"
+
+# Launch the process in background, record the PID into a file, wait
+# for the process to terminate and erase the recorded PID
+mednafen -fs 1 -lynx.stretch full "$1"&
+record_pid $!
+wait $!
+erase_pid
diff --git a/FunKey/board/funkey/rootfs-overlay/usr/games/launchers/mame_launch_mednafen.sh b/FunKey/board/funkey/rootfs-overlay/usr/games/launchers/mame_launch_mednafen.sh
index cc29289..cca6afe 100755
--- a/FunKey/board/funkey/rootfs-overlay/usr/games/launchers/mame_launch_mednafen.sh
+++ b/FunKey/board/funkey/rootfs-overlay/usr/games/launchers/mame_launch_mednafen.sh
@@ -1,8 +1,10 @@
 #!/bin/sh
 
-# Security
 cp /usr/games/mednafen-09x.cfg ${MEDNAFEN_HOME}/
-export HOME=/tmp/funkey
-mkdir -p ${HOME}
-cd ${HOME}
-mednafen -sound 1 -soundrate 22050 -soundbufsize 100 -vdriver sdl -frameskip 1  -fs 0 "$1"
+
+# Launch the process in background, record the PID into a file, wait
+# for the process to terminate and erase the recorded PID
+mednafen -sound 1 -soundrate 22050 -soundbufsize 100 -vdriver sdl -frameskip 1  -fs 0 "$1"&
+record_pid $!
+wait $!
+erase_pid
diff --git a/FunKey/board/funkey/rootfs-overlay/usr/games/launchers/megadrive_launch.sh b/FunKey/board/funkey/rootfs-overlay/usr/games/launchers/megadrive_launch.sh
index 0186681..cf3805a 100755
--- a/FunKey/board/funkey/rootfs-overlay/usr/games/launchers/megadrive_launch.sh
+++ b/FunKey/board/funkey/rootfs-overlay/usr/games/launchers/megadrive_launch.sh
@@ -1,6 +1,8 @@
 #!/bin/sh
 
-export HOME=/tmp/funkey
-mkdir -p ${HOME}
-cd ${HOME}
-PicoDrive "$1"
+# Launch the process in background, record the PID into a file, wait
+# for the process to terminate and erase the recorded PID
+PicoDrive "$1"&
+record_pid $!
+wait $!
+erase_pid
diff --git a/FunKey/board/funkey/rootfs-overlay/usr/games/launchers/nes_launch.sh b/FunKey/board/funkey/rootfs-overlay/usr/games/launchers/nes_launch.sh
index 0555e12..e241779 100755
--- a/FunKey/board/funkey/rootfs-overlay/usr/games/launchers/nes_launch.sh
+++ b/FunKey/board/funkey/rootfs-overlay/usr/games/launchers/nes_launch.sh
@@ -1,7 +1,8 @@
 #!/bin/sh
 
-export HOME=/tmp/funkey
-mkdir -p ${HOME}
-cd ${HOME}
-fceux "$1"
-
+# Launch the process in background, record the PID into a file, wait
+# for the process to terminate and erase the recorded PID
+fceux "$1"&
+record_pid $!
+wait $!
+erase_pid
diff --git a/FunKey/board/funkey/rootfs-overlay/usr/games/launchers/ngp_launch.sh b/FunKey/board/funkey/rootfs-overlay/usr/games/launchers/ngp_launch.sh
index 0e50eba..54e6eb9 100755
--- a/FunKey/board/funkey/rootfs-overlay/usr/games/launchers/ngp_launch.sh
+++ b/FunKey/board/funkey/rootfs-overlay/usr/games/launchers/ngp_launch.sh
@@ -1,8 +1,10 @@
 #!/bin/sh
 
-# Security
 cp /usr/games/mednafen-09x.cfg ${MEDNAFEN_HOME}/
-export HOME=/tmp/funkey
-mkdir -p ${HOME}
-cd ${HOME}
-mednafen -fs 1 -ngp.stretch full "$1"
+
+# Launch the process in background, record the PID into a file, wait
+# for the process to terminate and erase the recorded PID
+mednafen -fs 1 -ngp.stretch full "$1"&
+record_pid $!
+wait $!
+erase_pid
diff --git a/FunKey/board/funkey/rootfs-overlay/usr/games/launchers/pce_launch.sh b/FunKey/board/funkey/rootfs-overlay/usr/games/launchers/pce_launch.sh
index 9bd08eb..6c3f914 100755
--- a/FunKey/board/funkey/rootfs-overlay/usr/games/launchers/pce_launch.sh
+++ b/FunKey/board/funkey/rootfs-overlay/usr/games/launchers/pce_launch.sh
@@ -1,8 +1,10 @@
 #!/bin/sh
 
-# Security
 cp /usr/games/mednafen-09x.cfg ${MEDNAFEN_HOME}/
-export HOME=/tmp/funkey
-mkdir -p ${HOME}
-cd ${HOME}
-mednafen -fs 1 -force_module pce_fast -pce_fast.stretch full "$1"
+
+# Launch the process in background, record the PID into a file, wait
+# for the process to terminate and erase the recorded PID
+mednafen -fs 1 -force_module pce_fast -pce_fast.stretch full "$1"&
+record_pid $!
+wait $!
+erase_pid
diff --git a/FunKey/board/funkey/rootfs-overlay/usr/games/launchers/psone_launch_mednafen.sh b/FunKey/board/funkey/rootfs-overlay/usr/games/launchers/psone_launch_mednafen.sh
index 3afc12d..b81a321 100755
--- a/FunKey/board/funkey/rootfs-overlay/usr/games/launchers/psone_launch_mednafen.sh
+++ b/FunKey/board/funkey/rootfs-overlay/usr/games/launchers/psone_launch_mednafen.sh
@@ -1,6 +1,8 @@
 #!/bin/sh
 
-export HOME=/tmp/funkey
-mkdir -p ${HOME}
-cd ${HOME}
-pcsx -frameskip -cdfile "$1"
+# Launch the process in background, record the PID into a file, wait
+# for the process to terminate and erase the recorded PID
+pcsx -frameskip -cdfile "$1"&
+record_pid $!
+wait $!
+erase_pid
diff --git a/FunKey/board/funkey/rootfs-overlay/usr/games/launchers/psone_launch_pcsx.sh b/FunKey/board/funkey/rootfs-overlay/usr/games/launchers/psone_launch_pcsx.sh
index f6c3e48..b2a0613 100755
--- a/FunKey/board/funkey/rootfs-overlay/usr/games/launchers/psone_launch_pcsx.sh
+++ b/FunKey/board/funkey/rootfs-overlay/usr/games/launchers/psone_launch_pcsx.sh
@@ -1,7 +1,8 @@
 #!/bin/sh
 
-export HOME=/tmp/funkey
-mkdir -p ${HOME}
-cd ${HOME}
-#pcsx -frameskip -cdfile "$1"
-pcsx -cdfile "$1"
+# Launch the process in background, record the PID into a file, wait
+# for the process to terminate and erase the recorded PID
+pcsx -cdfile "$1"&
+record_pid $!
+wait $!
+erase_pid
diff --git a/FunKey/board/funkey/rootfs-overlay/usr/games/launchers/sms_launch.sh b/FunKey/board/funkey/rootfs-overlay/usr/games/launchers/sms_launch.sh
index 0186681..cf3805a 100755
--- a/FunKey/board/funkey/rootfs-overlay/usr/games/launchers/sms_launch.sh
+++ b/FunKey/board/funkey/rootfs-overlay/usr/games/launchers/sms_launch.sh
@@ -1,6 +1,8 @@
 #!/bin/sh
 
-export HOME=/tmp/funkey
-mkdir -p ${HOME}
-cd ${HOME}
-PicoDrive "$1"
+# Launch the process in background, record the PID into a file, wait
+# for the process to terminate and erase the recorded PID
+PicoDrive "$1"&
+record_pid $!
+wait $!
+erase_pid
diff --git a/FunKey/board/funkey/rootfs-overlay/usr/games/launchers/snes_launch.sh b/FunKey/board/funkey/rootfs-overlay/usr/games/launchers/snes_launch.sh
index 5a5e718..e0b39cf 100755
--- a/FunKey/board/funkey/rootfs-overlay/usr/games/launchers/snes_launch.sh
+++ b/FunKey/board/funkey/rootfs-overlay/usr/games/launchers/snes_launch.sh
@@ -1,6 +1,8 @@
 #!/bin/sh
 
-export HOME=/tmp/funkey
-mkdir -p ${HOME}
-cd ${HOME}
-psnes "$1"
+# Launch the process in background, record the PID into a file, wait
+# for the process to terminate and erase the recorded PID
+psnes "$1"&
+record_pid $!
+wait $!
+erase_pid
diff --git a/FunKey/board/funkey/rootfs-overlay/usr/games/launchers/snes_launch_mednafen.sh b/FunKey/board/funkey/rootfs-overlay/usr/games/launchers/snes_launch_mednafen.sh
index 5a5e718..e0b39cf 100755
--- a/FunKey/board/funkey/rootfs-overlay/usr/games/launchers/snes_launch_mednafen.sh
+++ b/FunKey/board/funkey/rootfs-overlay/usr/games/launchers/snes_launch_mednafen.sh
@@ -1,6 +1,8 @@
 #!/bin/sh
 
-export HOME=/tmp/funkey
-mkdir -p ${HOME}
-cd ${HOME}
-psnes "$1"
+# Launch the process in background, record the PID into a file, wait
+# for the process to terminate and erase the recorded PID
+psnes "$1"&
+record_pid $!
+wait $!
+erase_pid
diff --git a/FunKey/board/funkey/rootfs-overlay/usr/games/launchers/wonderswan_launch.sh b/FunKey/board/funkey/rootfs-overlay/usr/games/launchers/wonderswan_launch.sh
index b274169..69f7ca2 100755
--- a/FunKey/board/funkey/rootfs-overlay/usr/games/launchers/wonderswan_launch.sh
+++ b/FunKey/board/funkey/rootfs-overlay/usr/games/launchers/wonderswan_launch.sh
@@ -1,8 +1,10 @@
 #!/bin/sh
 
-# Security
 cp /usr/games/mednafen-09x.cfg ${MEDNAFEN_HOME}/
-export HOME=/tmp/funkey
-mkdir -p ${HOME}
-cd ${HOME}
-mednafen -fs 1 -wswan.stretch full "$1"
+
+# Launch the process in background, record the PID into a file, wait
+# for the process to terminate and erase the recorded PID
+mednafen -fs 1 -wswan.stretch full "$1"&
+record_pid $!
+wait $!
+erase_pid
diff --git a/FunKey/board/funkey/rootfs-overlay/usr/local/lib/utils b/FunKey/board/funkey/rootfs-overlay/usr/local/lib/utils
index a770cca..ec02b68 100755
--- a/FunKey/board/funkey/rootfs-overlay/usr/local/lib/utils
+++ b/FunKey/board/funkey/rootfs-overlay/usr/local/lib/utils
@@ -17,6 +17,14 @@ if [ $? -ne 0 ]; then
 	warn "$@"
 	exit $return_code
     }
+
+    die_notif () {
+    	local return_code=$1
+	shift
+	warn "$@"
+	notif "$@"
+	exit $return_code
+    }
 fi
 
 notif () {
diff --git a/FunKey/board/funkey/rootfs-overlay/usr/local/sbin/assembly_tests b/FunKey/board/funkey/rootfs-overlay/usr/local/sbin/assembly_tests
index 1b9f2c2..12a92e5 100755
--- a/FunKey/board/funkey/rootfs-overlay/usr/local/sbin/assembly_tests
+++ b/FunKey/board/funkey/rootfs-overlay/usr/local/sbin/assembly_tests
@@ -16,6 +16,10 @@ if [ -f ${LOCK_FILE} ]; then
 fi
 touch ${LOCK_FILE}
 
+## Register ourself as the running FunKey task for receiving USR1
+## signal for shutting down
+echo $$ > "/var/run/funkey.pid"
+
 ## Binaries
 PROD_SCREEN_BIN="/usr/local/sbin/funkey_prod_screens"
 GET_PROC_UID="/usr/local/sbin/get_sid"
@@ -28,7 +32,7 @@ stop_loop=false
 proc_uid=$($GET_PROC_UID)
 
 ## Defines
-VERSION="1.00"
+VERSION="1.01"
 LOG_FILE="/root/logs/assembly_tests/assy_tests_${proc_uid}.log"
 [ -d $(dirname $LOG_FILE) ] || mkdir -p $(dirname $LOG_FILE)
 MAGNET_DETECTED_FILE="/root/.assembly_tests_magnet_detected"
@@ -40,6 +44,9 @@ function function_magnet_detected_ok {
 	## Kill scheduled shutdown
 	pkill sched_shutdown
 
+	## Kill the funkey_prod_screen binary
+	killall -s USR1 "${PROD_SCREEN_BIN} > /dev/null 2>&1
+
 	## Write magnet_detected file
 	if $test_failed; then
 		echo "1" > $MAGNET_DETECTED_FILE
@@ -58,6 +65,9 @@ function function_magnet_detected_ok {
 ## Function called when SIGUSR1 is caught while NOT waiting for it
 function function_magnet_detected_ko {
 	echo "ERROR: Caught SIGUSR1 signal (magnet detected!)"
+
+	## Kill the funkey_prod_screen binary
+	killall -s USR1 "${PROD_SCREEN_BIN} > /dev/null 2>&1
 	sync
 }
 
diff --git a/FunKey/board/funkey/rootfs-overlay/usr/local/sbin/brightness_get b/FunKey/board/funkey/rootfs-overlay/usr/local/sbin/brightness_get
index d664293..399bc37 100755
--- a/FunKey/board/funkey/rootfs-overlay/usr/local/sbin/brightness_get
+++ b/FunKey/board/funkey/rootfs-overlay/usr/local/sbin/brightness_get
@@ -12,7 +12,7 @@ fi
 brightness=$(fw_printenv -n brightness 2>/dev/null)
 if ! [ ! "${brightness}" -ne "${brightness}" ] 2> /dev/null; then
     brightness=${BRIGHTNESS_DEFAULT_VALUE}
-	fw_setenv brightness ${brightness}
+    fw_setenv brightness ${brightness}
 fi
 echo ${brightness}
 exit 0
diff --git a/FunKey/board/funkey/rootfs-overlay/usr/local/sbin/cancel_sched_powerdown b/FunKey/board/funkey/rootfs-overlay/usr/local/sbin/cancel_sched_powerdown
new file mode 100755
index 0000000..2952a44
--- /dev/null
+++ b/FunKey/board/funkey/rootfs-overlay/usr/local/sbin/cancel_sched_powerdown
@@ -0,0 +1,11 @@
+#!/bin/sh
+
+# Uncomment the following line to get debug info
+#set -x
+
+if [ ${#} != 0 ]; then
+    echo "Usage: $(basename ${0})"
+    exit 1
+fi
+pkill sched_powerdown
+exit 0
diff --git a/FunKey/board/funkey/rootfs-overlay/usr/local/sbin/display_notif_system_stats b/FunKey/board/funkey/rootfs-overlay/usr/local/sbin/display_notif_system_stats
index 5c4638d..a3cbd27 100755
--- a/FunKey/board/funkey/rootfs-overlay/usr/local/sbin/display_notif_system_stats
+++ b/FunKey/board/funkey/rootfs-overlay/usr/local/sbin/display_notif_system_stats
@@ -1,5 +1,5 @@
 #!/bin/sh
 
-notif_set 0 "Getting system stats..."
+notif_set 0 " Getting system stats..."
 killall -s USR1 system_stats
 exit 0
diff --git a/FunKey/board/funkey/rootfs-overlay/usr/local/sbin/erase_pid b/FunKey/board/funkey/rootfs-overlay/usr/local/sbin/erase_pid
new file mode 100755
index 0000000..afb3597
--- /dev/null
+++ b/FunKey/board/funkey/rootfs-overlay/usr/local/sbin/erase_pid
@@ -0,0 +1,12 @@
+#!/bin/sh
+
+# Uncomment the following line to get debug info
+#set -x
+
+# Check args
+if [ ${#} -ne 0 ]; then
+    echo "Usage: $(basename ${0})"
+    exit 1
+fi
+rm -f /var/run/funkey.pid
+exit 0
diff --git a/FunKey/board/funkey/rootfs-overlay/usr/local/sbin/first_boot b/FunKey/board/funkey/rootfs-overlay/usr/local/sbin/first_boot
index 2ab0721..03b022a 100755
--- a/FunKey/board/funkey/rootfs-overlay/usr/local/sbin/first_boot
+++ b/FunKey/board/funkey/rootfs-overlay/usr/local/sbin/first_boot
@@ -14,31 +14,40 @@ SELF=$(basename $0)
 root_part=$(cat /proc/cmdline | sed -n 's|^.*root=\([^ ]*\).*|\1|p')
 root_part_num=${root_part#/dev/mmcblk0p}
 if [ "${root_part_num}" -eq 1 ]; then
-    die 0 "recovery mode"
+    die_notif 0 "recovery mode"
 elif [ "${root_part_num}" = "{$root_part}" ]; then
-    die 1 "${root_part} is not an SD card partition, aborting"
+    die_notif 1 "${root_part} is not an SD card partition, aborting"
 elif [ "${root_part_num}" -ne 2 ]; then
-    die 2 "unknown partition layout, aborting"
+    die_notif 2 "unknown partition layout, aborting"
 fi
 let swap_part_num=${root_part_num}+1
 swap_part=/dev/mmcblk0p${swap_part_num}
-let share_part_num=${swap_part_num}+1
-share_part=/dev/mmcblk0p${share_part_num}
+let usb_part_num=${swap_part_num}+1
+usb_part=/dev/mmcblk0p${usb_part_num}
 
 check_root_id () {
-    [ $(id -u) -ne 0 ] && die 3 "this script must be run as root, aborting"
+    [ $(id -u) -ne 0 ] && die_notif 3 "this script must be run as root, aborting"
     return 0
 }
 
 resize_rootfs_partition () {
 
+    # Check if root partition is already resized
+    local rootfs_part_line=$(fdisk -l /dev/mmcblk0 2>/dev/null | grep ${root_part})
+    set ${rootfs_part_line}
+    local rootfs_part_size=${6}
+    if [ "${rootfs_part_size}" = "1G" ]; then
+	info "root partition is already resized"
+	return 0
+    fi
+
     # Check that the last partition is the rootfs partition
     local last_part_line=$(fdisk -l /dev/mmcblk0 2>/dev/null | tail -n 1)
     set ${last_part_line}
     local last_part_num=${1#/dev/mmcblk0p}
     local part_start=${3}
     if [ "${last_part_num}" != "${root_part_num}" ]; then
-	die 4 "rootfs is not the last partition. Don't know how to expand, aborting"
+	die_notif 4 "rootfs is not the last partition. Don't know how to expand, aborting"
     fi
 
     # Remove (temporarily) the rootfs partition
@@ -55,39 +64,50 @@ w
 EOF
 
     # Mark the rootfs partition as bootable
-    sfdisk -A /dev/mmcblk0 ${root_part_num} >/dev/null 2>&1 || die 7 "cannot make the rootfs partition bootable, aborting"
+    sfdisk -A /dev/mmcblk0 ${root_part_num} >/dev/null 2>&1 || die_notif 7 "cannot make the rootfs partition bootable, aborting"
 
-    return 0
-}
+    # Reload the partition table
+    partprobe /dev/mmcblk0 >/dev/null 2>&1 || die_notif 8 "cannot reload the partition table, aborting"
 
-reload_partition_table () {
-    partprobe /dev/mmcblk0 >/dev/null 2>&1 || die 9 "cannot reload the partition table, aborting"
     return 0
 }
 
 resize_rootfs_filesystem () {
+    local rootfs_line=$(df | grep /dev/root)
+    set ${rootfs_line}
+    local rootfs_size=${2}
+    if [ ${rootfs_size} -gt 1000000 ]; then
+	info "rootfs already resized"
+	return 0
+    fi
     rw
-    resize2fs ${root_part} >/dev/null 2>&1 || die 10 "cannot resize the root filesystem, aborting"
+    resize2fs ${root_part} >/dev/null 2>&1 || die_notif 9 "cannot resize the root filesystem, aborting"
     ro
     return 0
 }
 
 create_swap () {
-    mount | grep -q ${share_part}
-    if [ $? -ne 0 ]; then
 
-	# Check that the last partition is the rootfs partition
-	local last_part_line=$(fdisk -l /dev/mmcblk0 2>/dev/null | tail -n 1)
-	set ${last_part_line}
-	local last_part_num=${1#/dev/mmcblk0p}
-	if [ "$last_part_num" != "$root_part_num" ]; then
-	    die 11 "rootfs is not the last partition. Don't know how to create the backing store partition"
-	fi
+    # Check if swap partition already exists
+    fdisk -l /dev/mmcblk0 2>/dev/null | grep "Linux swap" >/dev/null 2>&1
+    if [ $? -eq 0 ]; then
+	info "swap partition already exists"
+    else
+	mount | grep -q ${usb_part}
+	if [ $? -ne 0 ]; then
 
-	# Create an additional linux swap partition
-	let swap_part_num=${last_part_num}+1
-	swap_part=/dev/mmcblk0p${swap_part_num}
-	fdisk /dev/mmcblk0 >/dev/null 2>&1 <<EOF
+	    # Check that the last partition is the rootfs partition
+	    local last_part_line=$(fdisk -l /dev/mmcblk0 2>/dev/null | tail -n 1)
+	    set ${last_part_line}
+	    local last_part_num=${1#/dev/mmcblk0p}
+	    if [ "$last_part_num" != "$root_part_num" ]; then
+		die_notif 10 "rootfs is not the last partition. Don't know how to create the backing store partition"
+	    fi
+
+	    # Create an additional linux swap partition
+	    let swap_part_num=${last_part_num}+1
+	    swap_part=/dev/mmcblk0p${swap_part_num}
+	    fdisk /dev/mmcblk0 >/dev/null 2>&1 <<EOF
 n
 p
 ${swap_part_num}
@@ -98,21 +118,35 @@ ${wap_part_num}
 82
 w
 EOF
+	fi
+    fi
+
+    # Check if swap is enabled
+    local swap_line=$(free | grep Swap)
+    set ${swap_line}
+    local swap_size=${2}
+    if [ ${swap_size} -eq 0 ]; then
 	mkswap ${swap_part} >/dev/null 2>&1
 	if [ $? -ne 0 ]; then
-	    die 14 "cannot create swap file, aborting"
+	    die_notif 11 "cannot create swap file, aborting"
 	fi
+
+	# Enable swap
+	swapon -a >/dev/null 2>&1 || die_notif 12 "cannot enable swap file, aborting"
     fi
     return 0
 }
 
-enable_swap () {
-    swapon -a >/dev/null 2>&1 || die 15 "cannot enable swap file, aborting"
-    return 0
-}
+create_usb_partition () {
 
-create_backing_store_partition () {
-    mount | grep -q ${share_part}
+    # Check if the USB partition already exists
+    fdisk -l /dev/mmcblk0 2>/dev/null | grep "W95 FAT32" >/dev/null 2>&1
+    if [ $? -eq 0 ]; then
+	info "USB partition already exists"
+	return 0
+    fi
+    
+    mount | grep -q ${usb_part}
     if [ $? -ne 0 ]; then
 
 	# Check that the last partition is the swap partition
@@ -120,76 +154,86 @@ create_backing_store_partition () {
 	set ${last_part_line}
 	local last_part_num=${1#/dev/mmcblk0p}
 	if [ "${last_part_num}" != "${swap_part_num}" ]; then
-	    die 15 "rootfs is not the last partition. Don't know how to create the backing store partition"
+	    die_notif 13 "rootfs is not the last partition. Don't know how to create the backing store partition"
 	fi
 
-	# Create an additional FAT32 share partition that fills the disk
-	let share_part_num=${last_part_num}+1
-	share_part=/dev/mmcblk0p${share_part_num}
+	# Create an additional FAT32 USB partition that fills the disk
+	let usb_part_num=${last_part_num}+1
+	usb_part=/dev/mmcblk0p${usb_part_num}
 	fdisk /dev/mmcblk0 >/dev/null 2>&1 <<EOF
 n
 p
-${share_part_num}
+${usb_part_num}
 
 
 t
-${share_part_num}
+${usb_part_num}
 c
 w
 EOF
 	sync
     fi
+
+    # Reload the partition table
+    partprobe /dev/mmcblk0 >/dev/null 2>&1 || die_notif 14 "cannot reload the partition table, aborting"
+
     return 0
 }
 
-format_backing_store_partition () {
+format_usb_partition () {
+
+    # Check if the USB partition is already mounted
+    mount | grep /mnt > /dev/null 2>&1
+    if [ $? -eq 0 ]; then
+	info "USB partition already mounted"
+	return 0
+    fi
 
     # Format the backing store as FAT32
-    mkfs.vfat ${share_part} >/dev/null 2>&1 || die 17 "cannot format the backing store partition"
+    mkfs.vfat ${usb_part} >/dev/null 2>&1 || die_notif 15 "cannot format the backing store partition"
     return 0
 }
 
-copy_files_to_store_partition () {
-    # Add file to force assembly tests
-    
-    
-    mount /mnt/ || die 18 "Cannot mount /mnt"
+copy_files_to_usb_partition () {
+
+    # Check if the USB partition is already mounted
+    mount | grep /mnt > /dev/null 2>&1
+    if [ $? -ne 0 ]; then
+	mount /mnt/ || die_notif 16 "Cannot mount /mnt"
+    fi
     unzip -q -o /usr/local/share/mnt_freware_games.zip -d /mnt/
-    umount /mnt/ || die 20 "Cannot unmount /mnt"
+    mkdir -p /mnt/Emulators
+    set +f
+    cp -f /usr/games/opk/*.opk /mnt/Emulators/
+    set -f
+    umount /mnt/ || die_notif 17 "Cannot unmount /mnt"
     return 0
 }
 
 check_root_id
 notif " FIRST BOOT DETECTED"
 
-notif " 1/9 RESIZE ROOT PARTITION"
+notif " 1/6 RESIZE ROOT PARTITION"
 resize_rootfs_partition
 
-notif " 2/9 RELOAD ROOT PARTITION"
-reload_partition_table
-
-notif " 3/9 RESIZE ROOT FILESYSTEM"
+notif " 2/6 RESIZE ROOT FILESYSTEM"
 resize_rootfs_filesystem
 
-notif " 4/9 CREATE SWAP"
+notif " 3/6 CREATE SWAP"
 create_swap
 
-notif " 5/9 ENABLE SWAP"
-enable_swap
+notif " 4/6 CREATE USB PARTITION"
+create_usb_partition
 
-notif " 6/9 CREATE USB PARTITION"
-create_backing_store_partition
+notif " 5/6 FORMAT USB PARTITION"
+format_usb_partition
 
-notif " 7/9 RELOAD PARTITION TABLE"
-reload_partition_table
-
-notif " 8/9 FORMAT USB PARTITION"
-format_backing_store_partition
-
-notif " 9/9 COPY FILES TO ^    USB PARTITION"
-copy_files_to_store_partition
+notif " 6/6 COPY FILES TO ^    USB PARTITION"
+copy_files_to_usb_partition
 
 notif " FIRST BOOT SETUP FINISHED!"
 
+fw_setenv first_boot_ok 1
+
 sleep 1
 clear_notif
diff --git a/FunKey/board/funkey/rootfs-overlay/usr/local/sbin/get_launcher b/FunKey/board/funkey/rootfs-overlay/usr/local/sbin/get_launcher
new file mode 100755
index 0000000..0911e80
--- /dev/null
+++ b/FunKey/board/funkey/rootfs-overlay/usr/local/sbin/get_launcher
@@ -0,0 +1,25 @@
+#!/bin/sh
+
+# Check args
+if [ ${#} -ne 0 ]; then
+    echo "Usage: $(basename ${0})"
+    exit 1
+fi
+
+# Launcher File
+LAUNCHER_FILE=$HOME/Launchers/launcher.txt
+DEFAULT_LAUNCHER=retrofe
+
+# Check that file exists
+[ -f "$LAUNCHER_FILE" ] || set_launcher $DEFAULT_LAUNCHER >/dev/null 2>&1
+
+# Check Launcher
+launcher=$(cat "$LAUNCHER_FILE" | head -1)
+
+# Check not empty
+[ -z "$LAUNCHER_FILE" ] && launcher=$DEFAULT_LAUNCHER; set_launcher $launcher >/dev/null 2>&1
+
+# Return launcher name
+echo $launcher
+
+exit 0
diff --git a/FunKey/board/funkey/rootfs-overlay/usr/local/sbin/instant_play b/FunKey/board/funkey/rootfs-overlay/usr/local/sbin/instant_play
new file mode 100755
index 0000000..42db9a0
--- /dev/null
+++ b/FunKey/board/funkey/rootfs-overlay/usr/local/sbin/instant_play
@@ -0,0 +1,37 @@
+#!/bin/sh
+
+# Uncomment the following line to get debug info
+#set -x
+
+# Check args
+if [ ${#} -eq 0 ]; then
+    echo "Usage: $(basename ${0}) args..."
+    exit 1
+fi
+
+INSTANT_PLAY_FILE="/mnt/instant_play"
+
+# Write quick load file args
+echo -n "" > "${INSTANT_PLAY_FILE}"
+for arg in "$@"; do
+    if $(echo "${arg}" | egrep -q '[[:space:]]'); then
+
+	# Add quotes around arguments containing spaces
+        echo -n "\"${arg}\" " >> "${INSTANT_PLAY_FILE}"
+    else
+	echo -n "${arg} "  >> "${INSTANT_PLAY_FILE}"
+    fi
+done
+
+# Add the magic sauce to launch the process in background, record the
+# PID into a file, wait for the process to terminate and erase the
+# recorded PID
+cat << EOF >> "${INSTANT_PLAY_FILE}"
+&
+record_pid \$!
+wait \$!
+erase_pid
+EOF
+
+# Now terminate gracefully
+exec shutdown_funkey
diff --git a/FunKey/board/funkey/rootfs-overlay/usr/local/sbin/kill_emulators b/FunKey/board/funkey/rootfs-overlay/usr/local/sbin/kill_emulators
deleted file mode 100755
index 7e5efd8..0000000
--- a/FunKey/board/funkey/rootfs-overlay/usr/local/sbin/kill_emulators
+++ /dev/null
@@ -1,4 +0,0 @@
-#!/bin/sh
-
-killall mednafen gpsp psnes pcsx sdlgnuboy PicoDriveBin
-exit 0
diff --git a/FunKey/board/funkey/rootfs-overlay/usr/local/sbin/record_pid b/FunKey/board/funkey/rootfs-overlay/usr/local/sbin/record_pid
new file mode 100755
index 0000000..0f98bfc
--- /dev/null
+++ b/FunKey/board/funkey/rootfs-overlay/usr/local/sbin/record_pid
@@ -0,0 +1,12 @@
+#!/bin/sh
+
+# Uncomment the following line to get debug info
+#set -x
+
+# Check args
+if [ ${#} -ne 1 ]; then
+    echo "Usage: $(basename ${0}) pid"
+    exit 1
+fi
+echo ${1} > /var/run/funkey.pid
+exit 0
diff --git a/FunKey/board/funkey/rootfs-overlay/usr/local/sbin/sched_shutdown b/FunKey/board/funkey/rootfs-overlay/usr/local/sbin/sched_shutdown
index 00ff51f..3fa1732 100755
--- a/FunKey/board/funkey/rootfs-overlay/usr/local/sbin/sched_shutdown
+++ b/FunKey/board/funkey/rootfs-overlay/usr/local/sbin/sched_shutdown
@@ -1,15 +1,22 @@
 #!/bin/sh
 
-if [ ${#} != 1 ]; then
-    echo "Usage: $0	seconds_before_shutdown"
+# Uncomment the following line to get debug info
+#set -x
+
+if [ ${#} != 1 -o "${1}" -eq 0 ]; then
+    echo "Usage: $(basename ${0}) grace_delay"
     exit 1
 fi
 
-nb_secs_to_wait=$1
+# Send USR1 signal to the running FunKey process to warn about
+# impending shutdown
+pkill -USR1 -F /var/run/funkey.pid > /dev/null 2>&1
 
-# Wait $nb_secs_to_wait seconds to catch signal USR2
-# If the signal is caught, then it means a process canceled this shutdown
-sleep ${nb_secs_to_wait}
+# Delay for the given grace period seconds to catch signal USR2.
+# If the signal is caught, then it means the running FunKey process
+# canceled this shutdown and will handle it by itself.
+sleep ${1}
 
-# Too late to cancel: init shutdown
+# Delay expired, initiate shutdown
 shutdown_funkey
+
diff --git a/FunKey/board/funkey/rootfs-overlay/usr/local/sbin/set_launcher b/FunKey/board/funkey/rootfs-overlay/usr/local/sbin/set_launcher
new file mode 100755
index 0000000..1d20bd9
--- /dev/null
+++ b/FunKey/board/funkey/rootfs-overlay/usr/local/sbin/set_launcher
@@ -0,0 +1,18 @@
+#!/bin/sh
+
+# Check number of args
+if [ ${#} -ne 1 ]; then
+    echo "Usage: $(basename ${0}) launcher"
+    exit 1
+fi
+
+# Launcher File
+LAUNCHER_FILE=$HOME/Launchers/launcher.txt
+mkdir -p "$(dirname "$LAUNCHER_FILE")"
+
+# Check Launcher
+NEW_LAUNCHER=${1}
+echo "Setting launcher: ${NEW_LAUNCHER}"
+echo ${NEW_LAUNCHER} > "$LAUNCHER_FILE"
+
+exit 0
diff --git a/FunKey/board/funkey/rootfs-overlay/usr/local/sbin/share b/FunKey/board/funkey/rootfs-overlay/usr/local/sbin/share
index 8e29850..f63d8a5 100755
--- a/FunKey/board/funkey/rootfs-overlay/usr/local/sbin/share
+++ b/FunKey/board/funkey/rootfs-overlay/usr/local/sbin/share
@@ -48,7 +48,8 @@ mount_share () {
     fi
 
     # Create the directory structure if required
-    (cd /mnt; mkdir -p "Atari lynx" "Game Boy" "Game Boy Color" "Game Boy Advance" "Game Gear" "Neo Geo Pocket" "NES" "PS1" "PS1/bios" "Sega Genesis" "Sega Master System" "SNES" "WonderSwan" "PCE-TurboGrafx")
+    #(cd /mnt; mkdir -p "Applications" "Emulators" "Games" "Atari lynx" "Game Boy" "Game Boy Color" "Game Boy Advance" "Game Gear" "Neo Geo Pocket" "NES" "PS1" "PS1/bios" "Sega Genesis" "Sega Master System" "SNES" "WonderSwan" "PCE-TurboGrafx")
+    (cd /mnt; mkdir -p "Emulators" "Atari lynx" "Game Boy" "Game Boy Color" "Game Boy Advance" "Game Gear" "Neo Geo Pocket" "NES" "PS1" "PS1/bios" "Sega Genesis" "Sega Master System" "SNES" "WonderSwan" "PCE-TurboGrafx")
 
     # Check if there is a firmware update file
     if [ -f /mnt/FunKey-*.fwu ]; then
diff --git a/FunKey/board/funkey/rootfs-overlay/usr/local/sbin/shutdown_funkey b/FunKey/board/funkey/rootfs-overlay/usr/local/sbin/shutdown_funkey
index 9f1dc45..54ed4ca 100755
--- a/FunKey/board/funkey/rootfs-overlay/usr/local/sbin/shutdown_funkey
+++ b/FunKey/board/funkey/rootfs-overlay/usr/local/sbin/shutdown_funkey
@@ -1,5 +1,9 @@
 #!/bin/sh
 
+# Notify system, reboot in progress
+REBOOTING_FILE="/run/rebooting"
+touch $REBOOTING_FILE
+
 # Notif fullscreen "Shutting down"
 notif_set 0 "^^^^^^^^      SHUTTING DOWN...^^^^^^^^"
 
@@ -9,5 +13,8 @@ start_audio_amp 0 >/dev/null 2>&1
 # Force Read Only 
 ro
 
+# Unmount writeable partition to force 
+umount -r /mnt
+
 # Poweroff
 poweroff
diff --git a/FunKey/board/funkey/rootfs-overlay/usr/local/sbin/signal_usr1_to_emulators b/FunKey/board/funkey/rootfs-overlay/usr/local/sbin/signal_usr1_to_emulators
deleted file mode 100755
index 0d2aa02..0000000
--- a/FunKey/board/funkey/rootfs-overlay/usr/local/sbin/signal_usr1_to_emulators
+++ /dev/null
@@ -1,23 +0,0 @@
-#!/bin/sh
-# This should replaced by storing the correct PID before 
-# launching an emulator and signaling only this one.
-
-RUN_ENV_VAR=$(fw_printenv -n assembly_tests 2>/dev/null)
-if [ "x${RUN_ENV_VAR}" == "x1" ]; then
-	# First this one
-	killall -s USR1 assembly_tests > /dev/null 2>&1
-
-	# Then this one
-	killall -s USR1 funkey_prod_screens > /dev/null 2>&1
-fi
-
-# Send signal to all PCSX first (time critical)
-killall -s USR1 pcsx > /dev/null 2>&1
-
-# Send signal to all other emulators
-killall -s USR1 gpsp psnes sdlgnuboy PicoDrive mednafen fceux > /dev/null 2>&1
-
-# Send signal to launcher
-killall -s USR1 retrofe > /dev/null 2>&1
-
-exit 0
diff --git a/FunKey/board/funkey/rootfs-overlay/usr/local/sbin/snap b/FunKey/board/funkey/rootfs-overlay/usr/local/sbin/snap
new file mode 100755
index 0000000..7a246a5
--- /dev/null
+++ b/FunKey/board/funkey/rootfs-overlay/usr/local/sbin/snap
@@ -0,0 +1,33 @@
+#!/bin/sh
+
+# Uncomment the following line to get debug info
+#set -x
+
+# Check args
+if [ ${#} -ne 0 ]; then
+    echo "Usage: $(basename ${0})"
+    exit 1
+fi
+
+# Lock file (necessary since fbgrab must run in bg not to block the buttons while gaming)
+LOCK_FILE="/var/lock/snap.lock"
+if [ -f "${LOCK_FILE}" ]; then
+    echo "${LOCK_FILE} already exists"
+    exit 1
+fi
+touch "${LOCK_FILE}"
+
+# Increment name and save snapshot
+SNAPSHOT_EXT=PNG
+SNAPSHOT_DIR=$HOME/snapshots
+mkdir -p "${SNAPSHOT_DIR}"
+last=$(cd ${SNAPSHOT_DIR}; ls IMG_*.${SNAPSHOT_EXT} 2> /dev/null | tail -1 | sed 's/^IMG_0*\([0-9]\+\)\.'${SNAPSHOT_EXT}'$/\1/')
+let last=${last}+1
+snapshot_file=$(printf "IMG_%04d.${SNAPSHOT_EXT}" $last)
+notif_set 2 "   SCREEENSHOT ${snapshot_file}"
+fbgrab "${SNAPSHOT_DIR}/${snapshot_file}" >/dev/null 2>&1 &
+
+# Remove lock file
+rm -f "${LOCK_FILE}"
+
+exit 0
diff --git a/FunKey/board/funkey/rootfs-overlay/usr/local/sbin/start_audio_amp b/FunKey/board/funkey/rootfs-overlay/usr/local/sbin/start_audio_amp
index 79f7f18..50034e0 100755
--- a/FunKey/board/funkey/rootfs-overlay/usr/local/sbin/start_audio_amp
+++ b/FunKey/board/funkey/rootfs-overlay/usr/local/sbin/start_audio_amp
@@ -6,10 +6,16 @@ if [ ${#} -ne 1 ]; then
     exit 1
 fi
 
-# Check Enable arg
+# Check enable arg
 enable=${1}
 if [ ${enable} -eq 1 ]; then
-    echo "Turning audio amplifier ON"
+
+	# Turn ON only if volume is not null
+	if [ "$(volume_get)" -ne "0" ]; then
+    	echo "Turning audio amplifier ON"
+    else
+    	exit 0
+	fi
 elif [ ${enable} -eq 0 ]; then
     echo "Turning audio amplifier OFF"
 else
diff --git a/FunKey/board/funkey/rootfs-overlay/usr/local/sbin/start_launcher b/FunKey/board/funkey/rootfs-overlay/usr/local/sbin/start_launcher
index bcf2b43..1c10d96 100755
--- a/FunKey/board/funkey/rootfs-overlay/usr/local/sbin/start_launcher
+++ b/FunKey/board/funkey/rootfs-overlay/usr/local/sbin/start_launcher
@@ -1,46 +1,75 @@
 #!/bin/sh
 
-LOCK_FILE=/var/lock/launcher.lock
-PREVENT_LAUNCHER_FILE=/mnt/prevent_launcher
-PREVENT_LAUNCHER_FILE2=/boot/prevent_launcher
-QUICK_LOAD_FILE=/mnt/quick_load_cmd
+# Uncomment the following line to get debug info
+#set -x
 
-if [ -f ${LOCK_FILE} ]; then
+LOCK_FILE="/var/lock/launcher.lock"
+INSTANT_PLAY_FILE="/mnt/instant_play"
+PREVENT_LAUNCHER_FILE="/mnt/prevent_launcher"
+REBOOTING_FILE="/run/rebooting"
+
+
+if [ -f "${LOCK_FILE}" ]; then
     echo "${LOCK_FILE} already exists"
     exit 1
 fi
-touch ${LOCK_FILE}
+touch "${LOCK_FILE}"
 
-mkdir -p ${MEDNAFEN_HOME}
-cp /usr/games/lynxboot.img /usr/games/mednafen-09x.cfg ${MEDNAFEN_HOME}/
+# Sanity cmd: in case these files do not exist
+mkdir -p "${MEDNAFEN_HOME}"
+cp "/usr/games/lynxboot.img" "/usr/games/mednafen-09x.cfg" "${MEDNAFEN_HOME}/"
 
-# Launch Previous Game
-if [ -f ${QUICK_LOAD_FILE} ]; then
-    command=$(cat ${QUICK_LOAD_FILE})
-    echo "Found quick load file, restarting previous game with command:"
-    echo ${command}
-    rm ${QUICK_LOAD_FILE}
-    eval ${command}
+# Launch Previous Game if any
+if [ -f "${INSTANT_PLAY_FILE}" ]; then
+    echo "Found Instant Play file, restarting previous game with command: "$(head -n 1 "${INSTANT_PLAY_FILE}")
+    source "${INSTANT_PLAY_FILE}"
+    rm -f "${INSTANT_PLAY_FILE}"
     termfix_all
 fi
 
-# Loop to launch launcher indefinitely
+# Then loop to launch the launcher indefinitely
 while true; do
 	
     # Check if prevent launcher file present
-    if [ -f ${PREVENT_LAUNCHER_FILE} ]; then
-	echo "Found file: ${PREVENT_LAUNCHER_FILE}, not launching launcher" 
-	sleep 5
-    elif [ -f ${PREVENT_LAUNCHER_FILE2} ]; then
-	echo "Found file: ${PREVENT_LAUNCHER_FILE2}, not launching launcher" 
-	sleep 5
+    if [ -f "${PREVENT_LAUNCHER_FILE}" ]; then
+		echo "${PREVENT_LAUNCHER_FILE} file found, not starting launcher" 
+		sleep 5
     else
-	# Launch Retrofe
-	retrofe
+    	LAUNCHER=$(get_launcher)
 
-	# In case retrofe quits with errors, clear graphic VT
-	termfix_all
+		if [ ${LAUNCHER} == "gmenu2x" ]; then
+
+		    # Launch gmenu2x
+		    gmenu2x&
+		elif [ ${LAUNCHER} == "retrofe" ]; then
+
+		    # Launch Retrofe
+		    retrofe&
+		else
+			DEFAULT_LAUNCHER=retrofe
+		    echo "Not recognized launcher: $LAUNCHER, setting $DEFAULT_LAUNCHER"
+		    set_launcher $DEFAULT_LAUNCHER
+		fi
+
+		# Record the PID into a file, wait for the
+		# process to terminate and erase the recorded PID
+		record_pid $!
+		wait $!
+		erase_pid
+
+		# In case retrofe quits with errors, clear graphic VT
+		termfix_all
+    fi
+
+    # WD to prevent 100% CPU
+    sleep 0.5
+
+    # Exit if console rebooting
+    if [ -f $REBOOTING_FILE ]; then
+    	break
     fi
 done
-rm ${LOCK_FILE}
+
+# Remove lock file and exit
+rm "${LOCK_FILE}"
 exit 0
diff --git a/FunKey/board/funkey/rootfs-overlay/usr/local/sbin/system_info b/FunKey/board/funkey/rootfs-overlay/usr/local/sbin/system_info
new file mode 100755
index 0000000..cd1fca7
--- /dev/null
+++ b/FunKey/board/funkey/rootfs-overlay/usr/local/sbin/system_info
@@ -0,0 +1,18 @@
+#!/bin/sh
+
+. /etc/os-release
+ROOTFS_VERSION=`echo $VERSION | sed -e 's/.*-g/g/'`
+ROOTFS_DATE=`date -r /etc/os-release '+%a %b %d %Y' `
+
+KERNEL_VERSION=`uname -r`
+KERNEL_DATE=`uname -v |cut -d ' ' -f 3-5,8`
+PROCESSOR=`cat /proc/cpuinfo |head -1 |sed 's/^.*: //' `
+RAM=`cat /proc/meminfo | head -1 |sed 's/^MemTotal: \+//' `
+SWAP=`awk 'BEGIN { getline } { print "\t"$1": "$3" kB" }' /proc/swaps 2>/dev/null`
+[ "$SWAP" ] && SWAP="Swap: $SWAP\n"
+
+echo -e "Kernel version: $KERNEL_VERSION\nCompiled: $KERNEL_DATE\n
+Root FS version: $ROOTFS_VERSION\nCompiled: $ROOTFS_DATE\n
+Processor: $PROCESSOR\nUsable RAM: $RAM\n$SWAP
+Network interfaces:"
+ip -o -4 address list | awk '{if ($2 != "lo") { printf "%7s %s\n", $2, gensub("/.*$", "", "g", $4) ; outlines++ } } END { if (outlines == 0) print " (none)" }'
diff --git a/FunKey/board/funkey/rootfs-overlay/usr/local/sbin/system_stats b/FunKey/board/funkey/rootfs-overlay/usr/local/sbin/system_stats
index a6a091a..4e6ad97 100755
--- a/FunKey/board/funkey/rootfs-overlay/usr/local/sbin/system_stats
+++ b/FunKey/board/funkey/rootfs-overlay/usr/local/sbin/system_stats
@@ -20,16 +20,21 @@ while true; do
     if [ ${perform} -eq 1 ]; then
 
 	# Compute stats
-	cpu=$(printf "%.1f\n" $(mpstat -P ALL $UPDATE_PERIOD 1 | tail -1 | awk '{print 100-$12}'))
-	ram_mem=$(printf "%.1f\n" $(free | grep Mem | awk '{print $3/$2 * 100.0}'))
-	ram_swap=$(printf "%.1f\n" $(free | grep Swap | awk '{print $3/$2 * 100.0}'))
+	cpu=$(printf "%.0f\n" $(mpstat -P ALL $UPDATE_PERIOD 1 | tail -1 | awk '{print 100-$12}'))
+	ram_mem=$(printf "%.0f\n" $(free | grep Mem | awk '{print $3/$2 * 100.0}'))
+	ram_swap=$(printf "%.0f\n" $(free | grep Swap | awk '{print $3/$2 * 100.0}'))
+	ip_addr=$(ifconfig usb0 | grep "inet " | awk -F'[: ]+' '{ print $4 }')
 
 	# Notif
 	if [ ${notif_dirty} -eq 1 ]; then
 	    notif_clear
 	    notif_dirty=0
 	else
-	    notif_set 0 "CPU:${cpu}%% RAM:${ram_mem}%% SWAP:${ram_swap}%%"
+	    if [ "x${ip_addr}" != "x" ]; then
+		notif_set 0 " CPU:${cpu}%% RAM:${ram_mem}%% SWAP:${ram_swap}%%^IP:${ip_addr}"
+	    else
+		notif_set 0 " CPU:${cpu}%% RAM:${ram_mem}%% SWAP:${ram_swap}%%"
+	    fi
 	fi
     else
 	sleep ${UPDATE_PERIOD}
diff --git a/FunKey/board/funkey/rootfs-overlay/usr/local/sbin/volume_set b/FunKey/board/funkey/rootfs-overlay/usr/local/sbin/volume_set
index bebc413..41c9619 100755
--- a/FunKey/board/funkey/rootfs-overlay/usr/local/sbin/volume_set
+++ b/FunKey/board/funkey/rootfs-overlay/usr/local/sbin/volume_set
@@ -21,13 +21,6 @@ volume_scaled=$(echo "a = $volume_percent * (63 - $vol_mini) / 100 + $vol_mini +
 # Get current value
 current_volume=$(volume_get)
 
-# Turn on/off audio amplifier if necessary
-if [ ${current_volume} -eq 0 -a ${volume_percent} -ne 0 ]; then
-    start_audio_amp 1
-elif [ ${current_volume} -ne 0 -a ${volume_percent} -eq 0 ]; then
-    start_audio_amp 0
-fi
-
 # Set new volume
 amixer -q sset 'Headphone' ${volume_scaled} unmute
 
@@ -35,4 +28,12 @@ amixer -q sset 'Headphone' ${volume_scaled} unmute
 if [ ${?} -eq 0 -a ${current_volume} -ne ${volume_percent} ]; then
     fw_setenv volume ${volume_percent}
 fi
+
+# Turn on/off audio amplifier if necessary
+if [ ${current_volume} -eq 0 -a ${volume_percent} -ne 0 ]; then
+    start_audio_amp 1
+elif [ ${current_volume} -ne 0 -a ${volume_percent} -eq 0 ]; then
+    start_audio_amp 0
+fi
+
 exit 0
diff --git a/FunKey/board/funkey/rootfs-overlay/usr/share/fonts/truetype b/FunKey/board/funkey/rootfs-overlay/usr/share/fonts/truetype
new file mode 120000
index 0000000..945c9b4
--- /dev/null
+++ b/FunKey/board/funkey/rootfs-overlay/usr/share/fonts/truetype
@@ -0,0 +1 @@
+.
\ No newline at end of file
diff --git a/FunKey/board/funkey/rootfs-overlay/usr/share/gmenu2x/about.txt b/FunKey/board/funkey/rootfs-overlay/usr/share/gmenu2x/about.txt
new file mode 100644
index 0000000..7f8db0e
--- /dev/null
+++ b/FunKey/board/funkey/rootfs-overlay/usr/share/gmenu2x/about.txt
@@ -0,0 +1,20 @@
+GMenu2X has been created by Massimiliano "Ryo" Torromeo, and is released under the GPL-v2 license.
+
+This version is maintained by the Qi-Hardware and OpenDingux crew. The credits, as well as the latest source code can be found here:
+http://projects.qi-hardware.com/index.php/p/gmenu2x
+
+Credits for the translations:
+----
+English & Italian by Massimiliano Torromeo
+French by Paul Cercueil
+Danish by claus
+Dutch by superfly
+Spanish by pedator
+Portuguese (Portugal) by NightShadow
+Slovak by Jozef
+Swedish by Esslan and Micket
+German by fusion_power, johnnysnet and Waldteufel
+Finnish by Jontte and Atte
+Norwegian by Lithium Flower
+Russian by XaMMaX90
+Polish by Artur Rojek
diff --git a/FunKey/board/funkey/rootfs-overlay/usr/share/gmenu2x/gmenu2x.conf b/FunKey/board/funkey/rootfs-overlay/usr/share/gmenu2x/gmenu2x.conf
new file mode 100644
index 0000000..3d6e76c
--- /dev/null
+++ b/FunKey/board/funkey/rootfs-overlay/usr/share/gmenu2x/gmenu2x.conf
@@ -0,0 +1,7 @@
+videoBpp=16
+backlightTimeout=0
+opkPlatforms="funkey-s"
+brightnessSysfs="/sys/class/backlight/backlight"
+powerSupplySysfs="/sys/class/power_supply/axp20x-usb"
+batterySysfs="/sys/class/power_supply/axp20x-battery"
+section=1
diff --git a/FunKey/board/funkey/rootfs-overlay/usr/share/gmenu2x/input.conf b/FunKey/board/funkey/rootfs-overlay/usr/share/gmenu2x/input.conf
new file mode 100644
index 0000000..4341eb1
--- /dev/null
+++ b/FunKey/board/funkey/rootfs-overlay/usr/share/gmenu2x/input.conf
@@ -0,0 +1,11 @@
+accept=keyboard,97
+cancel=keyboard,98
+altleft=keyboard,109
+altright=keyboard,110
+menu=keyboard,107
+settings=keyboard,115
+up=keyboard,117
+down=keyboard,100
+left=keyboard,108
+right=keyboard,114
+home=keyboard,113
diff --git a/FunKey/board/funkey/rootfs-overlay/usr/share/gmenu2x/skins/240x240/Default/wallpapers/Abstract Blue.png b/FunKey/board/funkey/rootfs-overlay/usr/share/gmenu2x/skins/240x240/Default/wallpapers/Abstract Blue.png
new file mode 100644
index 0000000..8ce6fed
Binary files /dev/null and b/FunKey/board/funkey/rootfs-overlay/usr/share/gmenu2x/skins/240x240/Default/wallpapers/Abstract Blue.png differ
diff --git a/FunKey/board/funkey/rootfs-overlay/usr/share/gmenu2x/skins/240x240/Default/wallpapers/Abstract Orange.png b/FunKey/board/funkey/rootfs-overlay/usr/share/gmenu2x/skins/240x240/Default/wallpapers/Abstract Orange.png
new file mode 100644
index 0000000..9cb8459
Binary files /dev/null and b/FunKey/board/funkey/rootfs-overlay/usr/share/gmenu2x/skins/240x240/Default/wallpapers/Abstract Orange.png differ
diff --git a/FunKey/board/funkey/rootfs-overlay/usr/share/gmenu2x/skins/240x240/Default/wallpapers/Abstract Red.png b/FunKey/board/funkey/rootfs-overlay/usr/share/gmenu2x/skins/240x240/Default/wallpapers/Abstract Red.png
new file mode 100644
index 0000000..941eb86
Binary files /dev/null and b/FunKey/board/funkey/rootfs-overlay/usr/share/gmenu2x/skins/240x240/Default/wallpapers/Abstract Red.png differ
diff --git a/FunKey/board/funkey/rootfs-overlay/usr/share/gmenu2x/skins/240x240/Default/wallpapers/Cubes.png b/FunKey/board/funkey/rootfs-overlay/usr/share/gmenu2x/skins/240x240/Default/wallpapers/Cubes.png
new file mode 100644
index 0000000..63ab98d
Binary files /dev/null and b/FunKey/board/funkey/rootfs-overlay/usr/share/gmenu2x/skins/240x240/Default/wallpapers/Cubes.png differ
diff --git a/FunKey/board/funkey/rootfs-overlay/usr/share/gmenu2x/skins/240x240/Default/wallpapers/planet.png b/FunKey/board/funkey/rootfs-overlay/usr/share/gmenu2x/skins/240x240/Default/wallpapers/planet.png
new file mode 100644
index 0000000..643e6ad
Binary files /dev/null and b/FunKey/board/funkey/rootfs-overlay/usr/share/gmenu2x/skins/240x240/Default/wallpapers/planet.png differ
diff --git a/FunKey/board/funkey/rootfs-overlay/usr/share/gmenu2x/translations/Basque b/FunKey/board/funkey/rootfs-overlay/usr/share/gmenu2x/translations/Basque
new file mode 100644
index 0000000..1895e61
--- /dev/null
+++ b/FunKey/board/funkey/rootfs-overlay/usr/share/gmenu2x/translations/Basque
@@ -0,0 +1,129 @@
+Settings=Aukerak
+Configure GMenu2X's options=GMenu2X aukerak konfiguratu
+Activate Usb on SD=Sd-aren usb-a aktibatu
+Activate Usb on Nand=Nand-aren usb-a aktibatu
+Info about GMenu2X=GMenu2X-ri buruzkoak erakutsi
+About=Honi buruz...
+Add section=Sekzioa gehitu
+Rename section=Sekzioaren izena aldatu
+Delete section=Sekzioa ezabatu
+Scan for applications and games=Jokuak eta programak bilatu
+applications=Programak
+Edit link=Esteka aldatu
+Title=Izenburua
+Link title=Estekaren izenburua
+Description=Azalpena
+Link description=Azalpenaren esteka
+Section=sekzioa
+The section this link belongs to=Esteka honen sekzioa da
+Icon=Ikonoa
+Select an icon for the link: $1=Estekarentzako ikonoa aukeratu: $1
+Manual=Eskuliburua
+Select a graphic/textual manual or a readme=Aukeratu eskuliburua(testua edo grafikoa)
+Cpu clock frequency to set when launching this link=Programa hau abiarazteko PUZ abiadura ezarri
+Volume to set for this link=Esteka honentzako bolumena ezarri
+Parameters=Parametroak
+Parameters to pass to the application=Programara pasatuko diren parametroak
+Selector Directory=Selektorearen karpeta
+Directory to scan for the selector=Selektorearekin eskaneatuko den direktorioa
+Selector Browser=Selektorearen arakatzailea
+Allow the selector to change directory=Selektoreari direktorioz aldatzea baimendu
+Selector Filter=Selektorearen iragazkia
+Filter for the selector (Separate values with a comma)=Selektorearentzako iragazkiak(Balioak komekin banandu)
+Selector Screenshots=Selektorearen pantaila-argazkia
+Directory of the screenshots for the selector=Selektorearen pantaila-argazkien direktorioa
+Selector Aliases=Selektorearen aliasak
+File containing a list of aliases for the selector=Aliasen zerrenda duen artxiboaren izena
+Don't Leave=Ez irten
+Don't quit GMenu2X when launching this link=Lotura hau abiaraztean ez amaitu Gmenu2x
+Save last selection=Azken aukera gogoratu
+Save the last selected link and section on exit=Azken aukera eta esteka gorde irtetean
+Clock for GMenu2X=GMenu2X-ren erlojua
+Set the cpu working frequency when running GMenu2X=Gmenu2x-rentzako PUZ abiadura ezarri
+Maximum overclock=Overclock muga
+Set the maximum overclock for launching links=Ezarri daitekeen overclockik handiena
+Global Volume=Bolumen orokorra
+Set the default volume for the gp2x soundcard=Aurrezarritako bolumen maila
+Output logs=log artxiboak
+Logs the output of the links. Use the Log Viewer to read them.=esteken log-ak gorde. Log irakurlea erabili irakurtzeko.
+Number of columns=Zutabe zenbakia
+Set the number of columns of links to display on a page=Orri bakoitzeko erakutsiko diren zutabeak
+Number of rows=Ilara zenbakia
+Set the number of rows of links to display on a page=Orri bakoitzeko erakutsiko diren ilarak
+Top Bar Color=Goiko barraren kolorea
+Color of the top bar=Goian dagoen barraren kolorea
+Bottom Bar Color=Beheko barraren kolorea
+Color of the bottom bar=Behean dagoen barraren kolorea
+Selection Color=aukeratutakoaren kolorea
+Color of the selection and other interface details=Aukeratutako testuaren kolorea eta interfacearen beste aukera batzuk
+You should disable Usb Networking to do this.=Usb sarea desaktibatu beharko zenuke.
+Operation not permitted.=Baimendu gabeko operazioa.
+Language=Hizkuntza
+Set the language used by GMenu2X=Aukeratu Gmenu2x-ren hizkuntza
+Increase=Handitu
+Decrease=txikitu
+Change color component=Kolore konponentea aldatu
+Increase value=Balioa handitu
+Decrease value=Balioa txikitu
+Switch=Aldatu
+Change value=Balioa aldatu
+Edit=Editatu
+Clear=Garbitu
+Select a directory=Direktorioa aukeratu
+Select a file=Artxiboa aukeratu ezazu
+Clock (default: 200)=Maiztasuna (Aurrezarritakoa: 200)
+Volume (default: -1)=bolumena (Aurrezarritakoa: -1)
+Enter folder=Karpetan sartu
+Confirm=Berretsi
+Enter folder/Confirm=karpetan sartu/Berretsi
+Up one folder=Karpeta bat gora
+Select an application=Programa aukeratu
+Space=espazio-barra
+Shift=Shift
+Cancel=Cancelar
+OK=Ok
+Backspace=Backspace
+Skin=Maskara
+Set the skin used by GMenu2X=GMenu2X-ren maskara aukeratu
+Add link in $1=esteka sortu... $1
+Edit $1=Aldatu $1
+Delete $1 link=honen esteka ezabatu $1
+Deleting $1=Ezabatzen $1
+Are you sure?=Ziur zaude?
+Insert a name for the new section=Sekzio berrirako izena sartu
+Insert a new name for this section=Sekzio honetarako izen berria sartu
+Yes=Bai
+No=Ez
+You will lose all the links in this section.=Sekzio honetako esteka guztiak galduko dira
+Exit=Irten
+Link Scanner=Esteka bilatzailea
+Scanning SD filesystem...=SD txartela arakatzen...
+Scanning NAND filesystem...=NAND memoria arakatzen...
+$1 files found.=$1 aurkitutako artxiboak(s).
+Creating links...=Estekak sortzen...
+$1 links created.=$1 esteka sorturik(s).
+Version $1 (Build date: $2)=Bertsioa $1 (data: $2)
+Log Viewer=Log irakurlea
+Displays last launched program's output=Abiarazitako azken programa erakutsi
+Do you want to delete the log file?=Log-ak ezabatu nahi dituzu?
+USB Enabled (SD)=USB-a piztuta (SD)
+USB Enabled (Nand)=USB-a piztuta (Nand)
+Turn off=Desaktibatu
+Launching $1=Abiarazten $1
+Change page=Orria aldatu
+Page=Orria
+Scroll=scroll
+Untitled=Izenburu gabekoa
+Change GMenu2X wallpaper=GMenu2X-ren atzeko irudia aldatu
+Activate/deactivate tv-out=Tv-out piztu/itzali
+Select wallpaper=aukeratu atzekaldeko irudia
+Gamma=Gama
+Set gp2x gamma value (default: 10)=Gp2x-ren gama balioa ezarri (aurrezarritakoa: 10)
+Tv-Out encoding=tv-out-ren kodeketa
+Encoding of the tv-out signal=TV-out-ren seinalearen kodeketa
+Tweak RAM Timings=Ram denborak hobetu
+This usually speeds up the application at the cost of stability=programa azkarrago doa estabilitatearen truke.
+Gamma (default: 0)=Gama (aurrezarritakoa: 0)
+Gamma value to set when launching this link=Esteka hau abiarazteko erabiltzen den gama balioa
+Wrapper=Itzuli
+Explicitly relaunch GMenu2X after this link's execution ends=Amaitzerakoan gmenu2x birkargatu
diff --git a/FunKey/board/funkey/rootfs-overlay/usr/share/gmenu2x/translations/Catalan b/FunKey/board/funkey/rootfs-overlay/usr/share/gmenu2x/translations/Catalan
new file mode 100644
index 0000000..87d25ab
--- /dev/null
+++ b/FunKey/board/funkey/rootfs-overlay/usr/share/gmenu2x/translations/Catalan
@@ -0,0 +1,137 @@
+Settings=Preferències
+Configure GMenu2X's options=Configura les opcions del GMenu2X
+Activate Usb on SD=Activa USB per la SD
+Activate Usb on Nand=Activa USB per la Nand
+Info about GMenu2X=Informació del GMenu2X
+About=Informació
+Add section=Afegir secció
+Rename section=Re anomenar secció
+Delete section=Eliminar secció
+Scan for applications and games=Buscar aplicacions i jocs
+applications=Aplicacions
+Edit link=Editar enllaç
+Title=Títol
+Link title=Títol de l'enllaç
+Description=Descripció
+Link description=Descripció de l'enllaç
+Section=Secció
+The section this link belongs to=Secció a la que pertany l'enllaç
+Icon=Icona
+Select an icon for the link: $1=Selecciona una icona per l'enllaç: $1
+Manual=Manual
+Select a graphic/textual manual or a readme=Selecciona un manual gràfic/text o un "readme"
+Cpu clock frequency to set when launching this link=Ajust del rellotge de la cpu per aquest enllaç
+Volume to set for this link=Ajust del volum de l'enllaç
+Parameters=Paràmetres
+Parameters to pass to the application=Paràmetres que s'envien a l'aplicació
+Selector Directory=Directori del Selector
+Directory to scan for the selector=Directori a explorar amb el selector
+Selector Browser=Explorador del selector
+Allow the selector to change directory=Permetre al selector canviar de directori
+Selector Filter=Filtre del selector
+Filter for the selector (Separate values with a comma)=Filtre per el selector (Separar valors amb comes)
+Selector Screenshots=Captures de pantalla del selector
+Directory of the screenshots for the selector=Directori de captures de pantalla per el selector
+Selector Aliases=Alias del selector
+File containing a list of aliases for the selector=Fitxer que conté la llista d'alias per el selector
+Explicitly relaunch GMenu2X after this link's execution ends=Força recarregar el GMenu2X a l'acabar l'execució de l'enllaç
+Don't Leave=No sortir
+Don't quit GMenu2X when launching this link=No tancar GMenu2X al carregar aquest enllaç
+Save last selection=Recordar l'última selecció
+Save the last selected link and section on exit=Recordar l'última secció i enllaç seleccionat al sortir
+Clock for GMenu2X=Rellotge per al GMenu2X
+Set the cpu working frequency when running GMenu2X=Ajustar la freqüència de treball de la cpu a l'executar GMenu2X
+Maximum overclock=Overclock màxim
+Set the maximum overclock for launching links=Ajustar al màxim overclock per a carregar enllaços
+Global Volume=Volum global
+Set the default volume for the gp2x soundcard=Ajusta el volum per defecte del so a la gp2x
+Output logs=Fitxers de Log
+Logs the output of the links. Use the Log Viewer to read them.=Enregistra els Logs dels enllaços. Usa el lector de registres per llegir-los.
+Number of columns=Número de columnes
+Set the number of columns of links to display on a page=Ajusta el número de columnes d'enllaços a mostrar per pàgina
+Number of rows=Número de línies
+Set the number of rows of links to display on a page=Ajusta el número de línies d'enllaços a mostrar per pàgina
+Top Bar Color=Color de barra superior
+Color of the top bar=Color de la barra superior
+Bottom Bar Color=Color de barra inferior
+Color of the bottom bar=Color de la barra inferior
+Selection Color=Color selecció
+Color of the selection and other interface details=Color de la selecció i altres detalls de la interfície
+You should disable Usb Networking to do this.=Ha de desactivar la Xarxa per USB per fer això.
+Operation not permitted.=Operació no permesa.
+Language=Idioma
+Set the language used by GMenu2X=Ajusta l'idioma utilitzat al GMenu2X
+Increase=Augmentar
+Decrease=Reduïr
+Change color component=Canviar component cromàtic
+Increase value=Incrementar valor
+Decrease value=Reduir valor
+Switch=Canviar
+Change value=Canviar valor
+Edit=Modificar
+Clear=Netejar
+Select a directory=Selecciona un directori
+Select a file=Selecciona un fitxer
+Clock (default: 200)=Freqüència (predeterminada: 200)
+Volume (default: -1)=Volum (predeterminat: -1)
+Enter folder=Entrar a la carpeta
+Wrapper=Retornar
+Confirm=Confirmar
+Enter folder/Confirm=Entrar a la carpeta/Confirmar
+Up one folder=Pujar una carpeta
+Select an application=Selecciona un programa
+Space=Espai
+Shift=Majúscules
+Cancel=Cancel·lar
+OK=Acceptar
+Backspace=Retrocés
+Skin=Tema
+Set the skin used by GMenu2X=Selecciona el tema a utilitzar al GMenu2X
+Add link in $1=Afegir enllaç a $1
+Edit $1=Modificar $1
+Delete $1 link=Eliminar l'enllaç de $1
+Deleting $1=Eliminant $1
+Are you sure?=Estàs segur?
+Insert a name for the new section=Introduir nom per a la nova secció
+Insert a new name for this section=Introduir nou nom per a la secció
+Yes=Si
+No=No
+You will lose all the links in this section.=Es perdran tots els enllaços d'aquesta secció.
+Exit=Sortir
+Link Scanner=Buscador d'enllaços
+Scanning SD filesystem...=Explorant el sistema de fitxers de la SD...
+Scanning NAND filesystem...=Explorant el sistema de fitxers de la NAND...
+$1 files found.=$1 Fitxer(s) trobat(s).
+Creating links...=Creant enllaços...
+$1 links created.=$1 enllaç(os) creat(s).
+Version $1 (Build date: $2)=Versió $1 (Compilació: $2)
+Log Viewer=Visor de Logs
+Displays last launched program's output=Mostra la sortida de l'últim programa executat
+Do you want to delete the log file?=¿Desitja eliminar el fitxer de registre de successos?
+USB Enabled (SD)=USB Activat (SD)
+USB Enabled (Nand)=USB Activat (Nand)
+Turn off=Desactivar
+Launching $1=Executant $1
+Change page=Canviar pàgina
+Page=Pàgina
+Scroll=Desplaçament
+Untitled=Sense títol
+Change GMenu2X wallpaper=Canvia el fons del GMenu2X
+Activate/deactivate tv-out=Activa/desactiva la sortida de TV
+Select wallpaper=Selecciona la imatge de fons
+Gamma=Gamma
+Set gp2x gamma value (default: 10)=Ajustar el valor gama de la gp2x (predeterminat: 10)
+Tv-Out encoding=Codificació de sortida de TV
+Encoding of the tv-out signal=Codificació de la senyal de sortida de TV
+Tweak RAM Timings=Modifica la sincronització de RAM
+This usually speeds up the application at the cost of stability=Normalment accelera l'aplicació a costa de l'estabilitat
+Gamma (default: 0)=Gamma (predeterminat: 0)
+Gamma value to set when launching this link=Valor de gamma que utilitzarà a l'executar aquest enllaç
+Wallpaper=Fons d'escriptori
+Configure skin=Configura el Tema
+Message Box Color=Color de caixa de text
+Message Box Border Color=Color de la vora de la caixa de text
+Message Box Selection Color=Color de la selecció de la caixa de text
+Background color of the message box=Color de fons de la caixa de text
+Border color of the message box=Color de la vora de la caixa de text
+Color of the selection of the message box=Color de la selecció de la caixa de text
diff --git a/FunKey/board/funkey/rootfs-overlay/usr/share/gmenu2x/translations/Chinese (CN) b/FunKey/board/funkey/rootfs-overlay/usr/share/gmenu2x/translations/Chinese (CN)
new file mode 100644
index 0000000..33b8924
--- /dev/null
+++ b/FunKey/board/funkey/rootfs-overlay/usr/share/gmenu2x/translations/Chinese (CN)	
@@ -0,0 +1,199 @@
+Add section=新建分组
+Rename section=重命名分组
+Delete section=删除分组
+Scan for applications and games=搜索程序和游戏
+applications=程序
+Edit link=编辑程序链接
+Title=程序标题
+Link title=设置程序名称
+Description=程序概述
+Link description=设置程序的简要概述
+Section=程序分组
+The section this link belongs to=选择程序所属的分组
+Icon=程序图标
+Select an icon for the link=选择程序的图标
+Manual=程序手册
+Select a manual or README file=选择程序使用手册或说明文档
+CPU Clock=CPU运行频率
+CPU clock frequency when launching this link=运行程序时CPU的运行频率(Mhz)
+Volume to set for this link=为程序设置要使用的卷
+Parameters=运行参数
+Parameters to pass to the application=设置程序的运行参数
+Selector Directory=选择程序目录
+Directory to scan for the selector=选择程序默认加载目录(如:游戏rom存放目录)
+Selector Browser=程序浏览器
+Allow the selector to change directory=允许程序选择器改变目录
+Selector Filter=选择程序筛选器
+Filter file type (separate with commas)=选择程序筛选(用逗号分割)
+Selector Screenshots=选择截图目录
+Directory of the screenshots for the selector=选择程序截图目录
+Selector Aliases=程序选择器别名
+File containing a list of aliases for the selector=包含选择器别名列表的文件
+Backdrop=背景壁纸
+Select an image backdrop=选择程序的背景壁纸
+Wrapper=重新载入系统
+Relaunch GMenu2X after this link's execution ends=结束当前程序时重新加载系统
+Don't Leave=不退出程序
+Don't quit GMenu2X when launching this link=运行程序时不注销系统
+Save last selection=记录最后选择的分组
+Save the last selected link and section on exit=记录最后运行的程序和分组
+Clock for GMenu2X=运行频率
+Set the cpu working frequency when running GMenu2X=设置CPU运行频率
+Maximum overclock=最大超频
+Set the maximum overclock for launching links=设置此软件链接最大超频时钟频率
+Global Volume=全局音量
+Set the default volume for the gp2x soundcard=选择默认输出音量
+Output logs=输出日志
+Logs the output of the links. Use the Log Viewer to read them.=记录并输出日志
+Top/Section Bar=标题栏颜色
+Color of the top and section bar=设置标题栏的颜色
+List Body=列表项颜色
+Color of the list body=设置列表项目的颜色
+Bottom Bar=底部栏颜色
+Color of the bottom bar=设置底部栏颜色
+Selection=选中项颜色
+Color of the selection and other interface details=当前被选中项目的颜色
+Message Box=对话框颜色
+Background color of the message box=设置对话框的背景颜色
+Msg Box Border=对话框边框颜色
+Border color of the message box=设置对话框边框的颜色
+Msg Box Selection=对话框选中项颜色
+Color of the selection of the message box=设置对话框中当前被选中项目的颜色
+Font=文字颜色
+Color of the font=设置系统文字的颜色
+Font Outline=文字轮廓
+Color of the font's outline=设置文字轮廓的颜色
+Alt Font=备选文字
+Color of the alternative font=设置备选文字的颜色
+Alt Font Outline=备选文字轮廓
+Color of the alternative font outline=设置备选文字轮廓的颜色
+Font Size=文字大小
+Size of text font=设置文本文字的尺寸
+Title font size=标题文字大小
+Size of title's text font=设置标题文本文字的尺寸
+Top bar height=标题栏高度
+Height of top bar=设置标题栏的高度值
+Bottom bar height=底部栏高度
+Height of bottom bar=设置底部栏的高度值
+Link Height=程序项高度
+Height of link item=设置程序项的高度值
+Section bar size=分组栏大小
+Size of section bar=设置分组栏的尺寸
+Language=系统语言
+Set the language used by GMenu2X=设置系统界面语言
+Component=项目
+Increase=增大
+Decrease=减小
+Change color component=改变颜色组合
+Increase value=增大
+Decrease value=减小
+Switch=切换
+Change value=改变值
+Edit=编辑
+Clear=清除
+Select a directory=选择一个目录
+Select a file=选择一个文件
+Enter folder=输入文件夹
+Confirm=确认
+Enter folder/Confirm=输入文件夹/确认
+Folder up=上层文件夹
+Select=选择
+Back=返回
+Space=空格
+Keys=切换键盘
+Press=选择
+Cancel=取消
+Backspace=删除
+Skin=界面主题
+Set the skin used by GMenu2X=为系统选择一个界面主题
+Add link in $1=添加链接 $1
+Edit $1=编辑 $1
+Delete $1=删除 $1
+Delete $1 link=删除链接 $1
+Deleting $1=正在删除 $1
+Are you sure?=是否确定?
+Insert a name for the new section=为新分组输入名称
+Insert a new name for this section=为此分组输入新名称
+No=否
+All links in this section will be removed.=您将删除此分组所有程序链接
+Exit=退出
+Link Scanner=程序搜索器
+Scanning SD filesystem...=搜索TF卡文件系统...
+Scanning NAND filesystem...=搜索内部存储文件系统...
+$1 files found.=$1 个文件被找到
+Creating links...=正在创建新链接...
+$1 links created.=$1 链接已创建
+Charging=正在充电
+Log Viewer=日志阅读器 
+Displays last launched program's output=显示最后运行的程序输出日志
+Do you want to delete the log file?=您是否想删除此日志文件?
+Turn off=关闭
+Launching $1=正在运行 $1
+Change page=换页
+Page=页
+Scroll=滚动
+Untitled=无标题
+Change GMenu2X wallpaper=更换系统壁纸
+Select an image to use as a wallpaper=选择一张图片用作壁纸
+Explorer=文件管理器
+Launch an application=
+File Dialog=文件对话框
+Wallpaper=系统壁纸
+Set background image=
+Configure skin=
+Umount=卸载外置TF卡
+Umount external SD=
+About=关于
+Poweroff=关机
+Default=默认值
+Wallpaper selection=壁纸选择窗口
+Select an image from the list, to use as a wallpaper=从下列选择一个图片作为壁纸
+Do you want to umount external sdcard ?=确认卸载扩展TF卡吗?
+Poweroff or reboot the device?=关机或重启设备吗？
+Reboot=重启
+Rebooting=设备重启中...
+Settings=系统设置
+Configure settings=
+Image Browser=图像浏览器
+Info about system= 
+Loading=系统加载中···
+Suspend=闲置超时关闭屏幕
+Power=电源选项
+Power menu=
+Umount external SD card?=卸载外置TF卡吗？
+Yes=是
+Complete!=已完成!
+OK=确定
+Umount Test=卸载测试
+Select an application=选择想要运行的程序
+Date & Time=时间设置
+Set system's date time=设置系统时间
+Section bar postition=标题栏位置
+Set the position of the Section Bar=设置主界面标题栏位置
+Battery profile=电池类型
+Set the battery discharge profile=设置电池类型
+Skin backdrops=程序壁纸
+Automatic load backdrops from skin pack=自动从默认主题包加载程序背景壁纸
+Screen timeout=闲置锁屏时间
+Seconds to turn display off if inactive=设置闲置超时关闭屏幕时间(秒)
+Power timeout=闲置关机时间
+Minutes to poweroff system if inactive=设置闲置超时关机时间(分钟)
+Backlight=背光亮度
+Set LCD backlight=设置屏幕背光亮度
+Global volume=全局音量
+Set the default volume for the soundcard=设置系统全局默认音量
+TV-out=电视输出
+TV-out signal encoding=设置电视输出信号类型
+TV-out connected.=电视视频线已连接
+Continue?=确定继续吗?
+Delete=删除
+Add link=新增程序链接
+Link scanner=搜索程序链接
+Select USB mode:=选择USB连接模式：
+Charger=仅充电
+USB Drive=USB驱动器
+USB Drive Connected=USB驱动器已连接
+Battery Logger=电池记录器
+Log battery power to battery.csv=
+Del battery.csv=删除电池电量记录文件
+File Manager=文件管理器
\ No newline at end of file
diff --git a/FunKey/board/funkey/rootfs-overlay/usr/share/gmenu2x/translations/Chinese (TW) b/FunKey/board/funkey/rootfs-overlay/usr/share/gmenu2x/translations/Chinese (TW)
new file mode 100644
index 0000000..3ef6c15
--- /dev/null
+++ b/FunKey/board/funkey/rootfs-overlay/usr/share/gmenu2x/translations/Chinese (TW)	
@@ -0,0 +1,238 @@
+Settings=設定
+Configure GMenu2X's options=調整GMenu2X的設定
+Activate Usb on SD=啟動USB模式 (掛載SD卡)
+Activate Usb on Nand=啟動USB模式 (掛載NAND)
+Info about GMenu2X=有關GMenu2X的資訊
+About=關於
+Add section=加入區段
+Rename section=重新命名區段
+Delete section=刪除區段
+Scan for applications and games=掃描遊戲和應用程式
+applications=應用程式
+Edit link=編輯連結
+Title=標題
+Link title=連結標題
+Description=說明
+Link description=連結說明
+Section=區段
+The section this link belongs to=這個連結的區段屬於
+Icon=圖示
+Select an icon for the link: $1=選擇連結的圖示: $1
+Manual=說明
+Select a graphic/textual manual or a readme=請選擇圖片(或文字)的說明書
+Cpu clock frequency to set when launching this link=啟動這個連結時的CPU頻率
+Volume to set for this link=啟動這個連結時的音量
+Parameters=參數
+Parameters to pass to the application=要傳遞到這個應用程式中的參數
+Selector Directory=選擇器資料夾
+Directory to scan for the selector=選擇區段中欲掃描的資料夾
+Selector Browser=選擇器瀏覽器
+Allow the selector to change directory=允許選擇器改變資料夾
+Selector Filter=選擇器過濾器
+Filter for the selector (Separate values with a comma)=選擇器的過濾器(以逗號分隔)
+Selector Screenshots=選擇器截圖
+Directory of the screenshots for the selector=選擇器用截圖資料夾
+Selector Aliases=選擇器別名
+File containing a list of aliases for the selector=File containing a list of aliases for the selector
+Explicitly relaunch GMenu2X after this link's execution ends=Explicitly relaunch GMenu2X after this link's execution ends
+Don't Leave=別離開
+Don't quit GMenu2X when launching this link=當啟動這個連結時請勿離開GMenu2X
+Save last selection=記住最後的選擇
+Save the last selected link and section on exit=Save the last selected link and section on exit
+Clock for GMenu2X=GMenu2X的時脈
+Set the cpu working frequency when running GMenu2X=設定GMenu2X執行時的CPU時脈
+Maximum overclock=最大超頻
+Set the maximum overclock for launching links=設定啟動連結時的最大超頻量
+Global Volume=全域音量
+Set the default volume for the gp2x soundcard=設定GP2X的預設音量
+Output logs=輸出logs
+Logs the output of the links. Use the Log Viewer to read them.=Logs the output of the links. Use the Log Viewer to read them.
+Number of columns=列數
+Set the number of columns of links to display on a page=設定一頁中的連結列數
+Number of rows=行數
+Set the number of rows of links to display on a page=設定一頁中的連結行數
+Top Bar Color=上面Bar的顏色
+Color of the top bar=上面Bar的顏色
+Bottom Bar Color=下面Bar的顏色
+Color of the bottom bar=下面Bar的顏色
+Selection Color=選項顏色
+Color of the selection and other interface details=選項和其他介面細節的顏色
+You should disable Usb Networking to do this.=在執行此操作前應先關閉USB網路
+Operation not permitted.=不允許的操作
+Language=語言
+Set the language used by GMenu2X=設定GMenu2X使用的語言
+Increase=增加
+Decrease=減少
+Change color component=調整顏色分量
+Increase value=增加數值
+Decrease value=減少數值
+Switch=切換
+Change value=切換數值
+Edit=編輯
+Clear=清除
+Select a directory=選擇資料夾
+Select a file=選擇檔案
+Clock (default: 200)=時脈 (預設: 200)
+Volume (default: -1)=音量 (預設: -1)
+Wrapper=Wrapper
+Enter folder=進入資料夾
+Confirm=確認
+Enter folder/Confirm=進入資料夾/確認
+Up one folder=返回上層
+Select an application=選擇應用程式
+Space=空白
+Shift=Shift
+Cancel=取消
+OK=OK
+Backspace=Backspace
+Skin=顏色
+Set the skin used by GMenu2X=設定Gmenu2X的顏色
+Add link in $1=於 $1 中加入連結
+Edit $1=編輯 $1
+Delete $1 link=刪除 $1 連結
+Deleting $1=正在刪除 $1
+Are you sure?=你確定嗎?
+Insert a name for the new section=插入這個新區段中的名稱
+Insert a new name for this section=插入這個區段中的新名稱
+Yes=是
+No=否
+You will lose all the links in this section.=你將會遺失所有此區段中的連結.
+Exit=離開
+Link Scanner=連結掃描器
+Scanning SD filesystem...=掃描SD中...
+Scanning NAND filesystem...=掃描NAND中...
+$1 files found.=找到$1 個檔案.
+Creating links...=創建連結中...
+$1 links created.=創建$1 個連結
+Version $1 (Build date: $2)=版本 $1 (建置日期: $2)
+Log Viewer=Log檢視器
+Displays last launched program's output=顯示最後啟動程式的輸出
+Do you want to delete the log file?=是否刪除Log檔?
+USB Enabled (SD)=開啟USB (SD)
+USB Enabled (Nand)=開啟USB (Nand)
+Turn off=關閉
+Launching $1=啟動 $1
+Change page=換頁
+Page=頁
+Scroll=捲動
+Untitled=未命名
+Change GMenu2X wallpaper=更換GMenu2X桌布
+Activate/deactivate tv-out=切換TV輸出
+Select wallpaper=選擇桌布
+Gamma=Gamma
+Set gp2x gamma value (default: 10)=設定GP2X的Gamma value (預設: 10)
+Tv-Out encoding=電視輸出格式
+Encoding of the tv-out signal=電視輸出訊號的格式
+Tweak RAM Timings=微調RAM timings
+This usually speeds up the application at the cost of stability=這通常會以穩定性為代價換取更高效能
+Gamma (default: 0)=Gamma (預設: 0)
+Gamma value to set when launching this link=啟動這個連結時的Gamma值
+Explorer=檔案總管
+Launch an application=執行程式
+File Dialog=檔案視窗
+Wallpaper=桌布
+Configure skin=設定顯示顏色
+Umount=卸載
+Umount external SD=卸載外部SDCard
+Poweroff=關機
+Poweroff/Reboot device=機器關機/重新開機
+TV=電視
+Default="預設值"
+Message Box Color=對話盒顏色
+Message Box Border Color=對話盒外框顏色
+Message Box Selection Color=對話盒選擇顏色
+Font Color=字形顏色
+Font Outline Color=字形外框顏色
+Wallpaper selection=桌布選擇視窗
+Select an image from the list, to use as a wallpaper=從下面列表選擇桌布
+Do you want to umount external sdcard ?=確認卸載外部SDCard ?
+Reboot=重新開機
+Poweroff or reboot device ?=關機或重新開機 ?
+Jinyong=金庸
+Jinyong game=金庸群俠傳
+PAL=仙劍
+PAL game=仙劍奇俠傳
+PAL(MHV) game=仙劍奇俠傳(夢幻版)
+Rockbot=洛克機器人
+GMenu2X=設定
+File Manager=檔案管理器
+DOS Emulator=DOS模擬器
+Arcade Emulator(A320)=街機模擬器(A320)
+Arcade Emulator(GCW0)=街機模擬器(GCW0)
+FC/NES Emulator=FC/NES模擬器
+Hugo Emulator=PCE模擬器
+PS1 Emulator=PS1模擬器
+SMS/SMD Emulator=SMS/SMD模擬器
+SNES Emulator=SNES模擬器
+JavaME Emulator=JavaME模擬器
+MAME Emulator=街機模擬器
+PCE Emulator=PCE模擬器
+Comic Viewer=漫畫閱讀軟體
+Text Editor=文字編輯器
+Music Player=音樂播放器
+Image Viewer=圖片閱讀軟體
+ColecoVision Emulator=ColecoVision模擬器
+Atari 2600 Emulator=Atari 2600模擬器
+Atari 800/5200 Emulator=Atari 800/5200模擬器
+Amstrad CPC Emulator=Amstrad CPC模擬器
+GB/GBC Emulator=GB/GBC模擬器
+Arcade Emulator=街機模擬器
+GBA Emulator=GBA模擬器
+Intellivision Emulator=Intellivision模擬器
+PC98 Emulator=PC98模擬器
+OnScripter Emulator=OnScripter模擬器
+WS/WSC Emulator=WS/WSC模擬器
+Atari 7800 Emulator=Atari 7800模擬器
+Neo Geo Pocket Emulator=Neo Geo Pocket模擬器
+ScummVM Emulator=ScummVM模擬器
+SMS/GG Emulator=SMS/GG模擬器
+ZX Spectrum Emulator=ZX Spectrum模擬器
+Amiga500 Emulator=Amiga500模擬器
+Commodore Emulator=Commodore模擬器
+Atari Lynx Emulator=Atari Lynx模擬器
+Arkanoid Game=Arkanoid遊戲
+Apricots Game=Apricots遊戲
+ASCIIpOrtal Game=ASCIIpOrtal遊戲
+Bermuda Syndrome Game=Bermuda Syndrome遊戲
+Boulder Game=Boulder遊戲
+Cave Story Game=洞窟物語(英文)
+Cave Story Game(CN)=洞窟物語(簡體)
+Cave Story Game(TW)=洞窟物語(繁體)
+Chocolate Doom Game=Chocolate Doom遊戲
+Chroma Game=Chroma遊戲
+CZDoom Game=CZDoom遊戲
+Digger Game=Digger遊戲
+FreeDroid Classic Game=FreeDroid Classic遊戲
+Ganbare-Natsuki-San Game=Ganbare-Natsuki-San遊戲
+HHeretic Game=HHeretic遊戲
+HHexen Game=HHexen遊戲
+Homing Fever Game=Homing Fever遊戲
+JinYong Legend Game=金庸群俠傳(復刻版)
+JinYong v1.2 Game=金庸群俠傳(蒼龍逐日1.2)
+Just4Qix Game=Just4Qix遊戲
+KETM Game=KETM遊戲
+Last Mission Game=Last Mission遊戲
+Meritous Game=Meritous遊戲
+Meteoroid3D Game=Meteoroid3D遊戲
+Mr. Driller Game=Mr. Driller遊戲
+Noiz2sa Game=Noiz2sa遊戲
+OpenTitus Game=OpenTitus遊戲
+Pang Game=Pang遊戲
+ProfaDeluxe Game=ProfaDeluxe遊戲
+Puzzletube Game=Puzzletube遊戲
+Quake Game=Quake遊戲
+Rockbot Game=洛克機器人
+SDLPAL Game=仙劍奇俠傳
+SDLPAL MHV Game=仙劍奇俠傳(夢幻版)
+Shifty Pills Game=Shifty Pills遊戲
+Shisen Seki Game=Shisen Seki遊戲
+Snowman Game=Snowman遊戲
+SORR Game=怒之鐵拳
+Spartak Chess Game=Spartak Chess遊戲
+Spout Game=Spout遊戲
+Sonic Robo Blast 2 Game=Sonic Robo Blast 2遊戲
+Super Transball 2 Game=Super Transball 2遊戲
+SuperTux Game=SuperTux遊戲
+Triple Trapled Game=Triple Trapled遊戲
+Wizznic Game=Wizznic遊戲
+Wolfenstein 3D Game=Wolfenstein 3D遊戲
diff --git a/FunKey/board/funkey/rootfs-overlay/usr/share/gmenu2x/translations/Danish b/FunKey/board/funkey/rootfs-overlay/usr/share/gmenu2x/translations/Danish
new file mode 100644
index 0000000..9be85f5
--- /dev/null
+++ b/FunKey/board/funkey/rootfs-overlay/usr/share/gmenu2x/translations/Danish
@@ -0,0 +1,129 @@
+﻿settings=Indstillinger
+Configure GMenu2X's options=Konfigurer GMenu2X's Indstillinger
+Activate Usb on SD=Aktiver Usb på SD
+Activate Usb on Nand=Aktiver Usb på Nand
+Info about GMenu2X=Information om GMenu2X
+Activate/deactivate tv-out=Aktiver/deaktiver tv-udgang
+Exit GMenu2X to the official frontend=Lukker GMenu2X
+Change GMenu2X wallpaper=Skift baggrund
+About=Om
+Add section=Tilføj sektion
+Rename section=Ændre navn på sektion
+Delete section=Slet sektion
+Scan for applications and games=Skan hukommelsen for applikationer og spil
+applications=applikationer
+Edit link=Rediger genveje
+Title=Titel
+Link title=Genvejs titel
+Description=Beskrivelse
+Link description=Genvejs beskrivelse
+Section=Seektion
+The section this link belongs to=Sektionen for denne genvej
+Icon=Ikon
+Select an icon for the link=Vælg et ikon til denne genvej
+Manual=Manual
+Select a graphic/textual manual or a readme=Vælg en grafisk/tekstbaseret manual eller en readme fil
+Cpu clock frequency to set when launching this link=Cpu-clockfrekvens indstilling for denne genvej
+Volume to set for this link=Lydstyrke indstilling for denne genvej
+Parameters=Parametre
+Parameters to pass to the application=Angiv parametre for applikationen 
+Selector Directory=Selector oversigt
+Directory to scan for the selector=Angiv Mappe som selector skal skanne
+Selector Browser=Selector Browser
+Allow the selector to change directory=Tillad selector at ændre mappe
+Selector Filter=Selector filter
+Filter for the selector (Separate values with a comma)=Filter til selector (separer værdier med komma)
+Selector Screenshots= Selector Screenshots
+Directory of the screenshots for the selector=Mappe med Screenshots af selector
+Selector Aliases=Selector alias
+File containing a list of aliases for the selector=Fil som indeholder en liste over alias for selector
+Explicitly relaunch GMenu2X after this link's execution ends=Tving GMenu2X til at genstarte når denne genvej køres
+Don't Leave=Forlad ikke
+Don't quit GMenu2X when launching this link=Afslut ikke GMenu2X når denne genvej startes
+Save last selection=Gem sidste ændring
+Save the last selected link and section on exit= Gem sidst valgte genvej og sektion ved afslutning
+Clock for GMenu2X=Clockfrekvens for GMenu2X
+Set the cpu working frequency when running GMenu2X=Indstil cpu-clockfrekvens for GMenu2X
+Maximum overclock=Maksimal clockfrekvens
+Set the maximum overclock for launching links=Indstil maksimal clockfrekvens ved opstart af genvej
+Global Volume=Global lydstyrke
+Set the default volume for the gp2x soundcard=Indstil standard lydstyrke for gp2x lydkort
+Output logs=Vis logs
+Logs the output of the links. Use the Log Viewer to read them.=Danner logs for genvejene. Anvend Vis log for at åbne dem.
+Number of columns=Antal spalter
+Set the number of columns of links to display on a page=Angiv antallet af spalter for genveje per side 
+Number of rows=Antal rækker
+Set the number of rows of links to display on a page= Angiv antallet af rækker for genveje per side 
+Top Bar Color=Øverste bjælkes farve
+Color of the top bar= Øverste bjælkes farve 
+Bottom Bar Color=Nederste bjælkes farve
+Color of the bottom bar=Nederste bjælkes farve 
+Selection Color=Markørens farve
+Color of the selection and other interface details= Markøren og grænseflades farve 
+You should disable Usb Networking to do this.=Du bør fravælge USB netværket nå du vælger dette
+Operation not permitted.=Dette er ikke tilladt.  
+Language=Sprog
+Set the language used by GMenu2X=Indstil sprog der anvendes i GMenu2X
+Increase=Op
+Decrease=Ned
+Change color component=Ændre farven på komponent
+Increase value=Op
+Decrease value=Ned
+Switch=Ændre
+Change value=Ændre værdi
+Edit=Rediger
+Clear=Ryd
+Select a directory=Vælg en mappe
+Select a file=Vælg en fil
+Clock (default: 200)=Clockfrekvens (normal: 200)
+Volume (default: -1)=Lydstyrke (normal: -1)
+Wrapper=Wrapper
+Enter folder=Åbn mappe
+Confirm=Bekræft
+Enter folder/Confirm=Åbn mappe/Bekræft
+Up one folder=Tilbage
+Select an application=Vælg en applikation
+Space=Mellemrum
+Shift=Skift
+Cancel=Afbryd
+OK=OK
+Backspace=Slet
+Skin=Tema
+Set the skin used by GMenu2X =Angiv tema for GMenu2X 
+Add link in $1=Tilføj genvej i $1
+Edit $1=Rediger $1
+Delete $1 link=Slet $1
+Deleting $1=Sletter $1
+Are you sure?=Er du sikker?
+Insert a name for the new section=Angiv navn for den nye sektion
+Insert a new name for this section=Angiv nyt navn for denne sektion 
+Yes=Ja
+No=Nej
+You will lose all the links in this section.=Du vil miste alle genveje i denne sektion.
+Exit=Afslut
+Link Scanner=Skan genveje
+Scanning SD filesystem...=Skanner SD filsystem...
+Scanning NAND filesystem...=Skanner NAND filsystem...
+$1 files found.=$1 fil(er) fundet.
+Creating links...=Opretter genveje...
+$1 links created.=$1 genvej(e) oprettet.
+Version $1 (Build date: $2)=Version $1 (den: $2)
+Log Viewer=Vis log
+Displays last launched program's output=Vis log fra sidst kørte program
+Do you want to delete the log file?=Vil du slette denne log fil?
+USB Enabled (SD)=USB Aktiveret (SD)
+USB Enabled (Nand)=USB Aktiveret (Nand)
+Turn off=Afbryd
+Launching $1=Starter $1
+Change page=Skift side
+Page=Side
+Scroll=Rulle
+Untitled=Ikke navngivet
+Wallpaper=Baggrund
+Configure skin=Konfigurer tema
+Message Box Color=Farve på Konfigurations vinduet
+Message Box Border Color= Farve på Konfig vinduets kant
+Message Box Selection Color=Konfig vinduets markør farve 
+Background color of the message box= Konfigurations vinduets baggrundsfarve
+Border color of the message box=Farve på Konfigurations vinduets kant
+Color of the selection of the message box=Farven på markøren i Konfigurations vinduet
diff --git a/FunKey/board/funkey/rootfs-overlay/usr/share/gmenu2x/translations/Dutch b/FunKey/board/funkey/rootfs-overlay/usr/share/gmenu2x/translations/Dutch
new file mode 100644
index 0000000..eaa3062
--- /dev/null
+++ b/FunKey/board/funkey/rootfs-overlay/usr/share/gmenu2x/translations/Dutch
@@ -0,0 +1,118 @@
+Settings=Instellingen
+Configure GMenu2X's options=Instellingen van GMenu2X
+Activate Usb on SD=Activeer USB op SD
+Activate Usb on Nand=Activeer USB op Nand
+Info about GMenu2X=Informatie over GMenu2X
+About=Over
+Add section=Groep toevoegen
+Rename section=Groep hernoemen
+Delete section=Groep verwijderen
+Scan for applications and games=Zoek applicaties en spellen
+applications=applicaties
+Edit link=Wijzig snelkoppeling
+Title=Naam
+Link title=Naam van de snelkoppeling
+Description=Omschrijving
+Link description=Omschrijving van de snelkoppeling
+Section=Groep
+The section this link belongs to=De groep waartoe deze snelkoppeling behoort
+Icon=Pictogram
+Select an icon for the link: $1=Selecteer een pictogram voor de snelkoppeling: $1
+Manual=Handleiding
+Select a graphic/textual manual or a readme=Selecteer een grafische/tekstuele handleiding of een readme/leesmij
+Cpu clock frequency to set when launching this link=Kloksnelheid voor het starten van deze snelkoppeling
+Volume to set for this link=Volume-instelling voor deze snelkoppeling
+Parameters=Parameters
+Parameters to pass to the application=Parameters om door te geven aan de applicatie
+Selector Directory=Kiezer map
+Directory to scan for the selector=Map met bestanden voor de kiezer
+Selector Browser=Kiezer navigatie
+Allow the selector to change directory=Wisselen van map mogelijk maken in kiezer
+Selector Filter=Kiezer filter
+Filter for the selector (Separate values with a comma)=Filter voor de kiezer (komma-gescheiden)
+Selector Screenshots=Kiezer schermafdrukken
+Directory of the screenshots for the selector=Map met schermafdrukken voor de kiezer
+Selector Aliases=Kiezer aliassen
+File containing a list of aliases for the selector=Bestand met aliassen voor de kiezer
+Explicitly relaunch GMenu2X after this link's execution ends=GMenux2X altijd herstarten na uitvoeren snelkoppeling
+Don't Leave=Niet verlaten
+Don't quit GMenu2X when launching this link=GMenu2X niet verlaten bij uitvoeren snelkoppeling
+Save last selection=Bewaar laatste keuze
+Save the last selected link and section on exit=Bewaar de laatst gekozen snelkoppeling en groep
+Clock for GMenu2X=Kloksnelheid voor GMenu2X
+Set the cpu working frequency when running GMenu2X=Stel de kloksnelheid in voor het draaien van Gmenu2X
+Maximum overclock=Maximale overkloksnelheid
+Set the maximum overclock for launching links=Stel maximaal toegestane overkloksnelheid in
+Global Volume=Hoofdvolume
+Set the default volume for the gp2x soundcard=Stel het standaardvolume in voor de gp2x geluidskaart
+Output logs=Uitvoer naar logboek
+Logs the output of the links. Use the Log Viewer to read them.=Logt de uitvoer van snelkoppelingen. Gebruik de log viewer om de log te lezen.
+Number of columns=Aantal kolommen
+Set the number of columns of links to display on a page=Stel het aantal getoonde pictogrammen in (horizontaal)
+Number of rows=Aantal rijen
+Set the number of rows of links to display on a page=Stel het aantal getoonde pictogrammen in (verticaal)
+Top Bar Color=Kleur bovenste balk
+Color of the top bar=Kleur van de bovenste balk
+Bottom Bar Color=Kleur onderste balk
+Color of the bottom bar=Kleur van de onderste balk
+Selection Color=Kleur selectie
+Color of the selection and other interface details=Kleur van de selectie en andere interface details
+You should disable Usb Networking to do this.=Zet USB Netwerk uit om dit te gebruiken.
+Operation not permitted.=Handeling niet toegestaan.
+Language=Taal
+Set the language used by GMenu2X=Stel de taal van GMenu2X in
+Increase=Verhoog
+Decrease=Verlaag
+Change color component=Wijzig kleur component
+Increase value=Verhoog waarde
+Decrease value=Verlaag waarde
+Switch=Wissel
+Change value=Wijzig waarde
+Edit=Bewerk
+Clear=Wis
+Select a directory=Selecteer een map
+Select a file=Selecteer een bestand
+Clock (default: 200)=Kloksnelheid (standaard: 200)
+Volume (default: -1)=Volume (standaard: -1)
+Wrapper=Schil
+Enter folder=Open map
+Confirm=Bevestig
+Enter folder/Confirm=Open map/Bevestig
+Up one folder=Map omhoog
+Select an application=Selecteer een applicatie
+Space=Spatie
+Shift=Shift
+Cancel=Annuleer
+OK=OK
+Backspace=Backspace
+Skin=Skin
+Set the skin used by GMenu2X=Kies de skin voor GMenu2X
+Add link in $1=Voeg snelkoppeling toe in $1
+Edit $1=Wijzig $1
+Delete $1 link=Verwijder snelkoppeling $1
+Deleting $1=Verwijdert $1
+Are you sure?=Weet u het zeker?
+Insert a name for the new section=Geef een naam voor de nieuwe groep
+Insert a new name for this section=Geef een nieuwe naam voor deze groep
+Yes=Ja
+No=Nee
+You will lose all the links in this section.=Alle snelkoppelingen in deze groep worden gewist.
+Exit=Verlaat
+Link Scanner=Snelkoppeling-Scanner
+Scanning SD filesystem...=Scant het SD-bestandssysteem...
+Scanning NAND filesystem...=Scant het NAND-bestandssysteem...
+$1 files found.=$1 bestand(en) gevonden.
+Creating links...=Snelkoppelingen aanmaken...
+$1 links created.=$1 Snelkoppeling(en) aangemaakt.
+Version $1 (Build date: $2)=Versie $1 (Datum build: $2)
+Log Viewer=Logbestand lezer
+Displays last launched program's output=Toont de uitvoer van het laatst gestarte programma
+Do you want to delete the log file?=Wilt u het logbestand verwijderen?
+USB Enabled (SD)=USB geactiveerd (SD)
+USB Enabled (Nand)=USB geactiveerd (Nand)
+Turn off=Zet uit
+Launching $1=Start $1
+Change page=Wijzig pagina
+Page=Pagina
+Scroll=Scroll
+Untitled=Naamloos
diff --git a/FunKey/board/funkey/rootfs-overlay/usr/share/gmenu2x/translations/Finnish b/FunKey/board/funkey/rootfs-overlay/usr/share/gmenu2x/translations/Finnish
new file mode 100644
index 0000000..9158178
--- /dev/null
+++ b/FunKey/board/funkey/rootfs-overlay/usr/share/gmenu2x/translations/Finnish
@@ -0,0 +1,117 @@
+Settings=Asetukset
+Configure GMenu2X's options=Muuta GMenu2X:n asetuksia
+Activate Usb on SD=Aktivoi USB SD-kortille
+Activate Usb on Nand=Aktivoi USB NAND-muistille
+Info about GMenu2X=Tietoa GMenu2X:st�
+About=Tietoa
+Add section=Lis�� v�lilehti
+Rename section=Nime� v�lilehti uudelleen
+Delete section=Poista v�lilehti
+Scan for applications and games=Etsi ohjelmia ja pelej�
+applications=ohjelmat
+Edit link=Muokkaa linkki�
+Title=Otsikko
+Link title=Linkin otsikko
+Description=Kuvaus
+Link description=Linkin kuvaus
+Section=V�lilehti
+The section this link belongs to=V�lilehti johon t�m� linkki kuuluu
+Icon=Kuvake
+Select an icon for the link: $1=Valitse kuvake linkille: $1
+Manual=Ohjetiedosto
+Select a graphic/textual manual or a readme=Valitse graafinen/tekstipohjainen ohjetiedosto
+Cpu clock frequency to set when launching this link=CPU kellotaajuus t�m�n linkin k�ynnistyksess�
+Volume to set for this link=��nenvoimakkuus t�lle linkille
+Parameters=Parametrit
+Parameters to pass to the application=Ohjelmalle annettavat parametrit
+Selector Directory=Ohjelmanvalitsimen hakemisto
+Directory to scan for the selector=Hakemisto joka skannataan ohjelmanvalitsimelle
+Selector Browser=Ohjelmanvalitsimen selain
+Allow the selector to change directory=Anna ohjelmanvalitsimen vaihtaa hakemistoa
+Selector Filter=Ohjelmavalitsimen filtteri
+Filter for the selector (Separate values with a comma)=Filtteri ohjelmanvalitsimelle (Eroita arvot pilkulla)
+Selector Screenshots=Kuvakaappaukset ohjelmanvalitsimesta
+Directory of the screenshots for the selector=Ohjelmanvalitsimen kuvakaappausten hakemisto
+Selector Aliases=Ohjelmanvalitsimen peitenimet
+File containing a list of aliases for the selector=Tiedosto, joka sis�lt�� listan peitenimist� ohjelmavalitsimelle
+Explicitly relaunch GMenu2X after this link's execution ends=K�ynnist� GMenu2X uudelleen kun linkin ajo on suoritettu
+Don't Leave=�l� poistu
+Don't quit GMenu2X when launching this link=�l� sulje GMenu2X:�� kun linkki k�ynnistet��n
+Save last selection=Muista viimeisin valinta
+Save the last selected link and section on exit=Muista viimeisin valinta ja v�lilehti poistuttaessa
+Clock for GMenu2X=Kellotaajuus GMenu2X:lle
+Set the cpu working frequency when running GMenu2X=S��d� CPU kellotaajuutta GMenu2X:lle
+Maximum overclock=Ylikellotusrajoitus
+Set the maximum overclock for launching links=S��d� suurin mahdollinen ylikellotus k�ynnistett�ess� linkkej�
+Global Volume=Yleinen ��nenvoimakkuus
+Set the default volume fo the gp2x soundcard=S��d� perus��nenvoimakkuus gp2x:n ��nikortille
+Output logs=Tulosteloki
+Logs the output of the links. Use the Log Viewer to read them.=Kirjoita linkkien tuloste lokiin. K�yt� lokilukijaa niiden lukemiseen.
+Number of columns=Sarakkeiden lukum��r�
+Set the number of columns of links to display on a page=Aseta linkkisarakkeiden lukum��r� sivulla
+Number of rows=Rivien lukum��r�
+Set the number of rows of links to display on a page=Aseta linkkirivien lukum��r� sivulla
+Top Bar Color=V�ri yl�palkille
+Color of the top bar=Yl�palkin v�ri
+Color of the bottom bar=Alapalkin v�ri
+Selection Color=Valinnan v�ri
+Color of the selection and other interface details=Valinnan ja muiden ykstiyiskohtien v�ri
+You should disable Usb Networking to do this.=Usb Networking:in pit�� olla poissa k�yt�st� jotta voit tehd� t�m�n.
+Operation not permitted.=Toiminto ei ole sallittu.
+Language=Kieli
+Set the language used by GMenu2X=Valitse GMenu2X:n k�ytt�m� kieli
+Increase=Lis��
+Decrease=V�henn�
+Change color component=Vaihda v�rikomponenttia
+Increase value=Nosta arvoa
+Decrease value=Laske arvoa
+Switch=Vaihda
+Change value=Vaihda arvoa
+Edit=Muokkaa
+Clear=Tyhjenn�
+Select a directory=Valitse hakemisto
+Select a file=Valitse tiedosto
+Clock (default: 200)=Kellotaajuus (oletusarvo: 200)
+Volume (default: -1)=��nenvoimakkuus (oletusarvo: -1)
+Wrapper=Wrapperi
+Enter folder=Avaa kansio
+Confirm=Vahvista
+Enter folder/Confirm=Avaa kansio/Vahvista
+Up one folder=Yksi hakemisto yl�sp�in
+Select an application=Valitse ohjelma
+Space=V�lily�nti
+Shift=Vaihto
+Cancel=Peruuta
+OK=OK
+Backspace=Askelpalautin
+Skin=Teema
+Set the skin used by GMenu2X=Aseta GMenu2X:n k�ytt�m� teema
+Add link in $1=Lis�� linkki v�lilehteen $1
+Edit $1=Muokkaa v�lilehte� $1
+Delete $1 link=Poista v�lilehti $1
+Deleting $1=Poistetaan v�lilehte� $1
+Are you sure?=Oletko varma?
+Insert a name for the new section=Anna uuden v�lilehden nimi
+Insert a new name for this section=Anna uusi nimi t�lle v�lilehdelle
+Yes=Kyll�
+No=Ei
+You will lose all the links in this section.=Menet�t kaikki t�ss� v�lilehdess� olevat linkit.
+Exit=Poistu
+Link Scanner=Linkkiskanneri
+Scanning SD filesystem...=Skannataan SD-tiedostoj�rjestelm��...
+Scanning NAND filesystem...=Skannataan NAND-tiedostoj�rjestelm��...
+$1 files found.=$1 tiedosto(a) l�ydetty.
+Creating links...=Luodaan linkkej�...
+$1 links created.=$1 linkki(�) luotu.
+Version $1 (Build date: $2)=Versio $1 (K��nt�p�iv�m��r�: $2)
+Log Viewer=Lokilukija
+Displays last launched program's output=N�ytt�� viimeksi k�ynnistetyn ohjelman tulosteen
+Do you want to delete the log file?=Haluatko poistaa logitiedoston?
+USB Enabled (SD)=USB Aktivoitu (SD)
+USB Enabled (Nand)=USB Aktivoitu (Nand)
+Turn off=Sammuta
+Launching $1=K�ynnistet��n $1
+Change page=Vaihda sivua
+Page=Sivu
+Scroll=Vierit�
+Untitled=Nime�m�t�n
diff --git a/FunKey/board/funkey/rootfs-overlay/usr/share/gmenu2x/translations/French b/FunKey/board/funkey/rootfs-overlay/usr/share/gmenu2x/translations/French
new file mode 100644
index 0000000..f3d6081
--- /dev/null
+++ b/FunKey/board/funkey/rootfs-overlay/usr/share/gmenu2x/translations/French
@@ -0,0 +1,129 @@
+Settings=Configurations
+Configure GMenu2X's options=Configurer les options de GMenu2X
+Activate Usb on SD=Activer l'Usb sur la SD
+Activate Usb on Nand=Activer l'Usb sur la Nand
+Info about GMenu2X=Information sur GMenu2X
+About=A propos de
+Add section=Ajouter une section
+Rename section=Renommer une section
+Delete section=Supprimer une section
+Scan for applications and games=Rechercher des applications et des jeux
+applications=applications
+Edit link=Editer un lien
+Title=Titre
+Link title=Titre du lien
+Description=Description
+Link description=Description du lien
+Section=Section
+The section this link belongs to=La section à laquelle appartient ce lien
+Icon=Icône
+Select an icon for the link: $1=Selectionner une icône pour le lien: $1
+Manual=Manuel
+Select a graphic/textual manual or a readme=Selectionner un manuel graphique/textuel ou un readme
+Cpu clock frequency to set when launching this link=Fréquence d'horloge CPU à définir lorsqu'on lance ce lien
+Volume to set for this link=Volume à définir pour ce lien
+Parameters=Paramètres
+Parameters to pass to the application=Paramètres à donner à l'application
+Selector Directory=Répertoire de Selector
+Directory to scan for the selector=Répertoire à rechercher pour le selector
+Selector Browser=Explorateur de Selector
+Allow the selector to change directory=Laisser le Selector de changer le répertoire
+Selector Filter=Filtre de Selector
+Filter for the selector (Separate values with a comma)=Filtre pour le Selector (Séparer les valeurs avec une virgule)
+Selector Screenshots=Les captures d'écran de Selector
+Directory of the screenshots for the selector=Répertoire des captures d'écran pour le selector
+Selector Aliases=Aliases de Selector
+File containing a list of aliases for the selector=Fichier contenant une liste d'aliases pour selector
+Explicitly relaunch GMenu2X after this link's execution ends=Relancer explicitement GMenu2X après la fin de l'execution de ce lien
+Don't Leave=Ne quitter pas
+Don't quit GMenu2X when launching this link=Ne pas quitter GMenu2X lorsqu'on lance ce lien
+Save last selection=Sauvegarder la dernière sélection
+Save the last selected link and section on exit=Sauvegarder le dernier lien sélectionné et section en sortant
+Clock for GMenu2X=Horloge pour GMenu2X
+Set the cpu working frequency when running GMenu2X=Définir la féquence de fonctionnement du CPU lorsqu'on utilise GMenu2X
+Maximum overclock=Overclock maximum
+Set the maximum overclock for launching links=Définir l'overclock maximum pour lancer des liens
+Global Volume=Volume global
+Set the default volume for the gp2x soundcard=Définir le volume par défaut pour la carte sonore de la GP2x
+Output logs=Logs de sortie
+Logs the output of the links. Use the Log Viewer to read them.=Loguer la sortie des liens. Utiliser le lecteur de log pour les lire
+Number of columns=Nombre de colonnes
+Set the number of columns of links to display on a page=Définir le nombre de colonnes de liens à afficher sur une page
+Number of rows=Nombres de rangées
+Set the number of rows of links to display on a page=Définir le nombre de rangées de liens à afficher sur une page
+Top Bar Color=Couleur de la bar supérieur
+Color of the top bar=Couleur de la bar supérieur
+Bottom Bar Color=Couleur de la bar inférieur
+Color of the bottom bar=Couleur de la bar inférieur
+Selection Color=Couleur de sélection
+Color of the selection and other interface details=Couleur de la sélection et des autres détails de l'interface
+You should disable Usb Networking to do this.=Vous devez désactiver le réseau Usb pour faire ceci.
+Operation not permitted.=Opération non permise
+Language=Langue
+Set the language used by GMenu2X=Définir la langue utilisée par GMenu2X
+Increase=Augmenter
+Decrease=Diminuer
+Change color component=Changer la couleur composante
+Increase value=Augmenter la valeur
+Decrease value=Diminuer la valeur
+Switch=Changer
+Change value=Changer la valeur
+Edit=Editer
+Clear=Effacer
+Select a directory=Sélectionner un répertoire
+Select a file=Sélectionner un fichier
+Clock (default: 200)=Fréquence (par défaut: 200)
+Volume (default: -1)=Volume (par défaut: -1)
+Wrapper=Wrapper
+Enter folder=Enter dans le répertoire
+Confirm=Confirmer
+Enter folder/Confirm=Entrer un répertoire/Confirmer
+Up one folder=Remonter d'un répertoire
+Select an application=Sélectionner une application
+Space=Espace
+Shift=Majuscule
+Cancel=Annuler
+OK=OK
+Backspace=Backspace
+Skin=Skin
+Set the skin used by GMenu2X=Définir la skin utilisé par GMenu2X
+Add link in $1=Ajouter un lien dans $1
+Edit $1=Editer $1
+Delete $1 link=Effacer le lien $1
+Deleting $1=Effacement de $1
+Are you sure?=Êtes vous sûr?
+Insert a name for the new section=Saisir un nom pour cette nouvelle section
+Insert a new name for this section=Saisir un nouveau nom pour cette section
+Yes=Oui
+No=Non
+You will lose all the links in this section.=Vous perdrez tous les liens dans cette section.
+Exit=Quitter
+Link Scanner=Scanneur de lien
+Scanning SD filesystem...=Scanne du sytème de fichier de la SD...
+Scanning NAND filesystem...=Scanne du sytème de fichier de la NAND...
+$1 files found.=$1 fichiers trouvés.
+Creating links...=Création de liens...
+$1 links created.=$1 liens créés.
+Version $1 (Build date: $2)=Version $1 (Date de compilation: $2)
+Log Viewer=Visualisateur de log
+Displays last launched program's output=Afficher la sortie du dernier programme lancé
+Do you want to delete the log file?=Voulez vous effacer le fichier de log?
+USB Enabled (SD)=USB Activé (SD)
+USB Enabled (Nand)=USB Activé (Nand)
+Turn off=Désactiver
+Launching $1=Lancement de $1
+Change page=Changer de page
+Page=Page
+Scroll=Défilement
+Untitled=Sans titre
+Change GMenu2X wallpaper=Changer l'arrière-plan de GMenu2X
+Activate/deactivate tv-out=Activer/desactiver la sortie TV.
+Select wallpaper=Sélectionner l'arrière-plan
+Gamma=Gamma
+Set gp2x gamma value (default: 10)=Définir la valeur de gamma de la gp2x (par défaut: 10)
+Tv-Out encoding=Encodage de la sortie TV
+Encoding of the tv-out signal=Encodage du signal de la sortie TV
+Tweak RAM Timings=Modification des timings de la RAM
+This usually speeds up the application at the cost of stability=Ceci accélère, normalement, l'application mais en contre partie de la stabilité
+Gamma (default: 0)=Gamma (par défaut: 0)
+Gamma value to set when launching this link=Valeur de gamma à définir lors du lancement de ce lien
\ No newline at end of file
diff --git a/FunKey/board/funkey/rootfs-overlay/usr/share/gmenu2x/translations/German b/FunKey/board/funkey/rootfs-overlay/usr/share/gmenu2x/translations/German
new file mode 100644
index 0000000..c34ba94
--- /dev/null
+++ b/FunKey/board/funkey/rootfs-overlay/usr/share/gmenu2x/translations/German
@@ -0,0 +1,129 @@
+Settings=Einstellungen
+Configure GMenu2X's options=Optionen des GMenu2X konfigurieren
+Activate Usb on SD=Aktiviert USB für die SD-Karte
+Activate Usb on Nand=Aktiviert USB für den Nand-Speicher
+Info about GMenu2X=Informationen über GMenu2X
+About=Über
+Add section=Sektion hinzufügen
+Rename section=Sektion umbenennen
+Delete section=Sektion löschen
+Scan for applications and games= Nach Anwendungen und Spielen scannen
+applications=Anwendungen
+Edit link=Link bearbeiten
+Title=Titel
+Link title=Linktitel
+Description=Beschreibung
+Link description=Beschreibung für diesen Link
+Section=Sektion
+The section this link belongs to=Zum Link gehörende Sektion
+Icon=Icon
+Select an icon for the link: $1=Icon für diesen Link wählen: $1
+Manual=Anleitung
+Select a graphic/textual manual or a readme=Wähle eine Graphik/Text Anleitung oder Readme
+Cpu clock frequency to set when launching this link=CPU-Takt, mit welchem dieser Link gestartet wird
+Volume to set for this link=Lautstärke für diesen Link
+Parameters=Parameter
+Parameters to pass to the application=Parameter, die an die Anwendung übergeben werden
+Selector Directory=Selector Verzeichnis
+Directory to scan for the selector=Vom Selector zu scannendes Verzeichnis
+Selector Browser=Selector Browser
+Allow the selector to change directory=Erlaube dem Selector, das Verzeichnis zu wechseln
+Selector Filter=Selector Filter
+Filter for the selector (Separate values with a comma)=Filter für den Selector (Werte mit Komma trennen)
+Selector Screenshots=Selector Screenshots
+Directory of the screenshots for the selector=Screenshot-Verzeichnis für den Selector
+Selector Aliases=Selector Alternativnamen
+File containing a list of aliases for the selector=Datei mit Liste von Alternativnamen für den Selector
+Explicitly relaunch GMenu2X after this link's execution ends=GMenu2X nach Beenden der Anwendung umgehend Neustarten
+Don't Leave=Nicht verlassen
+Don't quit GMenu2X when launching this link=Beim starten dieses Links GMenu2X nicht beenden
+Save last selection=Letzte Auswahl speichern
+Save the last selected link and section on exit=Speichert den zuletzt gewählten Link und die Sektion beim Beenden
+Clock for GMenu2X=Taktfrequenz für GMenu2X
+Set the cpu working frequency when running GMenu2X=Stellt den CPU Arbeitstakt für GMenu2X ein
+Maximum overclock=Maximale Übertaktung
+Set the maximum overclock for launching links=Einstellen der maximalen Taktfrequenz zum Starten von Links
+Global Volume=Allgemeine Lautstärke
+Set the default volume for the gp2x soundcard=Einstellen der Standardlautstärke des GP2X Soundchips
+Output logs=Ausgabeprotokolle
+Logs the output of the links. Use the Log Viewer to read them.=Protokolliert Ausgabe der Links. Benutz den Log Viewer zum Lesen.
+Number of columns=Anzahl der Spalten
+Set the number of columns of links to display on a page=Anzahl der Spalten mit Links, pro Seite
+Number of rows=Anzahl der Zeilen
+Set the number of rows of links to display on a page=Anzahl der Zeilen mit Links, pro Seite
+Top Bar Color=Farbe der Kopfleiste
+Color of the top bar= Stellt Farbe und Transparenz der oberen Menüleiste ein
+Bottom Bar Color=Farbe der Fußleiste
+Color of the bottom bar=Stellt Farbe und Transparenz der unteren Menüleiste ein
+Selection Color=Farbe der Auswahl
+Color of the selection and other interface details=Farbe der Auswahl-Hervorhebung und anderer Interface-Details
+You should disable Usb Networking to do this.=Du solltest USB Networking deaktivieren um dies zu tun.
+Operation not permitted.=Operation nicht gestattet.
+Language=Sprache
+Set the language used by GMenu2X=Einstellen der GMenu2X-Sprache
+Increase=Erhöhen
+Decrease=Verringern
+Change color component=Farbkomponente ändern
+Increase value=Wert erhöhen
+Decrease value=Wert verringern
+Switch=Wechseln
+Change value=Wert ändern
+Edit=Bearbeiten
+Clear=Leeren
+Select a directory=Verzeichnis wählen
+Select a file=Datei wählen
+Clock (default: 200)=Taktfrequenz (Standard: 200)
+Volume (default: -1)=Lautstärke (Standard: -1)
+Wrapper=Wrapper
+Enter folder=Ordner öffnen
+Confirm=Bestätigen
+Enter folder/Confirm=Ordner öffnen/Bestätigen
+Up one folder=Einen Ordner Aufwärts
+Select an application=Anwendung wählen
+Space=Leer
+Shift=Umschalt
+Cancel=Abbrechen
+OK=OK
+Backspace=Rücktaste
+Skin=Skin
+Set the skin used by GMenu2X=Skin für GMenu2X auswählen
+Add link in $1=Link in "$1" hinzufügen
+Edit $1="$1" bearbeiten
+Delete $1 link=Link "$1" löschen
+Deleting $1="$1" wird gelöscht
+Are you sure?=Sind Sie sicher?
+Insert a name for the new section=Name für die neue Sektion
+Insert a new name for this section=Neuer Name für diese Sektion
+Yes=Ja
+No=Nein
+You will lose all the links in this section.=Alle Links in dieser Sektion gehen verloren.
+Exit=Beenden
+Link Scanner=Links suchen
+Scanning SD filesystem...=SD-Karte wird durchsucht...
+Scanning NAND filesystem...=NAND-Speicher wird durchsucht...
+$1 files found.=$1 Datei(en) gefunden.
+Creating links...=Verknüpfungen werden erstellt.
+$1 links created.=$1 Verknüpfung(en) erstellt.
+Version $1 (Build date: $2)=Version $1 vom $2
+Log Viewer=Log-Viewer
+Displays last launched program's output=Zeigt die Ausgabe der zuletzt gestarteten Anwendung an
+Do you want to delete the log file?=Möchten Sie diese Log-Datei löschen?
+USB Enabled (SD)=USB aktiviert (SD)
+USB Enabled (Nand)=USB aktiviert (Nand)
+Turn off=Abschalten
+Launching $1=$1 wird gestartet
+Change page=Seite wechseln
+Page=Seite
+Scroll=Scrollen
+Untitled=Unbenannt
+Change GMenu2X wallpaper=Ändert das GMenu2X Hintergrundbild
+Activate/deactivate tv-out=Aktiviert/deaktiviert TV-O ut
+Select wallpaper=Hintergrundbild wählen
+Gamma=Gamma
+Set gp2x gamma value (default: 10)=Setzt den GP2X Gamma-Wert (Standard: 10)
+Tv-Out encoding=TV-Out Signal
+Encoding of the tv-out signal=Einstellen der TV-Out Fernsehnorm
+Tweak RAM Timings=Schnellere RAM-Timings
+This usually speeds up the application at the cost of stability=Kann Anwendung beschleunigen, auf Kosten der Stabilität
+Gamma (default: 0)=Gamma (Standard: 0)
+Gamma value to set when launching this link=Bein Starten dieses Links benutzter Gamma-Wert
diff --git a/FunKey/board/funkey/rootfs-overlay/usr/share/gmenu2x/translations/Italian b/FunKey/board/funkey/rootfs-overlay/usr/share/gmenu2x/translations/Italian
new file mode 100644
index 0000000..bad8d40
--- /dev/null
+++ b/FunKey/board/funkey/rootfs-overlay/usr/share/gmenu2x/translations/Italian
@@ -0,0 +1,142 @@
+Settings=Impostazioni
+Configure GMenu2X's options=Configura le opzioni di GMenu2X
+Activate Usb on SD=Attiva USB sulla SD
+Activate Usb on Nand=Attiva USB sulla Nand
+Info about GMenu2X=Informazioni su GMenu2X
+About=Informazioni
+Add section=Aggiungi sezione
+Rename section=Rinomina sezione
+Delete section=Elimina sezione
+Scan for applications and games=Cerca applicazioni e giochi
+applications=applicazioni
+Edit link: $1=Modifica collegamento: $1
+Title=Titolo
+Link title=Titolo collegamento
+Description=Descrizione
+Link description=Descrizione collegamento
+Section=Sezione
+The section this link belongs to=La sezione alla quale appartiene questo collegamento
+Icon=Icona
+Select an icon for the link: $1=Seleziona un'icona per il collegamento: $1
+Manual=Manuale
+Select a graphic/textual manual or a readme=Seleziona un manuale grafico/testuale o un readme
+Cpu clock frequency to set when launching this link=Clock della cpu da impostare quando si lancia questo collegamento
+Volume to set for this link=Volume da impostare per questo collegamento
+Parameters=Parametri
+Parameters to pass to the application=Parametri da passare all'applicazione
+Selector Directory=Directory del selettore
+Directory to scan for the selector=Directory da utilizzare con il selettore
+Selector Browser=Browser del selettore
+Allow the selector to change directory=Permetti al selettore di cambiare directory
+Selector Filter=Filtro del selettore
+Filter for the selector (Separate values with a comma)=Filtro per il selettore (Separa i valori con virgola)
+Selector Screenshots=Screenshot del selettore
+Directory of the screenshots for the selector=Directory contenente gli screenshot per il selettore
+Selector Aliases=Alias del selettore
+File containing a list of aliases for the selector=File contenente una lista di alias per il selettore
+Explicitly relaunch GMenu2X after this link's execution ends=Rilancia esplicitamente GMenu2X dopo l'esecuzione del collegamento
+Don't Leave=Non lasciare
+Don't quit GMenu2X when launching this link=Non terminare GMenu2X quando viene lanciato questo collegamento
+Save last selection=Salva ultima selezione
+Save the last selected link and section on exit=Salva l'ultimo collegamento e sezione usati quando si esce
+Clock for GMenu2X=Clock per GMenu2X
+Set the cpu working frequency when running GMenu2X=Imposta la frequenza di lavoro per GMenu2X
+Maximum overclock=Overclock massimo
+Set the maximum overclock for launching links=Imposta la frequenza massima per i collegamenti
+Global Volume=Volume globale
+Set the default volume for the gp2x soundcard=Imposta il volume standard per la gp2x
+Output logs=Log dell'output
+Logs the output of the links. Use the Log Viewer to read them.=Tiene traccia dell'output dei collegamenti. Usa il Visualizzatore di Log per leggerlo.
+Number of columns=Numero di colonne
+Set the number of columns of links to display on a page=Imposta il numero di colonne di collegamenti da visualizzare in una pagina
+Number of rows=Numero di righe
+Set the number of rows of links to display on a page=Imposta il numero di righe di collegamenti da visualizzare in una pagina
+Top Bar Color=Colore barra superiore
+Color of the top bar=Colore della barra superiore
+Bottom Bar Color=Colore barra inferiore
+Color of the bottom bar=Colore della barra inferiore
+Selection Color=Colore selezione
+Color of the selection and other interface details=Colore della selezione e altri dettagli dell'interfaccia
+You should disable Usb Networking to do this.=Dovresti disattivare le impostazioni di rete per farlo.
+Operation not permitted.=Operazione non consentita.
+Language=Lingua
+Set the language used by GMenu2X=Imposta la lingua usata da GMenu2X
+Increase=Aumenta
+Decrease=Riduci
+Change color component=Cambia componente cromatico
+Increase value=Incrementa valore
+Decrease value=Decrementa valore
+Switch=Cambia
+Change value=Cambia valore
+Edit=Modifica
+Clear=Svuota
+Select a directory=Seleziona una directory
+Select a file=Seleziona un file
+Clock (default: 200)=Frequenza (predefinito: 200)
+Volume (default: -1)=Volume (predefinito: -1)
+Wrapper=Involucro
+Enter folder=Entra nella cartella
+Confirm=Conferma
+Enter folder/Confirm=Entra nella cartella/Conferma
+Up one folder=Sali di una cartella
+Select an application=Seleziona un'applicazione
+Space=Spazio
+Shift=Maiusc
+Cancel=Annulla
+OK=OK
+Backspace=Backspace
+Skin=Skin
+Set the skin used by GMenu2X=Imposta la skin usata da GMenu2X
+Add link in $1=Aggiungi collegamento in $1
+Edit $1=Modifica $1
+Delete $1 link=Elimina il collegamento $1
+Deleting $1=Rimozione di $1
+Are you sure?=Sei sicuro?
+Insert a name for the new section=Inserisci un nome per la nuova sezione
+Insert a new name for this section=Inserisci un nuovo nome per questa sezione
+Yes=Si
+No=No
+You will lose all the links in this section.=Perderai tutti i collegamenti in questa sezione.
+Exit=Esci
+Link Scanner=Scanner di collegamenti
+Scanning SD filesystem...=Scansione del filesystem della SD...
+Scanning NAND filesystem...=Scansione del filesystem della NAND...
+$1 files found.=$1 file trovati.
+Creating links...=Creazione collegamenti...
+$1 links created.=$1 collegamenti creati.
+Version $1 (Build date: $2)=Versione $1 (Data compilazione: $2)
+Log Viewer=Visualizzatore di log
+Displays last launched program's output=Visualizza l'output dell'ultimo programma eseguito
+Do you want to delete the log file?=Vuoi eliminare il file di log?
+USB Enabled (SD)=USB Attivata (SD)
+USB Enabled (Nand)=USB Attivata (Nand)
+Turn off=Disattiva
+Launching $1=Esecuzione di $1
+Change page=Cambia pagina
+Page=Pagina
+Scroll=Scorri
+Untitled=Senza titolo
+Change GMenu2X wallpaper=Cambia lo sfondo di GMenu2X
+Activate/deactivate tv-out=Attiva/disattiva tv-out
+Select wallpaper=Seleziona sfondo
+Gamma=Gamma
+Set gp2x gamma value (default: 10)=Imposta il valore gamma della gp2x (predefinito: 10)
+Tv-Out encoding=Codifica uscita tv
+Encoding of the tv-out signal=Codifica del segnale dell'uscita tv
+Tweak RAM Timings=Modifica i timings della RAM
+This usually speeds up the application at the cost of stability=Comporta solitamente un miglioramento delle performance al costo di stabilit�
+Gamma (default: 0)=Gamma (predefinito: 0)
+Gamma value to set when launching this link=Valore di gamma da impostare quando si lancia questo collegamento
+Wallpaper=Sfondo
+Configure skin=Configura skin
+Message Box Color=Colore Finestra Messaggi
+Message Box Border Color=Colore Bordo Finestra Messaggi
+Message Box Selection Color=Color Selezione Finestra Messaggi
+Background color of the message box=Colore di sfondo della finestra dei messaggi
+Border color of the message box=Colore del bordo della finestra dei messaggi
+Color of the selection of the message box=Colore della selezione della finestra dei messaggi
+
+Show root=Mostra radice
+Show root folder in the file selection dialogs=Mostra la cartella radice nelle finestre di selezione di file
+Change keys=Cambia tasti
+Launch an application=Esegue un'applicazione
diff --git a/FunKey/board/funkey/rootfs-overlay/usr/share/gmenu2x/translations/Korean b/FunKey/board/funkey/rootfs-overlay/usr/share/gmenu2x/translations/Korean
new file mode 100644
index 0000000..c11d508
--- /dev/null
+++ b/FunKey/board/funkey/rootfs-overlay/usr/share/gmenu2x/translations/Korean
@@ -0,0 +1,133 @@
+﻿Settings=설정
+Configure GMenu2X's options=GMenu2X의 옵션 구성하기
+Activate Usb on SD=SD Usb 활성화
+Activate Usb on Nand=Nand Usb 활성화
+Info about GMenu2X=GMenu2X에 대한 정보
+About=대하여
+Add section=섹션 추가
+Rename section=섹션 이름 변경
+Delete section=섹션 삭제
+Scan for applications and games=응용 프로그램 및 게임 스캔
+applications=응용 프로그램
+Edit link=링크 수정
+Title=제목
+Link title=링크 제목
+Description=기술
+Link description=링크 설명
+Section=섹션
+The section this link belongs to=이 링크가 속한 섹션
+Icon=아이콘
+Select an icon for the link: $1=링크 아이콘을 선택하십시오 : $1
+Manual=설명서
+Select a graphic/textual manual or a readme=그래픽 / 텍스트 설명서 또는 추가 정보 선택
+Cpu clock frequency to set when launching this link=이 링크를 시작할 때 설정할 CPU 클럭 주파수
+Volume to set for this link=이 링크에 설정할 볼륨
+Parameters=매개 변수
+Parameters to pass to the application=응용 프로그램에 전달할 매개 변수
+Selector Directory=선택기 디렉토리
+Directory to scan for the selector=선택기를 검색 할 디렉토리
+Selector Browser=선택기 브라우저
+Allow the selector to change directory=선택기에서 디렉토리를 변경할 수 있도록 허용
+Selector Filter=선택기 필터
+Filter for the selector (Separate values with a comma)=선택기 필터 (쉼표로 값 구분)
+Selector Screenshots=선택기 스크린 샷
+Directory of the screenshots for the selector=선택기의 스크린 샷 디렉토리
+Selector Aliases=선택자 별칭
+File containing a list of aliases for the selector=선택자에 대한 별칭 목록이 포함 된 파일
+Explicitly relaunch GMenu2X after this link's execution ends=이 링크 실행이 끝난 후 GMenu2X를 명시 적으로 다시 실행하십시오.
+Don't Leave=떠나지 마라.
+Don't quit GMenu2X when launching this link=이 링크를 실행할 때 GMenu2X를 종료하지 마십시오.
+Save last selection=마지막 선택 저장
+Save the last selected link and section on exit=종료시 마지막으로 선택한 링크 및 섹션 저장
+Clock for GMenu2X=GMenu2X 용 시계
+Set the cpu working frequency when running GMenu2X=GMenu2X를 실행할 때 CPU 작동 주파수 설정
+Maximum overclock=최대 오버 클럭
+Set the maximum overclock for launching links=링크 실행을위한 최대 오버 클럭 설정
+Global Volume=글로벌 볼륨
+Set the default volume for the gp2x soundcard=gp2x 사운드 카드의 기본 볼륨 설정
+Output logs=출력 로그
+Logs the output of the links. Use the Log Viewer to read them.=링크 출력을 기록합니다. Log Viewer를 사용하여 읽을 수 있습니다.
+Number of columns=열 수
+Set the number of columns of links to display on a page=페이지에 표시 할 링크 열 수 설정
+Number of rows=행 수
+Set the number of rows of links to display on a page=페이지에 표시 할 링크 행 수 설정
+Top Bar Color=탑 바 색상
+Color of the top bar=위쪽 막대의 색
+Bottom Bar Color=하단 바 색상
+Color of the bottom bar=하단 막대의 색상
+Selection Color=선택 색상
+Color of the selection and other interface details=선택 색상 및 기타 인터페이스 세부 정보
+You should disable Usb Networking to do this.=이를 위해서는 Usb 네트워킹을 비활성화해야합니다.
+Operation not permitted.=작동이 허용되지 않습니다.
+Language=언어
+Set the language used by GMenu2X=GMenu2X에서 사용하는 언어 설정
+Increase=증가
+Decrease=감소
+Change color component=색상 구성 요소 변경
+Increase value=볼륨 높이기
+Decrease value=볼륨 줄이기
+Switch=스위치
+Change value=값 변경
+Edit=편집
+Clear=클리어 
+Select a directory=디렉토리 선택
+Select a file=파일 선택
+Clock (default: 200)=시계 (기본값 : 200)
+Volume (default: -1)=볼륨 (기본값 : -1)
+Wrapper=레퍼
+Enter folder=폴더 입력
+Confirm=확인
+Enter folder/Confirm=폴더 입력 / 확인
+Up one folder=한 폴더 위로
+Select an application=응용 프로그램 선택
+Space=공간
+Shift=시프트
+Cancel=취소
+OK=승인
+Backspace=백스페이스
+Skin=스킨
+Set the skin used by GMenu2X=GMenu2X에서 사용하는 스킨 설정
+Add link in $1=$1에 링크 추가
+Edit $1=$1 수정
+Delete $1 link=$1 링크 삭제
+Deleting $1=$1 삭제 중
+Are you sure?=확실합니까?
+Insert a name for the new section=새 섹션의 이름 삽입
+Insert a new name for this section=이 섹션의 새 이름 삽입
+Yes=예
+No=아니오 
+You will lose all the links in this section.=이 섹션의 모든 링크가 사라집니다.
+Exit=나가기
+Link Scanner=링크 스캐너
+Scanning SD filesystem...=SD 파일 시스템 스캔 중 ...
+Scanning NAND filesystem...=NAND 파일 시스템 스캔 중 ...
+$1 files found.=$1 개의 파일을 찾았습니다.
+Creating links...=링크 생성 중 ...
+$1 links created.=$1 링크가 생성되었습니다.
+Version $1 (Build date: $2)=버전 $1 (빌드 날짜 : $2)
+Log Viewer=로그 뷰어
+Displays last launched program's output=마지막으로 실행 된 프로그램의 출력을 표시합니다.
+Do you want to delete the log file?=로그 파일을 삭제 하시겠습니까?
+USB Enabled (SD)=USB 사용 (SD)
+USB Enabled (Nand)=USB 사용 가능 (Nand)
+Turn off=끄다
+Launching $1=$1 시작
+Change page=페이지 변경
+Page=페이지
+Scroll=스크롤
+Untitled=제목 없음
+Change GMenu2X wallpaper=변경 GMenu2X의 벽지
+Activate/deactivate tv-out=TV 출력 활성화 / 비활성화
+Select wallpaper=월페이퍼 선택
+Gamma=감마
+Set gp2x gamma value (default: 10)=gp2x 감마 값 설정 (기본값 : 10)
+Tv-Out encoding=TV 출력 인코딩
+Encoding of the tv-out signal=TV 출력 신호 인코딩
+Tweak RAM Timings=RAM 타이밍 조정
+This usually speeds up the application at the cost of stability=이것은 대개 안정성을 희생시키면서 애플리케이션 속도를 높입니다.
+Gamma (default: 0)=감마 (기본값 : 0)
+Gamma value to set when launching this link=이 링크를 시작할 때 설정할 감마 값
+ON=온
+OFF=오프
+File Browser=파일 탐색기
+Directory Browser=디렉토리 브라우저
\ No newline at end of file
diff --git a/FunKey/board/funkey/rootfs-overlay/usr/share/gmenu2x/translations/Norwegian b/FunKey/board/funkey/rootfs-overlay/usr/share/gmenu2x/translations/Norwegian
new file mode 100644
index 0000000..ddb9c13
--- /dev/null
+++ b/FunKey/board/funkey/rootfs-overlay/usr/share/gmenu2x/translations/Norwegian
@@ -0,0 +1,118 @@
+Settings=Instillinger
+Configure GMenu2X's options=Konfigurer GMenu2X's innstillinger
+Activate Usb on SD=Aktiver USB på SD
+Activate Usb on Nand=Aktiver USB på Nand
+Info about GMenu2X=Info om GMenu2X
+About=Om
+Add section=Legg til avdeling
+Rename section=Gi nytt navn på avdelning
+Delete section=Slett avdelning
+Scan for applications and games=Skann etter applikasjoner
+applications=applikasjoner
+Edit link=Rediger link
+Title=Tittel
+Link title=Linktittel
+Description=Beskrivelse
+Link description=Linkbeskrivelse
+Section=Avdeling
+The section this link belongs to=Avdelningen som denne linken tilhører
+Icon=Ikon
+Select an icon for the link: $1=Velg et ikon til linken: $1
+Manual=Manual
+Select a graphic/textual manual or a readme=Velg en grafisk/tekstbasert manual eller en readme
+Cpu clock frequency to set when launching this link=CPU-klokk som skal settes når denne linken startes
+Volume to set for this link=Volum som skal settes til denne linken
+Parameters=Parametrer
+Parameters to pass to the application=Parametrer som skal settes til applikasjonen
+Selector Directory=Selectormappe
+Directory to scan for the selector=Mappe som selector skal skanne
+Selector Browser=Selectorutforsker
+Allow the selector to change directory=Tillat selector å bytte mappe
+Selector Filter=Selectorfilter
+Filter for the selector (Separate values with a comma)=Filter for selector (separer verdiene med et komma)
+Selector Screenshots=Screenshots for Selector
+Directory of the screenshots for the selector=Mappe med screenshots til selector
+Selector Aliases=Selectoralias
+File containing a list of aliases for the selector=Fil som inneholder en liste med alias for selector
+Explicitly relaunch GMenu2X after this link's execution ends=Tving GMenu2X i å starte når denne linken er ferdigkjørt
+Don't Leave=Forlat ikke
+Don't quit GMenu2X when launching this link=Ikke avslutt GMenu2X når denne linken kjøres
+Save last selection=Husk siste markering
+Save the last selected link and section on exit=Husk siste link og avdeling ved avslutning
+Clock for GMenu2X=Klokkefrekvens for GMenu2X
+Set the cpu working frequency when running GMenu2X=Sett CPU-frekvensen for GMenu2X
+Maximum overclock=Maks overklokk
+Set the maximum overclock for launching links=Sett maks overklokk for linker
+Global Volume=Mastervolum
+Set the default volume fo the gp2x soundcard=Sett overall mastervolum
+Output logs=Skriv logg
+Logs the output of the links. Use the Log Viewer to read them.=Skriver ut logg for linkene. Bruk loggleseren for å lese dem.
+Number of columns=Antall spalter
+Set the number of columns of links to display on a page=Velg antall spalter med linker som skal vises per side
+Number of rows=Antall rader
+Set the number of rows of links to display on a page=Velg antall rader med linker som skal vises per side
+Top Bar Color=Øverste felts farge
+Color of the top bar=Farge på det øverste feltet
+Bottom Bar Color=Nederste felts farge
+Color of the bottom bar=Färge på det nederste feltet
+Selection Color=Markørfarge
+Color of the selection and other interface details=Farge på markøren og andre deler av grensesnittet
+You should disable Usb Networking to do this.=Du bør slå av USB-nettverket når du gjør dette.
+Operation not permitted.=Utillat operasjon.  
+Language=Språk
+Set the language used by GMenu2X=Still inn språk for GMenu2X
+Increase=Øk
+Decrease=Minsk
+Change color component=Endre fargekomponent
+Increase value=Øk verdi
+Decrease value=Minsk verdi
+Switch=Endre
+Change value=Endre verdi
+Edit=Rediger
+Clear=Rens
+Select a directory=Velg en mappe
+Select a file=Velg en fil
+Clock (default: 200)=Klokkefrekvens (standard: 200)
+Volume (default: -1)=Volum (standard: -1)
+Wrapper=Wrapper
+Enter folder=Åpne mappe
+Confirm=Bekreft
+Enter folder/Confirm=Åpne mappe/Bekrefte
+Up one folder=Opp en mappe
+Select an application=Velg en applikasjon
+Space=Mellomrom
+Shift=Skift
+Cancel=Avbryt
+OK=OK
+Backspace=Backspace
+Skin=Skin
+Set the skin used by GMenu2X=Velg hvilket skin GMenu2X skal bruke
+Add link in $1=Lag link i $1
+Edit $1=Rediger $1
+Delete $1 link=Slett $1
+Deleting $1=Sletter $1
+Are you sure?=Er du sikker?
+Insert a name for the new section=Sett navn for den nye avdelningen
+Insert a new name for this section=Sett navn for denne avdelningen
+Yes=Ja
+No=Nei
+You will lose all the links in this section.=Du vil miste alle linkene i denne avdelingen.
+Exit=Avslutt
+Link Scanner=Linkskanner
+Scanning SD filesystem...=Skanner igjennom minnekortet...
+Scanning NAND filesystem...=Skanner igjennom NAND...
+$1 files found.=$1 fil(er) funnet.
+Creating links...=Lager linker...
+$1 links created.=$1 link(er) er lagd.
+Version $1 (Build date: $2)=Versjon $1 (Bygd den: $2)
+Log Viewer=Loggleser
+Displays last launched program's output=Vis utdata fra siste startede program
+Do you want to delete the log file?=Vil du slette loggen?
+USB Enabled (SD)=USB Aktivert (SD)
+USB Enabled (Nand)=USB Aktivert (NAND)
+Turn off=Slå av
+Launching $1=Starter $1
+Change page=Bytt side
+Page=Side
+Scroll=Rull
+Untitled=Uten navn
diff --git a/FunKey/board/funkey/rootfs-overlay/usr/share/gmenu2x/translations/Portuguese (Brazil) b/FunKey/board/funkey/rootfs-overlay/usr/share/gmenu2x/translations/Portuguese (Brazil)
new file mode 100644
index 0000000..5af1487
--- /dev/null
+++ b/FunKey/board/funkey/rootfs-overlay/usr/share/gmenu2x/translations/Portuguese (Brazil)	
@@ -0,0 +1,119 @@
+Settings=Configurações
+Configure GMenu2X's options=Configurar opções do GMenu2X
+Activate Usb on SD=Ativar USB para SD
+Activate Usb on Nand=Ativar USB para Nand
+Info about GMenu2X=Informação sobre GMenu2X
+About=Sobre
+Add section=Adicionar Seção
+Rename section=Renomear Seção
+Delete section=Excluir Seção
+Scan for applications and games=Procurar aplicativos e jogos 
+applications=aplicativos
+Edit link=Editar Link
+Title=Título
+Link title=Título do Link 
+Description=Descrição
+Link description=Descrição do Link
+Section=Seção
+The section this link belongs to=Seção a qual pertence este link 
+Icon=Ícone
+Select an icon for the link: $1=Selecione um ícone para o link: $1
+Manual=Manual
+Select a graphic/textual manual or a readme=Selecionar um manual gráfico e/ou de texto
+Cpu clock frequency to set when launching this link=Frequência de relógio do CPU ao executar este link 
+Volume to set for this link=Ajustar o volume para este link
+Parameters=Parâmetros
+Parameters to pass to the application=Adicionar parâmetros ao aplicativo
+Selector Directory=Diretório do Seletor
+Directory to scan for the selector=Diretório a explorar com o Seletor
+Selector Browser=Navegar Diretórios
+Allow the selector to change directory=Permitir ao Seletor que mude de diretório
+Selector Filter=Filtro de Arquivos
+Filter for the selector (Separate values with a comma)=Filtro do Seletor (Separar valores com virgulas)
+Selector Screenshots=Capturas de tela
+Directory of the screenshots for the selector=Diretório das capturas de tela do Seletor
+Selector Aliases=Alias do Seletor
+File containing a list of aliases for the selector=Arquivo que contém a lista de alias para o Seletor 
+Backdrop=Plano de fundo
+Select an image backdrop=Escolha a imagem de fundo
+Explicitly relaunch GMenu2X after this link's execution ends=Executar novamente o GMenu2x após término deste programa
+Don't Leave=Não Sair
+Don't quit GMenu2X when launching this link=Não sair do GMenu2X ao executar este link
+Save last selection=Lembrar a última seleção
+Save the last selected link and section on exit=Lembrar link e seção selecionadas ao sair
+Clock for GMenu2X=Relógio no GMenu2X
+Set the cpu working frequency when running GMenu2X=Define a frequência da CPU durante a execução do GMenu2X  
+Maximum overclock=Overclock máximo
+Set the maximum overclock for launching links=Define o overclock máximo ao executar um link
+Global Volume=Volume global
+Set the default volume for the soundcard=Define o volume padrão
+Output logs=Logs de Saída
+Logs the output of the links. Use the Log Viewer to read them.=Registra a saída dos links. Usar o Leitor de Logs para consultar.
+Number of columns=Número de colunas
+Set the number of columns of links to display on a page=Define o número de colunas de links por página 
+Number of rows=Número de filas
+Set the number of rows of links to display on a page=Define o número de linhas de links por página 
+Top Bar=Barra superior
+Color of the top bar=Cor da barra superior
+Bottom Bar=Barra inferior
+Color of the bottom bar=Cor da barra inferior
+Selection=Seleção
+Color of the selection and other interface details=Cor da seleção e outros detalhes da interface 
+You should disable Usb Networking to do this.=A função de Rede via USB deve ser desativada para executar este comando.
+Operation not permitted.=Operação não permitida.
+Language=Idioma
+Set the language used by GMenu2X=Define o idioma usado no GMenu2X 
+Increase=Aumentar
+Decrease=Diminuir
+Change color component=Alterar componente da cor
+Increase value=Aumentar valor
+Decrease value=Diminuir Valor
+Switch=Trocar
+Change value=Mudar Valor
+Edit=Editar
+Clear=Limpar
+Select a directory=Selecionar diretório
+Select a file=Selecionar arquivo
+Clock (default: 200)=Frequência (padrão: 200)
+Volume (default: -1)=Volume (padrão: -1)
+Wrapper=Invólucro
+Enter folder=Entrar na Pasta
+Confirm=Confirmar
+Enter folder/Confirm=Entrar na Pasta/Confirmar
+Up one folder=Subir uma pasta
+Select an application=Selecionar aplicativo
+Space=Espaço
+Shift=Shift
+Cancel=Cancelar
+OK=OK
+Backspace=Backspace
+Skin=Skin
+Set the skin used by GMenu2X=Selecionar a Skin a usar pelo GMenu2X
+Add link=Adicionar link
+Edit $1=Editar $1
+Delete $1=Excluir $1
+Deleting $1=Excluindo $1
+Are you sure?=Tem certeza? 
+Insert a name for the new section=Insira o nome da nova seção
+Insert a new name for this section=Insira um novo nome para esta seção
+Yes=Sim
+No=Não
+You will lose all the links in this section.=Todos os links desta seção serão removidos.
+Exit=Sair
+Scanning SD filesystem...=Buscando arquivos no SD...
+Scanning NAND filesystem...=Buscando arquivos na NAND...
+$1 files found.=$1 arquivos encontrados.
+Creating links...=Criando Links...
+$1 links created.=$1 Links criados.
+Version $1 (Build date: $2)=Versão $1 (Data de compilação: $2)
+Log Viewer=Leitor de Logs
+Displays last launched program's output=Mostra a saída do último programa executado
+Do you want to delete the log file?=Deseja excluir o arquivo de log?
+USB Enabled (SD)=USB Activado (SD)
+USB Enabled (Nand)=USB Activado (Nand)
+Turn off=Desligar
+Launching $1=Executando $1
+Change page=Mudar página
+Page=Página
+Scroll=Rolar
+Untitled=Sem título
diff --git a/FunKey/board/funkey/rootfs-overlay/usr/share/gmenu2x/translations/Portuguese (Portugal) b/FunKey/board/funkey/rootfs-overlay/usr/share/gmenu2x/translations/Portuguese (Portugal)
new file mode 100644
index 0000000..70c9084
--- /dev/null
+++ b/FunKey/board/funkey/rootfs-overlay/usr/share/gmenu2x/translations/Portuguese (Portugal)	
@@ -0,0 +1,118 @@
+Settings= Configurações
+Configure GMenu2X's options=Configurar opções do GMenu2X
+Activate Usb on SD=Activar USB para SD
+Activate Usb on Nand=Activar USB para Nand
+Info about GMenu2X=Informação sobre GMenu2X
+About=Sobre
+Add section=Adicionar Secção
+Rename section= Renomear Secção
+Delete section= Eliminar Secção
+Scan for applications and games=Procurar aplicações e jogos 
+applications=aplicações
+Edit link=Editar Link
+Title=Título
+Link title=Título do Link 
+Description=Descrição
+Link description=Descrição do Link
+Section=Secção
+The section this link belongs to=A secção a que pertence este link 
+Icon=Ícone
+Select an icon for the link: $1=Seleccionar um ícone para o link: $1
+Manual=Manual
+Select a graphic/textual manual or a readme=Seleccionar um manual gráfico e/ou de texto
+Cpu clock frequency to set when launching this link=Frequência de relógio do CPU ao lançar este link 
+Volume to set for this link=Ajustar o volume para este link
+Parameters=Parâmetros
+Parameters to pass to the application=Parâmetros a enviar para a aplicação
+Selector Directory=Directório do Selector
+Directory to scan for the selector=Directório a explorar com o selector
+Selector Browser=Explorador do selector
+Allow the selector to change directory=Permitir ao selector que mude de directório
+Selector Filter=Filtro do selector
+Filter for the selector (Separate values with a comma)=Filtro do selector (Separar valores com virgulas)
+Selector Screenshots=Capturas de ecrã do selector
+Directory of the screenshots for the selector=Directório das capturas de ecrã do selector
+Selector Aliases=Alias do selector
+File containing a list of aliases for the selector=Arquivo que contém a lista de alias para o selector 
+Explicitly relaunch GMenu2X after this link's execution ends=Forçar relançamento do GMenu2x após fim da execução deste link
+Don't Leave=Não Sair
+Don't quit GMenu2X when launching this link=Não sair do GMenu2X ao lançar este link
+Save last selection=Gravar a última selecção
+Save the last selected link and section on exit=Recordar link e secção seleccionadas ao sair
+Clock for GMenu2X=Relógio no GMenu2X
+Set the cpu working frequency when running GMenu2X=Ajustar a frequência do CPU durante a execução do GMenu2X  
+Maximum overclock=Overclock máximo
+Set the maximum overclock for launching links=Ajustar o overclock máximo ao lançar um link
+Global Volume=Volume global
+Set the default volume fo the gp2x soundcard=Ajustar o volume por defeito da gp2x 
+Output logs=Logs de Output
+Logs the output of the links. Use the Log Viewer to read them.=Regista o output dos links. Usar o Leitor de Logs para consultar.
+Number of columns=Número de colunas
+Set the number of columns of links to display on a page=Ajustar o número de colunas (de links) por página 
+Number of rows=Número de filas
+Set the number of rows of links to display on a page=Ajustar o número de filas (de links) por página 
+Top Bar Color=Cor da barra superior
+Color of the top bar= Cor da barra superior
+Bottom Bar Color= Cor da barra inferior
+Color of the bottom bar= Cor da barra inferior
+Selection Color=Cor da selecção
+Color of the selection and other interface details=Cor da selecção e outros detalhes do interface 
+You should disable Usb Networking to do this.=Deve desactivar a função Networking por USB para executar este comando.
+Operation not permitted.=Operação não permitida.
+Language=Idioma
+Set the language used by GMenu2X=Ajustar o idioma usado no GMenu2X 
+Increase=Aumentar
+Decrease=Reduzir
+Change color component=Alterar componente da cor
+Increase value=Aumentar valor
+Decrease value=Reduzir Valor
+Switch=Mudar
+Change value=Mudar Valor
+Edit=Editar
+Clear=Eliminar
+Select a directory=Seleccionar directório
+Select a file=Seleccionar ficheiro
+Clock (default: 200)=Frequência (predefinido: 200)
+Volume (default: -1)=Volume (predefinido: -1)
+Wrapper=Invólucro
+Enter folder=Entrar na Pasta
+Confirm=Confirmar
+Enter folder/Confirm=Entrar na Pasta/Confirmar
+Up one folder=Subir uma pasta
+Select an application=Seleccionar uma aplicação
+Space=Espaço
+Shift=Shift
+Cancel=Cancelar
+OK=OK
+Backspace=Backspace
+Skin=Skin
+Set the skin used by GMenu2X=Seleccionar a Skin a usar pelo GMenu2X
+Add link in $1=Adicionar link em $1
+Edit $1=Modificar $1
+Delete $1 link=Eliminar o link $1
+Deleting $1=Removendo $1
+Are you sure?=Tem a certeza? 
+Insert a name for the new section=Insira o nome da nova secção
+Insert a new name for this section=Insira um novo nome para esta secção
+Yes=Sim
+No=Não
+You will lose all the links in this section.=Perderá todos os links desta secção.
+Exit=Sair
+Link Scanner=Pesquisador de links
+Scanning SD filesystem...=A pesquisar o filesystem do SD...
+Scanning NAND filesystem...=A pesquisar o filesystem do NAND...
+$1 files found.=$1 ficheiros encontrados.
+Creating links...=A criar Links...
+$1 links created.=$1 Links criados.
+Version $1 (Build date: $2)=Versão $1 (Data de compilação: $2)
+Log Viewer=Leitor de Logs
+Displays last launched program's output=Mostra o output do último programa executado
+Do you want to delete the log file?=Deseja eliminar o ficheiro de log?
+USB Enabled (SD)=USB Activado (SD)
+USB Enabled (Nand)=USB Activado (Nand)
+Turn off=Desligar
+Launching $1= A Executar $1
+Change page=Mudar página
+Page=Página
+Scroll=Scroll
+Untitled=Sem título
diff --git a/FunKey/board/funkey/rootfs-overlay/usr/share/gmenu2x/translations/Russian b/FunKey/board/funkey/rootfs-overlay/usr/share/gmenu2x/translations/Russian
new file mode 100644
index 0000000..d53bd4f
--- /dev/null
+++ b/FunKey/board/funkey/rootfs-overlay/usr/share/gmenu2x/translations/Russian
@@ -0,0 +1,132 @@
+Settings=Настройки
+Configure GMenu2X's options=Изменить настройки GMenu2X
+Activate Usb on SD=Активировать SD через USB
+Activate Usb on Nand=Активировать NAND через USB
+Info about GMenu2X=Информация о GMenu2X
+About=Информация
+Add section=Добавить секцию
+Rename section=Переименовать секцию
+Delete section=Удалить секцию
+Scan for applications and games= Поиск игр и приложений
+applications=Приложения
+Edit link=Изменить ссылку
+Title=Заголовок
+Link title=Имя ссылки
+Description=Описание 
+Link description=Описание ссылки
+Section=Секция
+The section this link belongs to=Секция, которой принадлежит ссылка
+Icon=Иконка
+Select an icon for the link: $1=Выберите иконку к ссылке: $1
+Manual=Инструкция
+Select a graphic/textual manual or a readme=Выберите текстовую инструкцию
+Cpu clock frequency to set when launching this link=Частота CPU при запуске данной ссылки
+Volume to set for this link=Громкость установленная для этой ссылки
+Parameters=Параметры
+Parameters to pass to the application=Параметры для передачи приложению
+Selector Directory=Папка проводника
+Directory to scan for the selector=Папка для сканирования проводником
+Selector Browser=Выбрать браузером
+Allow the selector to change directory=Разрешить проводнику выбирать папку
+Selector Filter=Выбрать фильтр
+Filter for the selector (Separate values with a comma)=Фильтр для проводника
+Selector Screenshots=Обзор скриншотов
+Directory of the screenshots for the selector=Папка с скриншотами для проводника
+Selector Aliases=Обзор списков с именами
+File containing a list of aliases for the selector=Файл, содержащий список имён-псевдонимов
+Explicitly relaunch GMenu2X after this link's execution ends=Перезапуск GMenu2X после завершения выполнения ссылки
+Don't Leave=Не покидать
+Don't quit GMenu2X when launching this link=Не выключать Gmenu2X когда запускается эта ссылка
+Save last selection=Сохранять последней выбор
+Save the last selected link and section on exit=Сохранение последней выбранной ссылки и секции при выключение
+Clock for GMenu2X=Частота CPU для Gmenu2X
+Set the cpu working frequency when running GMenu2X=Устанавливает частоту CPU пока запущен Gmenu2X
+Maximum overclock=Максимальная частота CPU
+Set the maximum overclock for launching links=Максимальная частота CPU для запуска ссылок
+Global Volume=Громкость
+Set the default volume for the gp2x soundcard=Устанавливает громкость для звуковой карты GP2X
+Output logs=Отчёты
+Logs the output of the links. Use the Log Viewer to read them.=Создавать отчёты ссылок
+Number of columns=Количество столбцов
+Set the number of columns of links to display on a page=Установите количество столбцов для отображения на странице
+Number of rows=Количество колонок
+Set the number of rows of links to display on a page=Установите количество колонок для отображения на странице
+Top Bar Color=Цвет панели сверху
+Color of the top bar=Выберите цвет панели сверху
+Bottom Bar Color=Цвет панели внизу 
+Color of the bottom bar= Выберите цвет панели внизу
+Selection Color=Цвет панели выбора
+Color of the selection and other interface details=Выберите цвет панели выбора
+You should disable Usb Networking to do this.=Вы должны выключить USB Networking чтобы сделать это.
+Operation not permitted.=Операция не разрешена.
+Language=Язык
+Set the language used by GMenu2X=Выберите язык интерфейса
+Increase=Прибавить 
+Decrease=Убавить
+Change color component=Изменить компонент цвета 
+Increase value=Увеличить значение 
+Decrease value=Уменьшить значение 
+Switch=Переключить
+Change value=Изменить значение 
+Edit=Изменить
+Clear=Очистить 
+Select a directory=Выбрать папку 
+Select a file=Выбрать файл 
+Clock (default: 200)=Частота CPU (Стандарт: 200) 
+Volume (default: -1)=Громкость (Стандарт: -1) 
+Wrapper=Перезапуск 
+Enter folder=Открыть папку 
+Confirm=Подтвердить  
+Enter folder/Confirm=Выбрать папку/подтвердить
+Up one folder=Назад на одну папку 
+Select an application=Выберите приложение 
+Space=Пробел
+Shift=Shift
+Cancel=Выход 
+OK=OK 
+Backspace=Стереть
+Skin=Скин
+Set the skin used by GMenu2X=Выберите скин для GMenu2X
+Add link in $1=Добавить ссылку в $1
+Edit $1=Именить $1 
+Delete $1 link=Удалить ссылку $1
+Deleting $1=Удаление $1
+Are you sure?=Вы уверены? 
+Insert a name for the new section=Впишите имя для новой секции 
+Insert a new name for this section=Впишите новое имя для этой секции 
+Yes=Да 
+No=Нет 
+You will lose all the links in this section.=Вы потеряете все ссылки в этой секции 
+Exit=Выход 
+Link Scanner=Поиск ссылок 
+Scanning SD filesystem...=Сканирование SD... 
+Scanning NAND filesystem...=Сканирование NAND... 
+$1 files found.=$1 Файлов найдено. 
+Creating links...=Создание ссылок... 
+$1 links created.=$1 ссылок создано. 
+Version $1 (Build date: $2)=Версия $1 (Дата сборки: $2) 
+Log Viewer=Отчёты
+Displays last launched program's output=Отображает последний запущенный отчёт программы 
+Do you want to delete the log file?=Вы хотите удалить этот отчёт? 
+USB Enabled (SD)=USB включен (SD) 
+USB Enabled (Nand)=USB включен (Nand) 
+Turn off=Выключить 
+Launching $1=Запуск $1...
+Change page=Изменить страницу 
+Page=Страница 
+Scroll=Прокрутка
+Untitled=Не названный
+
+
+Change GMenu2X wallpaper=Изменить обои Gmenu2X
+Activate/deactivate tv-out=Активировать/дезактивировать ТВ-выход 
+Select wallpaper=Выберите обои
+Gamma=Гамма
+Set gp2x gamma value (default: 10)=Значение гаммы экрана GP2X (стандарт: 10)
+Tv-Out encoding=Технология вывода на ТВ
+Encoding of the tv-out signal=Шифровка ТВ сигнала
+Tweak RAM Timings=Изменение параметров RAM
+This usually speeds up the application at the cost of stability=Это обычно убыстряет приложение
+Gamma (default: 0)=Гамма (стандарт: 0)
+Gamma value to set when launching this link=Значение гаммы при запуске этой ссылки
+
diff --git a/FunKey/board/funkey/rootfs-overlay/usr/share/gmenu2x/translations/Slovak b/FunKey/board/funkey/rootfs-overlay/usr/share/gmenu2x/translations/Slovak
new file mode 100644
index 0000000..80da006
--- /dev/null
+++ b/FunKey/board/funkey/rootfs-overlay/usr/share/gmenu2x/translations/Slovak
@@ -0,0 +1,137 @@
+Settings=Nastavenia
+Configure GMenu2X's options=Nastaviť voľby pre GMenu2X
+Activate Usb on SD=Aktivovať USB pre SD kartu
+Activate Usb on Nand=Aktivovať USB pre pamäť Nand
+Info about GMenu2X=Informácie o GMenu2X
+About=O programe
+Add section=Pridať sekciu
+Rename section=Premenovať sekciu
+Delete section=Vymazať sekciu
+Scan for applications and games= Hľadať aplikácie a hry
+applications=aplikácie
+Edit link=Upraviť odkaz
+Title=Názov
+Link title=Názov odkazu
+Description=Popis
+Link description=Popis pre odkaz
+Section=Sekcia
+The section this link belongs to=Sekcia, do ktorej patrí tento odkaz
+Icon=Ikona
+Select an icon for the link=Vyberte ikonu pre tento odkaz
+Manual=Návod
+Select a graphic/textual manual or a readme=Vyberte grafický/textový návod alebo readme
+Cpu clock frequency to set when launching this link=Taktovacia frekvencia procesora, s ktorou bude spustený odkaz
+Volume to set for this link=Nastavenie hlasitosti pre tento odkaz
+Parameters=Parametre
+Parameters to pass to the application=Parametre, ktoré majú byť predané aplikácii
+Selector Directory=Adresár selektora
+Directory to scan for the selector=Adresár, v ktorom má byť hľadaný selektor
+Selector Browser=Prehliadač selektora
+Allow the selector to change directory=Povolí selektorovi zmeniť adresár
+Selector Filter=Filter pre selektor
+Filter for the selector (Separate values with a comma)=Filter pre selektor (hodnoty oddeľujte čiarkou)
+Selector Screenshots=Snímky obrazovky selektora
+Directory of the screenshots for the selector=Adresár so snímkami obrazovky selektora
+Selector Aliases=Aliasy selektora
+File containing a list of aliases for the selector=Súbor obsahujúci zoznam aliasov pre selektor
+Explicitly relaunch GMenu2X after this link's execution ends=Explicitne opätovne spustiť GMenu2X po ukončení spustenia tohto odkazu
+Don't Leave=Neopúšťať
+Don't quit GMenu2X when launching this link=Neukončovať GMenu2X pri spúšťaní tohto odkazu
+Save last selection=Ulož posledný výber
+Save the last selected link and section on exit=Ulož naposledy vybraný odkaz a sekciu pri ukončení
+Clock for GMenu2X=Takt. frekvencia pre GMenu2X
+Set the cpu working frequency when running GMenu2X=Nastavte frekvenciu cpu počas behu GMenu2X
+Maximum overclock=Maximálne pretaktovanie
+Set the maximum overclock for launching links=Nastavte maximálne pretaktovanie pre spúšťanie odkazov
+Global Volume=Globálna hlasitosť
+Set the default volume for the gp2x soundcard=Nastavte východziu hlasitosť pre zvukovú kartu gp2x
+Output logs=Výstupné logy
+Logs the output of the links. Use the Log Viewer to read them.=Loguje výstup odkazov. Na prezretie použite Log Viewer.
+Number of columns=Počet stĺpcov
+Set the number of columns of links to display on a page=Nastavte počet stĺpcov pre odkazy zobrazené na stránke
+Number of rows=Počet riadkov
+Set the number of rows of links to display on a page=Počet riadkov odkazov zobrazených na stránke
+Top Bar Color=Farba hornej lišty
+Color of the top bar= Farba hornej lišty
+Bottom Bar Color=Farba spodnej lišty
+Color of the bottom bar=Farba spodnej lišty
+Selection Color=Farba výberu
+Color of the selection and other interface details=Farba výberu a iných detailov interfacu
+You should disable Usb Networking to do this.=Pre vykonanie tejto operácie by ste mali deaktivovať Usb sieťovanie.
+Operation not permitted.=Operácia nepovolená.
+Language=Jazyk
+Set the language used by GMenu2X=Nastavte jazyk pre GMenu2X
+Increase=Zvýšiť 
+Decrease=Znížiť 
+Change color component=Zmeniť farebnú zložku
+Increase value=Zvýšiť hodnotu
+Decrease value=Znížiť hodnotu
+Switch=Prepnúť 
+Change value=Zmeniť hodnotu
+Edit=Upraviť 
+Clear=Vyčistiť 
+Select a directory=Vyberte adresár
+Select a file=Vyberte súbor
+Clock (default: 200)=Takt (štandardne: 200) 
+Volume (default: -1)=Hlasitosť (štandardne: -1) 
+Wrapper=Obaľovač 
+Enter folder=Zadajte priečinok
+Confirm=Potvrdiť 
+Enter folder/Confirm=Zadajte priečinok/Potvrdiť
+Up one folder=O jeden priečinok vyššie
+Select an application=Vyberte aplikáciu
+Space=Medzera
+Shift=Shift 
+Cancel=Zrušiť 
+OK=OK 
+Backspace=Backspace
+Skin=Skin 
+Set the skin used by GMenu2X=Nastavte skin pre GMenu2X
+Add link in $1=Pridať odkaz do $1
+Edit $1=Upraviť $1
+Delete $1 link=Vymazať odkaz na $1
+Deleting $1=Mažem $1
+Are you sure?=Ste si istý? 
+Insert a name for the new section=Zadajte názov novej sekcie
+Insert a new name for this section=Zadajte nový názov pre túto sekciu
+Yes=Áno 
+No=Nie 
+You will lose all the links in this section.=Stratíte všetky odkazy v tejto sekcii. 
+Exit=Ukončiť 
+Link Scanner=Vyhľadávač odkazov
+Scanning SD filesystem...=Prehľadávam súborový systém na SD karte...
+Scanning NAND filesystem...=Prehľadávam súborový systém na pamäti NAND...
+$1 files found.=$1 súbor(ov) nájdených. 
+Creating links...=Vytváram odkazy...
+$1 links created.=$1 odkazov vytvorených.
+Version $1 (Build date: $2)=Verzia $1 (dátum zostavenia: $2)
+Log Viewer=Prehliadač log súborov
+Displays last launched program's output=Zobrazuje výstup naposledy spusteného súboru
+Do you want to delete the log file?=Želáte si vymazať log súbor?
+USB Enabled (SD)=USB aktivované (SD) 
+USB Enabled (Nand)=USB aktivované (Nand) 
+Turn off=Vypnúť 
+Launching $1=Spúšťam $1
+Change page=Zmeniť stránku 
+Page=Stránka 
+Scroll=Skrolovať 
+Untitled=Bez mena
+Change GMenu2X wallpaper=Zmeniť pozadie GMenu2X
+Activate/deactivate tv-out=Aktivovať/deaktivovať výstup na TV
+Select wallpaper=Vyberte pozadie
+Gamma=Gamma
+Set gp2x gamma value (default: 10)=Nastavte hodnotu gamma (implic: 10)
+Tv-Out encoding=Kódovanie výstupu na TV
+Encoding of the tv-out signal=Kódovanie televízneho signálu
+Tweak RAM Timings=Upraviť časovanie RAM
+This usually speeds up the application at the cost of stability=Toto nastavenie zvyčajne zrýchli aplikáciu na úkor stability
+Gamma (default: 0)=Gamma (implic: 0)
+Gamma value to set when launching this link=Hodnota gamma pri spúšťaní tohto odkazu
+Wallpaper=Pozadie
+Configure skin=Nastaviť skin
+Message Box Color=Farba textového okna
+Message Box Border Color=Farba okraja textového okna
+Message Box Selection Color=Farba výberu textového okna
+Background color of the message box=Farba pozadia textového okna
+Border color of the message box=Farba okraja textového okna
+Color of the selection of the message box=Farba výberu textového okna
diff --git a/FunKey/board/funkey/rootfs-overlay/usr/share/gmenu2x/translations/Spanish b/FunKey/board/funkey/rootfs-overlay/usr/share/gmenu2x/translations/Spanish
new file mode 100644
index 0000000..74f5a8e
--- /dev/null
+++ b/FunKey/board/funkey/rootfs-overlay/usr/share/gmenu2x/translations/Spanish
@@ -0,0 +1,129 @@
+Settings=Ajustes
+Configure GMenu2X's options=Configura las opciones de GMenu2X
+Activate Usb on SD=Activa USB para SD
+Activate Usb on Nand=Activa USB para Nand
+Info about GMenu2X=Información sobre GMenu2X
+About=Información
+Add section=Añadir sección
+Rename section=Renombrar sección
+Delete section=Eliminar sección
+Scan for applications and games=Buscar aplicaciones y juegos
+applications=aplicaciones
+Edit link=Editar enlace
+Title=Título
+Link title=Título de enlace
+Description=Descripción
+Link description=Descripción de enlace
+Section=Sección
+The section this link belongs to=La sección a la que pertenece el enlace
+Icon=Icono
+Select an icon for the link: $1=Seleccione un icono para el enlace: $1
+Manual=Manual
+Select a graphic/textual manual or a readme=Seleccione un manual gráfico/texto o un leeme
+Cpu clock frequency to set when launching this link=Ajuste del reloj de la cpu para cargar este enlace
+Volume to set for this link=Ajuste del volumen para este enlace
+Parameters=Parámetros
+Parameters to pass to the application=Parámetros que enviar a la aplicación
+Selector Directory=Directorio del Selector
+Directory to scan for the selector=Directorio que explorar con el selector
+Selector Browser=Explorador del selector
+Allow the selector to change directory=Permitir al selector cambiar de directorio
+Selector Filter=Filtro del selector
+Filter for the selector (Separate values with a comma)=Filtro para el selector (Separar valores con comas)
+Selector Screenshots=Capturas de pantalla del selector
+Directory of the screenshots for the selector=Directorio de capturas de pantalla para el selector
+Selector Aliases=Alias del selector
+File containing a list of aliases for the selector=Archivo contenedor de la lista de alias para el selector
+Don't Leave=No salir
+Don't quit GMenu2X when launching this link=No terminar GMenu2X al cargar este enlace
+Save last selection=Recordar última selección
+Save the last selected link and section on exit=Recordar última sección y enlace seleccionado al salir
+Clock for GMenu2X=Reloj para GMenu2X
+Set the cpu working frequency when running GMenu2X=Ajuste la frecuencia de trabajo de cpu al ejecutar GMenu2X
+Maximum overclock=Overclock máximo
+Set the maximum overclock for launching links=Ajuste el máximo overclock para cargar enlaces
+Global Volume=Volumen global
+Set the default volume for the gp2x soundcard=Ajuste el volumen por defecto del sonido en gp2x
+Output logs=Archivos de registro de sucesos
+Logs the output of the links. Use the Log Viewer to read them.=Registra los sucesos de los enlaces. Usa el lector de registros para leerlos.
+Number of columns=Número de columnas
+Set the number of columns of links to display on a page=Ajuste el número de columnas de enlaces que mostrar por página
+Number of rows=Número de líneas
+Set the number of rows of links to display on a page=Ajuste el número de líneas de enlaces que mostrar por página
+Top Bar Color=Color de barra superior
+Color of the top bar=Color de la barra superior
+Bottom Bar Color=Color de barra inferior
+Color of the bottom bar=Color de la barra inferior
+Selection Color=Color de selección
+Color of the selection and other interface details=Color de la selección y otros detalles del interfaz
+You should disable Usb Networking to do this.=Debe desactivar Red por USB para hacer esto.
+Operation not permitted.=Operación no permitida.
+Language=Idioma
+Set the language used by GMenu2X=Ajuste el idioma usado en GMenu2X
+Increase=Aumentar
+Decrease=Reducir
+Change color component=Cambiar componente cromático
+Increase value=Incrementar valor
+Decrease value=Reducir valor
+Switch=Cambiar
+Change value=Cambiar valor
+Edit=Modificar
+Clear=Limpiar
+Select a directory=Selecciona un directorio
+Select a file=Selecciona un archivo
+Clock (default: 200)=Frecuencia (predefinida: 200)
+Volume (default: -1)=Volumen (predefinida: -1)
+Enter folder=Entrar en carpeta
+Confirm=Confirmar
+Enter folder/Confirm=Entrar en carpeta/Confirmar
+Up one folder=Subir una carpeta
+Select an application=Seleccionar un programa
+Space=Espacio
+Shift=Mayúsculas
+Cancel=Cancelar
+OK=Aceptar
+Backspace=Retroceso
+Skin=Máscara
+Set the skin used by GMenu2X=Seleccione la máscara usada en GMenu2X
+Add link in $1=Añadir enlace en $1
+Edit $1=Modificar $1
+Delete $1 link=Eliminar el enlace de $1
+Deleting $1=Eliminando $1
+Are you sure?=¿Estás seguro?
+Insert a name for the new section=Insertar nombre para la nueva sección
+Insert a new name for this section=Insertar nuevo nombre para la sección
+Yes=Si
+No=No
+You will lose all the links in this section.=Se perderán todos los enlaces de esta sección.
+Exit=Salir
+Link Scanner=Buscador de enlaces
+Scanning SD filesystem...=Explorando sistema de archivos de SD...
+Scanning NAND filesystem...=Explorando sistema de archivos de NAND...
+$1 files found.=$1 archivo(s) encontrado(s).
+Creating links...=Creando enlaces...
+$1 links created.=$1 enlace(s) creado(s).
+Version $1 (Build date: $2)=Versión $1 (Compilación: $2)
+Log Viewer=Lector de Registro de Sucesos
+Displays last launched program's output=Muestra la salida del último programa ejecutado
+Do you want to delete the log file?=¿Desea eliminar el archivo de registro de sucesos?
+USB Enabled (SD)=USB Activado (SD)
+USB Enabled (Nand)=USB Activado (Nand)
+Turn off=Desactivar
+Launching $1=Ejecutando $1
+Change page=Cambiar página
+Page=Página
+Scroll=Desplazamiento
+Untitled=Sin titulo
+Change GMenu2X wallpaper=Cambia el fondo de GMenu2X
+Activate/deactivate tv-out=Activa/desactiva salida de tv
+Select wallpaper=Selecciona imagen de fondo
+Gamma=Gama
+Set gp2x gamma value (default: 10)=Ajusta el valor gama de la gp2x (predefinido: 10)
+Tv-Out encoding=Codificación de salida de tv
+Encoding of the tv-out signal=Codificación de la señal de salida de tv
+Tweak RAM Timings=Modifica la sincronización de RAM
+This usually speeds up the application at the cost of stability=Normalmente acelera la aplicación a costa de la estabilidad
+Gamma (default: 0)=Gama (predefinido: 0)
+Gamma value to set when launching this link=Valor de gama que usar al lanzar este enlace
+Wrapper=Retornar
+Explicitly relaunch GMenu2X after this link's execution ends=Forzar recarga de GMenu2X al terminar ejecución del enlace
diff --git a/FunKey/board/funkey/rootfs-overlay/usr/share/gmenu2x/translations/Swedish b/FunKey/board/funkey/rootfs-overlay/usr/share/gmenu2x/translations/Swedish
new file mode 100644
index 0000000..5501464
--- /dev/null
+++ b/FunKey/board/funkey/rootfs-overlay/usr/share/gmenu2x/translations/Swedish
@@ -0,0 +1,119 @@
+Settings=Inställningar
+Configure GMenu2X's options=Konfigurera GMenu2X's inställningar
+Activate Usb on SD=Aktivera Usb på SD
+Activate Usb on Nand=Aktivera Usb på Nand
+Info about GMenu2X=Info om GMenu2X
+About=Om
+Add section=Lägg till avdelning
+Rename section=Byt namn på avdelning
+Delete section=Ta bort avdelning
+Scan for applications and games=Scanna efter applikationer
+applications=applikationer
+Edit link=Redigera länk
+Title=Titel
+Link title=Länktitel
+Description=Beskrivning
+Link description=Länkbeskrivning
+Section=Avdelning
+The section this link belongs to=Avdelningen som den här länken tillhör
+Icon=Ikon
+Select an icon for the link: $1=Välj en ikon till länken: $1
+Manual=Manual
+Select a graphic/textual manual or a readme=Välj en grafisk/textbaserad manual eller en readme
+Cpu clock frequency to set when launching this link=Cpu-klockfrekvens att ändra till när denna länk körs
+Volume to set for this link=Volymen som skall sättas för den här länken
+Parameters=Parametrar
+Parameters to pass to the application=Parametrar som skall skickas till applikationen
+Selector Directory=Selectorkatalog
+Directory to scan for the selector=Katalog som selector skall scanna
+Selector Browser=Selectorutforskare
+Allow the selector to change directory=Tillåt selector att byta katalog
+Selector Filter=Selectorfilter
+Filter for the selector (Separate values with a comma)=Filter för selector (separera värdena med kommatecken)
+Selector Screenshots=Skärmdumpar till Selector
+Directory of the screenshots for the selector=Katalog med skärmdumpar till selector
+Selector Aliases=Selectoralias
+File containing a list of aliases for the selector=Fil som innehåller en lista med alias för selector
+Explicitly relaunch GMenu2X after this link's execution ends=Tvinga GMenu2X att starta om när denna länk körts klart
+Don't Leave=Lämna inte
+Don't quit GMenu2X when launching this link=Avsluta inte GMenu2X när denna länk körs
+Save last selection=Spara senaste markeringen
+Save the last selected link and section on exit=Spara senaste länk och avdelning vid avslut
+Clock for GMenu2X=Klockfrekvens för GMenu2X
+Set the cpu working frequency when running GMenu2X=Ställ in cpu-klockfrekvensen för GMenu2X
+Maximum overclock=Maximal överklockning
+Set the maximum overclock for launching links=Sätt maximal överklockning vid start av länk
+Global Volume=Global volym
+Set the default volume fo the gp2x soundcard=Sätt förinställd volym på gp2x ljudkortet
+Output logs=logg utskrift
+Logs the output of the links. Use the Log Viewer to read them.=Skriver ut loggar för länkarna. Använd loggläsaren för att läsa dem.
+Number of columns=Antal spalter
+Set the number of columns of links to display on a page=Välj antal spalter med länkar som skall visas per sida
+Number of rows=Antal rader
+Set the number of rows of links to display on a page=Välj antal rader med länkar som skall visas per sida
+Top Bar Color=Översta fältets färg
+Color of the top bar=Färg på det översta fältet
+Bottom Bar Color=Nedersta fältets färg
+Color of the bottom bar=Färg på det nedersta fältet
+Selection Color=Markörfärg
+Color of the selection and other interface details=Färg på markören och andra delar av gränssnittet
+You should disable Usb Networking to do this.=Du bör slå av usb-nätverket när du gör detta.
+Operation not permitted.=Otillåten användning.  
+Language=Språk
+Set the language used by GMenu2X=Ställ in språk för GMenu2X
+Increase=Öka
+Decrease=Minska
+Change color component=Ändra färg komponent
+Increase value=Öka värde
+Decrease value=Minska värde
+Switch=Ändra
+Change value=Ändra värde
+Edit=Redigera
+Clear=Rensa
+Select a directory=Välj en katalog
+Select a file=Välj en fil
+Clock (default: 200)=Klockfrekvens (förinställd: 200)
+Volume (default: -1)=Volym (förinställd: -1)
+Wrapper=Wrapper
+Enter folder=Öppna katalog
+Confirm=Bekräfta
+Enter folder/Confirm=Öppna katalog/Bekräfta
+Up one folder=Upp en katalog
+Select an application=Välj en applikation
+Space=Mellanslag
+Shift=Skift
+Cancel=Avbryt
+OK=OK
+Backspace=Backsteg
+Skin=Skin
+Set the skin used by GMenu2X=Välj vilket skin GMenu2X ska använda
+Add link in $1=Skapa länk i $1
+Edit $1=Redigera $1
+Delete $1 link=Ta bort $1
+Deleting $1=Tar bort $1
+Are you sure?=Är du säker?
+Insert a name for the new section=Skriv in namn för den nya avdelningen
+Insert a new name for this section=Skriv in namn för den här avdelningen
+Yes=Ja
+No=Nej
+You will lose all the links in this section.=Du kommer att förlora alla länkar i den här
+avdelningen.
+Exit=Avsluta
+Link Scanner=Länkscanner
+Scanning SD filesystem...=Scannar SD filsystem...
+Scanning NAND filesystem...=Scannar NAND filsystem...
+$1 files found.=$1 fil(er) funna.
+Creating links...=Skapar länkar...
+$1 links created.=$1 länk(ar) skapade.
+Version $1 (Build date: $2)=Version $1 (Skapad den: $2)
+Log Viewer=Logg läsare
+Displays last launched program's output=Visa utdata från senast körda program
+Do you want to delete the log file?=Vill du ta bort logg filen?
+USB Enabled (SD)=USB Aktiverad (SD)
+USB Enabled (Nand)=USB Aktiverad (Nand)
+Turn off=Stäng av
+Launching $1=Startar $1
+Change page=Byt sida
+Page=Sida
+Scroll=Rulla
+Untitled=Obetitlad
diff --git a/FunKey/board/funkey/rootfs-overlay/usr/share/gmenu2x/translations/Turkish b/FunKey/board/funkey/rootfs-overlay/usr/share/gmenu2x/translations/Turkish
new file mode 100644
index 0000000..7554ced
--- /dev/null
+++ b/FunKey/board/funkey/rootfs-overlay/usr/share/gmenu2x/translations/Turkish
@@ -0,0 +1,133 @@
+﻿Settings=Ayarlar
+Configure GMenu2X's options=GMenu2X'in ayarlarini degistir
+Activate Usb on SD=SD Karti için USB baglantisini aktive et
+Activate Usb on Nand=Nand Bellegi için USB baglantisini aktive et
+Info about GMenu2X=GMenu2X hakkinda bilgi
+About=Hakkinda
+Add section=Bölüm ekle
+Rename section=Bölümü yeniden adlandir
+Delete section=Bölümü sil
+Scan for applications and games=Oyun ve program taramasi
+applications=programlar
+Edit link=Link'i düzenle
+Title=Baslik
+Link title=Link basligi
+Description=Açiklama
+Link description=Link açiklamasi
+Section=Bölüm
+The section this link belongs to=Bu linkin ait oldugu bölüm
+Icon=Icon
+Select an icon for the link: $1=$1 Linki için bir simge seçin:
+Manual=Kullanim kilavuzu
+Select a graphic/textual manual or a readme=Grafik/Text bir kullanim kilavuzu veya readme dosyasi seçin
+Cpu clock frequency to set when launching this link=Bu linki baslatirken kullanilacak CPU hizi
+Volume to set for this link=Bu linkin ses seviyesi
+Parameters=Parametreler
+Parameters to pass to the application=Programa baslatirken aktarilacak parametreler
+Selector Directory=Seçim klasörü
+Directory to scan for the selector=Seçim için taranacak klasör
+Selector Browser=Dosya seçimi
+Allow the selector to change directory=Dosyas seçiminin klasör degistirmesine izin ver
+Selector Filter=Seçim filtresi
+Filter for the selector (Separate values with a comma)=Seçim için filtre (Verileri virgül ile ayirin)
+Selector Screenshots=Seçim Ekran görüntüleri
+Directory of the screenshots for the selector=Seçim için ekran görüntülerinin klasörü
+Selector Aliases=Seçim için alternatif isimler (alias)
+File containing a list of aliases for the selector=Seçim için alternatif isimleri içeren dosya adi
+Explicitly relaunch GMenu2X after this link's execution ends=GMenu2X'i bu linkin çalismasi bittiginde özellikle tekrar baslat
+Don't Leave=Çikma
+Don't quit GMenu2X when launching this link=Bu linki baslatirken GMenu2X'i durdurma
+Save last selection=Son seçimi sakla
+Save the last selected link and section on exit=Son seçilen link ve seçimi çikista sakla
+Clock for GMenu2X=GMenu2X için CPU hizi
+Set the cpu working frequency when running GMenu2X=GMenu2X'in çalistigi CPU hizi
+Maximum overclock=Maksimum overclock hizi
+Set the maximum overclock for launching links=Linkler baslatilirken kullanilacak en yüksek CPU hizini belirleyin
+Global Volume=Genel ses seviyesi
+Set the default volume for the gp2x soundcard=GP2X ses kartinin standart ses seviyesini belirleyin
+Output logs=Çikti kayitlari
+Logs the output of the links. Use the Log Viewer to read them.=Linklerin çiktilarini kaydeder. Kayit gösterici ile okuyabilirsiniz.
+Number of columns=Sütun sayisi
+Set the number of columns of links to display on a page=Bir sayfada gösterilecek sütun sayisini belirleyin
+Number of rows=Satir sayisi
+Set the number of rows of links to display on a page=Bir sayfada gösterilecek satir sayisini belirleyin
+Top Bar Color=Baslik çubugunun rengi
+Color of the top bar=Baslik çubugunun rengini ve saydamligini belirler
+Bottom Bar Color=Statü çubugunun rengi
+Color of the bottom bar=Statü çubugunun rengini ve saydamligini belirler
+Selection Color=Seçim rengi
+Color of the selection and other interface details=Seçim rengi ve baska arabirim detaylarinin rengi
+You should disable Usb Networking to do this.=Bunu yapmadan önce USB-Ag destegini kapatmalisiniz.
+Operation not permitted.=Isleme izin verilmedi.
+Language=Dil
+Set the language used by GMenu2X=GMenu2X'in kullanacagi dili seçin
+Increase=Artir
+Decrease=Azalt
+Change color component=Renk ögesini degistirin
+Increase value=Degeri artir
+Decrease value=Degeri azalt
+Switch=Degistir
+Change value=Degeri degistir
+Edit=Düzenle
+Clear=Temizle
+Select a directory=Bir klasör seç
+Select a file=Bir dosya seç
+Clock (default: 200)=Hiz (Standart: 200)
+Volume (default: -1)=Ses seviyesi (Standart: -1)
+Wrapper=GMenu2X'i Tekrar baslat
+Enter folder=Klasöre gir
+Confirm=Onayla
+Enter folder/Confirm=Klasöre gir / Onayla
+Up one folder=Bir klasör yukari
+Select an application=Bir program seçin
+Space=Bosluk
+Shift=Shift
+Cancel=Vazgeç
+OK=OK
+Backspace=Geri
+Skin=Skin
+Set the skin used by GMenu2X=GMenu2X'in kullanacagi Skin'i seçin
+Add link in $1=$1 bölümüne link ekle
+Edit $1=$1 linkini düzenle
+Delete $1 link=$1 linkini sil
+Deleting $1=$1 linki silinecek
+Are you sure?=Emin misiniz?
+Insert a name for the new section=Yeni bölüm için bir isim girin
+Insert a new name for this section=Bu bölüm için yeni isim girin
+Yes=Evet
+No=Hayir
+You will lose all the links in this section.=Bu bölümdeki tüm linkleri kaybedeceksiniz.
+Exit=Çikis
+Link Scanner=Link taramasi
+Scanning SD filesystem...=SD karti taraniyor...
+Scanning NAND filesystem...=NAND bellegi taraniyor...
+$1 files found.=$1 dosya bulundu.
+Creating links...=Linkler yaratiliyor.
+$1 links created.=$1 link yaratildi.
+Version $1 (Build date: $2)=Version $1 - Derleme: $2
+Log Viewer=Kayit Gösterici
+Displays last launched program's output=Son çalistirilan programin çiktisini gösterir
+Do you want to delete the log file?=Kayit dosyasini silmek istiyor musunuz?
+USB Enabled (SD)=USB aktive edildi (SD)
+USB Enabled (Nand)=USB aktive edildi (Nand)
+Turn off=Kapat
+Launching $1=$1 baslatiliyor
+Change page=Sayfa degistir
+Page=Sayfa
+Scroll=Kaydir
+Untitled=Basliksiz
+Change GMenu2X wallpaper=GMenu2X arka plan resmini degistir
+Activate/deactivate tv-out=TV-Out aktive/deaktive et
+Select wallpaper=Arka plan resmi seç
+Gamma=Gamma
+Set gp2x gamma value (default: 10)=gp2x gamma degerini degistir (Standart: 10)
+Tv-Out encoding=TV-Out Sinyali
+Encoding of the tv-out signal=TV-Out formati ayari
+Tweak RAM Timings=Hafiza hizlandirmasi
+This usually speeds up the application at the cost of stability=Programlari hizlandirir ancak stabilitelerini düsürür
+Gamma (default: 0)=Gamma (Standart: 0)
+Gamma value to set when launching this link=Bu linki çalistirirken kullanilacak gamma degeri
+ON=Açik
+OFF=Kapali
+File Browser=Dosya seçici
+Directory Browser=Klasör seçici
diff --git a/FunKey/board/funkey/sw-description b/FunKey/board/funkey/sw-description
index dd72e7c..d411479 100644
--- a/FunKey/board/funkey/sw-description
+++ b/FunKey/board/funkey/sw-description
@@ -1,6 +1,6 @@
 software =
 {
-        version = "1.1.0";
+        version = "2.0.0";
 
         hardware-compatibility = [ "#RE:^Rev\.[D-E]$" ];
 
diff --git a/FunKey/board/funkey/update_partition b/FunKey/board/funkey/update_partition
index bbbc887..82f46b1 100755
--- a/FunKey/board/funkey/update_partition
+++ b/FunKey/board/funkey/update_partition
@@ -9,20 +9,37 @@ root_mount=/tmp/rootfs
 
 do_preinst()
 {
-    notif " 1/3 EXTRACT FIRMWARE UPDATE..^DO NOT TURN OFF THE CONSOLE"
+    notif " 1/4 EXTRACT FIRMWARE UPDATE..^DO NOT TURN OFF THE CONSOLE"
     exit 0
 }
 
 do_postinst()
 {
-    notif " 2/3 RESIZE ROOT FILESYSTEM^DO NOT TURN OFF THE CONSOLE"
+    notif " 2/4 RESIZE ROOT FILESYSTEM^DO NOT TURN OFF THE CONSOLE"
     resize2fs ${root_part}
     if [ $? -ne 0 ]; then
-    	notif " CANNOT RESIZE ROOT ^FILESYSTEM"
+    	notif " CANNOT RESIZE ROOT^FILESYSTEM"
     	exit 1
     fi
+    notif " 3/4 COPY OPKS TO USB MOUNT^DO NOT TURN OFF THE CONSOLE"
+    folder_opks_emulators=/mnt/Emulators
+    if [ ! -d "$folder_opks_emulators" ]; then
+    	mkdir -p "$folder_opks_emulators"
+    	mkdir -p ${root_mount}
+    	mount -t ext4 ${root_part} ${root_mount}
+    	if [ $? -ne 0 ]; then
+    	    notif "CANNOT MOUNT ROOT^FILESYSTEM"
+    	    exit 1
+    	fi
+    	cp -f ${root_mount}/usr/games/opk/*.opk "$folder_opks_emulators"
+    	umount ${root_mount}
+    	if [ $? -ne 0 ]; then
+    	    notif "CANNOT UNMOUNT ROOT^FILESYSTEM"
+    	    exit 1
+    	fi
+    fi
     for file in $(ls /mnt/FunKey-*.fwu); do
-    	notif " 3/3 ERASE UPDATE FILE^DO NOT TURN OFF THE CONSOLE"
+    	notif " 4/4 ERASE UPDATE FILE^DO NOT TURN OFF THE CONSOLE"
     	rm -f "${file}"
     done
     exit 0
diff --git a/FunKey/configs/funkey_defconfig b/FunKey/configs/funkey_defconfig
index 49ed628..f35b796 100644
--- a/FunKey/configs/funkey_defconfig
+++ b/FunKey/configs/funkey_defconfig
@@ -1,10 +1,20 @@
 BR2_arm=y
 BR2_cortex_a7=y
+BR2_ARM_FPU_VFPV4=y
 BR2_DL_DIR="../download"
 BR2_CCACHE=y
+BR2_OPTIMIZE_FAST=y
+BR2_SHARED_STATIC_LIBS=y
 BR2_GLOBAL_PATCH_DIR="$(BR2_EXTERNAL_FUNKEY_PATH)/board/funkey/patches"
 BR2_TOOLCHAIN_EXTERNAL=y
+BR2_TOOLCHAIN_EXTERNAL_CUSTOM=y
+BR2_TOOLCHAIN_EXTERNAL_DOWNLOAD=y
+BR2_TOOLCHAIN_EXTERNAL_URL="https://github.com/FunKey-Project/FunKey-OS/releases/download/FunKey-OS-2.0.0/FunKey-sdk-2.0.0.tar.gz"
+BR2_TOOLCHAIN_EXTERNAL_HEADERS_4_14=y
+BR2_TOOLCHAIN_EXTERNAL_CUSTOM_MUSL=y
+BR2_TOOLCHAIN_EXTERNAL_CXX=y
 BR2_TOOLCHAIN_EXTERNAL_GDB_SERVER_COPY=y
+BR2_TARGET_OPTIMIZATION="-fno-PIC -march=armv7-a+neon-vfpv4 -mtune=cortex-a7 -mfpu=neon-vfpv4 -mvectorize-with-neon-quad -D__ARM_NEON__"
 BR2_TARGET_GENERIC_HOSTNAME="FunKey"
 BR2_TARGET_GENERIC_ISSUE="Welcome to Buildroot for the FunKey"
 BR2_ROOTFS_DEVICE_TABLE="$(BR2_EXTERNAL_FUNKEY_PATH)/board/funkey/device_table.txt"
@@ -42,6 +52,7 @@ BR2_PACKAGE_ALSA_UTILS_BAT=y
 BR2_PACKAGE_ALSA_UTILS_IECSET=y
 BR2_PACKAGE_ALSA_UTILS_SPEAKER_TEST=y
 BR2_PACKAGE_AUMIX=y
+BR2_PACKAGE_MPG123=y
 BR2_PACKAGE_DOSFSTOOLS=y
 BR2_PACKAGE_DOSFSTOOLS_FATLABEL=y
 BR2_PACKAGE_DOSFSTOOLS_FSCK_FAT=y
@@ -49,10 +60,11 @@ BR2_PACKAGE_DOSFSTOOLS_MKFS_FAT=y
 BR2_PACKAGE_E2FSPROGS=y
 # BR2_PACKAGE_E2FSPROGS_FSCK is not set
 BR2_PACKAGE_E2FSPROGS_RESIZE2FS=y
-BR2_PACKAGE_SDL_GFX=y
+BR2_PACKAGE_FBGRAB=y
 BR2_PACKAGE_SDL_IMAGE_GIF=y
 BR2_PACKAGE_SDL_IMAGE_JPEG=y
 BR2_PACKAGE_SDL_IMAGE_PNG=y
+BR2_PACKAGE_SDL_NET=y
 BR2_PACKAGE_SDL_SOUND=y
 BR2_PACKAGE_SDL_SOUND_PLAYSOUND=y
 BR2_PACKAGE_GPTFDISK=y
@@ -62,14 +74,25 @@ BR2_PACKAGE_PARTED=y
 BR2_PACKAGE_SYSSTAT=y
 BR2_PACKAGE_UBOOT_TOOLS=y
 BR2_PACKAGE_UBOOT_TOOLS_MKENVIMAGE=y
+BR2_PACKAGE_LUA=y
+BR2_PACKAGE_LIBSAMPLERATE=y
+BR2_PACKAGE_LIBSNDFILE=y
+BR2_PACKAGE_OPENAL=y
+BR2_PACKAGE_TINYALSA=y
+BR2_PACKAGE_TREMOR=y
 BR2_PACKAGE_LIBARCHIVE=y
 BR2_PACKAGE_LIBARCHIVE_BSDTAR=y
 BR2_PACKAGE_LIBARCHIVE_BSDCPIO=y
 BR2_PACKAGE_LIBARCHIVE_BSDCAT=y
+BR2_PACKAGE_LIBCONFIG=y
 BR2_PACKAGE_LIBQRENCODE=y
 BR2_PACKAGE_LIBQRENCODE_TOOLS=y
+BR2_PACKAGE_PIXMAN=y
+BR2_PACKAGE_TINYXML2=y
 BR2_PACKAGE_LIBNL=y
 BR2_PACKAGE_LIBRSYNC=y
+BR2_PACKAGE_FMT=y
+BR2_PACKAGE_ICU=y
 BR2_PACKAGE_DHCPCD=y
 BR2_PACKAGE_DROPBEAR=y
 BR2_PACKAGE_PROCPS_NG=y
@@ -80,9 +103,16 @@ BR2_PACKAGE_UTIL_LINUX_BINARIES=y
 BR2_PACKAGE_NANO=y
 BR2_TARGET_ROOTFS_EXT2=y
 BR2_TARGET_ROOTFS_EXT2_4=y
-BR2_TARGET_ROOTFS_EXT2_SIZE="100M"
+BR2_TARGET_ROOTFS_EXT2_SIZE="160M"
 # BR2_TARGET_ROOTFS_TAR is not set
 BR2_PACKAGE_HOST_DOSFSTOOLS=y
 BR2_PACKAGE_HOST_DTC=y
 BR2_PACKAGE_HOST_MTOOLS=y
+BR2_PACKAGE_HOST_SQUASHFS=y
 BR2_PACKAGE_HOST_UBOOT_TOOLS=y
+BR2_PACKAGE_LIBOPK=y
+BR2_PACKAGE_LIBXDGMIME=y
+BR2_PACKAGE_GMENU2X=y
+BR2_PACKAGE_AGG=y
+BR2_PACKAGE_FLUIDLITE=y
+BR2_PACKAGE_LIBMIKMOD=y
diff --git a/FunKey/package/FCEUX/FCEUX.mk b/FunKey/package/FCEUX/FCEUX.mk
index 0522bf3..132e462 100644
--- a/FunKey/package/FCEUX/FCEUX.mk
+++ b/FunKey/package/FCEUX/FCEUX.mk
@@ -4,7 +4,7 @@
 #
 ################################################################################
 
-FCEUX_VERSION = fceux-FunKey-1.00
+FCEUX_VERSION = fceux-FunKey-1.1.0
 FCEUX_SITE_METHOD = git
 FCEUX_SITE = https://github.com/FunKey-Project/fceux.git
 FCEUX_LICENSE = GPL-2.0
@@ -12,25 +12,7 @@ FCEUX_LICENSE_FILES = COPYING
 
 FCEUX_DEPENDENCIES = sdl sdl_image sdl_mixer sdl_ttf zlib
 
-FCEUX_CFLAGS = $(TARGET_CFLAGS)
-
-ifeq ($(BR2_ARM_CPU_ARMV7A),y)
-FCEUX_CFLAGS += -march=armv7-a
-endif
-
-ifeq ($(BR2_GCC_TARGET_CPU),"cortex-a7")
-FCEUX_CFLAGS += -mtune=cortex-a7
-endif
-
-ifeq ($(BR2_GCC_TARGET_FLOAT_ABI),"hard")
-FCEUX_CFLAGS += -mfloat-abi=hard -ffast-math -funsafe-math-optimizations
-else ifeq ($(BR2_GCC_TARGET_FLOAT_ABI),"soft")
-FCEUX_CFLAGS += -mfloat-abi=soft -ffast-math -funsafe-math-optimizations
-endif
-
-ifeq ($(BR2_ARM_CPU_HAS_NEON),y)
-FCEUX_CFLAGS += -D__ARM_NEON__ -mfpu=neon -mvectorize-with-neon-quad
-endif
+FCEUX_CFLAGS = $(TARGET_CFLAGS) $(subst $\",,$(BR2_TARGET_OPTIMIZATION)) -mfloat-abi=hard -ffast-math -funsafe-math-optimizations
 
 FCEUX_CFLAGS += -ggdb -O3
 FCEUX_CFLAGS += -Wno-write-strings -Wno-sign-compare
@@ -65,5 +47,10 @@ define FCEUX_INSTALL_TARGET_CMDS
 	$(INSTALL) -m 0755 $(@D)/fceux/fceux $(TARGET_DIR)/usr/games/fceux
 endef
 
+define FCEUX_CREATE_OPK
+	$(INSTALL) -d -m 0755 $(TARGET_DIR)/usr/games/opk
+	$(HOST_DIR)/usr/bin/mksquashfs $(FCEUX_PKGDIR)/opk/nes $(TARGET_DIR)/usr/games/opk/nes_fceux_funkey-s.opk -all-root -noappend -no-exports -no-xattrs
+endef
+FCEUX_POST_INSTALL_TARGET_HOOKS += FCEUX_CREATE_OPK
 
 $(eval $(generic-package))
diff --git a/FunKey/package/FCEUX/opk/nes/nes.funkey-s.desktop b/FunKey/package/FCEUX/opk/nes/nes.funkey-s.desktop
new file mode 100644
index 0000000..191ad6d
--- /dev/null
+++ b/FunKey/package/FCEUX/opk/nes/nes.funkey-s.desktop
@@ -0,0 +1,12 @@
+[Desktop Entry]
+Type=Application
+Name=NES
+Comment=FC/NES Emulator
+Icon=nes
+Exec=/usr/games/launchers/nes_launch.sh %f
+Terminal=false
+X-OD-NeedsDownscaling=true
+Categories=emulators;
+selectorbrowser=true
+SelectorDir=/mnt/NES
+SelectorFilter=nes,NES
diff --git a/FunKey/package/FCEUX/opk/nes/nes.png b/FunKey/package/FCEUX/opk/nes/nes.png
new file mode 100644
index 0000000..04ea8bb
Binary files /dev/null and b/FunKey/package/FCEUX/opk/nes/nes.png differ
diff --git a/FunKey/package/FunKey-GPIO-Mapping/FunKey-GPIO-Mapping.mk b/FunKey/package/FunKey-GPIO-Mapping/FunKey-GPIO-Mapping.mk
index af98396..4e85222 100644
--- a/FunKey/package/FunKey-GPIO-Mapping/FunKey-GPIO-Mapping.mk
+++ b/FunKey/package/FunKey-GPIO-Mapping/FunKey-GPIO-Mapping.mk
@@ -4,7 +4,7 @@
 #
 ################################################################################
 
-FUNKEY_GPIO_MAPPING_VERSION = FunKey-GPIO-Mapping-FunKey-1.00
+FUNKEY_GPIO_MAPPING_VERSION = FunKey-GPIO-Mapping-FunKey-1.1.0
 FUNKEY_GPIO_MAPPING_SITE_METHOD = git
 FUNKEY_GPIO_MAPPING_SITE = https://github.com/FunKey-Project/FunKey-GPIO-Mapping.git
 FUNKEY_GPIO_MAPPING_SITE_LICENSE = GPL-2.1+
diff --git a/FunKey/package/PCSX-ReARMed/PCSX-ReARMed.mk b/FunKey/package/PCSX-ReARMed/PCSX-ReARMed.mk
index 6bf4d61..46d70ae 100644
--- a/FunKey/package/PCSX-ReARMed/PCSX-ReARMed.mk
+++ b/FunKey/package/PCSX-ReARMed/PCSX-ReARMed.mk
@@ -4,7 +4,7 @@
 #
 ################################################################################
 
-PCSX_REARMED_VERSION = pcsx_rearmed-FunKey-1.00
+PCSX_REARMED_VERSION = pcsx_rearmed-FunKey-1.1.0
 PCSX_REARMED_SITE_METHOD = git
 PCSX_REARMED_SITE = https://github.com/FunKey-Project/pcsx_rearmed.git
 PCSX_REARMED_LICENSE = GPL-2.0
@@ -12,25 +12,10 @@ PCSX_REARMED_LICENSE_FILES = COPYING
 
 PCSX_REARMED_DEPENDENCIES = sdl sdl_image sdl_mixer sdl_ttf zlib
 
-PCSX_REARMED_CFLAGS = $(TARGET_CFLAGS)
-
-ifeq ($(BR2_ARM_CPU_ARMV7A),y)
-PCSX_REARMED_CFLAGS += -march=armv7-a
-endif
-
-ifeq ($(BR2_GCC_TARGET_CPU),"cortex-a7")
-PCSX_REARMED_CFLAGS += -mtune=cortex-a7
-endif
-
-ifeq ($(BR2_GCC_TARGET_FLOAT_ABI),"hard")
-PCSX_REARMED_CFLAGS += -mfloat-abi=hard -ffast-math -funsafe-math-optimizations
-else ifeq ($(BR2_GCC_TARGET_FLOAT_ABI),"soft")
-PCSX_REARMED_CFLAGS += -mfloat-abi=soft -ffast-math -funsafe-math-optimizations
-endif
+PCSX_REARMED_CFLAGS = $(TARGET_CFLAGS) $(subst $\",,$(BR2_TARGET_OPTIMIZATION)) -mfloat-abi=hard -ffast-math -funsafe-math-optimizations
 
 ifeq ($(BR2_ARM_CPU_HAS_NEON),y)
-PCSX_REARMED_CONF_OPTS += --enable-neon --gpu=neon
-PCSX_REARMED_CFLAGS += -D__ARM_NEON__ -mfpu=neon -mvectorize-with-neon-quad
+    PCSX_REARMED_CONF_OPTS += --enable-neon --gpu=neon
 endif
 
 PCSX_REARMED_CONF_OPTS += --sound-drivers=sdl
@@ -62,4 +47,10 @@ define PCSX_REARMED_INSTALL_TARGET_CMDS
 	$(INSTALL) -m 0755 $(@D)/pcsx $(TARGET_DIR)/usr/games/pcsx
 endef
 
+define PCSX_REARMED_CREATE_OPK
+	$(INSTALL) -d -m 0755 $(TARGET_DIR)/usr/games/opk
+	$(HOST_DIR)/usr/bin/mksquashfs $(PCSX_REARMED_PKGDIR)/opk/ps1 $(TARGET_DIR)/usr/games/opk/ps1_pcsx_funkey-s.opk -all-root -noappend -no-exports -no-xattrs
+endef
+PCSX_REARMED_POST_INSTALL_TARGET_HOOKS += PCSX_REARMED_CREATE_OPK
+
 $(eval $(generic-package))
diff --git a/FunKey/package/PCSX-ReARMed/opk/ps1/ps1.funkey-s.desktop b/FunKey/package/PCSX-ReARMed/opk/ps1/ps1.funkey-s.desktop
new file mode 100644
index 0000000..c2d3861
--- /dev/null
+++ b/FunKey/package/PCSX-ReARMed/opk/ps1/ps1.funkey-s.desktop
@@ -0,0 +1,12 @@
+[Desktop Entry]
+Type=Application
+Name=PS1
+Comment=PS1 Emulator
+Icon=ps1
+Exec=/usr/games/launchers/psone_launch_pcsx.sh %f
+Terminal=false
+X-OD-NeedsDownscaling=true
+Categories=emulators;
+selectorbrowser=true
+SelectorDir=/mnt/PS1
+SelectorFilter=bin,BIN,cue,CUE,pbp,PBP
diff --git a/FunKey/package/PCSX-ReARMed/opk/ps1/ps1.png b/FunKey/package/PCSX-ReARMed/opk/ps1/ps1.png
new file mode 100644
index 0000000..66181c7
Binary files /dev/null and b/FunKey/package/PCSX-ReARMed/opk/ps1/ps1.png differ
diff --git a/FunKey/package/PocketSNES/PocketSNES.mk b/FunKey/package/PocketSNES/PocketSNES.mk
index 7753f9d..265d486 100644
--- a/FunKey/package/PocketSNES/PocketSNES.mk
+++ b/FunKey/package/PocketSNES/PocketSNES.mk
@@ -4,7 +4,7 @@
 #
 ################################################################################
 
-POCKETSNES_VERSION = PocketSNES-FunKey-1.00
+POCKETSNES_VERSION = PocketSNES-FunKey-1.1.0
 POCKETSNES_SITE_METHOD = git
 POCKETSNES_SITE = https://github.com/FunKey-Project/PocketSNES.git
 POCKETSNES_LICENSE = GPL-2.0
@@ -12,25 +12,7 @@ POCKETSNES_LICENSE_FILES = COPYING
 
 POCKETSNES_DEPENDENCIES = sdl sdl_image sdl_mixer sdl_ttf zlib
 
-POCKETSNES_CFLAGS = $(TARGET_CFLAGS)
-
-ifeq ($(BR2_ARM_CPU_ARMV7A),y)
-POCKETSNES_CFLAGS += -march=armv7-a
-endif
-
-ifeq ($(BR2_GCC_TARGET_CPU),"cortex-a7")
-POCKETSNES_CFLAGS += -mtune=cortex-a7
-endif
-
-ifeq ($(BR2_GCC_TARGET_FLOAT_ABI),"hard")
-POCKETSNES_CFLAGS += -mfloat-abi=hard -ffast-math -funsafe-math-optimizations
-else ifeq ($(BR2_GCC_TARGET_FLOAT_ABI),"soft")
-POCKETSNES_CFLAGS += -mfloat-abi=soft -ffast-math -funsafe-math-optimizations
-endif
-
-ifeq ($(BR2_ARM_CPU_HAS_NEON),y)
-POCKETSNES_CFLAGS += -D__ARM_NEON__ -mfpu=neon -mvectorize-with-neon-quad
-endif
+POCKETSNES_CFLAGS = $(TARGET_CFLAGS) $(subst $\",,$(BR2_TARGET_OPTIMIZATION)) -mfloat-abi=hard -ffast-math -funsafe-math-optimizations
 
 POCKETSNES_SDL_CFLAGS += $(shell $(STAGING_DIR)/usr/bin/sdl-config --cflags)
 POCKETSNES_SDL_LIBS += $(shell $(STAGING_DIR)/usr/bin/sdl-config --libs)
@@ -71,4 +53,10 @@ define POCKETSNES_INSTALL_TARGET_CMDS
 	$(INSTALL) -m 0755 $(@D)/psnes $(TARGET_DIR)/usr/games/psnes
 endef
 
+define POCKETSNES_CREATE_OPK
+	$(INSTALL) -d -m 0755 $(TARGET_DIR)/usr/games/opk
+	$(HOST_DIR)/usr/bin/mksquashfs $(POCKETSNES_PKGDIR)/opk/snes $(TARGET_DIR)/usr/games/opk/snes_pocketsnes_funkey-s.opk -all-root -noappend -no-exports -no-xattrs
+endef
+POCKETSNES_POST_INSTALL_TARGET_HOOKS += POCKETSNES_CREATE_OPK
+
 $(eval $(generic-package))
diff --git a/FunKey/package/PocketSNES/opk/snes/snes.funkey-s.desktop b/FunKey/package/PocketSNES/opk/snes/snes.funkey-s.desktop
new file mode 100644
index 0000000..f7668dd
--- /dev/null
+++ b/FunKey/package/PocketSNES/opk/snes/snes.funkey-s.desktop
@@ -0,0 +1,12 @@
+[Desktop Entry]
+Type=Application
+Name=SNES
+Comment=SNES Emulator
+Icon=snes
+Exec=/usr/games/launchers/snes_launch.sh %f
+Terminal=false
+X-OD-NeedsDownscaling=true
+Categories=emulators;
+selectorbrowser=true
+SelectorDir=/mnt/SNES
+SelectorFilter=sfc,SFC,smc,SMC
diff --git a/FunKey/package/PocketSNES/opk/snes/snes.png b/FunKey/package/PocketSNES/opk/snes/snes.png
new file mode 100644
index 0000000..d3f327a
Binary files /dev/null and b/FunKey/package/PocketSNES/opk/snes/snes.png differ
diff --git a/FunKey/package/ProdScreens/ProdScreens.mk b/FunKey/package/ProdScreens/ProdScreens.mk
index e14548f..0037a14 100644
--- a/FunKey/package/ProdScreens/ProdScreens.mk
+++ b/FunKey/package/ProdScreens/ProdScreens.mk
@@ -4,7 +4,7 @@
 #
 ################################################################################
 
-PRODSCREENS_VERSION = FunKey-ProdScreens-FunKey-1.00
+PRODSCREENS_VERSION = FunKey-ProdScreens-FunKey-1.1.0
 PRODSCREENS_SITE_METHOD = git
 PRODSCREENS_SITE = https://github.com/FunKey-Project/FunKey-ProdScreens.git
 PRODSCREENS_SITE_LICENSE = GPL-2.1+
diff --git a/FunKey/package/agg/0001-Fix-non-terminating-loop-conditions-when-len-1.patch b/FunKey/package/agg/0001-Fix-non-terminating-loop-conditions-when-len-1.patch
new file mode 100644
index 0000000..eaf0467
--- /dev/null
+++ b/FunKey/package/agg/0001-Fix-non-terminating-loop-conditions-when-len-1.patch
@@ -0,0 +1,81 @@
+From efd33aad5e69f36ab343b1f28839a55db4538104 Mon Sep 17 00:00:00 2001
+From: Tom Hughes <tom@compton.nu>
+Date: Sun, 19 May 2013 10:55:37 +0100
+Subject: [PATCH 01/15] Fix non-terminating loop conditions when len=1
+
+-   while(abs(sx - lp.x1) + abs(sy - lp.y1) > lp2.len)
++   while(abs(sx - lp.x1) + abs(sy - lp.y1) > 1 + lp2.len)
+    {
+        sx = (lp.x1 + sx) >> 1;
+        sy = (lp.y1 + sy) >> 1;
+    }
+---
+ include/agg_renderer_outline_aa.h    | 8 ++++----
+ include/agg_renderer_outline_image.h | 4 ++--
+ 2 files changed, 6 insertions(+), 6 deletions(-)
+
+diff --git a/include/agg_renderer_outline_aa.h b/include/agg_renderer_outline_aa.h
+index ce25a2e..cb2aa00 100644
+--- a/include/agg_renderer_outline_aa.h
++++ b/include/agg_renderer_outline_aa.h
+@@ -1659,7 +1659,7 @@ namespace agg
+                         }
+                         else
+                         {
+-                            while(abs(sx - lp.x1) + abs(sy - lp.y1) > lp2.len)
++                            while(abs(sx - lp.x1) + abs(sy - lp.y1) > 1 + lp2.len)
+                             {
+                                 sx = (lp.x1 + sx) >> 1;
+                                 sy = (lp.y1 + sy) >> 1;
+@@ -1726,7 +1726,7 @@ namespace agg
+                         }
+                         else
+                         {
+-                            while(abs(ex - lp.x2) + abs(ey - lp.y2) > lp2.len)
++                            while(abs(ex - lp.x2) + abs(ey - lp.y2) > 1 + lp2.len)
+                             {
+                                 ex = (lp.x2 + ex) >> 1;
+                                 ey = (lp.y2 + ey) >> 1;
+@@ -1798,7 +1798,7 @@ namespace agg
+                         }
+                         else
+                         {
+-                            while(abs(sx - lp.x1) + abs(sy - lp.y1) > lp2.len)
++                            while(abs(sx - lp.x1) + abs(sy - lp.y1) > 1 + lp2.len)
+                             {
+                                 sx = (lp.x1 + sx) >> 1;
+                                 sy = (lp.y1 + sy) >> 1;
+@@ -1811,7 +1811,7 @@ namespace agg
+                         }
+                         else
+                         {
+-                            while(abs(ex - lp.x2) + abs(ey - lp.y2) > lp2.len)
++                            while(abs(ex - lp.x2) + abs(ey - lp.y2) > 1 + lp2.len)
+                             {
+                                 ex = (lp.x2 + ex) >> 1;
+                                 ey = (lp.y2 + ey) >> 1;
+diff --git a/include/agg_renderer_outline_image.h b/include/agg_renderer_outline_image.h
+index fbfac10..66d2b9a 100644
+--- a/include/agg_renderer_outline_image.h
++++ b/include/agg_renderer_outline_image.h
+@@ -969,7 +969,7 @@ namespace agg
+                         }
+                         else
+                         {
+-                            while(abs(sx - lp.x1) + abs(sy - lp.y1) > lp2.len)
++                            while(abs(sx - lp.x1) + abs(sy - lp.y1) > 1 + lp2.len)
+                             {
+                                 sx = (lp.x1 + sx) >> 1;
+                                 sy = (lp.y1 + sy) >> 1;
+@@ -982,7 +982,7 @@ namespace agg
+                         }
+                         else
+                         {
+-                            while(abs(ex - lp.x2) + abs(ey - lp.y2) > lp2.len)
++                            while(abs(ex - lp.x2) + abs(ey - lp.y2) > 1 + lp2.len)
+                             {
+                                 ex = (lp.x2 + ex) >> 1;
+                                 ey = (lp.y2 + ey) >> 1;
+-- 
+1.8.1.4
+
diff --git a/FunKey/package/agg/0001-autogen.patch b/FunKey/package/agg/0001-autogen.patch
new file mode 100644
index 0000000..b773f12
--- /dev/null
+++ b/FunKey/package/agg/0001-autogen.patch
@@ -0,0 +1,15 @@
+Author: Andrea Veri <and@debian.org>
+Description: Disable configure's run from the autogen file.
+
+Index: agg-2.5+dfsg1/autogen.sh
+===================================================================
+--- a/autogen.sh	2007-10-11 00:06:16.000000000 +0200
++++ b/autogen.sh	2012-05-01 16:57:37.916862783 +0200
+@@ -18,6 +18,6 @@
+ automake --foreign --add-missing --ignore-deps
+ 
+ # and finally invoke our new configure
+-./configure $*
++[ -n "$NOCONFIGURE" ] || ./configure $*
+ 
+ # end
diff --git a/FunKey/package/agg/0002-Cure-recursion-by-aborting-if-the-co-ordinates-are-t.patch b/FunKey/package/agg/0002-Cure-recursion-by-aborting-if-the-co-ordinates-are-t.patch
new file mode 100644
index 0000000..4fe7434
--- /dev/null
+++ b/FunKey/package/agg/0002-Cure-recursion-by-aborting-if-the-co-ordinates-are-t.patch
@@ -0,0 +1,40 @@
+From e269fe9b62af6fe314cebe0ee7a6d6d1a4a84d1c Mon Sep 17 00:00:00 2001
+From: Tom Hughes <tom@compton.nu>
+Date: Sun, 19 May 2013 11:03:26 +0100
+Subject: [PATCH 02/15] Cure recursion by aborting if the co-ordinates are to
+ big to handle
+
+---
+ include/agg_rasterizer_cells_aa.h | 9 ++++++++-
+ 1 file changed, 8 insertions(+), 1 deletion(-)
+
+diff --git a/include/agg_rasterizer_cells_aa.h b/include/agg_rasterizer_cells_aa.h
+index d3bb138..3a616d9 100644
+--- a/include/agg_rasterizer_cells_aa.h
++++ b/include/agg_rasterizer_cells_aa.h
+@@ -40,7 +40,8 @@
+ #define AGG_RASTERIZER_CELLS_AA_INCLUDED
+ 
+ #include <string.h>
+-#include <math.h>
++#include <cstdlib>
++#include <limits>
+ #include "agg_math.h"
+ #include "agg_array.h"
+ 
+@@ -333,6 +334,12 @@ namespace agg
+         {
+             int cx = (x1 + x2) >> 1;
+             int cy = (y1 + y2) >> 1;
++
++            // Bail if values are so large they are likely to wrap
++            if ((std::abs(x1) >= std::numeric_limits<int>::max()/2) || (std::abs(y1) >= std::numeric_limits<int>::max()/2) ||
++                (std::abs(x2) >= std::numeric_limits<int>::max()/2) || (std::abs(y2) >= std::numeric_limits<int>::max()/2))
++                    return;
++
+             line(x1, y1, cx, cy);
+             line(cx, cy, x2, y2);
+         }
+-- 
+1.8.1.4
+
diff --git a/FunKey/package/agg/0002-caca.patch b/FunKey/package/agg/0002-caca.patch
new file mode 100644
index 0000000..f98a573
--- /dev/null
+++ b/FunKey/package/agg/0002-caca.patch
@@ -0,0 +1,34 @@
+--- a/configure.in
++++ b/configure.in
+@@ -39,7 +39,7 @@
+ # used as platform library in examples:
+ # todo, make the PREFERED_PLATFORM selectable, after the set of possible 
+ # Platforms to link the examples have been evaluated.
+-PREFERED_PLATFORM=X11
++PREFERED_PLATFORM=sdl
+ case "$host" in
+   *darwin* )
+     OSX_LIBS="-framework Carbon -framework QuickTime"
+@@ -120,9 +120,7 @@
+ if test "$no_x" = "yes"; then
+   AC_MSG_WARN([*** X11 not found! Omitting X11 layer.])
+ fi
+-AM_CONDITIONAL(ENABLE_X11,[test x$no_x = x -a xno != x$enable_platform -a x$win32_host != xyes])
+-AC_SUBST(x_includes)
+-AC_SUBST(x_libraries)
++AM_CONDITIONAL(ENABLE_X11,0)
+ dnl ###############################################
+ 
+ dnl Settung up library version
+
+--- a/include/agg_renderer_outline_aa.h
++++ b/include/agg_renderer_outline_aa.h
+@@ -1375,7 +1375,7 @@
+         //---------------------------------------------------------------------
+         void profile(const line_profile_aa& prof) { m_profile = &prof; }
+         const line_profile_aa& profile() const { return *m_profile; }
+-        line_profile_aa& profile() { return *m_profile; }
++//        line_profile_aa& profile() { return *m_profile; }
+ 
+         //---------------------------------------------------------------------
+         int subpixel_width() const { return m_profile->subpixel_width(); }
diff --git a/FunKey/package/agg/0003-Get-coordinates-from-previous-vertex-if-last-command.patch b/FunKey/package/agg/0003-Get-coordinates-from-previous-vertex-if-last-command.patch
new file mode 100644
index 0000000..b12684d
--- /dev/null
+++ b/FunKey/package/agg/0003-Get-coordinates-from-previous-vertex-if-last-command.patch
@@ -0,0 +1,30 @@
+From 032d5342430f4c5dfbc34a2817d67386a14fd51b Mon Sep 17 00:00:00 2001
+From: Tom Hughes <tom@compton.nu>
+Date: Sun, 19 May 2013 11:40:49 +0100
+Subject: [PATCH 03/15] Get coordinates from previous vertex if last command is
+ path_cmd_end_poly
+
+---
+ include/agg_path_storage.h | 6 ++++++
+ 1 file changed, 6 insertions(+)
+
+diff --git a/include/agg_path_storage.h b/include/agg_path_storage.h
+index 7be7393..8922fc8 100644
+--- a/include/agg_path_storage.h
++++ b/include/agg_path_storage.h
+@@ -878,6 +878,12 @@ namespace agg
+                 *x += x2;
+                 *y += y2;
+             }
++            else if (!is_stop(m_vertices.last_command()) &&
++                     is_vertex(m_vertices.prev_vertex(&x2, &y2)))
++            {
++                *x += x2;
++                *y += y2;
++            }
+         }
+     }
+ 
+-- 
+1.8.1.4
+
diff --git a/FunKey/package/agg/0004-Make-rasterizer_outline_aa-ignore-close_polygon-when.patch b/FunKey/package/agg/0004-Make-rasterizer_outline_aa-ignore-close_polygon-when.patch
new file mode 100644
index 0000000..0cecaf7
--- /dev/null
+++ b/FunKey/package/agg/0004-Make-rasterizer_outline_aa-ignore-close_polygon-when.patch
@@ -0,0 +1,138 @@
+From b9c4b1c72b4ad6b24c37f402d3eec39ef393b0eb Mon Sep 17 00:00:00 2001
+From: Tom Hughes <tom@compton.nu>
+Date: Sun, 19 May 2013 14:17:43 +0100
+Subject: [PATCH 04/15] Make rasterizer_outline_aa ignore close_polygon when
+ vertex count < 3
+
+---
+ include/agg_rasterizer_outline_aa.h | 107 ++++++++++++++++++------------------
+ 1 file changed, 52 insertions(+), 55 deletions(-)
+
+diff --git a/include/agg_rasterizer_outline_aa.h b/include/agg_rasterizer_outline_aa.h
+index 4d6dd57..24301d5 100644
+--- a/include/agg_rasterizer_outline_aa.h
++++ b/include/agg_rasterizer_outline_aa.h
+@@ -333,68 +333,65 @@ namespace agg
+         int y2;
+         int lprev;
+ 
+-        if(close_polygon)
++        if(close_polygon && (m_src_vertices.size() >= 3))
+         {
+-            if(m_src_vertices.size() >= 3)
++            dv.idx = 2;
++
++            v     = &m_src_vertices[m_src_vertices.size() - 1];
++            x1    = v->x;
++            y1    = v->y;
++            lprev = v->len;
++
++            v  = &m_src_vertices[0];
++            x2 = v->x;
++            y2 = v->y;
++            dv.lcurr = v->len;
++            line_parameters prev(x1, y1, x2, y2, lprev);
++
++            v = &m_src_vertices[1];
++            dv.x1    = v->x;
++            dv.y1    = v->y;
++            dv.lnext = v->len;
++            dv.curr = line_parameters(x2, y2, dv.x1, dv.y1, dv.lcurr);
++
++            v = &m_src_vertices[dv.idx];
++            dv.x2 = v->x;
++            dv.y2 = v->y;
++            dv.next = line_parameters(dv.x1, dv.y1, dv.x2, dv.y2, dv.lnext);
++
++            dv.xb1 = 0;
++            dv.yb1 = 0;
++            dv.xb2 = 0;
++            dv.yb2 = 0;
++
++            switch(m_line_join)
+             {
+-                dv.idx = 2;
+-
+-                v     = &m_src_vertices[m_src_vertices.size() - 1];
+-                x1    = v->x;
+-                y1    = v->y;
+-                lprev = v->len;
+-
+-                v  = &m_src_vertices[0];
+-                x2 = v->x;
+-                y2 = v->y;
+-                dv.lcurr = v->len;
+-                line_parameters prev(x1, y1, x2, y2, lprev);
+-
+-                v = &m_src_vertices[1];
+-                dv.x1    = v->x;
+-                dv.y1    = v->y;
+-                dv.lnext = v->len;
+-                dv.curr = line_parameters(x2, y2, dv.x1, dv.y1, dv.lcurr);
+-
+-                v = &m_src_vertices[dv.idx];
+-                dv.x2 = v->x;
+-                dv.y2 = v->y;
+-                dv.next = line_parameters(dv.x1, dv.y1, dv.x2, dv.y2, dv.lnext);
+-
+-                dv.xb1 = 0;
+-                dv.yb1 = 0;
+-                dv.xb2 = 0;
+-                dv.yb2 = 0;
+-
+-                switch(m_line_join)
+-                {
+-                case outline_no_join:
+-                    dv.flags = 3;
+-                    break;
++            case outline_no_join:
++                dv.flags = 3;
++                break;
+ 
+-                case outline_miter_join:
+-                case outline_round_join:
+-                    dv.flags = 
+-                            (prev.diagonal_quadrant() == dv.curr.diagonal_quadrant()) |
+-                        ((dv.curr.diagonal_quadrant() == dv.next.diagonal_quadrant()) << 1);
+-                    break;
++            case outline_miter_join:
++            case outline_round_join:
++                dv.flags = 
++                        (prev.diagonal_quadrant() == dv.curr.diagonal_quadrant()) |
++                    ((dv.curr.diagonal_quadrant() == dv.next.diagonal_quadrant()) << 1);
++                break;
+ 
+-                case outline_miter_accurate_join:
+-                    dv.flags = 0;
+-                    break;
+-                }
++            case outline_miter_accurate_join:
++                dv.flags = 0;
++                break;
++            }
+ 
+-                if((dv.flags & 1) == 0 && m_line_join != outline_round_join)
+-                {
+-                    bisectrix(prev, dv.curr, &dv.xb1, &dv.yb1);
+-                }
++            if((dv.flags & 1) == 0 && m_line_join != outline_round_join)
++            {
++                bisectrix(prev, dv.curr, &dv.xb1, &dv.yb1);
++            }
+ 
+-                if((dv.flags & 2) == 0 && m_line_join != outline_round_join)
+-                {
+-                    bisectrix(dv.curr, dv.next, &dv.xb2, &dv.yb2);
+-                }
+-                draw(dv, 0, m_src_vertices.size());
++            if((dv.flags & 2) == 0 && m_line_join != outline_round_join)
++            {
++                bisectrix(dv.curr, dv.next, &dv.xb2, &dv.yb2);
+             }
++            draw(dv, 0, m_src_vertices.size());
+         }
+         else
+         {
+-- 
+1.8.1.4
+
diff --git a/FunKey/package/agg/0005-Remove-VC-6-workaround.patch b/FunKey/package/agg/0005-Remove-VC-6-workaround.patch
new file mode 100644
index 0000000..f38f7c4
--- /dev/null
+++ b/FunKey/package/agg/0005-Remove-VC-6-workaround.patch
@@ -0,0 +1,52 @@
+From b8c43fb0ba13af0cc2b1050f48f81d76d2fdf0c7 Mon Sep 17 00:00:00 2001
+From: Tom Hughes <tom@compton.nu>
+Date: Sun, 19 May 2013 15:04:05 +0100
+Subject: [PATCH 05/15] Remove VC++ 6 workaround
+
+---
+ include/agg_renderer_scanline.h | 29 +----------------------------
+ 1 file changed, 1 insertion(+), 28 deletions(-)
+
+diff --git a/include/agg_renderer_scanline.h b/include/agg_renderer_scanline.h
+index c3bb6f0..c27ca60 100644
+--- a/include/agg_renderer_scanline.h
++++ b/include/agg_renderer_scanline.h
+@@ -79,34 +79,7 @@ namespace agg
+             sl.reset(ras.min_x(), ras.max_x());
+             while(ras.sweep_scanline(sl))
+             {
+-                //render_scanline_aa_solid(sl, ren, ren_color);
+-
+-                // This code is equivalent to the above call (copy/paste). 
+-                // It's just a "manual" optimization for old compilers,
+-                // like Microsoft Visual C++ v6.0
+-                //-------------------------------
+-                int y = sl.y();
+-                unsigned num_spans = sl.num_spans();
+-                typename Scanline::const_iterator span = sl.begin();
+-
+-                for(;;)
+-                {
+-                    int x = span->x;
+-                    if(span->len > 0)
+-                    {
+-                        ren.blend_solid_hspan(x, y, (unsigned)span->len, 
+-                                              ren_color, 
+-                                              span->covers);
+-                    }
+-                    else
+-                    {
+-                        ren.blend_hline(x, y, (unsigned)(x - span->len - 1), 
+-                                        ren_color, 
+-                                        *(span->covers));
+-                    }
+-                    if(--num_spans == 0) break;
+-                    ++span;
+-                }
++                render_scanline_aa_solid(sl, ren, ren_color);
+             }
+         }
+     }
+-- 
+1.8.1.4
+
diff --git a/FunKey/package/agg/0006-Implement-grain-merge-blending-mode-GIMP.patch b/FunKey/package/agg/0006-Implement-grain-merge-blending-mode-GIMP.patch
new file mode 100644
index 0000000..f1e465b
--- /dev/null
+++ b/FunKey/package/agg/0006-Implement-grain-merge-blending-mode-GIMP.patch
@@ -0,0 +1,85 @@
+From 9422570f4e099a834fc43619f7b2a7eb6b442e25 Mon Sep 17 00:00:00 2001
+From: Tom Hughes <tom@compton.nu>
+Date: Sun, 19 May 2013 15:31:01 +0100
+Subject: [PATCH 06/15] Implement grain-merge blending mode (GIMP)
+
+---
+ include/agg_pixfmt_rgba.h | 42 ++++++++++++++++++++++++++++++++++++++++--
+ 1 file changed, 40 insertions(+), 2 deletions(-)
+
+diff --git a/include/agg_pixfmt_rgba.h b/include/agg_pixfmt_rgba.h
+index 79d10dc..f576ce4 100644
+--- a/include/agg_pixfmt_rgba.h
++++ b/include/agg_pixfmt_rgba.h
+@@ -1401,9 +1401,46 @@ namespace agg
+         }
+     };
+ 
++    //================================================comp_op_rgba_grain_merge
++    template <typename ColorT, typename Order> struct comp_op_rgba_grain_merge
++    {
++        typedef ColorT color_type;
++        typedef Order order_type;
++        typedef typename color_type::value_type value_type;
++        typedef typename color_type::calc_type calc_type;
++        typedef typename color_type::long_type long_type;
++        enum base_scale_e
++        {
++            base_shift = color_type::base_shift,
++            base_mask  = color_type::base_mask
++        };
+ 
++        // E = I + M - 128
++        static AGG_INLINE void blend_pix(value_type* p,
++                                         unsigned sr, unsigned sg, unsigned sb,
++                                         unsigned sa, unsigned cover)
++        {
+ 
+-
++            if(cover < 255)
++            {
++                sr = (sr * cover + 255) >> 8;
++                sg = (sg * cover + 255) >> 8;
++                sb = (sb * cover + 255) >> 8;
++                sa = (sa * cover + 255) >> 8;
++            }
++            if(sa)
++            {
++                calc_type da = p[Order::A];
++                int dr = sr + p[Order::R] - 128;
++                int dg = sg + p[Order::G] - 128;
++                int db = sb + p[Order::B] - 128;
++                p[Order::R] = (value_type)(dr < 0 ? 0 : (dr > 255 ? 255 : dr));
++                p[Order::G] = (value_type)(dg < 0 ? 0 : (dg > 255 ? 255 : dg));
++                p[Order::B] = (value_type)(db < 0 ? 0 : (db > 255 ? 255 : db));
++                p[Order::A] = (value_type)(sa + da - ((sa * da + base_mask) >> base_shift));
++            }
++        }
++    };
+ 
+     //======================================================comp_op_table_rgba
+     template<class ColorT, class Order> struct comp_op_table_rgba
+@@ -1451,6 +1488,7 @@ namespace agg
+         comp_op_rgba_contrast   <ColorT,Order>::blend_pix,
+         comp_op_rgba_invert     <ColorT,Order>::blend_pix,
+         comp_op_rgba_invert_rgb <ColorT,Order>::blend_pix,
++        comp_op_rgba_grain_merge<ColorT,Order>::blend_pix,
+         0
+     };
+ 
+@@ -1486,6 +1524,7 @@ namespace agg
+         comp_op_contrast,      //----comp_op_contrast
+         comp_op_invert,        //----comp_op_invert
+         comp_op_invert_rgb,    //----comp_op_invert_rgb
++        comp_op_grain_merge,   //----comp_op_grain_merge
+ 
+         end_of_comp_op_e
+     };
+@@ -2908,4 +2947,3 @@ namespace agg
+ }
+ 
+ #endif
+-
+-- 
+1.8.1.4
+
diff --git a/FunKey/package/agg/0007-Implement-grain-extract-blending-mode-GIMP.patch b/FunKey/package/agg/0007-Implement-grain-extract-blending-mode-GIMP.patch
new file mode 100644
index 0000000..cafb36e
--- /dev/null
+++ b/FunKey/package/agg/0007-Implement-grain-extract-blending-mode-GIMP.patch
@@ -0,0 +1,85 @@
+From abd440342e166a90d08610bf5b31d2a8357eafbe Mon Sep 17 00:00:00 2001
+From: Tom Hughes <tom@compton.nu>
+Date: Sun, 19 May 2013 15:43:18 +0100
+Subject: [PATCH 07/15] Implement grain-extract blending mode (GIMP)
+
+---
+ include/agg_pixfmt_rgba.h | 48 +++++++++++++++++++++++++++++++++++++++++++++++
+ 1 file changed, 48 insertions(+)
+
+diff --git a/include/agg_pixfmt_rgba.h b/include/agg_pixfmt_rgba.h
+index f576ce4..42f0a05 100644
+--- a/include/agg_pixfmt_rgba.h
++++ b/include/agg_pixfmt_rgba.h
+@@ -1442,6 +1442,52 @@ namespace agg
+         }
+     };
+ 
++    //==============================================comp_op_rgba_grain_extract
++    template <typename ColorT, typename Order> struct comp_op_rgba_grain_extract
++    {
++        typedef ColorT color_type;
++        typedef Order order_type;
++        typedef typename color_type::value_type value_type;
++        typedef typename color_type::calc_type calc_type;
++        typedef typename color_type::long_type long_type;
++        enum base_scale_e
++        {
++            base_shift = color_type::base_shift,
++            base_mask  = color_type::base_mask
++        };
++
++        // E = I - M + 128
++        static AGG_INLINE void blend_pix(value_type* p,
++                                         unsigned sr, unsigned sg, unsigned sb,
++                                         unsigned sa, unsigned cover)
++        {
++            calc_type da = (p[Order::A] * sa + 255) >> 8;
++
++            int dr = p[Order::R] - sr + 128;
++            int dg = p[Order::G] - sg + 128;
++            int db = p[Order::B] - sb + 128;
++
++            dr = dr < 0 ? 0 : (dr > 255 ? 255 : dr);
++            dg = dg < 0 ? 0 : (dg > 255 ? 255 : dg);
++            db = db < 0 ? 0 : (db > 255 ? 255 : db);
++
++            p[Order::A] = da;
++
++            if(da < 255)
++            {
++                p[Order::R] = (dr * da + 255) >> 8;
++                p[Order::G] = (dg * da + 255) >> 8;
++                p[Order::B] = (db * da + 255) >> 8;
++            }
++            else
++            {
++                p[Order::R] = dr;
++                p[Order::G] = dg;
++                p[Order::B] = db;
++            }
++        }
++    };
++
+     //======================================================comp_op_table_rgba
+     template<class ColorT, class Order> struct comp_op_table_rgba
+     {
+@@ -1489,6 +1535,7 @@ namespace agg
+         comp_op_rgba_invert     <ColorT,Order>::blend_pix,
+         comp_op_rgba_invert_rgb <ColorT,Order>::blend_pix,
+         comp_op_rgba_grain_merge<ColorT,Order>::blend_pix,
++        comp_op_rgba_grain_extract<ColorT,Order>::blend_pix,
+         0
+     };
+ 
+@@ -1525,6 +1572,7 @@ namespace agg
+         comp_op_invert,        //----comp_op_invert
+         comp_op_invert_rgb,    //----comp_op_invert_rgb
+         comp_op_grain_merge,   //----comp_op_grain_merge
++        comp_op_grain_extract, //----comp_op_grain_extract
+ 
+         end_of_comp_op_e
+     };
+-- 
+1.8.1.4
+
diff --git a/FunKey/package/agg/0008-Declare-multiplication-and-division-operators-as-con.patch b/FunKey/package/agg/0008-Declare-multiplication-and-division-operators-as-con.patch
new file mode 100644
index 0000000..0ed92ee
--- /dev/null
+++ b/FunKey/package/agg/0008-Declare-multiplication-and-division-operators-as-con.patch
@@ -0,0 +1,36 @@
+From 2688af280836b95908d3cfd6915510d55de673b8 Mon Sep 17 00:00:00 2001
+From: Tom Hughes <tom@compton.nu>
+Date: Sun, 19 May 2013 16:15:01 +0100
+Subject: [PATCH 08/15] Declare multiplication and division operators as const
+
+---
+ include/agg_trans_affine.h | 8 ++++----
+ 1 file changed, 4 insertions(+), 4 deletions(-)
+
+diff --git a/include/agg_trans_affine.h b/include/agg_trans_affine.h
+index a662099..2f602a0 100644
+--- a/include/agg_trans_affine.h
++++ b/include/agg_trans_affine.h
+@@ -216,15 +216,15 @@ namespace agg
+         }
+ 
+         // Multiply the matrix by another one and return
+-        // the result in a separete matrix.
+-        trans_affine operator * (const trans_affine& m)
++        // the result in a separate matrix.
++        trans_affine operator * (const trans_affine& m) const
+         {
+             return trans_affine(*this).multiply(m);
+         }
+ 
+         // Multiply the matrix by inverse of another one 
+-        // and return the result in a separete matrix.
+-        trans_affine operator / (const trans_affine& m)
++        // and return the result in a separate matrix.
++        trans_affine operator / (const trans_affine& m) const
+         {
+             return trans_affine(*this).multiply_inv(m);
+         }
+-- 
+1.8.1.4
+
diff --git a/FunKey/package/agg/0009-Add-a-static-identity-transformation.patch b/FunKey/package/agg/0009-Add-a-static-identity-transformation.patch
new file mode 100644
index 0000000..01555cb
--- /dev/null
+++ b/FunKey/package/agg/0009-Add-a-static-identity-transformation.patch
@@ -0,0 +1,37 @@
+From be9ed90897bc43b4547a3a1f8046827caaf13b4c Mon Sep 17 00:00:00 2001
+From: Tom Hughes <tom@compton.nu>
+Date: Sun, 19 May 2013 16:15:36 +0100
+Subject: [PATCH 09/15] Add a static identity transformation
+
+---
+ include/agg_trans_affine.h | 1 +
+ src/agg_trans_affine.cpp   | 1 +
+ 2 files changed, 2 insertions(+)
+
+diff --git a/include/agg_trans_affine.h b/include/agg_trans_affine.h
+index 2f602a0..67fe5ca 100644
+--- a/include/agg_trans_affine.h
++++ b/include/agg_trans_affine.h
+@@ -92,6 +92,7 @@ namespace agg
+     //----------------------------------------------------------------------
+     struct trans_affine
+     {
++        static const trans_affine identity;
+         double sx, shy, shx, sy, tx, ty;
+ 
+         //------------------------------------------ Construction
+diff --git a/src/agg_trans_affine.cpp b/src/agg_trans_affine.cpp
+index aca18c2..b3d9bc0 100644
+--- a/src/agg_trans_affine.cpp
++++ b/src/agg_trans_affine.cpp
+@@ -28,6 +28,7 @@
+ 
+ namespace agg
+ {
++    const trans_affine trans_affine::identity;
+ 
+     //------------------------------------------------------------------------
+     const trans_affine& trans_affine::parl_to_parl(const double* src, 
+-- 
+1.8.1.4
+
diff --git a/FunKey/package/agg/0010-Add-renderer_scanline_aa_alpha.patch b/FunKey/package/agg/0010-Add-renderer_scanline_aa_alpha.patch
new file mode 100644
index 0000000..b0be258
--- /dev/null
+++ b/FunKey/package/agg/0010-Add-renderer_scanline_aa_alpha.patch
@@ -0,0 +1,193 @@
+From 749c8cd11e9e6f81e93ae5ce19258431722b6bdf Mon Sep 17 00:00:00 2001
+From: Tom Hughes <tom@compton.nu>
+Date: Sun, 19 May 2013 16:43:25 +0100
+Subject: [PATCH 10/15] Add renderer_scanline_aa_alpha
+
+---
+ include/agg_pixfmt_rgba.h       | 24 +++++++++++++-
+ include/agg_renderer_base.h     | 28 ++++++++++++++++
+ include/agg_renderer_scanline.h | 71 +++++++++++++++++++++++++++++++++++++++++
+ 3 files changed, 122 insertions(+), 1 deletion(-)
+
+diff --git a/include/agg_pixfmt_rgba.h b/include/agg_pixfmt_rgba.h
+index 42f0a05..6c4bc37 100644
+--- a/include/agg_pixfmt_rgba.h
++++ b/include/agg_pixfmt_rgba.h
+@@ -2247,7 +2247,6 @@ namespace agg
+         }
+ 
+ 
+-
+         //--------------------------------------------------------------------
+         void blend_color_vspan(int x, int y,
+                                unsigned len, 
+@@ -2751,6 +2750,29 @@ namespace agg
+         }
+ 
+         //--------------------------------------------------------------------
++        void blend_color_hspan_alpha(int x, int y, unsigned len,
++                                     const color_type* colors,
++                                     value_type alpha,
++                                     const int8u* covers,
++                                     int8u cover)
++        {
++            value_type* p = (value_type*)m_rbuf->row_ptr(x, y, len) + (x << 2);
++            do
++            {
++                blender_type::blend_pix(m_comp_op,
++                                        p,
++                                        (colors->r * alpha + 255) >> 8,
++                                        (colors->g * alpha + 255) >> 8,
++                                        (colors->b * alpha + 255) >> 8,
++                                        (colors->a * alpha + 255) >> 8,
++                                        covers ? *covers++ : cover);
++                p += 4;
++                ++colors;
++            }
++            while(--len);
++        }
++
++        //--------------------------------------------------------------------
+         void blend_color_vspan(int x, int y, unsigned len, 
+                                const color_type* colors, 
+                                const int8u* covers,
+diff --git a/include/agg_renderer_base.h b/include/agg_renderer_base.h
+index 1808944..25f07c3 100644
+--- a/include/agg_renderer_base.h
++++ b/include/agg_renderer_base.h
+@@ -37,6 +37,7 @@ namespace agg
+     public:
+         typedef PixelFormat pixfmt_type;
+         typedef typename pixfmt_type::color_type color_type;
++        typedef typename pixfmt_type::color_type::value_type value_type;
+         typedef typename pixfmt_type::row_data row_data;
+ 
+         //--------------------------------------------------------------------
+@@ -383,6 +384,33 @@ namespace agg
+         }
+ 
+         //--------------------------------------------------------------------
++        void blend_color_hspan_alpha(int x, int y, int len,
++                               const color_type* colors,
++                               value_type alpha,
++                               const cover_type* covers,
++                               cover_type cover = agg::cover_full)
++        {
++            if(y > ymax()) return;
++            if(y < ymin()) return;
++
++            if(x < xmin())
++            {
++                int d = xmin() - x;
++                len -= d;
++                if(len <= 0) return;
++                if(covers) covers += d;
++                colors += d;
++                x = xmin();
++            }
++            if(x + len > xmax())
++            {
++                len = xmax() - x + 1;
++                if(len <= 0) return;
++            }
++            m_ren->blend_color_hspan_alpha(x, y, len, colors, alpha,  covers, cover);
++        }
++
++        //--------------------------------------------------------------------
+         void blend_color_vspan(int x, int y, int len, 
+                                const color_type* colors, 
+                                const cover_type* covers,
+diff --git a/include/agg_renderer_scanline.h b/include/agg_renderer_scanline.h
+index c27ca60..4fcb557 100644
+--- a/include/agg_renderer_scanline.h
++++ b/include/agg_renderer_scanline.h
+@@ -156,6 +156,35 @@ namespace agg
+         }
+     }
+ 
++    //================================================render_scanline_aa_alpha
++    template<class Scanline, class BaseRenderer,
++             class SpanAllocator, class SpanGenerator>
++    void render_scanline_aa_alpha(const Scanline& sl, BaseRenderer& ren,
++                                  SpanAllocator& alloc, SpanGenerator& span_gen,
++                                  unsigned alpha)
++    {
++        int y = sl.y();
++
++        unsigned num_spans = sl.num_spans();
++        typename Scanline::const_iterator span = sl.begin();
++        for(;;)
++        {
++            int x = span->x;
++            int len = span->len;
++            const typename Scanline::cover_type* covers = span->covers;
++
++            if(len < 0) len = -len;
++            typename BaseRenderer::color_type* colors = alloc.allocate(len);
++            span_gen.generate(colors, x, y, len);
++            ren.blend_color_hspan_alpha(x, y, len, colors, alpha,
++                                  (span->len < 0) ? 0 : covers, *covers);
++
++            if(--num_spans == 0) break;
++            ++span;
++        }
++    }
++
++
+     //=====================================================render_scanlines_aa
+     template<class Rasterizer, class Scanline, class BaseRenderer, 
+              class SpanAllocator, class SpanGenerator>
+@@ -216,8 +245,50 @@ namespace agg
+     };
+ 
+ 
++    //==============================================renderer_scanline_aa_alpha
++    template<class BaseRenderer, class SpanAllocator, class SpanGenerator>
++    class renderer_scanline_aa_alpha
++    {
++    public:
++        typedef BaseRenderer  base_ren_type;
++        typedef SpanAllocator alloc_type;
++        typedef SpanGenerator span_gen_type;
+ 
++        //--------------------------------------------------------------------
++        renderer_scanline_aa_alpha() : m_ren(0), m_alloc(0), m_span_gen(0), m_alpha(1.0) {}
++        renderer_scanline_aa_alpha(base_ren_type& ren,
++                             alloc_type& alloc,
++                             span_gen_type& span_gen,
++                             unsigned alpha) :
++            m_ren(&ren),
++            m_alloc(&alloc),
++            m_span_gen(&span_gen),
++            m_alpha(alpha)
++        {}
++        void attach(base_ren_type& ren,
++                    alloc_type& alloc,
++                    span_gen_type& span_gen)
++        {
++            m_ren = &ren;
++            m_alloc = &alloc;
++            m_span_gen = &span_gen;
++        }
+ 
++        //--------------------------------------------------------------------
++        void prepare() { m_span_gen->prepare(); }
++
++        //--------------------------------------------------------------------
++        template<class Scanline> void render(const Scanline& sl)
++        {
++            render_scanline_aa_alpha(sl, *m_ren, *m_alloc, *m_span_gen, m_alpha);
++        }
++
++    private:
++        base_ren_type* m_ren;
++        alloc_type*    m_alloc;
++        span_gen_type* m_span_gen;
++        unsigned       m_alpha;
++    };
+ 
+ 
+     //===============================================render_scanline_bin_solid
+-- 
+1.8.1.4
+
diff --git a/FunKey/package/agg/0011-Avoid-division-by-zero-in-color-burn-mode.patch b/FunKey/package/agg/0011-Avoid-division-by-zero-in-color-burn-mode.patch
new file mode 100644
index 0000000..2a0d198
--- /dev/null
+++ b/FunKey/package/agg/0011-Avoid-division-by-zero-in-color-burn-mode.patch
@@ -0,0 +1,58 @@
+From 0ec68d7f5695403eccac75025ba7f6f7ecf1814e Mon Sep 17 00:00:00 2001
+From: Tom Hughes <tom@compton.nu>
+Date: Sun, 19 May 2013 16:49:08 +0100
+Subject: [PATCH 11/15] Avoid division by zero in color-burn mode
+
+FIXME: re-work using latest math from http://www.w3.org/TR/SVGCompositing/
+---
+ include/agg_pixfmt_rgba.h | 21 ++++++++++++++++++---
+ 1 file changed, 18 insertions(+), 3 deletions(-)
+
+diff --git a/include/agg_pixfmt_rgba.h b/include/agg_pixfmt_rgba.h
+index 6c4bc37..5d6b511 100644
+--- a/include/agg_pixfmt_rgba.h
++++ b/include/agg_pixfmt_rgba.h
+@@ -1027,6 +1027,21 @@ namespace agg
+         //   Dca' = Sa.(Sca.Da + Dca.Sa - Sa.Da)/Sca + Sca.(1 - Da) + Dca.(1 - Sa)
+         // 
+         // Da'  = Sa + Da - Sa.Da 
++
++
++        // http://www.w3.org/TR/SVGCompositing/
++        // if Sca == 0 and Dca == Da
++        //   Dca' = Sa × Da + Sca × (1 - Da) + Dca × (1 - Sa)
++        //        = Sa × Da + Dca × (1 - Sa)
++        //        = Da = Dca
++        // otherwise if Sca == 0
++        //   Dca' = Sca × (1 - Da) + Dca × (1 - Sa)
++        //        = Dca × (1 - Sa)
++        // otherwise if Sca > 0
++        //   Dca' = Sa × Da - Sa × Da × min(1, (1 - Dca/Da) × Sa/Sca) + Sca × (1 - Da) + Dca × (1 - Sa)
++        //        = Sa × Da × (1 - min(1, (1 - Dca/Da) × Sa/Sca)) + Sca × (1 - Da) + Dca × (1 - Sa)
++
++        //   sa * da * (255 - std::min(255, (255 - p[0]/da)*(sa/(sc*sa)) +
+         static AGG_INLINE void blend_pix(value_type* p, 
+                                          unsigned sr, unsigned sg, unsigned sb, 
+                                          unsigned sa, unsigned cover)
+@@ -1056,15 +1071,15 @@ namespace agg
+ 
+                 p[Order::R] = (value_type)(((srda + drsa <= sada) ? 
+                     sr * d1a + dr * s1a :
+-                    sa * (srda + drsa - sada) / sr + sr * d1a + dr * s1a + base_mask) >> base_shift);
++                   (sr > 0 ? sa * (srda + drsa - sada) / sr + sr * d1a + dr * s1a + base_mask : 0)) >> base_shift);
+ 
+                 p[Order::G] = (value_type)(((sgda + dgsa <= sada) ? 
+                     sg * d1a + dg * s1a :
+-                    sa * (sgda + dgsa - sada) / sg + sg * d1a + dg * s1a + base_mask) >> base_shift);
++                   (sg > 0 ? sa * (sgda + dgsa - sada) / sg + sg * d1a + dg * s1a + base_mask : 0)) >> base_shift);
+ 
+                 p[Order::B] = (value_type)(((sbda + dbsa <= sada) ? 
+                     sb * d1a + db * s1a :
+-                    sa * (sbda + dbsa - sada) / sb + sb * d1a + db * s1a + base_mask) >> base_shift);
++                   (sb > 0 ? sa * (sbda + dbsa - sada) / sb + sb * d1a + db * s1a + base_mask : 0)) >> base_shift);
+ 
+                 p[Order::A] = (value_type)(sa + da - ((sa * da + base_mask) >> base_shift));
+             }
+-- 
+1.8.1.4
+
diff --git a/FunKey/package/agg/0012-Avoid-pixel-artifacts-when-compositing.patch b/FunKey/package/agg/0012-Avoid-pixel-artifacts-when-compositing.patch
new file mode 100644
index 0000000..b3e641e
--- /dev/null
+++ b/FunKey/package/agg/0012-Avoid-pixel-artifacts-when-compositing.patch
@@ -0,0 +1,26 @@
+From bf0e0b71360cfbc690a29f4abe15d7b9b61b8479 Mon Sep 17 00:00:00 2001
+From: Tom Hughes <tom@compton.nu>
+Date: Sat, 22 Jun 2013 12:11:54 +0100
+Subject: [PATCH 12/15] Avoid pixel artifacts when compositing
+
+Change src_over alpha to avoid pixel artifacts by reordering computations.
+---
+ include/agg_pixfmt_rgba.h | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+diff --git a/include/agg_pixfmt_rgba.h b/include/agg_pixfmt_rgba.h
+index 5d6b511..bb255cd 100644
+--- a/include/agg_pixfmt_rgba.h
++++ b/include/agg_pixfmt_rgba.h
+@@ -346,7 +346,7 @@ namespace agg
+             p[Order::R] = (value_type)(sr + ((p[Order::R] * s1a + base_mask) >> base_shift));
+             p[Order::G] = (value_type)(sg + ((p[Order::G] * s1a + base_mask) >> base_shift));
+             p[Order::B] = (value_type)(sb + ((p[Order::B] * s1a + base_mask) >> base_shift));
+-            p[Order::A] = (value_type)(sa + p[Order::A] - ((sa * p[Order::A] + base_mask) >> base_shift));
++            p[Order::A] = (value_type)(sa + ((p[Order::A] * s1a + base_mask) >> base_shift));
+         }
+     };
+ 
+-- 
+1.8.1.4
+
diff --git a/FunKey/package/agg/0013-Modify-agg-conv-classes-to-allow-access-to-the-origi.patch b/FunKey/package/agg/0013-Modify-agg-conv-classes-to-allow-access-to-the-origi.patch
new file mode 100644
index 0000000..9deb904
--- /dev/null
+++ b/FunKey/package/agg/0013-Modify-agg-conv-classes-to-allow-access-to-the-origi.patch
@@ -0,0 +1,93 @@
+From 6f1ab5f4b470bcf4e7e72aac6e2f7f6ee3e7b424 Mon Sep 17 00:00:00 2001
+From: Tom Hughes <tom@compton.nu>
+Date: Sat, 22 Jun 2013 12:16:42 +0100
+Subject: [PATCH 13/15] Modify agg conv classes to allow access to the original
+ geometry type
+
+---
+ include/agg_conv_adaptor_vcgen.h | 2 ++
+ include/agg_conv_adaptor_vpgen.h | 1 +
+ include/agg_conv_clip_polygon.h  | 1 +
+ include/agg_conv_clip_polyline.h | 1 +
+ include/agg_conv_smooth_poly1.h  | 2 ++
+ 5 files changed, 7 insertions(+)
+
+diff --git a/include/agg_conv_adaptor_vcgen.h b/include/agg_conv_adaptor_vcgen.h
+index 7bd9b07..fef4579 100644
+--- a/include/agg_conv_adaptor_vcgen.h
++++ b/include/agg_conv_adaptor_vcgen.h
+@@ -38,6 +38,7 @@ namespace agg
+ 
+         void rewind(unsigned) {}
+         unsigned vertex(double*, double*) { return path_cmd_stop; }
++        unsigned type() const { return 0; }
+     };
+ 
+ 
+@@ -73,6 +74,7 @@ namespace agg
+         }
+ 
+         unsigned vertex(double* x, double* y);
++        unsigned type() const { return m_source->type(); }
+ 
+     private:
+         // Prohibit copying
+diff --git a/include/agg_conv_adaptor_vpgen.h b/include/agg_conv_adaptor_vpgen.h
+index dca9415..a39102d 100644
+--- a/include/agg_conv_adaptor_vpgen.h
++++ b/include/agg_conv_adaptor_vpgen.h
+@@ -42,6 +42,7 @@ namespace agg
+ 
+         void rewind(unsigned path_id);
+         unsigned vertex(double* x, double* y);
++        unsigned type() const { return m_source->type(); }
+ 
+     private:
+         conv_adaptor_vpgen(const conv_adaptor_vpgen<VertexSource, VPGen>&);
+diff --git a/include/agg_conv_clip_polygon.h b/include/agg_conv_clip_polygon.h
+index 3c34590..e417a7d 100644
+--- a/include/agg_conv_clip_polygon.h
++++ b/include/agg_conv_clip_polygon.h
+@@ -60,6 +60,7 @@ namespace agg
+         double y1() const { return base_type::vpgen().y1(); }
+         double x2() const { return base_type::vpgen().x2(); }
+         double y2() const { return base_type::vpgen().y2(); }
++        unsigned type() const { return base_type::type(); }
+ 
+     private:
+         conv_clip_polygon(const conv_clip_polygon<VertexSource>&);
+diff --git a/include/agg_conv_clip_polyline.h b/include/agg_conv_clip_polyline.h
+index d45067f..0de4b57 100644
+--- a/include/agg_conv_clip_polyline.h
++++ b/include/agg_conv_clip_polyline.h
+@@ -60,6 +60,7 @@ namespace agg
+         double y1() const { return base_type::vpgen().y1(); }
+         double x2() const { return base_type::vpgen().x2(); }
+         double y2() const { return base_type::vpgen().y2(); }
++        unsigned type() const { return base_type::type(); }
+ 
+     private:
+         conv_clip_polyline(const conv_clip_polyline<VertexSource>&);
+diff --git a/include/agg_conv_smooth_poly1.h b/include/agg_conv_smooth_poly1.h
+index 15f7f8d..0956c4e 100644
+--- a/include/agg_conv_smooth_poly1.h
++++ b/include/agg_conv_smooth_poly1.h
+@@ -48,6 +48,7 @@ namespace agg
+ 
+         void   smooth_value(double v) { base_type::generator().smooth_value(v); }
+         double smooth_value() const { return base_type::generator().smooth_value(); }
++        unsigned type() const { return base_type::type(); }
+ 
+     private:
+         conv_smooth_poly1(const conv_smooth_poly1<VertexSource>&);
+@@ -70,6 +71,7 @@ namespace agg
+ 
+         void   smooth_value(double v) { m_smooth.generator().smooth_value(v); }
+         double smooth_value() const { return m_smooth.generator().smooth_value(); }
++        unsigned type() const { return m_smooth.type(); }
+ 
+     private:
+         conv_smooth_poly1_curve(const conv_smooth_poly1_curve<VertexSource>&);
+-- 
+1.8.1.4
+
diff --git a/FunKey/package/agg/0014-Avoid-potential-zero-division-resulting-in-nan-in-ag.patch b/FunKey/package/agg/0014-Avoid-potential-zero-division-resulting-in-nan-in-ag.patch
new file mode 100644
index 0000000..547b0d2
--- /dev/null
+++ b/FunKey/package/agg/0014-Avoid-potential-zero-division-resulting-in-nan-in-ag.patch
@@ -0,0 +1,30 @@
+From 6433a64f4cd41e88499386b0b7c7ae05d30683b8 Mon Sep 17 00:00:00 2001
+From: Tom Hughes <tom@compton.nu>
+Date: Sat, 22 Jun 2013 12:33:32 +0100
+Subject: [PATCH 14/15] Avoid potential zero division resulting in nan in
+ agg::gamma_linear
+
+---
+ include/agg_gamma_functions.h | 6 +++++-
+ 1 file changed, 5 insertions(+), 1 deletion(-)
+
+diff --git a/include/agg_gamma_functions.h b/include/agg_gamma_functions.h
+index fa38a45..beb0c04 100644
+--- a/include/agg_gamma_functions.h
++++ b/include/agg_gamma_functions.h
+@@ -94,7 +94,11 @@ namespace agg
+         {
+             if(x < m_start) return 0.0;
+             if(x > m_end) return 1.0;
+-            return (x - m_start) / (m_end - m_start);
++            double delta = m_end - m_start;
++            // avoid nan from potential zero division
++            // https://github.com/mapnik/mapnik/issues/761
++            if (delta <= 0.0) return 0.0;
++            return (x - m_start) / delta;
+         }
+ 
+     private:
+-- 
+1.8.1.4
+
diff --git a/FunKey/package/agg/0015-Ensure-first-value-in-the-gamma-table-is-always-zero.patch b/FunKey/package/agg/0015-Ensure-first-value-in-the-gamma-table-is-always-zero.patch
new file mode 100644
index 0000000..6214bd6
--- /dev/null
+++ b/FunKey/package/agg/0015-Ensure-first-value-in-the-gamma-table-is-always-zero.patch
@@ -0,0 +1,24 @@
+From ca818d4dcd428c5560fc3c341fbaf427a7485e32 Mon Sep 17 00:00:00 2001
+From: Tom Hughes <tom@compton.nu>
+Date: Sat, 22 Jun 2013 12:34:37 +0100
+Subject: [PATCH 15/15] Ensure first value in the gamma table is always zero
+
+---
+ include/agg_gamma_functions.h | 1 +
+ 1 file changed, 1 insertion(+)
+
+diff --git a/include/agg_gamma_functions.h b/include/agg_gamma_functions.h
+index beb0c04..b8eda52 100644
+--- a/include/agg_gamma_functions.h
++++ b/include/agg_gamma_functions.h
+@@ -49,6 +49,7 @@ namespace agg
+ 
+         double operator() (double x) const
+         {
++            if (x == 0.0) return 0.0;
+             return pow(x, m_gamma);
+         }
+ 
+-- 
+1.8.1.4
+
diff --git a/FunKey/package/agg/CREATE_FILES.patch b/FunKey/package/agg/CREATE_FILES.patch
new file mode 100644
index 0000000..1a78125
--- /dev/null
+++ b/FunKey/package/agg/CREATE_FILES.patch
@@ -0,0 +1,14 @@
+--- a/README.orig	2007-01-07 13:58:28.000000000 +0000
++++ b/README	2007-01-07 14:02:40.000000000 +0000
+@@ -0,0 +1 @@
++cacac
+
+--- a/NEWS.orig	2007-01-07 13:58:28.000000000 +0000
++++ b/NEWS	2007-01-07 14:02:40.000000000 +0000
+@@ -0,0 +1 @@
++cacac
+
+--- a/AUTHORS.orig	2007-01-07 13:58:28.000000000 +0000
++++ b/AUTHORS	2007-01-07 14:02:40.000000000 +0000
+@@ -0,0 +1 @@
++cacac
diff --git a/FunKey/package/agg/Config.in b/FunKey/package/agg/Config.in
new file mode 100644
index 0000000..a842098
--- /dev/null
+++ b/FunKey/package/agg/Config.in
@@ -0,0 +1,13 @@
+config BR2_PACKAGE_AGG
+	bool "agg"
+	depends on BR2_INSTALL_LIBSTDCPP
+	select BR2_PACKAGE_SDL
+	help
+	  The Anti-Grain Geometry project. A High Quality 2D Graphics Rendering
+	  Engine for C++.
+	  We select the SDL backend by default.
+
+	  http://www.antigrain.com/index.html
+
+comment "agg needs a toolchain with C++ support"
+	depends on !BR2_INSTALL_LIBSTDCPP
diff --git a/FunKey/package/agg/agg-2.4-depends.patch b/FunKey/package/agg/agg-2.4-depends.patch
new file mode 100644
index 0000000..f5506e2
--- /dev/null
+++ b/FunKey/package/agg/agg-2.4-depends.patch
@@ -0,0 +1,36 @@
+--- agg-2.4.orig/font_freetype/Makefile.am	2005-10-18 11:45:40.000000000 +0100
++++ agg-2.4/font_freetype/Makefile.am	2006-07-10 15:11:55.000000000 +0100
+@@ -4,8 +4,9 @@
+ agginclude_HEADERS = agg_font_freetype.h
+ lib_LTLIBRARIES = libaggfontfreetype.la
+ 
+-libaggfontfreetype_la_LDFLAGS = -version-info @AGG_LIB_VERSION@ @FREETYPE_LIBS@ 
++libaggfontfreetype_la_LDFLAGS = -version-info @AGG_LIB_VERSION@
+ libaggfontfreetype_la_SOURCES = agg_font_freetype.cpp
+ libaggfontfreetype_la_CXXFLAGS = -I$(top_srcdir)/include @FREETYPE_CFLAGS@ 
++libaggfontfreetype_la_LIBADD = ../src/libagg.la @FREETYPE_LIBS@
+ endif
+ 
+--- agg-2.4.orig/src/platform/sdl/Makefile.am	2005-10-17 23:49:35.000000000 +0100
++++ agg-2.4/src/platform/sdl/Makefile.am	2006-07-10 15:11:55.000000000 +0100
+@@ -5,6 +5,6 @@
+ libaggplatformsdl_la_LDFLAGS = -version-info @AGG_LIB_VERSION@
+ libaggplatformsdl_la_SOURCES = agg_platform_support.cpp
+ libaggplatformsdl_la_CXXFLAGS = -I$(top_srcdir)/include @SDL_CFLAGS@
+-libaggplatformsdl_la_LIBADD = @SDL_LIBS@
++libaggplatformsdl_la_LIBADD = ../../libagg.la @SDL_LIBS@
+ endif
+ 
+--- agg-2.5.orig/src/platform/X11/Makefile.am	2006-12-11 00:59:45.000000000 +0000
++++ agg-2.5/src/platform/X11/Makefile.am	2007-01-07 14:07:39.000000000 +0000
+@@ -1,8 +1,8 @@
+ if ENABLE_X11
+ lib_LTLIBRARIES = libaggplatformX11.la
+ 
+-libaggplatformX11_la_LDFLAGS = -version-info @AGG_LIB_VERSION@  -L@x_libraries@
++libaggplatformX11_la_LDFLAGS = -version-info @AGG_LIB_VERSION@ @X_LDFLAGS@
+ libaggplatformX11_la_SOURCES = agg_platform_support.cpp
+ libaggplatformX11_la_CXXFLAGS =  -I$(top_srcdir)/include -I@x_includes@
+-libaggplatformX11_la_LIBADD = -lX11
++libaggplatformX11_la_LIBADD = ../../libagg.la -lX11
+ endif
diff --git a/FunKey/package/agg/agg-2.5-autotools.patch b/FunKey/package/agg/agg-2.5-autotools.patch
new file mode 100644
index 0000000..1272b65
--- /dev/null
+++ b/FunKey/package/agg/agg-2.5-autotools.patch
@@ -0,0 +1,11 @@
+--- a/configure.in	2013-02-22 09:30:00.000000000 -0600
++++ b/configure.in	2013-02-22 09:30:49.030777571 -0600
+@@ -8,7 +8,7 @@
+ AC_PROG_CC
+ AC_PROG_CXX
+ AC_ISC_POSIX
+-AM_C_PROTOTYPES
++#AM_C_PROTOTYPES
+ if test "x$U" != "x"; then
+   AC_MSG_ERROR(Compiler not ANSI compliant)
+ fi
diff --git a/FunKey/package/agg/agg-2.5-pkgconfig.patch b/FunKey/package/agg/agg-2.5-pkgconfig.patch
new file mode 100644
index 0000000..a303bfb
--- /dev/null
+++ b/FunKey/package/agg/agg-2.5-pkgconfig.patch
@@ -0,0 +1,10 @@
+--- agg-2.5/libagg.pc.in.orig	2007-01-07 13:58:28.000000000 +0000
++++ agg-2.5/libagg.pc.in	2007-01-07 14:02:40.000000000 +0000
+@@ -6,5 +6,6 @@
+ Name: libagg
+ Description: Anti Grain Geometry - A High Quality Rendering Engine for C++
+ Version: @VERSION@
+-Libs: -L${libdir} -Wl,-rpath,${exec_prefix}/lib -lagg
++Requires.private: freetype2
++Libs: -L${libdir} -lagg
+ Cflags: -I${includedir}
diff --git a/FunKey/package/agg/agg.mk b/FunKey/package/agg/agg.mk
new file mode 100644
index 0000000..ecf5749
--- /dev/null
+++ b/FunKey/package/agg/agg.mk
@@ -0,0 +1,32 @@
+###############################################################################
+#
+# agg
+#
+###############################################################################
+
+AGG_VERSION = 2.5
+AGG_SOURCE = agg-$(AGG_VERSION).tar.gz
+AGG_SITE = https://ftp.osuosl.org/pub/blfs/8.0/a
+AGG_LICENSE = GPLv3+
+AGG_LICENSE_FILES = COPYING
+AGG_INSTALL_STAGING = YES
+AGG_AUTORECONF = YES
+
+AGG_DEPENDENCIES = host-pkgconf sdl
+
+AGG_CONF_OPTS = \
+        --with-sdl-prefix=$(STAGING_DIR)/usr \
+        --disable-sdltest
+
+AGG_CONF_OPTS += \
+	--with-x=NO \
+	--disable-examples --disable-gpc
+
+ifeq ($(BR2_PACKAGE_FREETYPE),y)
+AGG_DEPENDENCIES += freetype
+AGG_CONF_OPTS += --enable-freetype
+else
+AGG_CONF_OPTS += --disable-freetype
+endif
+
+$(eval $(autotools-package))
diff --git a/FunKey/package/fluidlite/0001-fluidlite.patch b/FunKey/package/fluidlite/0001-fluidlite.patch
new file mode 100644
index 0000000..9dc01c4
--- /dev/null
+++ b/FunKey/package/fluidlite/0001-fluidlite.patch
@@ -0,0 +1,11 @@
+--- a/CMakeLists.txt
++++ b/CMakeLists.txt
+@@ -125,7 +125,7 @@
+     endif()
+ endif()
+ 
+-option(FLUIDLITE_BUILD_SHARED "Build shared library" TRUE)
++option(FLUIDLITE_BUILD_SHARED "Build shared library" FALSE)
+ if(FLUIDLITE_BUILD_SHARED)
+     add_library(${PROJECT_NAME} SHARED ${SOURCES})
+ 
diff --git a/FunKey/package/fluidlite/Config.in b/FunKey/package/fluidlite/Config.in
new file mode 100644
index 0000000..bdfb7e0
--- /dev/null
+++ b/FunKey/package/fluidlite/Config.in
@@ -0,0 +1,5 @@
+config BR2_PACKAGE_FLUIDLITE
+	bool "FluidLite"
+	depends on BR2_USE_MMU
+	help
+	  FluidLite is a very light version of FluidSynth.
diff --git a/FunKey/package/fluidlite/fluidsynth.hash b/FunKey/package/fluidlite/fluidsynth.hash
new file mode 100644
index 0000000..d39b0e8
--- /dev/null
+++ b/FunKey/package/fluidlite/fluidsynth.hash
@@ -0,0 +1,3 @@
+# Locally computed
+sha256  ef4d008f9fe2fa9a48135505d42dd7e8e9cc4d7494a4b13d6caa13adb5c61ff8  1.0.tar.gz
+sha256  a5564e99fd7f49e1344152a8c5bc1d420d5f973b30e010946764db0b5b9e668c  LICENSE
diff --git a/FunKey/package/fluidlite/fluidsynth.mk b/FunKey/package/fluidlite/fluidsynth.mk
new file mode 100644
index 0000000..5cd24b7
--- /dev/null
+++ b/FunKey/package/fluidlite/fluidsynth.mk
@@ -0,0 +1,19 @@
+################################################################################
+#
+# FLUIDLITE
+#
+################################################################################
+
+FLUIDLITE_VERSION = fdd05bad03cdb24d1f78b5fe3453842890c1b0e8
+FLUIDLITE_SITE = $(call github,gcw0,FluidLite,$(FLUIDLITE_VERSION))
+FLUIDLITE_LICENSE = LGPL-2.1+
+FLUIDLITE_LICENSE_FILES = LICENSE
+FLUIDLITE_INSTALL_STAGING = YES
+FLUIDLITE_DEPENDENCIES = 
+
+# Disable the shared library for static only build
+ifeq ($(BR2_STATIC_LIBS),y)
+FLUIDLITE_CONF_OPTS += -DDFLUIDLITE_BUILD_SHARED=FALSE
+endif
+
+$(eval $(cmake-package))
diff --git a/FunKey/package/fonts-droid/Config.in b/FunKey/package/fonts-droid/Config.in
new file mode 100644
index 0000000..a964265
--- /dev/null
+++ b/FunKey/package/fonts-droid/Config.in
@@ -0,0 +1,4 @@
+config BR2_PACKAGE_FONTS_DROID
+	bool "fonts-droid"
+	help
+	  Android (AOSP) "droid" family of fonts
diff --git a/FunKey/package/fonts-droid/fonts-droid.mk b/FunKey/package/fonts-droid/fonts-droid.mk
new file mode 100644
index 0000000..630f438
--- /dev/null
+++ b/FunKey/package/fonts-droid/fonts-droid.mk
@@ -0,0 +1,28 @@
+################################################################################
+#
+# fonts-droid
+#
+################################################################################
+
+FONTS_DROID_VERSION = 074990596701553b8b51ff22290453de522f0d15
+FONTS_DROID_SITE = https://android.googlesource.com/platform/frameworks/base/+archive/$(FONTS_DROID_VERSION)/data
+FONTS_DROID_SOURCE = fonts.tar.gz
+FONTS_DROID_LICENSE = Apache-2.0
+
+FONTS_DROID_STRIP_COMPONENTS = 0
+
+# We cannot verify the hash because googlesource.com produces an archive
+# with a different hash on every request.
+#
+# This still issues a warning.
+BR_NO_CHECK_HASH_FOR += $(FONTS_DROID_SOURCE)
+
+define FONTS_DROID_INSTALL_TARGET_CMDS
+	mkdir -p $(TARGET_DIR)/usr/share/fonts/droid/
+	install -m 0644 $(@D)/NOTICE $(@D)/DroidSansFallback.ttf \
+	  $(TARGET_DIR)/usr/share/fonts/droid/
+	install -m 0644 $(@D)/NOTICE $(@D)/DroidSansFallbackFull.ttf \
+	  $(TARGET_DIR)/usr/share/fonts/droid/
+endef
+
+$(eval $(generic-package))
diff --git a/FunKey/package/gmenu2x/Config.in b/FunKey/package/gmenu2x/Config.in
new file mode 100644
index 0000000..833c465
--- /dev/null
+++ b/FunKey/package/gmenu2x/Config.in
@@ -0,0 +1,25 @@
+config BR2_PACKAGE_GMENU2X
+	bool "gmenu2x"
+	select BR2_PACKAGE_LIBPNG
+	select BR2_PACKAGE_SDL
+	select BR2_PACKAGE_SDL_TTF
+	select BR2_PACKAGE_SDL_GFX
+	select BR2_PACKAGE_DEJAVU
+	select BR2_PACKAGE_FONTS_DROID
+	help
+	  Application menu for devices with a 320x240 screen and buttons.
+
+	  This is a forked version with cleanups, targeted at the Ben NanoNote
+	  and the Dingoo A320/A330.
+
+	  http://projects.qi-hardware.com/index.php/p/gmenu2x
+
+if BR2_PACKAGE_GMENU2X
+
+config BR2_PACKAGE_GMENU2X_SHOW_CLOCK
+	bool "Show clock"
+
+config BR2_PACKAGE_GMENU2X_CPUFREQ
+	bool "Support CPU frequency control"
+
+endif
diff --git a/FunKey/package/gmenu2x/gmenu2x.mk b/FunKey/package/gmenu2x/gmenu2x.mk
new file mode 100644
index 0000000..f73e8c0
--- /dev/null
+++ b/FunKey/package/gmenu2x/gmenu2x.mk
@@ -0,0 +1,37 @@
+#############################################################
+#
+# gmenu2x
+#
+#############################################################
+GMENU2X_VERSION = gmenu2x-FunKey-1.0.0
+GMENU2X_SITE_METHOD = git
+GMENU2X_SITE = https://github.com/FunKey-Project/gmenu2x.git
+GMENU2X_LICENSE = GPL-2.0
+
+GMENU2X_DEPENDENCIES = sdl sdl_ttf sdl_gfx dejavu libpng fonts-droid
+
+GMENU2X_CONF_OPTS = -DBIND_CONSOLE=ON
+
+ifeq ($(BR2_PACKAGE_GMENU2X_SHOW_CLOCK),y)
+GMENU2X_CONF_OPTS += -DCLOCK=ON
+else
+GMENU2X_CONF_OPTS += -DCLOCK=OFF
+endif
+
+ifeq ($(BR2_PACKAGE_GMENU2X_CPUFREQ),y)
+GMENU2X_CONF_OPTS += -DCPUFREQ=ON
+else
+GMENU2X_CONF_OPTS += -DCPUFREQ=OFF
+endif
+
+GMENU2X_CONF_OPTS += -DSCREEN_WIDTH=240 -DSCREEN_HEIGHT=240 -DSCREEN_DEPTH=16
+
+ifeq ($(BR2_PACKAGE_LIBOPK),y)
+GMENU2X_DEPENDENCIES += libopk
+endif
+
+ifeq ($(BR2_PACKAGE_LIBXDGMIME),y)
+GMENU2X_DEPENDENCIES += libxdgmime
+endif
+
+$(eval $(cmake-package))
diff --git a/FunKey/package/gnuboy/gnuboy.mk b/FunKey/package/gnuboy/gnuboy.mk
index 830f056..dfc4e71 100644
--- a/FunKey/package/gnuboy/gnuboy.mk
+++ b/FunKey/package/gnuboy/gnuboy.mk
@@ -4,7 +4,7 @@
 #
 ################################################################################
 
-GNUBOY_VERSION = gnuboy-FunKey-1.00
+GNUBOY_VERSION = gnuboy-FunKey-1.1.0
 GNUBOY_SITE_METHOD = git
 GNUBOY_SITE = https://github.com/FunKey-Project/gnuboy.git
 GNUBOY_LICENSE = GPL-2.0
@@ -12,25 +12,7 @@ GNUBOY_LICENSE_FILES = COPYING
 
 GNUBOY_DEPENDENCIES = sdl sdl_image sdl_mixer sdl_ttf zlib
 
-GNUBOY_CFLAGS = $(TARGET_CFLAGS)
-
-ifeq ($(BR2_ARM_CPU_ARMV7A),y)
-GNUBOY_CFLAGS += -march=armv7-a
-endif
-
-ifeq ($(BR2_GCC_TARGET_CPU),"cortex-a7")
-GNUBOY_CFLAGS += -mtune=cortex-a7
-endif
-
-ifeq ($(BR2_GCC_TARGET_FLOAT_ABI),"hard")
-GNUBOY_CFLAGS += -mfloat-abi=hard -ffast-math -funsafe-math-optimizations
-else ifeq ($(BR2_GCC_TARGET_FLOAT_ABI),"soft")
-GNUBOY_CFLAGS += -mfloat-abi=soft -ffast-math -funsafe-math-optimizations
-endif
-
-ifeq ($(BR2_ARM_CPU_HAS_NEON),y)
-GNUBOY_CFLAGS += -D__ARM_NEON__ -mfpu=neon -mvectorize-with-neon-quad
-endif
+GNUBOY_CFLAGS = $(TARGET_CFLAGS) $(subst $\",,$(BR2_TARGET_OPTIMIZATION)) -mfloat-abi=hard -ffast-math -funsafe-math-optimizations
 
 GNUBOY_CFLAGS += -ggdb -O3
 
@@ -43,4 +25,11 @@ GNUBOY_CONF_OPTS += --without-fb \
 
 GNUBOY_CONF_ENV += SDL_CONFIG="$(STAGING_DIR)/usr/bin/sdl-config"
 
+define GNUBOY_CREATE_OPK
+	$(INSTALL) -d -m 0755 $(TARGET_DIR)/usr/games/opk
+	$(HOST_DIR)/usr/bin/mksquashfs $(GNUBOY_PKGDIR)/opk/gb $(TARGET_DIR)/usr/games/opk/gb_gnuboy_funkey-s.opk -all-root -noappend -no-exports -no-xattrs
+	$(HOST_DIR)/usr/bin/mksquashfs $(GNUBOY_PKGDIR)/opk/gbc $(TARGET_DIR)/usr/games/opk/gbc_gnuboy_funkey-s.opk -all-root -noappend -no-exports -no-xattrs
+endef
+GNUBOY_POST_INSTALL_TARGET_HOOKS += GNUBOY_CREATE_OPK
+
 $(eval $(autotools-package))
diff --git a/FunKey/package/gnuboy/opk/gb/gb.funkey-s.desktop b/FunKey/package/gnuboy/opk/gb/gb.funkey-s.desktop
new file mode 100644
index 0000000..a6e56d7
--- /dev/null
+++ b/FunKey/package/gnuboy/opk/gb/gb.funkey-s.desktop
@@ -0,0 +1,12 @@
+[Desktop Entry]
+Type=Application
+Name=GameBoy
+Comment=Game Boy Emulator
+Icon=gb
+Exec=/usr/games/launchers/gb_launch.sh %f
+Terminal=false
+X-OD-NeedsDownscaling=true
+Categories=emulators;
+selectorbrowser=true
+SelectorDir=/mnt/Game Boy
+SelectorFilter=gb,GB
diff --git a/FunKey/package/gnuboy/opk/gb/gb.png b/FunKey/package/gnuboy/opk/gb/gb.png
new file mode 100644
index 0000000..7b7d823
Binary files /dev/null and b/FunKey/package/gnuboy/opk/gb/gb.png differ
diff --git a/FunKey/package/gnuboy/opk/gbc/gbc.funkey-s.desktop b/FunKey/package/gnuboy/opk/gbc/gbc.funkey-s.desktop
new file mode 100644
index 0000000..30f3c65
--- /dev/null
+++ b/FunKey/package/gnuboy/opk/gbc/gbc.funkey-s.desktop
@@ -0,0 +1,12 @@
+[Desktop Entry]
+Type=Application
+Name=GameBoy Col
+Comment=Game Boy Color Emulator
+Icon=gbc
+Exec=/usr/games/launchers/gb_launch.sh %f
+Terminal=false
+X-Od-NeedsDownscaling=true
+Categories=emulators;
+selectorbrowser=true
+SelectorDir=/mnt/Game Boy Color
+SelectorFilter=gbc,GBC
diff --git a/FunKey/package/gnuboy/opk/gbc/gbc.png b/FunKey/package/gnuboy/opk/gbc/gbc.png
new file mode 100644
index 0000000..24eea74
Binary files /dev/null and b/FunKey/package/gnuboy/opk/gbc/gbc.png differ
diff --git a/FunKey/package/gpsp/gpsp.mk b/FunKey/package/gpsp/gpsp.mk
index 743d032..19e2721 100644
--- a/FunKey/package/gpsp/gpsp.mk
+++ b/FunKey/package/gpsp/gpsp.mk
@@ -4,7 +4,7 @@
 #
 ################################################################################
 
-GPSP_VERSION = gpsp-FunKey-1.00
+GPSP_VERSION = gpsp-FunKey-1.1.0
 GPSP_SITE_METHOD = git
 GPSP_SITE = https://github.com/FunKey-Project/gpsp.git
 GPSP_LICENSE = GPL-2.0
@@ -12,25 +12,7 @@ GPSP_LICENSE_FILES = COPYING.DOC
 
 GPSP_DEPENDENCIES = sdl sdl_image sdl_mixer sdl_ttf zlib
 
-GPSP_CFLAGS = $(TARGET_CFLAGS)
-
-ifeq ($(BR2_ARM_CPU_ARMV7A),y)
-GPSP_CFLAGS += -march=armv7-a
-endif
-
-ifeq ($(BR2_GCC_TARGET_CPU),"cortex-a7")
-GPSP_CFLAGS += -mtune=cortex-a7
-endif
-
-ifeq ($(BR2_GCC_TARGET_FLOAT_ABI),"hard")
-GPSP_CFLAGS += -mfloat-abi=hard -ffast-math -funsafe-math-optimizations
-else ifeq ($(BR2_GCC_TARGET_FLOAT_ABI),"soft")
-GPSP_CFLAGS += -mfloat-abi=soft -ffast-math -funsafe-math-optimizations
-endif
-
-ifeq ($(BR2_ARM_CPU_HAS_NEON),y)
-GPSP_CFLAGS += -D__ARM_NEON__ -mfpu=neon -mvectorize-with-neon-quad
-endif
+GPSP_CFLAGS = $(TARGET_CFLAGS) $(subst $\",,$(BR2_TARGET_OPTIMIZATION)) -mfloat-abi=hard -ffast-math -funsafe-math-optimizations
 
 GPSP_SDL_CFLAGS += $(shell $(STAGING_DIR)/usr/bin/sdl-config --cflags)
 GPSP_SDL_LIBS += $(shell $(STAGING_DIR)/usr/bin/sdl-config --libs)
@@ -58,4 +40,10 @@ define GPSP_INSTALL_TARGET_CMDS
 	$(INSTALL) -m 0644 $(@D)/game_config.txt $(TARGET_DIR)/usr/games/game_config.txt
 endef
 
+define GPSP_CREATE_OPK
+	$(INSTALL) -d -m 0755 $(TARGET_DIR)/usr/games/opk
+	$(HOST_DIR)/usr/bin/mksquashfs $(GPSP_PKGDIR)/opk/gba $(TARGET_DIR)/usr/games/opk/gba_gpsp_funkey-s.opk -all-root -noappend -no-exports -no-xattrs
+endef
+GPSP_POST_INSTALL_TARGET_HOOKS += GPSP_CREATE_OPK
+
 $(eval $(generic-package))
diff --git a/FunKey/package/gpsp/opk/gba/gba.funkey-s.desktop b/FunKey/package/gpsp/opk/gba/gba.funkey-s.desktop
new file mode 100644
index 0000000..44ec461
--- /dev/null
+++ b/FunKey/package/gpsp/opk/gba/gba.funkey-s.desktop
@@ -0,0 +1,12 @@
+[Desktop Entry]
+Type=Application
+Name=GameBoy Adv
+Comment=Game Boy Advance Emulator
+Icon=gba
+Exec=/usr/games/launchers/gba_launch_gpsp.sh %f
+Terminal=false
+X-OD-NeedsDownscaling=true
+Categories=emulators;
+selectorbrowser=true
+SelectorDir=/mnt/Game Boy Advance
+SelectorFilter=gba,GBA
diff --git a/FunKey/package/gpsp/opk/gba/gba.png b/FunKey/package/gpsp/opk/gba/gba.png
new file mode 100644
index 0000000..77d3eab
Binary files /dev/null and b/FunKey/package/gpsp/opk/gba/gba.png differ
diff --git a/FunKey/package/libini/Config.in b/FunKey/package/libini/Config.in
new file mode 100644
index 0000000..a42967b
--- /dev/null
+++ b/FunKey/package/libini/Config.in
@@ -0,0 +1,6 @@
+config BR2_PACKAGE_LIBINI
+	bool "libini"
+	help
+	  Tiny library to help reading INI files.
+
+	  https://github.com/pcercuei/libini
diff --git a/FunKey/package/libini/libini.mk b/FunKey/package/libini/libini.mk
new file mode 100644
index 0000000..33c1736
--- /dev/null
+++ b/FunKey/package/libini/libini.mk
@@ -0,0 +1,13 @@
+#############################################################
+#
+# libini
+#
+#############################################################
+LIBINI_VERSION = libini-FunKey-1.0.0
+LIBINI_SITE_METHOD = git
+LIBINI_SITE = https://github.com/FunKey-Project/libini.git
+LIBINI_LICENSE = LGPL-2.1
+
+LIBINI_INSTALL_STAGING = YES
+
+$(eval $(cmake-package))
diff --git a/FunKey/package/libmikmod/Config.in b/FunKey/package/libmikmod/Config.in
new file mode 100644
index 0000000..909cc00
--- /dev/null
+++ b/FunKey/package/libmikmod/Config.in
@@ -0,0 +1,7 @@
+config BR2_PACKAGE_LIBMIKMOD
+	bool "libmikmod"
+	help
+	  Mikmod is a module player and library supporting many
+	  tracker formats, including mod, s3m, it, and xm.
+
+	  http://mikmod.shlomifish.org/
diff --git a/FunKey/package/libmikmod/libmikmod.mk b/FunKey/package/libmikmod/libmikmod.mk
new file mode 100644
index 0000000..cce5190
--- /dev/null
+++ b/FunKey/package/libmikmod/libmikmod.mk
@@ -0,0 +1,21 @@
+#############################################################
+#
+# libmikmod
+#
+#############################################################
+LIBMIKMOD_VERSION:=3.3.11.1
+LIBMIKMOD_SITE:=http://sourceforge.net/projects/mikmod/files/libmikmod/$(LIBMIKMOD_VERSION)
+
+LIBMIKMOD_CONF_OPTS = --localstatedir=/var
+
+LIBMIKMOD_LIBTOOL_PATCH = NO
+LIBMIKMOD_INSTALL_STAGING = YES
+
+LIBMIKMOD_CONFIG_SCRIPTS = libmikmod-config
+
+define LIBMIKMOD_REMOVE_LIBMIKMOD_CONFIG
+mv $(TARGET_DIR)/usr/bin/libmikmod-config $(HOST_DIR)/bin/
+endef
+LIBMIKMOD_POST_INSTALL_TARGET_HOOKS += LIBMIKMOD_REMOVE_LIBMIKMOD_CONFIG
+
+$(eval $(autotools-package))
diff --git a/FunKey/package/libopk/Config.in b/FunKey/package/libopk/Config.in
new file mode 100644
index 0000000..db3bce9
--- /dev/null
+++ b/FunKey/package/libopk/Config.in
@@ -0,0 +1,8 @@
+config BR2_PACKAGE_LIBOPK
+        bool "libopk"
+        select BR2_PACKAGE_ZLIB
+        select BR2_PACKAGE_LIBINI
+        help
+          Library to handle OPK packages.
+
+          https://github.com/pcercuei/libopk
diff --git a/FunKey/package/libopk/libopk.mk b/FunKey/package/libopk/libopk.mk
new file mode 100644
index 0000000..7f767a7
--- /dev/null
+++ b/FunKey/package/libopk/libopk.mk
@@ -0,0 +1,14 @@
+#############################################################
+#
+# libopk
+#
+#############################################################
+LIBOPK_VERSION = libopk-FunKey-1.0.0
+LIBOPK_SITE_METHOD = git
+LIBOPK_SITE = https://github.com/FunKey-Project/libopk.git
+
+LIBOPK_DEPENDENCIES = libini zlib
+
+LIBOPK_INSTALL_STAGING = YES
+
+$(eval $(cmake-package))
diff --git a/FunKey/package/libxdgmime/Config.in b/FunKey/package/libxdgmime/Config.in
new file mode 100644
index 0000000..f1d4fc0
--- /dev/null
+++ b/FunKey/package/libxdgmime/Config.in
@@ -0,0 +1,12 @@
+config BR2_PACKAGE_LIBXDGMIME
+	bool "libxdgmime"
+	depends on BR2_USE_WCHAR # shared-mime-info
+	select BR2_PACKAGE_SHARED_MIME_INFO
+	help
+	  Simple library that parses the proposed MIME spec
+	  listed at http://freedesktop.org/.
+
+	  https://github.com/pcercuei/libxdgmime
+
+comment "libxdgmime requires a toolchain with WCHAR support"
+	depends on !BR2_USE_WCHAR
diff --git a/FunKey/package/libxdgmime/libxdgmime.mk b/FunKey/package/libxdgmime/libxdgmime.mk
new file mode 100644
index 0000000..509703d
--- /dev/null
+++ b/FunKey/package/libxdgmime/libxdgmime.mk
@@ -0,0 +1,30 @@
+#############################################################
+#
+# libxdgmime
+#
+#############################################################
+LIBXDGMIME_VERSION = libxdgmime-FunKey-1.0.0
+LIBXDGMIME_SITE_METHOD = git
+LIBXDGMIME_SITE = https://github.com/FunKey-Project/libxdgmime.git
+LIBXDGMIME_DEPENDENCIES = shared-mime-info
+LIBXDGMIME_LICENCE = LGPL-2.1+ or AFL-2.1
+
+LIBXDGMIME_INSTALL_STAGING = YES
+
+LIBXDGMIME_MAKE_ENV = CFLAGS="$(TARGET_CFLAGS)" LDFLAGS="$(TARGET_LDFLAGS)" \
+				  CROSS_COMPILE="$(TARGET_CROSS)" PREFIX=/usr \
+				  PLATFORM="$(BR2_VENDOR)"
+
+define LIBXDGMIME_BUILD_CMDS
+	$(LIBXDGMIME_MAKE_ENV) $(MAKE) -C $(@D)
+endef
+
+define LIBXDGMIME_INSTALL_STAGING_CMDS
+	$(LIBXDGMIME_MAKE_ENV) DESTDIR="$(STAGING_DIR)" $(MAKE) -C $(@D) install
+endef
+
+define LIBXDGMIME_INSTALL_TARGET_CMDS
+	$(LIBXDGMIME_MAKE_ENV) DESTDIR="$(TARGET_DIR)" $(MAKE) -C $(@D) install-lib
+endef
+
+$(eval $(generic-package))
diff --git a/FunKey/package/mednafen/mednafen.mk b/FunKey/package/mednafen/mednafen.mk
index 991044f..0ce2272 100644
--- a/FunKey/package/mednafen/mednafen.mk
+++ b/FunKey/package/mednafen/mednafen.mk
@@ -4,7 +4,7 @@
 #
 ################################################################################
 
-MEDNAFEN_VERSION = mednafen-git-FunKey-1.00
+MEDNAFEN_VERSION = mednafen-git-FunKey-1.1.0
 MEDNAFEN_SITE_METHOD = git
 MEDNAFEN_SITE = https://github.com/FunKey-Project/mednafen-git.git
 MEDNAFEN_LICENSE = GPL-2.0+
@@ -12,36 +12,31 @@ MEDNAFEN_LICENSE_FILES = COPYING
 
 MEDNAFEN_DEPENDENCIES = sdl sdl_image sdl_mixer sdl_ttf zlib
 
-MEDNAFEN_CFLAGS = $(TARGET_CFLAGS)
-
-ifeq ($(BR2_ARM_CPU_ARMV7A),y)
-MEDNAFEN_CFLAGS += -march=armv7-a
-endif
-
-ifeq ($(BR2_GCC_TARGET_CPU),"cortex-a7")
-MEDNAFEN_CFLAGS += -mtune=cortex-a7
-endif
-
-ifeq ($(BR2_GCC_TARGET_FLOAT_ABI),"hard")
-MEDNAFEN_CFLAGS += -mfloat-abi=hard -ffast-math -funsafe-math-optimizations
-else ifeq ($(BR2_GCC_TARGET_FLOAT_ABI),"soft")
-MEDNAFEN_CFLAGS += -mfloat-abi=soft -ffast-math -funsafe-math-optimizations
-endif
-
-ifeq ($(BR2_ARM_CPU_HAS_NEON),y)
-MEDNAFEN_CFLAGS += -D__ARM_NEON__ -mfpu=neon -mvectorize-with-neon-quad
-endif
+MEDNAFEN_CFLAGS = $(TARGET_CFLAGS) $(subst $\",,$(BR2_TARGET_OPTIMIZATION)) -mfloat-abi=hard
 
 #MEDNAFEN_AUTORECONF = YES
 
 MEDNAFEN_CFLAGS += -ggdb -O3
+MEDNAFEN_CFLAGS += -DFUNKEY_FAST_BLIT
 
 #MEDNAFEN_LDFLAGS +=  -lSDL_ttf -lSDL_image
 
-MEDNAFEN_CONF_OPTS += CFLAGS="$(MEDNAFEN_CFLAGS)"
+MEDNAFEN_CONF_OPTS += CXXFLAGS="$(MEDNAFEN_CFLAGS)"
 #MEDNAFEN_CONF_OPTS += LDFLAGS="$(MEDNAFEN_LDFLAGS)"
 MEDNAFEN_CONF_OPTS += --prefix=/usr/local --bindir=/usr/games --without-libsndfile
+MEDNAFEN_CONF_OPTS += --disable-ss --disable-ssfplay --disable-fancy-scalers
+#MEDNAFEN_CONF_OPTS += --disable-nes --disable-gba --disable-psx --disable-snes 
 
 MEDNAFEN_CONF_ENV += SDL_CONFIG="$(STAGING_DIR)/usr/bin/sdl-config"
 
+define MEDNAFEN_CREATE_OPK
+	$(INSTALL) -d -m 0755 $(TARGET_DIR)/usr/games/opk
+	$(HOST_DIR)/usr/bin/mksquashfs $(MEDNAFEN_PKGDIR)/opk/gamegear $(TARGET_DIR)/usr/games/opk/gamegear_mednafen_funkey-s.opk -all-root -noappend -no-exports -no-xattrs
+	$(HOST_DIR)/usr/bin/mksquashfs $(MEDNAFEN_PKGDIR)/opk/lynx $(TARGET_DIR)/usr/games/opk/lynx_mednafen_funkey-s.opk -all-root -noappend -no-exports -no-xattrs
+	$(HOST_DIR)/usr/bin/mksquashfs $(MEDNAFEN_PKGDIR)/opk/ngp $(TARGET_DIR)/usr/games/opk/ngp_mednafen_funkey-s.opk -all-root -noappend -no-exports -no-xattrs
+	$(HOST_DIR)/usr/bin/mksquashfs $(MEDNAFEN_PKGDIR)/opk/pce $(TARGET_DIR)/usr/games/opk/pce_mednaefn_funkey-s.opk -all-root -noappend -no-exports -no-xattrs
+	$(HOST_DIR)/usr/bin/mksquashfs $(MEDNAFEN_PKGDIR)/opk/wonderswan $(TARGET_DIR)/usr/games/opk/wonderswan_mednafen_funkey-s.opk -all-root -noappend -no-exports -no-xattrs
+endef
+MEDNAFEN_POST_INSTALL_TARGET_HOOKS += MEDNAFEN_CREATE_OPK
+
 $(eval $(autotools-package))
diff --git a/FunKey/package/mednafen/opk/gamegear/gamegear.funkey-s.desktop b/FunKey/package/mednafen/opk/gamegear/gamegear.funkey-s.desktop
new file mode 100644
index 0000000..bbc5a28
--- /dev/null
+++ b/FunKey/package/mednafen/opk/gamegear/gamegear.funkey-s.desktop
@@ -0,0 +1,12 @@
+[Desktop Entry]
+Type=Application
+Name=Game Gear
+Comment=Game Gear Emulator
+Icon=gamegear
+Exec=/usr/games/launchers/gamegear_launch.sh %f
+Terminal=false
+X-OD-NeedsDownscaling=true
+Categories=emulators;
+selectorbrowser=true
+SelectorDir=/mnt/Game Gear
+SelectorFilter=zip,ZIP,gg,GG
diff --git a/FunKey/package/mednafen/opk/gamegear/gamegear.png b/FunKey/package/mednafen/opk/gamegear/gamegear.png
new file mode 100644
index 0000000..c6a0f24
Binary files /dev/null and b/FunKey/package/mednafen/opk/gamegear/gamegear.png differ
diff --git a/FunKey/package/mednafen/opk/lynx/lynx.funkey-s.desktop b/FunKey/package/mednafen/opk/lynx/lynx.funkey-s.desktop
new file mode 100644
index 0000000..3216c4f
--- /dev/null
+++ b/FunKey/package/mednafen/opk/lynx/lynx.funkey-s.desktop
@@ -0,0 +1,12 @@
+[Desktop Entry]
+Type=Application
+Name=Lynx
+Comment=Atari Lynx Emulator
+Icon=lynx
+Exec=/usr/games/launchers/lynx_launch.sh %f
+Terminal=false
+X-OD-NeedsDownscaling=true
+Categories=emulators;
+selectorbrowser=true
+SelectorDir=/mnt/Atari lynx
+SelectorFilter=zip,ZIP,lnx,LNX
diff --git a/FunKey/package/mednafen/opk/lynx/lynx.png b/FunKey/package/mednafen/opk/lynx/lynx.png
new file mode 100644
index 0000000..bb144e9
Binary files /dev/null and b/FunKey/package/mednafen/opk/lynx/lynx.png differ
diff --git a/FunKey/package/mednafen/opk/ngp/ngp.funkey-s.desktop b/FunKey/package/mednafen/opk/ngp/ngp.funkey-s.desktop
new file mode 100644
index 0000000..76365aa
--- /dev/null
+++ b/FunKey/package/mednafen/opk/ngp/ngp.funkey-s.desktop
@@ -0,0 +1,12 @@
+[Desktop Entry]
+Type=Application
+Name=Neo Geo
+Comment=Neo Geo Pocket Emulator
+Icon=ngp
+Exec=/usr/games/launchers/ngp_launch.sh %f
+Terminal=false
+X-OD-NeedsDownscaling=true
+Categories=emulators;
+selectorbrowser=true
+SelectorDir=/mnt/Neo Geo Pocket
+SelectorFilter=zip,ZIP,ngp,NGP
diff --git a/FunKey/package/mednafen/opk/ngp/ngp.png b/FunKey/package/mednafen/opk/ngp/ngp.png
new file mode 100644
index 0000000..29d9c0e
Binary files /dev/null and b/FunKey/package/mednafen/opk/ngp/ngp.png differ
diff --git a/FunKey/package/mednafen/opk/pce/pce.funkey-s.desktop b/FunKey/package/mednafen/opk/pce/pce.funkey-s.desktop
new file mode 100644
index 0000000..355dcd9
--- /dev/null
+++ b/FunKey/package/mednafen/opk/pce/pce.funkey-s.desktop
@@ -0,0 +1,12 @@
+[Desktop Entry]
+Type=Application
+Name=PC Engine
+Comment=PCE-TurboGrafx Emulator
+Icon=pce
+Exec=/usr/games/launchers/pce_launch.sh %f
+Terminal=false
+X-OD-NeedsDownscaling=true
+Categories=emulators;
+selectorbrowser=true
+SelectorDir=/mnt/PCE-TurboGrafx
+SelectorFilter=zip,ZIP,pce,PCE,sgx,SGX
diff --git a/FunKey/package/mednafen/opk/pce/pce.png b/FunKey/package/mednafen/opk/pce/pce.png
new file mode 100644
index 0000000..e1a0f8c
Binary files /dev/null and b/FunKey/package/mednafen/opk/pce/pce.png differ
diff --git a/FunKey/package/mednafen/opk/wonderswan/wonderswan.funkey-s.desktop b/FunKey/package/mednafen/opk/wonderswan/wonderswan.funkey-s.desktop
new file mode 100644
index 0000000..ec4280b
--- /dev/null
+++ b/FunKey/package/mednafen/opk/wonderswan/wonderswan.funkey-s.desktop
@@ -0,0 +1,12 @@
+[Desktop Entry]
+Type=Application
+Name=WonderSwan
+Comment=WonderSwan/WonderSwan Color Emulator
+Icon=wonderswan
+Exec=/usr/games/launchers/wonderswan_launch.sh %f
+Terminal=false
+X-OD-NeedsDownscaling=true
+Categories=emulators;
+selectorbrowser=true
+SelectorDir=/mnt/WonderSwan
+SelectorFilter=zip,ZIP,wsc,WSC
diff --git a/FunKey/package/mednafen/opk/wonderswan/wonderswan.png b/FunKey/package/mednafen/opk/wonderswan/wonderswan.png
new file mode 100644
index 0000000..265f1c0
Binary files /dev/null and b/FunKey/package/mednafen/opk/wonderswan/wonderswan.png differ
diff --git a/FunKey/package/picodrive/opk/megadrive/megadrive.funkey-s.desktop b/FunKey/package/picodrive/opk/megadrive/megadrive.funkey-s.desktop
new file mode 100644
index 0000000..0f2f544
--- /dev/null
+++ b/FunKey/package/picodrive/opk/megadrive/megadrive.funkey-s.desktop
@@ -0,0 +1,12 @@
+[Desktop Entry]
+Type=Application
+Name=MegaDrive
+Comment=Sega Genesis/MegaDrive Emulator
+Icon=megadrive
+Exec=/usr/games/launchers/megadrive_launch.sh %f
+Terminal=false
+X-OD-NeedsDownscaling=true
+Categories=emulators;
+selectorbrowser=true
+SelectorDir=/mnt/Sega Genesis
+SelectorFilter=zip,ZIP,md,MD
diff --git a/FunKey/package/picodrive/opk/megadrive/megadrive.png b/FunKey/package/picodrive/opk/megadrive/megadrive.png
new file mode 100644
index 0000000..585dd22
Binary files /dev/null and b/FunKey/package/picodrive/opk/megadrive/megadrive.png differ
diff --git a/FunKey/package/picodrive/picodrive.mk b/FunKey/package/picodrive/picodrive.mk
index f6b7070..1b17d4f 100644
--- a/FunKey/package/picodrive/picodrive.mk
+++ b/FunKey/package/picodrive/picodrive.mk
@@ -4,7 +4,7 @@
 #
 ################################################################################
 
-PICODRIVE_VERSION = picodrive-FunKey-1.00
+PICODRIVE_VERSION = picodrive-FunKey-1.1.0
 PICODRIVE_SITE_METHOD = git
 PICODRIVE_SITE = https://github.com/FunKey-Project/picodrive.git
 PICODRIVE_LICENSE = MAME
@@ -12,25 +12,7 @@ PICODRIVE_LICENSE_FILES = COPYING
 
 PICODRIVE_DEPENDENCIES = sdl sdl_image sdl_mixer sdl_ttf zlib
 
-PICODRIVE_CFLAGS = $(TARGET_CFLAGS)
-
-ifeq ($(BR2_ARM_CPU_ARMV7A),y)
-PICODRIVE_CFLAGS += -march=armv7-a
-endif
-
-ifeq ($(BR2_GCC_TARGET_CPU),"cortex-a7")
-PICODRIVE_CFLAGS += -mtune=cortex-a7
-endif
-
-ifeq ($(BR2_GCC_TARGET_FLOAT_ABI),"hard")
-PICODRIVE_CFLAGS += -mfloat-abi=hard -ffast-math -funsafe-math-optimizations
-else ifeq ($(BR2_GCC_TARGET_FLOAT_ABI),"soft")
-PICODRIVE_CFLAGS += -mfloat-abi=soft -ffast-math -funsafe-math-optimizations
-endif
-
-ifeq ($(BR2_ARM_CPU_HAS_NEON),y)
-PICODRIVE_CFLAGS += -D__ARM_NEON__ -mfpu=neon -mvectorize-with-neon-quad
-endif
+PICODRIVE_CFLAGS = $(TARGET_CFLAGS) $(subst $\",,$(BR2_TARGET_OPTIMIZATION)) -mfloat-abi=hard -ffast-math -funsafe-math-optimizations
 
 PICODRIVE_CONF_OPTS += --platform=generic --sound-drivers=sdl
 PICODRIVE_CFLAGS += -ggdb -O3
@@ -62,5 +44,10 @@ define PICODRIVE_INSTALL_TARGET_CMDS
 	$(INSTALL) -m 0755 $(@D)/PicoDrive $(TARGET_DIR)/usr/games/
 endef
 
+define PICODRIVE_CREATE_OPK
+	$(INSTALL) -d -m 0755 $(TARGET_DIR)/usr/games/opk
+	$(HOST_DIR)/usr/bin/mksquashfs $(PICODRIVE_PKGDIR)/opk/megadrive $(TARGET_DIR)/usr/games/opk/megadrive_picodrive_funkey-s.opk -all-root -noappend -no-exports -no-xattrs
+endef
+PICODRIVE_POST_INSTALL_TARGET_HOOKS += PICODRIVE_CREATE_OPK
 
 $(eval $(generic-package))
diff --git a/FunKey/package/retrofe/retrofe.mk b/FunKey/package/retrofe/retrofe.mk
index ab89168..82a9f51 100644
--- a/FunKey/package/retrofe/retrofe.mk
+++ b/FunKey/package/retrofe/retrofe.mk
@@ -4,7 +4,7 @@
 #
 ################################################################################
 
-RETROFE_VERSION = RetroFE-FunKey-1.00
+RETROFE_VERSION = RetroFE-FunKey-1.1.0
 RETROFE_SITE_METHOD = git
 RETROFE_SITE = https://github.com/FunKey-Project/RetroFE.git
 RETROFE_DEPENDENCIES = gstreamer1 gst1-plugins-base sdl sdl_image sdl_mixer sdl_sound sdl_ttf libglib2 sqlite zlib
@@ -15,9 +15,16 @@ RETROFE_SUBDIR = RetroFE/Source
 RETROFE_SUPPORTS_IN_SOURCE_BUILD = NO
 RETROFE_CONF_OPTS += -DVERSION_MAJOR=0 -DVERSION_MINOR=0 -DVERSION_BUILD=0
 
+ifeq ($(BR2_PACKAGE_LIBMIKMOD),y)
+RETROFE_DEPENDENCIES += libmikmod
+RETROFE_CONF_OPTS += -DLIBMIKMOD=1
+endif
+
 define RETROFE_INSTALL_TARGET_CMDS
 	$(INSTALL) -d -m 0755 $(TARGET_DIR)/usr/games
 	$(INSTALL) -m 0755 $(@D)/RetroFE/Build/retrofe $(TARGET_DIR)/usr/games/retrofe
 endef
 
+TARGET_CFLAGS += -O3
+
 $(eval $(cmake-package))
diff --git a/Makefile b/Makefile
index 83cf861..86a6c02 100644
--- a/Makefile
+++ b/Makefile
@@ -57,6 +57,26 @@ fun: buildroot Recovery/output/.config FunKey/output/.config
 	@$(call MESSAGE,"Making fun in FunKey")
 	@$(BRMAKE) BR2_EXTERNAL=../FunKey O=../FunKey/output
 
+sdk: buildroot SDK/output/.config
+	@$(call MESSAGE,"Making FunKey SDK")
+	@$(BRMAKE) BR2_EXTERNAL=../SDK O=../SDK/output prepare-sdk
+	@$(call MESSAGE,"Generating SDK tarball")
+	export LC_ALL=C; \
+	SDK=FunKey-sdk-$(shell cat FunKey/board/funkey/rootfs-overlay/etc/sw-versions | cut -f 2); \
+	grep -lr "$(shell pwd)/SDK/output/host" SDK/output/host | while read -r FILE ; do \
+		if file -b --mime-type "$${FILE}" | grep -q '^text/'; then \
+			sed -i "s|$(shell pwd)/SDK/output/host|/opt/$${SDK}|g" "$${FILE}"; \
+		fi; \
+	done; \
+	mkdir -p images; \
+	tar czf "images/$${SDK}.tar.gz" \
+		--owner=0 --group=0 --numeric-owner \
+		--transform="s#^$(patsubst /%,%,$(shell pwd))/SDK/output/host#$${SDK}#" \
+		-C / "$(patsubst /%,%,$(shell pwd))/SDK/output/host"; \
+	rm -f download/toolchain-external-custom/$${SDK}.tar.gz; \
+	mkdir -p download/toolchain-external-custom; \
+	ln -s ../../images/$${SDK}.tar.gz download/toolchain-external-custom/
+
 FunKey/%: FunKey/output/.config
 	@$(call MESSAGE,"Making $(notdir $@) in $(subst /,,$(dir $@))")
 	@$(BR) BR2_EXTERNAL=../FunKey O=../FunKey/output $(notdir $@)
@@ -65,12 +85,17 @@ Recovery/%: Recovery/output/.config
 	@$(call MESSAGE,"Making $(notdir $@) in $(subst /,,$(dir $@))")
 	@$(BR) BR2_EXTERNAL=../Recovery O=../Recovery/output $(notdir $@)
 
+SDK/%: SDK/output/.config
+	@$(call MESSAGE,"Making $(notdir $@) in $(subst /,,$(dir $@))")
+	@$(BR) BR2_EXTERNAL=../SDK O=../SDK/output $(notdir $@)
+
 #%: FunKey/output/.config
 #	@$(call MESSAGE,"Making $@ in FunKey")
 #	@$(BR) BR2_EXTERNAL=../FunKey O=../FunKey/output $@
 
 source:
 	@$(call MESSAGE,"Getting sources")
+	@$(BR) BR2_EXTERNAL=../SDK O=../SDK/output source
 	@$(BR) BR2_EXTERNAL=../Recovery O=../Recovery/output source
 	@$(BR) BR2_EXTERNAL=../FunKey O=../FunKey/output source
 
@@ -108,6 +133,8 @@ update: fun
 
 defconfig:
 	@$(call MESSAGE,"Updating default configs")
+	@$(call MESSAGE,"Updating default configs in SDK")
+	@$(BR) BR2_EXTERNAL=../SDK O=../SDK/output savedefconfig
 	@$(call MESSAGE,"Updating default configs in Recovery")
 	@$(BR) BR2_EXTERNAL=../Recovery O=../Recovery/output savedefconfig linux-update-defconfig uboot-update-defconfig busybox-update-config
 	@$(call MESSAGE,"Updating default configs in FunKey")
@@ -115,6 +142,7 @@ defconfig:
 
 clean:
 	@$(call MESSAGE,"Clean everything")
+	@$(BR) BR2_EXTERNAL=../SDK O=../SDK/output distclean
 	@$(BR) BR2_EXTERNAL=../Recovery O=../Recovery/output distclean
 	@$(BR) BR2_EXTERNAL=../FunKey O=../FunKey/output distclean
 	@rm -f br.log
@@ -132,3 +160,8 @@ Recovery/output/.config:
 	@$(call MESSAGE,"Configure Recovery")
 	@mkdir -p Recovery/board/funkey/patches
 	@$(BR) BR2_EXTERNAL=../Recovery O=../Recovery/output recovery_defconfig
+
+SDK/output/.config:
+	@$(call MESSAGE,"Configure SDK")
+	@mkdir -p SDK/board/funkey/patches
+	@$(BR) BR2_EXTERNAL=../SDK O=../SDK/output funkey_defconfig
diff --git a/Recovery/board/funkey/busybox.config b/Recovery/board/funkey/busybox.config
index 6d28ca2..bc03308 100644
--- a/Recovery/board/funkey/busybox.config
+++ b/Recovery/board/funkey/busybox.config
@@ -1,7 +1,7 @@
 #
 # Automatically generated make config: don't edit
-# Busybox version: 1.31.1
-# Sun Nov  8 23:55:55 2020
+# Busybox version: 1.32.0
+# Fri Jan 15 10:35:44 2021
 #
 CONFIG_HAVE_DOT_CONFIG=y
 
@@ -76,6 +76,7 @@ CONFIG_PREFIX="./_install"
 # CONFIG_DEBUG_SANITIZE is not set
 # CONFIG_UNIT_TEST is not set
 # CONFIG_WERROR is not set
+# CONFIG_WARN_SIMPLE_MSG is not set
 CONFIG_NO_DEBUG_LIB=y
 # CONFIG_DMALLOC is not set
 # CONFIG_EFENCE is not set
@@ -462,6 +463,7 @@ CONFIG_FEATURE_FIND_SIZE=y
 CONFIG_FEATURE_FIND_PRUNE=y
 CONFIG_FEATURE_FIND_QUIT=y
 # CONFIG_FEATURE_FIND_DELETE is not set
+CONFIG_FEATURE_FIND_EMPTY=y
 CONFIG_FEATURE_FIND_PATH=y
 CONFIG_FEATURE_FIND_REGEX=y
 # CONFIG_FEATURE_FIND_CONTEXT is not set
@@ -692,6 +694,7 @@ CONFIG_FEATURE_SETPRIV_CAPABILITY_NAMES=y
 CONFIG_SWITCH_ROOT=y
 # CONFIG_TASKSET is not set
 # CONFIG_FEATURE_TASKSET_FANCY is not set
+# CONFIG_FEATURE_TASKSET_CPULIST is not set
 CONFIG_UEVENT=y
 CONFIG_UMOUNT=y
 CONFIG_FEATURE_UMOUNT_ALL=y
@@ -805,6 +808,7 @@ CONFIG_MAKEDEVS=y
 CONFIG_FEATURE_MAKEDEVS_TABLE=y
 # CONFIG_MAN is not set
 CONFIG_MICROCOM=y
+CONFIG_MIM=y
 CONFIG_MT=y
 # CONFIG_NANDWRITE is not set
 # CONFIG_NANDDUMP is not set
@@ -1095,6 +1099,7 @@ CONFIG_SH_IS_ASH=y
 # CONFIG_BASH_IS_ASH is not set
 # CONFIG_BASH_IS_HUSH is not set
 CONFIG_BASH_IS_NONE=y
+CONFIG_SHELL_ASH=y
 CONFIG_ASH=y
 CONFIG_ASH_OPTIMIZE_FOR_SIZE=y
 CONFIG_ASH_INTERNAL_GLOB=y
@@ -1115,6 +1120,7 @@ CONFIG_ASH_GETOPTS=y
 CONFIG_ASH_CMDCMD=y
 # CONFIG_CTTYHACK is not set
 # CONFIG_HUSH is not set
+# CONFIG_SHELL_HUSH is not set
 # CONFIG_HUSH_BASH_COMPAT is not set
 # CONFIG_HUSH_BRACE_EXPANSION is not set
 # CONFIG_HUSH_LINENO_VAR is not set
@@ -1177,6 +1183,7 @@ CONFIG_FEATURE_ROTATE_LOGFILE=y
 CONFIG_FEATURE_REMOTE_LOG=y
 # CONFIG_FEATURE_SYSLOGD_DUP is not set
 # CONFIG_FEATURE_SYSLOGD_CFG is not set
+# CONFIG_FEATURE_SYSLOGD_PRECISE_TIMESTAMPS is not set
 CONFIG_FEATURE_SYSLOGD_READ_BUFFER_SIZE=256
 # CONFIG_FEATURE_IPC_SYSLOG is not set
 CONFIG_FEATURE_IPC_SYSLOG_BUFFER_SIZE=0
diff --git a/Recovery/board/funkey/linux.config b/Recovery/board/funkey/linux.config
index 20f59df..8993f06 100644
--- a/Recovery/board/funkey/linux.config
+++ b/Recovery/board/funkey/linux.config
@@ -25,6 +25,7 @@ CONFIG_CPU_FREQ=y
 CONFIG_CPUFREQ_DT=y
 CONFIG_VFP=y
 CONFIG_NEON=y
+CONFIG_KERNEL_MODE_NEON=y
 # CONFIG_COREDUMP is not set
 CONFIG_HIBERNATION=y
 CONFIG_PM_STD_PARTITION="/dev/mmcblk0p2"
@@ -153,7 +154,7 @@ CONFIG_AUTOFS4_FS=y
 CONFIG_VFAT_FS=y
 CONFIG_TMPFS=y
 CONFIG_CONFIGFS_FS=y
-# CONFIG_MISC_FILESYSTEMS is not set
+CONFIG_SQUASHFS=y
 # CONFIG_NETWORK_FILESYSTEMS is not set
 CONFIG_NLS_CODEPAGE_437=y
 CONFIG_NLS_ISO8859_1=y
diff --git a/Recovery/board/funkey/rootfs-overlay/etc/funkey_gpio_mapping.conf b/Recovery/board/funkey/rootfs-overlay/etc/funkey_gpio_mapping.conf
index ecb81ba..e5cbd8d 100644
--- a/Recovery/board/funkey/rootfs-overlay/etc/funkey_gpio_mapping.conf
+++ b/Recovery/board/funkey/rootfs-overlay/etc/funkey_gpio_mapping.conf
@@ -49,6 +49,7 @@
 13, KEYBOARD, KEY_Y, KEY_Y, Y
 11, KEYBOARD, KEY_X, KEY_X, X
 
+7+6, SHELL_COMMAND, snap, Fn+Start, Snapshot
 7+12, SHELL_COMMAND, quick_action_volume_up, Fn+Y, Volume++
 7+13, SHELL_COMMAND, quick_action_volume_down, Fn+A, Volume--
 7+11, SHELL_COMMAND, quick_action_bright_up, Fn+B, Brightness++
diff --git a/Recovery/board/funkey/rootfs-overlay/etc/issue b/Recovery/board/funkey/rootfs-overlay/etc/issue
index afbd169..62edbca 100644
--- a/Recovery/board/funkey/rootfs-overlay/etc/issue
+++ b/Recovery/board/funkey/rootfs-overlay/etc/issue
@@ -5,6 +5,6 @@
 |___|    |_____|__|__||__|\__||_____|___  |
          FUN ON A KEYCHAIN          |_____|
  -----------------------------------------------------
- Version 1.1.0 (Recovery)
+ Version 2.0.0 (Recovery)
  -----------------------------------------------------
 
diff --git a/Recovery/board/funkey/rootfs-overlay/etc/os-release b/Recovery/board/funkey/rootfs-overlay/etc/os-release
index 2483f0b..8a1fa48 100644
--- a/Recovery/board/funkey/rootfs-overlay/etc/os-release
+++ b/Recovery/board/funkey/rootfs-overlay/etc/os-release
@@ -1,12 +1,12 @@
 NAME="FunKey-OS Recovery"
-VERSION="1.1.0 (Quacking Quagga)"
+VERSION="2.0.0 (Rowdy Rabbit)"
 ID=funkey-recovery
 ID_LIKE=buildroot
-PRETTY_NAME="FunKey-OS Recovery 1.1.0"
-VERSION_ID="1.1.0"
+PRETTY_NAME="FunKey-OS Recovery 2.0.0"
+VERSION_ID="2.0.0"
 HOME_URL="https://www.funkey-project.com/"
 SUPPORT_URL="https://www.funkey-project.com/"
 BUG_REPORT_URL="https://www.funkey-project.com/"
 PRIVACY_POLICY_URL="https://www.funkey-project.com"
-VERSION_CODENAME=Quacking
-UBUNTU_CODENAME=Quacking
+VERSION_CODENAME=Rowdy
+UBUNTU_CODENAME=Rowdy
diff --git a/Recovery/board/funkey/rootfs-overlay/etc/sw-versions b/Recovery/board/funkey/rootfs-overlay/etc/sw-versions
index 6019c23..4455861 100644
--- a/Recovery/board/funkey/rootfs-overlay/etc/sw-versions
+++ b/Recovery/board/funkey/rootfs-overlay/etc/sw-versions
@@ -1 +1 @@
-Recovery	1.1.0
+Recovery	2.0.0
diff --git a/Recovery/board/funkey/rootfs-overlay/media b/Recovery/board/funkey/rootfs-overlay/media
new file mode 120000
index 0000000..35ec3b9
--- /dev/null
+++ b/Recovery/board/funkey/rootfs-overlay/media
@@ -0,0 +1 @@
+/
\ No newline at end of file
diff --git a/Recovery/board/funkey/rootfs-overlay/opk/.empty b/Recovery/board/funkey/rootfs-overlay/opk/.empty
new file mode 100644
index 0000000..e69de29
diff --git a/Recovery/board/funkey/rootfs-overlay/usr/local/sbin/menu b/Recovery/board/funkey/rootfs-overlay/usr/local/sbin/menu
index 1048ec7..88b8b89 100755
--- a/Recovery/board/funkey/rootfs-overlay/usr/local/sbin/menu
+++ b/Recovery/board/funkey/rootfs-overlay/usr/local/sbin/menu
@@ -16,8 +16,8 @@ menu_display () {
     case ${entry} in
 	0)
 
-	    # Version
-	    message=" VERSION"
+	    # Information
+	    message=" INFO"
 	    ;;
 
 	1)
@@ -61,20 +61,32 @@ menu_display () {
 	6)
 
 	    # Factory tests enable/disable
-	    if [ -e /mnt/.assembly_tests ]; then
-			message=" FACTORY TESTS DISABLE"
+	    RUN_ENV_VAR=$(fw_printenv -n assembly_tests 2>/dev/null)
+	    if [ "x${RUN_ENV_VAR}" == "x1" ]; then
+		message=" FACTORY TESTS DISABLE"
 	    else
-			message=" FACTORY TESTS ENABLE"
+		message=" FACTORY TESTS ENABLE"
 	    fi
 	    ;;
 
 	7)
 
+	    # First boot enable/disable
+	    FIRST_BOOT_OK_ENV_VAR=$(fw_printenv -n first_boot_ok 2>/dev/null)
+	    if [ "x${FIRST_BOOT_OK_ENV_VAR}" == "x1" ]; then
+		message=" FIRST BOOT DISABLE"
+	    else
+		message=" FACTORY TESTS ENABLE"
+	    fi
+	    ;;
+
+	8)
+
 	    # Exit Recovery
 	    message=" EXIT RECOVERY"
 	    ;;
 
-	8)
+	9)
 
 	    # Shutdown
 	    message=" SHUTDOWN"
@@ -90,7 +102,7 @@ menu_run () {
     case ${entry} in
 	0)
 
-	    # Version
+	    # Information
 	    rootfs_mount=/tmp/rootfs
 	    mkdir -p ${rootfs_mount}
 	    mount -t ext4 -o ro /dev/mmcblk0p2 ${rootfs_mount}
@@ -101,7 +113,8 @@ menu_run () {
 	    fi
 	    umount ${rootfs_mount}
 	    version_recovery=$(grep Recovery /etc/sw-versions | cut -f 2)
-	    notif "${message}^^     Recovery: ${version_recovery}^     rootfs  : ${version_rootfs}"
+	    ip_addr=$(ifconfig usb0 | grep "inet " | awk -F'[: ]+' '{ print $4 }')
+	    notif "${message}^^    Recovery: ${version_recovery}^    rootfs  : ${version_rootfs}^    IP addr : ${ip_addr}"
 	    ;;
 
 	1)
@@ -224,22 +237,36 @@ menu_run () {
 
 	    # Factory tests enable/disable
 	    RUN_ENV_VAR=$(fw_printenv -n assembly_tests 2>/dev/null)
-		if [ "x${RUN_ENV_VAR}" == "x1" ]; then
-		    fw_setenv assembly_tests 0
-		    message=" FACTORY TESTS ENABLE"
-		else
-		    fw_setenv assembly_tests 1
-		    message=" FACTORY TESTS DISABLE"
-		fi
-		notif "${message}"
+	    if [ "x${RUN_ENV_VAR}" == "x1" ]; then
+		fw_setenv assembly_tests 0
+		message=" FACTORY TESTS ENABLE"
+	    else
+		fw_setenv assembly_tests 1
+		message=" FACTORY TESTS DISABLE"
+	    fi
+	    notif "${message}"
 	    ;;
 
 	7)
+
+	    # First boot enable/disable
+	    FIRST_BOOT_OK_ENV_VAR=$(fw_printenv -n first_boot_ok 2>/dev/null)
+	    if [ "x${FIRST_BOOT_OK_ENV_VAR}" == "x1" ]; then
+		fw_setenv first_boot_ok
+		message=" FIRST BOOT ENABLE"
+	    else
+		fw_setenv first_boot_ok 1
+		message=" FIRST BOOT DISABLE"
+	    fi
+	    notif "${message}"
+	    ;;
+
+	8)
 	    notif "^^^^^^^^         RESTARTING...^^^^^^^^"
 	    normal_mode
 	    ;;
 
-	8)
+	9)
 	    notif "^^^^^^^^          SHUTDOWN...^^^^^^^^"
 	    poweroff
 	    ;;
diff --git a/Recovery/board/funkey/rootfs-overlay/usr/local/sbin/snap b/Recovery/board/funkey/rootfs-overlay/usr/local/sbin/snap
new file mode 100755
index 0000000..7a246a5
--- /dev/null
+++ b/Recovery/board/funkey/rootfs-overlay/usr/local/sbin/snap
@@ -0,0 +1,33 @@
+#!/bin/sh
+
+# Uncomment the following line to get debug info
+#set -x
+
+# Check args
+if [ ${#} -ne 0 ]; then
+    echo "Usage: $(basename ${0})"
+    exit 1
+fi
+
+# Lock file (necessary since fbgrab must run in bg not to block the buttons while gaming)
+LOCK_FILE="/var/lock/snap.lock"
+if [ -f "${LOCK_FILE}" ]; then
+    echo "${LOCK_FILE} already exists"
+    exit 1
+fi
+touch "${LOCK_FILE}"
+
+# Increment name and save snapshot
+SNAPSHOT_EXT=PNG
+SNAPSHOT_DIR=$HOME/snapshots
+mkdir -p "${SNAPSHOT_DIR}"
+last=$(cd ${SNAPSHOT_DIR}; ls IMG_*.${SNAPSHOT_EXT} 2> /dev/null | tail -1 | sed 's/^IMG_0*\([0-9]\+\)\.'${SNAPSHOT_EXT}'$/\1/')
+let last=${last}+1
+snapshot_file=$(printf "IMG_%04d.${SNAPSHOT_EXT}" $last)
+notif_set 2 "   SCREEENSHOT ${snapshot_file}"
+fbgrab "${SNAPSHOT_DIR}/${snapshot_file}" >/dev/null 2>&1 &
+
+# Remove lock file
+rm -f "${LOCK_FILE}"
+
+exit 0
diff --git a/Recovery/board/funkey/rootfs-overlay/usr/local/sbin/system_stats b/Recovery/board/funkey/rootfs-overlay/usr/local/sbin/system_stats
index a6a091a..4e6ad97 100755
--- a/Recovery/board/funkey/rootfs-overlay/usr/local/sbin/system_stats
+++ b/Recovery/board/funkey/rootfs-overlay/usr/local/sbin/system_stats
@@ -20,16 +20,21 @@ while true; do
     if [ ${perform} -eq 1 ]; then
 
 	# Compute stats
-	cpu=$(printf "%.1f\n" $(mpstat -P ALL $UPDATE_PERIOD 1 | tail -1 | awk '{print 100-$12}'))
-	ram_mem=$(printf "%.1f\n" $(free | grep Mem | awk '{print $3/$2 * 100.0}'))
-	ram_swap=$(printf "%.1f\n" $(free | grep Swap | awk '{print $3/$2 * 100.0}'))
+	cpu=$(printf "%.0f\n" $(mpstat -P ALL $UPDATE_PERIOD 1 | tail -1 | awk '{print 100-$12}'))
+	ram_mem=$(printf "%.0f\n" $(free | grep Mem | awk '{print $3/$2 * 100.0}'))
+	ram_swap=$(printf "%.0f\n" $(free | grep Swap | awk '{print $3/$2 * 100.0}'))
+	ip_addr=$(ifconfig usb0 | grep "inet " | awk -F'[: ]+' '{ print $4 }')
 
 	# Notif
 	if [ ${notif_dirty} -eq 1 ]; then
 	    notif_clear
 	    notif_dirty=0
 	else
-	    notif_set 0 "CPU:${cpu}%% RAM:${ram_mem}%% SWAP:${ram_swap}%%"
+	    if [ "x${ip_addr}" != "x" ]; then
+		notif_set 0 " CPU:${cpu}%% RAM:${ram_mem}%% SWAP:${ram_swap}%%^IP:${ip_addr}"
+	    else
+		notif_set 0 " CPU:${cpu}%% RAM:${ram_mem}%% SWAP:${ram_swap}%%"
+	    fi
 	fi
     else
 	sleep ${UPDATE_PERIOD}
diff --git a/Recovery/configs/recovery_defconfig b/Recovery/configs/recovery_defconfig
index 7570988..ee12e1c 100644
--- a/Recovery/configs/recovery_defconfig
+++ b/Recovery/configs/recovery_defconfig
@@ -1,10 +1,19 @@
 BR2_arm=y
 BR2_cortex_a7=y
+BR2_ARM_FPU_VFPV4=y
 BR2_DL_DIR="../download"
 BR2_CCACHE=y
+BR2_OPTIMIZE_FAST=y
+BR2_SHARED_STATIC_LIBS=y
 BR2_GLOBAL_PATCH_DIR="$(BR2_EXTERNAL_RECOVERY_PATH)/board/funkey/patches"
 BR2_TOOLCHAIN_EXTERNAL=y
-BR2_TOOLCHAIN_EXTERNAL_GDB_SERVER_COPY=y
+BR2_TOOLCHAIN_EXTERNAL_CUSTOM=y
+BR2_TOOLCHAIN_EXTERNAL_DOWNLOAD=y
+BR2_TOOLCHAIN_EXTERNAL_URL="https://github.com/FunKey-Project/FunKey-OS/releases/download/FunKey-OS-2.0.0/FunKey-sdk-2.0.0.tar.gz"
+BR2_TOOLCHAIN_EXTERNAL_HEADERS_4_14=y
+BR2_TOOLCHAIN_EXTERNAL_CUSTOM_MUSL=y
+BR2_TOOLCHAIN_EXTERNAL_CXX=y
+BR2_TARGET_OPTIMIZATION="-fno-PIC -march=armv7-a+neon-vfpv4 -mtune=cortex-a7 -mfpu=neon-vfpv4"
 BR2_TARGET_GENERIC_HOSTNAME="FunKey"
 BR2_TARGET_GENERIC_ISSUE="Welcome to Recovery Buildroot for the FunKey"
 BR2_ROOTFS_DEVICE_TABLE="$(BR2_EXTERNAL_RECOVERY_PATH)/board/funkey/device_table.txt"
diff --git a/Recovery/package/FunKey-GPIO-Mapping/FunKey-GPIO-Mapping.mk b/Recovery/package/FunKey-GPIO-Mapping/FunKey-GPIO-Mapping.mk
index af98396..36958ed 100644
--- a/Recovery/package/FunKey-GPIO-Mapping/FunKey-GPIO-Mapping.mk
+++ b/Recovery/package/FunKey-GPIO-Mapping/FunKey-GPIO-Mapping.mk
@@ -4,7 +4,7 @@
 #
 ################################################################################
 
-FUNKEY_GPIO_MAPPING_VERSION = FunKey-GPIO-Mapping-FunKey-1.00
+FUNKEY_GPIO_MAPPING_VERSION = e2b637f
 FUNKEY_GPIO_MAPPING_SITE_METHOD = git
 FUNKEY_GPIO_MAPPING_SITE = https://github.com/FunKey-Project/FunKey-GPIO-Mapping.git
 FUNKEY_GPIO_MAPPING_SITE_LICENSE = GPL-2.1+
diff --git a/SDK/Config.in b/SDK/Config.in
new file mode 100644
index 0000000..984b74f
--- /dev/null
+++ b/SDK/Config.in
@@ -0,0 +1,7 @@
+source "$BR2_EXTERNAL_SDK_PATH/package/dmtx-utils/Config.in"
+source "$BR2_EXTERNAL_SDK_PATH/package/libini/Config.in"
+source "$BR2_EXTERNAL_SDK_PATH/package/libopk/Config.in"
+source "$BR2_EXTERNAL_SDK_PATH/package/libxdgmime/Config.in"
+source "$BR2_EXTERNAL_SDK_PATH/package/agg/Config.in"
+source "$BR2_EXTERNAL_SDK_PATH/package/fluidlite/Config.in"
+source "$BR2_EXTERNAL_SDK_PATH/package/libmikmod/Config.in"
diff --git a/SDK/board/funkey/patches/pixman/0002-aarch64NEON_asm.patch b/SDK/board/funkey/patches/pixman/0002-aarch64NEON_asm.patch
new file mode 100644
index 0000000..1f27440
--- /dev/null
+++ b/SDK/board/funkey/patches/pixman/0002-aarch64NEON_asm.patch
@@ -0,0 +1,6526 @@
+From 7b128ae8c56b1055a93573004148a98465d79857 Mon Sep 17 00:00:00 2001
+From: Mizuki Asakura <ed6e117f@gmail.com>
+Date: Sun, 17 Apr 2016 20:16:12 +0900
+Subject: [PATCH] [mod] added aarch64 bilinear implementations (ver.4.1)
+
+Since aarch64 has different neon syntax from aarch32 and has no
+support for (older) arm-simd,
+there are no SIMD accelerations for pixman on aarch64.
+
+We need new implementations.
+
+This patch also contains Ben Avions's series of patches for aarch32
+and now the benchmark results are fine to aarch64.
+
+Please find the result at the below ticket.
+
+Added: https://bugs.freedesktop.org/show_bug.cgi?id=94758
+Signed-off-by: Mizuki Asakura <ed6e117f@gmail.com>
+---
+ configure.ac                             |   36 +-
+ pixman/Makefile.am                       |   17 +-
+ pixman/pixman-arm-neon.c                 |   23 +-
+ pixman/pixman-arm.c                      |    6 +
+ pixman/pixman-arma64-neon-asm-bilinear.S | 1275 ++++++++++
+ pixman/pixman-arma64-neon-asm.S          | 3704 ++++++++++++++++++++++++++++++
+ pixman/pixman-arma64-neon-asm.h          | 1310 +++++++++++
+ pixman/pixman-private.h                  |    7 +-
+ 8 files changed, 6374 insertions(+), 4 deletions(-)
+ create mode 100644 pixman/pixman-arma64-neon-asm-bilinear.S
+ create mode 100644 pixman/pixman-arma64-neon-asm.S
+ create mode 100644 pixman/pixman-arma64-neon-asm.h
+
+diff --git a/configure.ac b/configure.ac
+index 6b2134e..26203a8
+--- a/configure.ac
++++ b/configure.ac
+@@ -667,6 +667,40 @@ if test $enable_arm_neon = yes && test $have_arm_neon = no ; then
+    AC_MSG_ERROR([ARM NEON intrinsics not detected])
+ fi
+ 
++dnl ==========================================================================
++dnl Check if assembler is gas compatible and supports ARM-a64 NEON instructions
++have_arm_a64_neon=no
++AC_MSG_CHECKING(whether to use ARM A64 NEON assembler)
++xserver_save_CFLAGS=$CFLAGS
++CFLAGS="-x assembler-with-cpp $CFLAGS"
++AC_COMPILE_IFELSE([AC_LANG_SOURCE([[
++.text
++.arch armv8-a
++.altmacro
++prfm pldl2strm, [x0]
++xtn v0.8b, v0.8h]])], have_arm_a64_neon=yes)
++CFLAGS=$xserver_save_CFLAGS
++
++AC_ARG_ENABLE(arm-a64-neon,
++   [AC_HELP_STRING([--disable-arm-a64-neon],
++                   [disable ARM A64 NEON fast paths])],
++   [enable_arm_a64_neon=$enableval], [enable_arm_a64_neon=auto])
++
++if test $enable_arm_a64_neon = no ; then
++   have_arm_a64_neon=disabled
++fi
++
++if test $have_arm_a64_neon = yes ; then
++   AC_DEFINE(USE_ARM_A64_NEON, 1, [use ARM A64_NEON assembly optimizations])
++fi
++
++AM_CONDITIONAL(USE_ARM_A64_NEON, test $have_arm_a64_neon = yes)
++
++AC_MSG_RESULT($have_arm_a64_neon)
++if test $enable_arm_a64_neon = yes && test $have_arm_a64_neon4 = no ; then
++   AC_MSG_ERROR([ARM A64 NEON intrinsics not detected])
++fi
++
+ dnl ===========================================================================
+ dnl Check for IWMMXT
+ 
+diff --git a/pixman/Makefile.am b/pixman/Makefile.am
+index 581b6f6..f1afa27
+--- a/pixman/Makefile.am
++++ b/pixman/Makefile.am
+@@ -94,6 +94,21 @@ libpixman_1_la_LIBADD += libpixman-arm-neon.la
+ ASM_CFLAGS_arm_neon=
+ endif
+ 
++# arm a64 neon code
++if USE_ARM_A64_NEON
++noinst_LTLIBRARIES += libpixman-arma64-neon.la
++libpixman_arma64_neon_la_SOURCES = \
++        pixman-arm-neon.c        \
++        pixman-arm-common.h      \
++        pixman-arma64-neon-asm.S \
++        pixman-arma64-neon-asm-bilinear.S \
++        pixman-arm-asm.h         \
++        pixman-arma64-neon-asm.h
++libpixman_1_la_LIBADD += libpixman-arma64-neon.la
++
++ASM_CFLAGS_arm_neon=
++endif
++
+ # iwmmxt code
+ if USE_ARM_IWMMXT
+ libpixman_iwmmxt_la_SOURCES = pixman-mmx.c
+diff --git a/pixman/pixman-arm-neon.c b/pixman/pixman-arm-neon.c
+index be761c9..62c9442 100644
+--- a/pixman/pixman-arm-neon.c
++++ b/pixman/pixman-arm-neon.c
+@@ -194,7 +194,7 @@ arm_neon_fill (pixman_implementation_t *imp,
+ 	       uint32_t                 _xor)
+ {
+     /* stride is always multiple of 32bit units in pixman */
+-    uint32_t byte_stride = stride * sizeof(uint32_t);
++    int32_t byte_stride = stride * sizeof(uint32_t);
+ 
+     switch (bpp)
+     {
+@@ -331,6 +331,7 @@ static const pixman_fast_path_t arm_neon_fast_paths[] =
+     PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, a8,       b5g6r5,   neon_composite_over_8888_8_0565),
+     PIXMAN_STD_FAST_PATH (OVER, r5g6b5,   a8,       r5g6b5,   neon_composite_over_0565_8_0565),
+     PIXMAN_STD_FAST_PATH (OVER, b5g6r5,   a8,       b5g6r5,   neon_composite_over_0565_8_0565),
++    PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, x8r8g8b8, neon_composite_over_8888_8888_8888),
+     PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, a8r8g8b8, neon_composite_over_8888_8888_8888),
+     PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null,     r5g6b5,   neon_composite_over_8888_0565),
+     PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null,     b5g6r5,   neon_composite_over_8888_0565),
+@@ -341,17 +342,33 @@ static const pixman_fast_path_t arm_neon_fast_paths[] =
+     PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, null,     a8r8g8b8, neon_composite_src_x888_8888),
+     PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, null,     a8b8g8r8, neon_composite_src_x888_8888),
+     PIXMAN_STD_FAST_PATH (ADD,  solid,    a8,       a8,       neon_composite_add_n_8_8),
++    PIXMAN_STD_FAST_PATH (ADD,  solid,    a8,       x8r8g8b8, neon_composite_add_n_8_8888),
+     PIXMAN_STD_FAST_PATH (ADD,  solid,    a8,       a8r8g8b8, neon_composite_add_n_8_8888),
++    PIXMAN_STD_FAST_PATH (ADD,  solid,    a8,       x8b8g8r8, neon_composite_add_n_8_8888),
+     PIXMAN_STD_FAST_PATH (ADD,  solid,    a8,       a8b8g8r8, neon_composite_add_n_8_8888),
+     PIXMAN_STD_FAST_PATH (ADD,  a8,       a8,       a8,       neon_composite_add_8_8_8),
+     PIXMAN_STD_FAST_PATH (ADD,  r5g6b5,   a8,       r5g6b5,   neon_composite_add_0565_8_0565),
+     PIXMAN_STD_FAST_PATH (ADD,  b5g6r5,   a8,       b5g6r5,   neon_composite_add_0565_8_0565),
++    PIXMAN_STD_FAST_PATH (ADD,  x8r8g8b8, a8,       x8r8g8b8, neon_composite_add_8888_8_8888),
++    PIXMAN_STD_FAST_PATH (ADD,  a8r8g8b8, a8,       x8r8g8b8, neon_composite_add_8888_8_8888),
++    PIXMAN_STD_FAST_PATH (ADD,  x8b8g8r8, a8,       x8b8g8r8, neon_composite_add_8888_8_8888),
++    PIXMAN_STD_FAST_PATH (ADD,  a8b8g8r8, a8,       x8b8g8r8, neon_composite_add_8888_8_8888),
+     PIXMAN_STD_FAST_PATH (ADD,  a8r8g8b8, a8,       a8r8g8b8, neon_composite_add_8888_8_8888),
+     PIXMAN_STD_FAST_PATH (ADD,  a8b8g8r8, a8,       a8b8g8r8, neon_composite_add_8888_8_8888),
++    PIXMAN_STD_FAST_PATH (ADD,  x8r8g8b8, a8r8g8b8, x8r8g8b8, neon_composite_add_8888_8888_8888),
++    PIXMAN_STD_FAST_PATH (ADD,  a8r8g8b8, a8r8g8b8, x8r8g8b8, neon_composite_add_8888_8888_8888),
+     PIXMAN_STD_FAST_PATH (ADD,  a8r8g8b8, a8r8g8b8, a8r8g8b8, neon_composite_add_8888_8888_8888),
++    PIXMAN_STD_FAST_PATH (ADD,  x8r8g8b8, solid,    x8r8g8b8, neon_composite_add_8888_n_8888),
++    PIXMAN_STD_FAST_PATH (ADD,  a8r8g8b8, solid,    x8r8g8b8, neon_composite_add_8888_n_8888),
++    PIXMAN_STD_FAST_PATH (ADD,  x8b8g8r8, solid,    x8b8g8r8, neon_composite_add_8888_n_8888),
++    PIXMAN_STD_FAST_PATH (ADD,  a8b8g8r8, solid,    x8b8g8r8, neon_composite_add_8888_n_8888),
+     PIXMAN_STD_FAST_PATH (ADD,  a8r8g8b8, solid,    a8r8g8b8, neon_composite_add_8888_n_8888),
+     PIXMAN_STD_FAST_PATH (ADD,  a8b8g8r8, solid,    a8b8g8r8, neon_composite_add_8888_n_8888),
+     PIXMAN_STD_FAST_PATH (ADD,  a8,       null,     a8,       neon_composite_add_8_8),
++    PIXMAN_STD_FAST_PATH (ADD,  x8r8g8b8, null,     x8r8g8b8, neon_composite_add_8888_8888),
++    PIXMAN_STD_FAST_PATH (ADD,  a8r8g8b8, null,     x8r8g8b8, neon_composite_add_8888_8888),
++    PIXMAN_STD_FAST_PATH (ADD,  x8b8g8r8, null,     x8b8g8r8, neon_composite_add_8888_8888),
++    PIXMAN_STD_FAST_PATH (ADD,  a8b8g8r8, null,     x8b8g8r8, neon_composite_add_8888_8888),
+     PIXMAN_STD_FAST_PATH (ADD,  a8r8g8b8, null,     a8r8g8b8, neon_composite_add_8888_8888),
+     PIXMAN_STD_FAST_PATH (ADD,  a8b8g8r8, null,     a8b8g8r8, neon_composite_add_8888_8888),
+     PIXMAN_STD_FAST_PATH (IN,   solid,    null,     a8,       neon_composite_in_n_8),
+@@ -359,7 +376,9 @@ static const pixman_fast_path_t arm_neon_fast_paths[] =
+     PIXMAN_STD_FAST_PATH (OVER_REVERSE, solid, null, a8b8g8r8, neon_composite_over_reverse_n_8888),
+     PIXMAN_STD_FAST_PATH (OUT_REVERSE,  a8,    null, r5g6b5,   neon_composite_out_reverse_8_0565),
+     PIXMAN_STD_FAST_PATH (OUT_REVERSE,  a8,    null, b5g6r5,   neon_composite_out_reverse_8_0565),
++    PIXMAN_STD_FAST_PATH (OUT_REVERSE,  a8,    null, x8r8g8b8, neon_composite_out_reverse_8_8888),
+     PIXMAN_STD_FAST_PATH (OUT_REVERSE,  a8,    null, a8r8g8b8, neon_composite_out_reverse_8_8888),
++    PIXMAN_STD_FAST_PATH (OUT_REVERSE,  a8,    null, x8b8g8r8, neon_composite_out_reverse_8_8888),
+     PIXMAN_STD_FAST_PATH (OUT_REVERSE,  a8,    null, a8b8g8r8, neon_composite_out_reverse_8_8888),
+ 
+     SIMPLE_NEAREST_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, neon_8888_8888),
+@@ -404,6 +423,7 @@ static const pixman_fast_path_t arm_neon_fast_paths[] =
+ 
+     SIMPLE_BILINEAR_FAST_PATH (ADD, a8r8g8b8, a8r8g8b8, neon_8888_8888),
+     SIMPLE_BILINEAR_FAST_PATH (ADD, a8r8g8b8, x8r8g8b8, neon_8888_8888),
++    SIMPLE_BILINEAR_FAST_PATH (ADD, x8r8g8b8, x8r8g8b8, neon_8888_8888),
+ 
+     SIMPLE_BILINEAR_A8_MASK_FAST_PATH (SRC, a8r8g8b8, a8r8g8b8, neon_8888_8_8888),
+     SIMPLE_BILINEAR_A8_MASK_FAST_PATH (SRC, a8r8g8b8, x8r8g8b8, neon_8888_8_8888),
+@@ -420,6 +440,7 @@ static const pixman_fast_path_t arm_neon_fast_paths[] =
+ 
+     SIMPLE_BILINEAR_A8_MASK_FAST_PATH (ADD, a8r8g8b8, a8r8g8b8, neon_8888_8_8888),
+     SIMPLE_BILINEAR_A8_MASK_FAST_PATH (ADD, a8r8g8b8, x8r8g8b8, neon_8888_8_8888),
++    SIMPLE_BILINEAR_A8_MASK_FAST_PATH (ADD, x8r8g8b8, x8r8g8b8, neon_8888_8_8888),
+ 
+     { PIXMAN_OP_NONE },
+ };
+diff --git a/pixman/pixman-arm.c b/pixman/pixman-arm.c
+index 23374e4..734cbea 100644
+--- a/pixman/pixman-arm.c
++++ b/pixman/pixman-arm.c
+@@ -221,5 +221,11 @@ _pixman_arm_get_implementations (pixman_implementation_t *imp)
+ 	imp = _pixman_implementation_create_arm_neon (imp);
+ #endif
+ 
++#ifdef USE_ARM_A64_NEON
++    /* neon is a part of aarch64 */
++    if (!_pixman_disabled ("arm-neon"))
++        imp = _pixman_implementation_create_arm_neon (imp);
++#endif
++
+     return imp;
+ }
+diff --git a/pixman/pixman-arma64-neon-asm-bilinear.S b/pixman/pixman-arma64-neon-asm-bilinear.S
+new file mode 100644
+index 0000000..aaa4a83
+--- /dev/null
++++ b/pixman/pixman-arma64-neon-asm-bilinear.S
+@@ -0,0 +1,1275 @@
++/*
++ * Copyright © 2011 SCore Corporation
++ *
++ * Permission is hereby granted, free of charge, to any person obtaining a
++ * copy of this software and associated documentation files (the "Software"),
++ * to deal in the Software without restriction, including without limitation
++ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
++ * and/or sell copies of the Software, and to permit persons to whom the
++ * Software is furnished to do so, subject to the following conditions:
++ *
++ * The above copyright notice and this permission notice (including the next
++ * paragraph) shall be included in all copies or substantial portions of the
++ * Software.
++ *
++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
++ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
++ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
++ * DEALINGS IN THE SOFTWARE.
++ *
++ * Author:  Siarhei Siamashka (siarhei.siamashka@nokia.com)
++ * Author:  Taekyun Kim (tkq.kim@samsung.com)
++ */
++
++/*
++ * This file contains scaled bilinear scanline functions implemented
++ * using older siarhei's bilinear macro template.
++ *
++ * << General scanline function procedures >>
++ *  1. bilinear interpolate source pixels
++ *  2. load mask pixels
++ *  3. load destination pixels
++ *  4. duplicate mask to fill whole register
++ *  5. interleave source & destination pixels
++ *  6. apply mask to source pixels
++ *  7. combine source & destination pixels
++ *  8, Deinterleave final result
++ *  9. store destination pixels
++ *
++ * All registers with single number (i.e. src0, tmp0) are 64-bits registers.
++ * Registers with double numbers(src01, dst01) are 128-bits registers.
++ * All temp registers can be used freely outside the code block.
++ * Assume that symbol(register .req) OUT and MASK are defined at caller of these macro blocks.
++ *
++ * Remarks
++ *  There can be lots of pipeline stalls inside code block and between code blocks.
++ *  Further optimizations will be done by new macro templates using head/tail_head/tail scheme.
++ */
++
++/* Prevent the stack from becoming executable for no reason... */
++#if defined(__linux__) && defined (__ELF__)
++.section .note.GNU-stack,"",%progbits
++#endif
++
++.text
++.arch armv8-a
++.altmacro
++.p2align 2
++
++#include "pixman-private.h"
++#include "pixman-arm-asm.h"
++#include "pixman-arma64-neon-asm.h"
++
++/*
++ * Bilinear macros from pixman-arm-neon-asm.S
++ */
++
++/*
++ * Bilinear scaling support code which tries to provide pixel fetching, color
++ * format conversion, and interpolation as separate macros which can be used
++ * as the basic building blocks for constructing bilinear scanline functions.
++ */
++
++.macro bilinear_load_8888 reg1, reg2, tmp
++    asr       WTMP1, X, #16
++    add       X, X, UX
++    add       TMP1, TOP, TMP1, lsl #2
++    ld1       {&reg1&.2s}, [TMP1], STRIDE
++    ld1       {&reg2&.2s}, [TMP1]
++.endm
++
++.macro bilinear_load_0565 reg1, reg2, tmp
++    asr       WTMP1, X, #16
++    add       X, X, UX
++    add       TMP1, TOP, TMP1, lsl #1
++    ld1       {&reg2&.s}[0], [TMP1], STRIDE
++    ld1       {&reg2&.s}[1], [TMP1]
++    convert_four_0565_to_x888_packed reg2, reg1, reg2, tmp
++.endm
++
++.macro bilinear_load_and_vertical_interpolate_two_8888 \
++                    acc1, acc2, reg1, reg2, reg3, reg4, tmp1, tmp2
++
++    bilinear_load_8888 reg1, reg2, tmp1
++    umull     &acc1&.8h, &reg1&.8b, v28.8b
++    umlal     &acc1&.8h, &reg2&.8b, v29.8b
++    bilinear_load_8888 reg3, reg4, tmp2
++    umull     &acc2&.8h, &reg3&.8b, v28.8b
++    umlal     &acc2&.8h, &reg4&.8b, v29.8b
++.endm
++
++.macro bilinear_load_and_vertical_interpolate_four_8888 \
++                xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi \
++                yacc1, yacc2, yreg1, yreg2, yreg3, yreg4, yacc2lo, yacc2hi
++
++    bilinear_load_and_vertical_interpolate_two_8888 \
++                xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi
++    bilinear_load_and_vertical_interpolate_two_8888 \
++                yacc1, yacc2, yreg1, yreg2, yreg3, yreg4, yacc2lo, yacc2hi
++.endm
++
++.macro vzip reg1, reg2
++    zip1      v24.8b, reg1, reg2
++    zip2      reg2,   reg1, reg2
++    mov       reg1,   v24.8b
++.endm
++
++.macro vuzp reg1, reg2
++    uzp1     v24.8b, reg1, reg2
++    uzp2     reg2,   reg1, reg2
++    mov      reg1,   v24.8b
++.endm
++
++.macro bilinear_load_and_vertical_interpolate_two_0565 \
++                acc1, acc2, reg1, reg2, reg3, reg4, acc2lo, acc2hi
++    asr       WTMP1, X, #16
++    add       X, X, UX
++    add       TMP1, TOP, TMP1, lsl #1
++    asr       WTMP2, X, #16
++    add       X, X, UX
++    add       TMP2, TOP, TMP2, lsl #1
++    ld1       {&acc2&.s}[0], [TMP1], STRIDE
++    ld1       {&acc2&.s}[2], [TMP2], STRIDE
++    ld1       {&acc2&.s}[1], [TMP1]
++    ld1       {&acc2&.s}[3], [TMP2]
++    convert_0565_to_x888 acc2, reg3, reg2, reg1
++    vzip      &reg1&.8b, &reg3&.8b
++    vzip      &reg2&.8b, &reg4&.8b
++    vzip      &reg3&.8b, &reg4&.8b
++    vzip      &reg1&.8b, &reg2&.8b
++    umull     &acc1&.8h, &reg1&.8b, v28.8b
++    umlal     &acc1&.8h, &reg2&.8b, v29.8b
++    umull     &acc2&.8h, &reg3&.8b, v28.8b
++    umlal     &acc2&.8h, &reg4&.8b, v29.8b
++.endm
++
++.macro bilinear_load_and_vertical_interpolate_four_0565 \
++                xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi \
++                yacc1, yacc2, yreg1, yreg2, yreg3, yreg4, yacc2lo, yacc2hi
++
++    asr       WTMP1, X, #16
++    add       X, X, UX
++    add       TMP1, TOP, TMP1, lsl #1
++    asr       WTMP2, X, #16
++    add       X, X, UX
++    add       TMP2, TOP, TMP2, lsl #1
++    ld1       {&xacc2&.s}[0], [TMP1], STRIDE
++    ld1       {&xacc2&.s}[2], [TMP2], STRIDE
++    ld1       {&xacc2&.s}[1], [TMP1]
++    ld1       {&xacc2&.s}[3], [TMP2]
++    convert_0565_to_x888 xacc2, xreg3, xreg2, xreg1
++    asr       WTMP1, X, #16
++    add       X, X, UX
++    add       TMP1, TOP, TMP1, lsl #1
++    asr       WTMP2, X, #16
++    add       X, X, UX
++    add       TMP2, TOP, TMP2, lsl #1
++    ld1       {&yacc2&.s}[0], [TMP1], STRIDE
++    vzip      &xreg1&.8b, &xreg3&.8b
++    ld1       {&yacc2&.s}[2], [TMP2], STRIDE
++    vzip      &xreg2&.8b, &xreg4&.8b
++    ld1       {&yacc2&.s}[1], [TMP1]
++    vzip      &xreg3&.8b, &xreg4&.8b
++    ld1       {&yacc2&.s}[3], [TMP2]
++    vzip      &xreg1&.8b, &xreg2&.8b
++    convert_0565_to_x888 yacc2, yreg3, yreg2, yreg1
++    umull     &xacc1&.8h, &xreg1&.8b, v28.8b
++    vzip      &yreg1&.8b, &yreg3&.8b
++    umlal     &xacc1&.8h, &xreg2&.8b, v29.8b
++    vzip      &yreg2&.8b, &yreg4&.8b
++    umull     &xacc2&.8h, &xreg3&.8b, v28.8b
++    vzip      &yreg3&.8b, &yreg4&.8b
++    umlal     &xacc2&.8h, &xreg4&.8b, v29.8b
++    vzip      &yreg1&.8b, &yreg2&.8b
++    umull     &yacc1&.8h, &yreg1&.8b, v28.8b
++    umlal     &yacc1&.8h, &yreg2&.8b, v29.8b
++    umull     &yacc2&.8h, &yreg3&.8b, v28.8b
++    umlal     &yacc2&.8h, &yreg4&.8b, v29.8b
++.endm
++
++.macro bilinear_store_8888 numpix, tmp1, tmp2
++.if numpix == 4
++    st1       {v0.2s, v1.2s}, [OUT], #16
++.elseif numpix == 2
++    st1       {v0.2s}, [OUT], #8
++.elseif numpix == 1
++    st1       {v0.s}[0], [OUT], #4
++.else
++    .error bilinear_store_8888 numpix is unsupported
++.endif
++.endm
++
++.macro bilinear_store_0565 numpix, tmp1, tmp2
++    vuzp    v0.8b, v1.8b
++    vuzp    v2.8b, v3.8b
++    vuzp    v1.8b, v3.8b
++    vuzp    v0.8b, v2.8b
++    convert_8888_to_0565 v2, v1, v0, v1, tmp1, tmp2
++.if numpix == 4
++    st1       {v1.4h}, [OUT], #8
++.elseif numpix == 2
++    st1       {v1.s}[0], [OUT], #4
++.elseif numpix == 1
++    st1       {v1.h}[0], [OUT], #2
++.else
++    .error bilinear_store_0565 numpix is unsupported
++.endif
++.endm
++
++
++/*
++ * Macros for loading mask pixels into register 'mask'.
++ * dup must be done in somewhere else.
++ */
++.macro bilinear_load_mask_x numpix, mask
++.endm
++
++.macro bilinear_load_mask_8 numpix, mask
++.if numpix == 4
++    ld1         {&mask&.s}[0], [MASK], #4
++.elseif numpix == 2
++    ld1         {&mask&.h}[0], [MASK], #2
++.elseif numpix == 1
++    ld1         {&mask&.b}[0], [MASK], #1
++.else
++    .error bilinear_load_mask_8 numpix is unsupported
++.endif
++    prfm        PREFETCH_MODE, [MASK, #prefetch_offset]
++.endm
++
++.macro bilinear_load_mask mask_fmt, numpix, mask
++    bilinear_load_mask_&mask_fmt numpix, mask
++.endm
++
++
++/*
++ * Macros for loading destination pixels into register 'dst0' and 'dst1'.
++ * Interleave should be done somewhere else.
++ */
++.macro bilinear_load_dst_0565_src numpix, dst0, dst1, dst01
++.endm
++
++.macro bilinear_load_dst_8888_src numpix, dst0, dst1, dst01
++.endm
++
++.macro bilinear_load_dst_8888 numpix, dst0, dst1, dst01
++.if numpix == 4
++    ld1         {&dst0&.2s, &dst1&.2s}, [OUT]
++.elseif numpix == 2
++    ld1         {&dst0&.2s}, [OUT]
++.elseif numpix == 1
++    ld1         {&dst0&.s}[0], [OUT]
++.else
++    .error bilinear_load_dst_8888 numpix is unsupported
++.endif
++    mov         &dst01&.d[0], &dst0&.d[0]
++    mov         &dst01&.d[1], &dst1&.d[0]
++    prfm        PREFETCH_MODE, [OUT, #(prefetch_offset * 4)]
++.endm
++
++.macro bilinear_load_dst_8888_over numpix, dst0, dst1, dst01
++    bilinear_load_dst_8888 numpix, dst0, dst1, dst01
++.endm
++
++.macro bilinear_load_dst_8888_add numpix, dst0, dst1, dst01
++    bilinear_load_dst_8888 numpix, dst0, dst1, dst01
++.endm
++
++.macro bilinear_load_dst dst_fmt, op, numpix, dst0, dst1, dst01
++    bilinear_load_dst_&dst_fmt&_&op numpix, dst0, dst1, dst01
++.endm
++
++/*
++ * Macros for duplicating partially loaded mask to fill entire register.
++ * We will apply mask to interleaved source pixels, that is
++ *  (r0, r1, r2, r3, g0, g1, g2, g3) x (m0, m1, m2, m3, m0, m1, m2, m3)
++ *  (b0, b1, b2, b3, a0, a1, a2, a3) x (m0, m1, m2, m3, m0, m1, m2, m3)
++ * So, we need to duplicate loaded mask into whole register.
++ *
++ * For two pixel case
++ *  (r0, r1, x, x, g0, g1, x, x) x (m0, m1, m0, m1, m0, m1, m0, m1)
++ *  (b0, b1, x, x, a0, a1, x, x) x (m0, m1, m0, m1, m0, m1, m0, m1)
++ * We can do some optimizations for this including last pixel cases.
++ */
++.macro bilinear_duplicate_mask_x numpix, mask
++.endm
++
++.macro bilinear_duplicate_mask_8 numpix, mask
++.if numpix == 4
++    dup         &mask&.2s, &mask&.s[0]
++.elseif numpix == 2
++    dup         &mask&.4h, &mask&.h[0]
++.elseif numpix == 1
++    dup         &mask&.8b, &mask&.b[0]
++.else
++    .error bilinear_duplicate_mask_8 is unsupported
++.endif
++.endm
++
++.macro bilinear_duplicate_mask mask_fmt, numpix, mask
++    bilinear_duplicate_mask_&mask_fmt numpix, mask
++.endm
++
++/*
++ * Macros for interleaving src and dst pixels to rrrr gggg bbbb aaaa form.
++ * Interleave should be done when maks is enabled or operator is 'over'.
++ */
++.macro bilinear_interleave src0, src1, src01, dst0, dst1, dst01
++    vuzp       &src0&.8b, &src1&.8b
++    vuzp       &dst0&.8b, &dst1&.8b
++    vuzp       &src0&.8b, &src1&.8b
++    vuzp       &dst0&.8b, &dst1&.8b
++    mov        &src01&.d[1], &src1&.d[0]
++    mov        &src01&.d[0], &src0&.d[0]
++    mov        &dst01&.d[1], &dst1&.d[0]
++    mov        &dst01&.d[0], &dst0&.d[0]
++.endm
++
++.macro bilinear_interleave_src_dst_x_src \
++                numpix, src0, src1, src01, dst0, dst1, dst01
++.endm
++
++.macro bilinear_interleave_src_dst_x_over \
++                numpix, src0, src1, src01, dst0, dst1, dst01
++
++    bilinear_interleave src0, src1, src01, dst0, dst1, dst01
++.endm
++
++.macro bilinear_interleave_src_dst_x_add \
++                numpix, src0, src1, src01, dst0, dst1, dst01
++    bilinear_interleave src0, src1, src01, dst0, dst1, dst01
++.endm
++
++.macro bilinear_interleave_src_dst_8_src \
++                numpix, src0, src1, src01, dst0, dst1, dst01
++
++    bilinear_interleave src0, src1, src01, dst0, dst1, dst01
++.endm
++
++.macro bilinear_interleave_src_dst_8_over \
++                numpix, src0, src1, src01, dst0, dst1, dst01
++
++    bilinear_interleave src0, src1, src01, dst0, dst1, dst01
++.endm
++
++.macro bilinear_interleave_src_dst_8_add \
++                numpix, src0, src1, src01, dst0, dst1, dst01
++
++    bilinear_interleave src0, src1, src01, dst0, dst1, dst01
++.endm
++
++.macro bilinear_interleave_src_dst \
++                mask_fmt, op, numpix, src0, src1, src01, dst0, dst1, dst01
++
++    bilinear_interleave_src_dst_&mask_fmt&_&op \
++                numpix, src0, src1, src01, dst0, dst1, dst01
++.endm
++
++
++/*
++ * Macros for applying masks to src pixels. (see combine_mask_u() function)
++ * src, dst should be in interleaved form.
++ * mask register should be in form (m0, m1, m2, m3).
++ */
++.macro bilinear_apply_mask_to_src_x \
++                numpix, src0, src1, src01, mask, \
++                tmp01, tmp23, tmp45, tmp67
++.endm
++
++.macro bilinear_apply_mask_to_src_8 \
++                numpix, src0, src1, src01, mask, \
++                tmp01, tmp23, tmp45, tmp67
++
++    umull           &tmp01&.8h, &src0&.8b, &mask&.8b
++    umull           &tmp23&.8h, &src1&.8b, &mask&.8b
++    /* bubbles */
++    urshr           &tmp45&.8h, &tmp01&.8h, #8
++    urshr           &tmp67&.8h, &tmp23&.8h, #8
++    /* bubbles */
++    raddhn          &src0&.8b, &tmp45&.8h, &tmp01&.8h
++    raddhn          &src1&.8b, &tmp67&.8h, &tmp23&.8h
++    mov             &src01&.d[0], &src0&.d[0]
++    mov             &src01&.d[1], &src1&.d[0]
++.endm
++
++.macro bilinear_apply_mask_to_src \
++                mask_fmt, numpix, src0, src1, src01, mask, \
++                tmp01, tmp23, tmp45, tmp67
++
++    bilinear_apply_mask_to_src_&mask_fmt \
++                numpix, src0, src1, src01, mask, \
++                tmp01, tmp23, tmp45, tmp67
++.endm
++
++
++/*
++ * Macros for combining src and destination pixels.
++ * Interleave or not is depending on operator 'op'.
++ */
++.macro bilinear_combine_src \
++                numpix, src0, src1, src01, dst0, dst1, dst01, \
++                tmp01, tmp23, tmp45, tmp67, tmp8
++.endm
++
++.macro bilinear_combine_over \
++                numpix, src0, src1, src01, dst0, dst1, dst01, \
++                tmp01, tmp23, tmp45, tmp67, tmp8
++
++    dup         &tmp8&.2s, &src1&.s[1]
++    /* bubbles */
++    mvn         &tmp8&.8b, &tmp8&.8b
++    /* bubbles */
++    umull       &tmp01&.8h, &dst0&.8b, &tmp8&.8b
++    /* bubbles */
++    umull       &tmp23&.8h, &dst1&.8b, &tmp8&.8b
++    /* bubbles */
++    urshr       &tmp45&.8h, &tmp01&.8h, #8
++    urshr       &tmp67&.8h, &tmp23&.8h, #8
++    /* bubbles */
++    raddhn      &dst0&.8b, &tmp45&.8h, &tmp01&.8h
++    raddhn      &dst1&.8b, &tmp67&.8h, &tmp23&.8h
++    mov         &dst01&.d[0], &dst0&.d[0]
++    mov         &dst01&.d[1], &dst1&.d[0]
++    /* bubbles */
++    uqadd       &src0&.8b, &dst0&.8b, &src0&.8b
++    uqadd       &src1&.8b, &dst1&.8b, &src1&.8b
++    mov         &src01&.d[0], &src0&.d[0]
++    mov         &src01&.d[1], &src1&.d[0]
++.endm
++
++.macro bilinear_combine_add \
++                numpix, src0, src1, src01, dst0, dst1, dst01, \
++                tmp01, tmp23, tmp45, tmp67, tmp8
++
++    uqadd       &src0&.8b, &dst0&.8b, &src0&.8b
++    uqadd       &src1&.8b, &dst1&.8b, &src1&.8b
++    mov         &src01&.d[0], &src0&.d[0]
++    mov         &src01&.d[1], &src1&.d[0]
++.endm
++
++.macro bilinear_combine \
++                op, numpix, src0, src1, src01, dst0, dst1, dst01, \
++                tmp01, tmp23, tmp45, tmp67, tmp8
++
++    bilinear_combine_&op \
++                numpix, src0, src1, src01, dst0, dst1, dst01, \
++                tmp01, tmp23, tmp45, tmp67, tmp8
++.endm
++
++/*
++ * Macros for final deinterleaving of destination pixels if needed.
++ */
++.macro bilinear_deinterleave numpix, dst0, dst1, dst01
++    vuzp       &dst0&.8b, &dst1&.8b
++    /* bubbles */
++    vuzp       &dst0&.8b, &dst1&.8b
++    mov        &dst01&.d[0], &dst0&.d[0]
++    mov        &dst01&.d[1], &dst1&.d[0]
++.endm
++
++.macro bilinear_deinterleave_dst_x_src numpix, dst0, dst1, dst01
++.endm
++
++.macro bilinear_deinterleave_dst_x_over numpix, dst0, dst1, dst01
++    bilinear_deinterleave numpix, dst0, dst1, dst01
++.endm
++
++.macro bilinear_deinterleave_dst_x_add numpix, dst0, dst1, dst01
++    bilinear_deinterleave numpix, dst0, dst1, dst01
++.endm
++
++.macro bilinear_deinterleave_dst_8_src numpix, dst0, dst1, dst01
++    bilinear_deinterleave numpix, dst0, dst1, dst01
++.endm
++
++.macro bilinear_deinterleave_dst_8_over numpix, dst0, dst1, dst01
++    bilinear_deinterleave numpix, dst0, dst1, dst01
++.endm
++
++.macro bilinear_deinterleave_dst_8_add numpix, dst0, dst1, dst01
++    bilinear_deinterleave numpix, dst0, dst1, dst01
++.endm
++
++.macro bilinear_deinterleave_dst mask_fmt, op, numpix, dst0, dst1, dst01
++    bilinear_deinterleave_dst_&mask_fmt&_&op numpix, dst0, dst1, dst01
++.endm
++
++
++.macro bilinear_interpolate_last_pixel src_fmt, mask_fmt, dst_fmt, op
++    bilinear_load_&src_fmt v0, v1, v2
++    bilinear_load_mask mask_fmt, 1, v4
++    bilinear_load_dst dst_fmt, op, 1, v18, v19, v9
++    umull     v2.8h, v0.8b, v28.8b
++    umlal     v2.8h, v1.8b, v29.8b
++    /* 5 cycles bubble */
++    ushll     v0.4s, v2.4h, #BILINEAR_INTERPOLATION_BITS
++    umlsl     v0.4s, v2.4h, v15.h[0]
++    umlal2    v0.4s, v2.8h, v15.h[0]
++    /* 5 cycles bubble */
++    bilinear_duplicate_mask mask_fmt, 1, v4
++    shrn      v0.4h, v0.4s, #(2 * BILINEAR_INTERPOLATION_BITS)
++    /* 3 cycles bubble */
++    xtn       v0.8b, v0.8h
++    /* 1 cycle bubble */
++    bilinear_interleave_src_dst \
++                mask_fmt, op, 1, v0, v1, v0, v18, v19, v9
++    bilinear_apply_mask_to_src \
++                mask_fmt, 1, v0, v1, v0, v4, \
++                v3, v8, v10, v11
++    bilinear_combine \
++                op, 1, v0, v1, v0, v18, v19, v9, \
++                v3, v8, v10, v11, v5
++    bilinear_deinterleave_dst mask_fmt, op, 1, v0, v1, v0
++    bilinear_store_&dst_fmt 1, v17, v18
++.endm
++
++.macro bilinear_interpolate_two_pixels src_fmt, mask_fmt, dst_fmt, op
++    bilinear_load_and_vertical_interpolate_two_&src_fmt \
++                v1, v11, v18, v19, v20, v21, v22, v23
++    bilinear_load_mask mask_fmt, 2, v4
++    bilinear_load_dst dst_fmt, op, 2, v18, v19, v9
++    ushll     v0.4s, v1.4h, #BILINEAR_INTERPOLATION_BITS
++    umlsl     v0.4s, v1.4h, v15.h[0]
++    umlal2    v0.4s, v1.8h, v15.h[0]
++    ushll     v10.4s, v11.4h, #BILINEAR_INTERPOLATION_BITS
++    umlsl     v10.4s, v11.4h, v15.h[4]
++    umlal2    v10.4s, v11.8h, v15.h[4]
++    shrn      v0.4h, v0.4s, #(2 * BILINEAR_INTERPOLATION_BITS)
++    shrn2     v0.8h, v10.4s, #(2 * BILINEAR_INTERPOLATION_BITS)
++    bilinear_duplicate_mask mask_fmt, 2, v4
++    ushr      v15.8h, v12.8h, #(16 - BILINEAR_INTERPOLATION_BITS)
++    add       v12.8h, v12.8h, v13.8h
++    xtn       v0.8b, v0.8h
++    bilinear_interleave_src_dst \
++                mask_fmt, op, 2, v0, v1, v0, v18, v19, v9
++    bilinear_apply_mask_to_src \
++                mask_fmt, 2, v0, v1, v0, v4, \
++                v3, v8, v10, v11
++    bilinear_combine \
++                op, 2, v0, v1, v0, v18, v19, v9, \
++                v3, v8, v10, v11, v5
++    bilinear_deinterleave_dst mask_fmt, op, 2, v0, v1, v0
++    bilinear_store_&dst_fmt 2, v16, v17
++.endm
++
++.macro bilinear_interpolate_four_pixels src_fmt, mask_fmt, dst_fmt, op
++    bilinear_load_and_vertical_interpolate_four_&src_fmt \
++                v1, v11, v4,  v5,  v6,  v7,  v22, v23 \
++                v3, v9,  v16, v17, v20, v21, v18, v19
++    prfm      PREFETCH_MODE, [TMP1, PF_OFFS]
++    sub       TMP1, TMP1, STRIDE
++    prfm      PREFETCH_MODE, [TMP1, PF_OFFS]
++    ushll     v0.4s, v1.4h, #BILINEAR_INTERPOLATION_BITS
++    umlsl     v0.4s, v1.4h, v15.h[0]
++    umlal2    v0.4s, v1.8h, v15.h[0]
++    ushll     v10.4s, v11.4h, #BILINEAR_INTERPOLATION_BITS
++    umlsl     v10.4s, v11.4h, v15.h[4]
++    umlal2    v10.4s, v11.8h, v15.h[4]
++    ushr      v15.8h, v12.8h, #(16 - BILINEAR_INTERPOLATION_BITS)
++    ushll     v2.4s, v3.4h, #BILINEAR_INTERPOLATION_BITS
++    umlsl     v2.4s, v3.4h, v15.h[0]
++    umlal2    v2.4s, v3.8h, v15.h[0]
++    ushll     v8.4s, v9.4h, #BILINEAR_INTERPOLATION_BITS
++    umlsl     v8.4s, v9.4h, v15.h[4]
++    umlal2    v8.4s, v9.8h, v15.h[4]
++    add       v12.8h, v12.8h, v13.8h
++    shrn      v0.4h,  v0.4s, #(2 * BILINEAR_INTERPOLATION_BITS)
++    shrn2     v0.8h, v10.4s, #(2 * BILINEAR_INTERPOLATION_BITS)
++    shrn      v2.4h,  v2.4s, #(2 * BILINEAR_INTERPOLATION_BITS)
++    shrn2     v2.8h,  v8.4s, #(2 * BILINEAR_INTERPOLATION_BITS)
++    bilinear_load_mask mask_fmt, 4, v4
++    bilinear_duplicate_mask mask_fmt, 4, v4
++    ushr      v15.8h, v12.8h, #(16 - BILINEAR_INTERPOLATION_BITS)
++    xtn       v0.8b, v0.8h
++    xtn       v1.8b, v2.8h
++    add       v12.8h, v12.8h, v13.8h
++    bilinear_load_dst dst_fmt, op, 4, v2, v3, v21
++    bilinear_interleave_src_dst \
++                mask_fmt, op, 4, v0, v1, v0, v2, v3, v11
++    bilinear_apply_mask_to_src \
++                mask_fmt, 4, v0, v1, v0, v4, \
++                v6, v8, v9, v10
++    bilinear_combine \
++                op, 4, v0, v1, v0, v2, v3, v1, \
++                v6, v8, v9, v10, v23
++    bilinear_deinterleave_dst mask_fmt, op, 4, v0, v1, v0
++    bilinear_store_&dst_fmt 4, v6, v7
++.endm
++
++.set BILINEAR_FLAG_USE_MASK,        1
++.set BILINEAR_FLAG_USE_ALL_NEON_REGS,    2
++
++/*
++ * Main template macro for generating NEON optimized bilinear scanline functions.
++ *
++ * Bilinear scanline generator macro take folling arguments:
++ *  fname            - name of the function to generate
++ *  src_fmt            - source color format (8888 or 0565)
++ *  dst_fmt            - destination color format (8888 or 0565)
++ *  src/dst_bpp_shift        - (1 << bpp_shift) is the size of src/dst pixel in bytes
++ *  process_last_pixel        - code block that interpolate one pixel and does not
++ *                  update horizontal weight
++ *  process_two_pixels        - code block that interpolate two pixels and update
++ *                  horizontal weight
++ *  process_four_pixels        - code block that interpolate four pixels and update
++ *                  horizontal weight
++ *  process_pixblock_head    - head part of middle loop
++ *  process_pixblock_tail    - tail part of middle loop
++ *  process_pixblock_tail_head    - tail_head of middle loop
++ *  pixblock_size        - number of pixels processed in a single middle loop
++ *  prefetch_distance        - prefetch in the source image by that many pixels ahead
++ */
++
++.macro generate_bilinear_scanline_func \
++    fname, \
++    src_fmt, dst_fmt, src_bpp_shift, dst_bpp_shift, \
++    bilinear_process_last_pixel, \
++    bilinear_process_two_pixels, \
++    bilinear_process_four_pixels, \
++    bilinear_process_pixblock_head, \
++    bilinear_process_pixblock_tail, \
++    bilinear_process_pixblock_tail_head, \
++    pixblock_size, \
++    prefetch_distance, \
++    flags
++
++pixman_asm_function fname
++.if pixblock_size == 8
++.elseif pixblock_size == 4
++.else
++    .error unsupported pixblock size
++.endif
++
++.if ((flags) & BILINEAR_FLAG_USE_MASK) == 0
++    OUT       .req    x0
++    TOP       .req    x1
++    BOTTOM    .req    x2
++    WT        .req    x3
++    WWT       .req    w3
++    WB        .req    x4
++    WWB       .req    w4
++    X         .req    w5
++    UX        .req    w6
++    WIDTH     .req    x7
++    TMP1      .req    x10
++    WTMP1     .req    w10
++    TMP2      .req    x11
++    WTMP2     .req    w11
++    PF_OFFS   .req    x12
++    TMP3      .req    x13
++    WTMP3     .req    w13
++    TMP4      .req    x14
++    WTMP4     .req    w14
++    STRIDE    .req    x15
++    DUMMY     .req    x30
++
++    stp       x29, x30, [sp, -16]!
++    mov       x29, sp
++    sub       sp, sp, 112
++    sub       x29, x29, 64
++    st1       {v8.8b, v9.8b, v10.8b, v11.8b}, [x29], 32
++    st1       {v12.8b, v13.8b, v14.8b, v15.8b}, [x29], 32
++    stp       x10, x11, [x29, -80]
++    stp       x12, x13, [x29, -96]
++    stp       x14, x15, [x29, -112]
++.else
++    OUT       .req      x0
++    MASK      .req      x1
++    TOP       .req      x2
++    BOTTOM    .req      x3
++    WT        .req      x4
++    WWT       .req      w4
++    WB        .req      x5
++    WWB       .req      w5
++    X         .req      w6
++    UX        .req      w7
++    WIDTH     .req      x8
++    TMP1      .req      x10
++    WTMP1     .req      w10
++    TMP2      .req      x11
++    WTMP2     .req      w11
++    PF_OFFS   .req      x12
++    TMP3      .req      x13
++    WTMP3     .req      w13
++    TMP4      .req      x14
++    WTMP4     .req      w14
++    STRIDE    .req      x15
++    DUMMY     .req      x30
++
++    .set prefetch_offset, prefetch_distance
++
++    stp      x29, x30, [sp, -16]!
++    mov      x29, sp
++    sub      x29, x29, 64
++    st1      {v8.8b, v9.8b, v10.8b, v11.8b}, [x29], 32
++    st1      {v12.8b, v13.8b, v14.8b, v15.8b}, [x29], 32
++    stp      x10, x11, [x29, -80]
++    stp      x12, x13, [x29, -96]
++    stp      x14, x15, [x29, -112]
++    str      x8, [x29, -120]
++    ldr      w8, [x29, 16]
++    sub      sp, sp, 120
++.endif
++
++    mov      WTMP1, #prefetch_distance
++    umull    PF_OFFS, WTMP1, UX
++
++    sub      STRIDE, BOTTOM, TOP
++    .unreq   BOTTOM
++
++    cmp      WIDTH, #0
++    ble      300f
++
++    dup      v12.8h, X
++    dup      v13.8h, UX
++    dup      v28.8b, WWT
++    dup      v29.8b, WWB
++    mov      v25.d[0], v12.d[1]
++    mov      v26.d[0], v13.d[0]
++    add      v25.4h, v25.4h, v26.4h
++    mov      v12.d[1], v25.d[0]
++
++    /* ensure good destination alignment  */
++    cmp       WIDTH, #1
++    blt       100f
++    tst       OUT, #(1 << dst_bpp_shift)
++    beq       100f
++    ushr      v15.8h, v12.8h, #(16 - BILINEAR_INTERPOLATION_BITS)
++    add       v12.8h, v12.8h, v13.8h
++    bilinear_process_last_pixel
++    sub       WIDTH, WIDTH, #1
++100:
++    add       v13.8h, v13.8h, v13.8h
++    ushr      v15.8h, v12.8h, #(16 - BILINEAR_INTERPOLATION_BITS)
++    add       v12.8h, v12.8h, v13.8h
++
++    cmp       WIDTH, #2
++    blt       100f
++    tst       OUT, #(1 << (dst_bpp_shift + 1))
++    beq       100f
++    bilinear_process_two_pixels
++    sub       WIDTH, WIDTH, #2
++100:
++.if pixblock_size == 8
++    cmp       WIDTH, #4
++    blt       100f
++    tst       OUT, #(1 << (dst_bpp_shift + 2))
++    beq       100f
++    bilinear_process_four_pixels
++    sub       WIDTH, WIDTH, #4
++100:
++.endif
++    subs      WIDTH, WIDTH, #pixblock_size
++    blt       100f
++    asr       PF_OFFS, PF_OFFS, #(16 - src_bpp_shift)
++    bilinear_process_pixblock_head
++    subs      WIDTH, WIDTH, #pixblock_size
++    blt       500f
++0:
++    bilinear_process_pixblock_tail_head
++    subs      WIDTH, WIDTH, #pixblock_size
++    bge       0b
++500:
++    bilinear_process_pixblock_tail
++100:
++.if pixblock_size == 8
++    tst       WIDTH, #4
++    beq       200f
++    bilinear_process_four_pixels
++200:
++.endif
++    /* handle the remaining trailing pixels */
++    tst       WIDTH, #2
++    beq       200f
++    bilinear_process_two_pixels
++200:
++    tst       WIDTH, #1
++    beq       300f
++    bilinear_process_last_pixel
++300:
++
++.if ((flags) & BILINEAR_FLAG_USE_MASK) == 0
++    sub       x29, x29, 64
++    ld1       {v8.8b, v9.8b, v10.8b, v11.8b}, [x29], 32
++    ld1       {v12.8b, v13.8b, v14.8b, v15.8b}, [x29], 32
++    ldp       x10, x11, [x29, -80]
++    ldp       x12, x13, [x29, -96]
++    ldp       x14, x15, [x29, -112]
++    mov       sp, x29
++    ldp       x29, x30, [sp], 16
++.else
++    sub       x29, x29, 64
++    ld1       {v8.8b, v9.8b, v10.8b, v11.8b}, [x29], 32
++    ld1       {v12.8b, v13.8b, v14.8b, v15.8b}, [x29], 32
++    ldp       x10, x11, [x29, -80]
++    ldp       x12, x13, [x29, -96]
++    ldp       x14, x15, [x29, -112]
++    ldr       x8, [x29, -120]
++    mov       sp, x29
++    ldp       x29, x30, [sp], 16
++.endif
++    ret
++
++    .unreq    OUT
++    .unreq    TOP
++    .unreq    WT
++    .unreq    WWT
++    .unreq    WB
++    .unreq    WWB
++    .unreq    X
++    .unreq    UX
++    .unreq    WIDTH
++    .unreq    TMP1
++    .unreq    WTMP1
++    .unreq    TMP2
++    .unreq    PF_OFFS
++    .unreq    TMP3
++    .unreq    TMP4
++    .unreq    STRIDE
++.if ((flags) & BILINEAR_FLAG_USE_MASK) != 0
++    .unreq    MASK
++.endif
++
++.endfunc
++
++.endm
++
++/* src_8888_8_8888 */
++.macro bilinear_src_8888_8_8888_process_last_pixel
++    bilinear_interpolate_last_pixel 8888, 8, 8888, src
++.endm
++
++.macro bilinear_src_8888_8_8888_process_two_pixels
++    bilinear_interpolate_two_pixels 8888, 8, 8888, src
++.endm
++
++.macro bilinear_src_8888_8_8888_process_four_pixels
++    bilinear_interpolate_four_pixels 8888, 8, 8888, src
++.endm
++
++.macro bilinear_src_8888_8_8888_process_pixblock_head
++    bilinear_src_8888_8_8888_process_four_pixels
++.endm
++
++.macro bilinear_src_8888_8_8888_process_pixblock_tail
++.endm
++
++.macro bilinear_src_8888_8_8888_process_pixblock_tail_head
++    bilinear_src_8888_8_8888_process_pixblock_tail
++    bilinear_src_8888_8_8888_process_pixblock_head
++.endm
++
++/* src_8888_8_0565 */
++.macro bilinear_src_8888_8_0565_process_last_pixel
++    bilinear_interpolate_last_pixel 8888, 8, 0565, src
++.endm
++
++.macro bilinear_src_8888_8_0565_process_two_pixels
++    bilinear_interpolate_two_pixels 8888, 8, 0565, src
++.endm
++
++.macro bilinear_src_8888_8_0565_process_four_pixels
++    bilinear_interpolate_four_pixels 8888, 8, 0565, src
++.endm
++
++.macro bilinear_src_8888_8_0565_process_pixblock_head
++    bilinear_src_8888_8_0565_process_four_pixels
++.endm
++
++.macro bilinear_src_8888_8_0565_process_pixblock_tail
++.endm
++
++.macro bilinear_src_8888_8_0565_process_pixblock_tail_head
++    bilinear_src_8888_8_0565_process_pixblock_tail
++    bilinear_src_8888_8_0565_process_pixblock_head
++.endm
++
++/* src_0565_8_x888 */
++.macro bilinear_src_0565_8_x888_process_last_pixel
++    bilinear_interpolate_last_pixel 0565, 8, 8888, src
++.endm
++
++.macro bilinear_src_0565_8_x888_process_two_pixels
++    bilinear_interpolate_two_pixels 0565, 8, 8888, src
++.endm
++
++.macro bilinear_src_0565_8_x888_process_four_pixels
++    bilinear_interpolate_four_pixels 0565, 8, 8888, src
++.endm
++
++.macro bilinear_src_0565_8_x888_process_pixblock_head
++    bilinear_src_0565_8_x888_process_four_pixels
++.endm
++
++.macro bilinear_src_0565_8_x888_process_pixblock_tail
++.endm
++
++.macro bilinear_src_0565_8_x888_process_pixblock_tail_head
++    bilinear_src_0565_8_x888_process_pixblock_tail
++    bilinear_src_0565_8_x888_process_pixblock_head
++.endm
++
++/* src_0565_8_0565 */
++.macro bilinear_src_0565_8_0565_process_last_pixel
++    bilinear_interpolate_last_pixel 0565, 8, 0565, src
++.endm
++
++.macro bilinear_src_0565_8_0565_process_two_pixels
++    bilinear_interpolate_two_pixels 0565, 8, 0565, src
++.endm
++
++.macro bilinear_src_0565_8_0565_process_four_pixels
++    bilinear_interpolate_four_pixels 0565, 8, 0565, src
++.endm
++
++.macro bilinear_src_0565_8_0565_process_pixblock_head
++    bilinear_src_0565_8_0565_process_four_pixels
++.endm
++
++.macro bilinear_src_0565_8_0565_process_pixblock_tail
++.endm
++
++.macro bilinear_src_0565_8_0565_process_pixblock_tail_head
++    bilinear_src_0565_8_0565_process_pixblock_tail
++    bilinear_src_0565_8_0565_process_pixblock_head
++.endm
++
++/* over_8888_8888 */
++.macro bilinear_over_8888_8888_process_last_pixel
++    bilinear_interpolate_last_pixel 8888, x, 8888, over
++.endm
++
++.macro bilinear_over_8888_8888_process_two_pixels
++    bilinear_interpolate_two_pixels 8888, x, 8888, over
++.endm
++
++.macro bilinear_over_8888_8888_process_four_pixels
++    bilinear_interpolate_four_pixels 8888, x, 8888, over
++.endm
++
++.macro bilinear_over_8888_8888_process_pixblock_head
++    asr         WTMP1, X, #16
++    add         X, X, UX
++    add         TMP1, TOP, TMP1, lsl #2
++    asr         WTMP2, X, #16
++    add         X, X, UX
++    add         TMP2, TOP, TMP2, lsl #2
++
++    ld1         {v22.2s}, [TMP1], STRIDE
++    ld1         {v23.2s}, [TMP1]
++    asr         WTMP3, X, #16
++    add         X, X, UX
++    add         TMP3, TOP, TMP3, lsl #2
++    umull       v8.8h, v22.8b, v28.8b
++    umlal       v8.8h, v23.8b, v29.8b
++
++    ld1         {v22.2s}, [TMP2], STRIDE
++    ld1         {v23.2s}, [TMP2]
++    asr         WTMP4, X, #16
++    add         X, X, UX
++    add         TMP4, TOP, TMP4, lsl #2
++    umull       v9.8h, v22.8b, v28.8b
++    umlal       v9.8h, v23.8b, v29.8b
++
++    ld1         {v22.2s}, [TMP3], STRIDE
++    ld1         {v23.2s}, [TMP3]
++    umull       v10.8h, v22.8b, v28.8b
++    umlal       v10.8h, v23.8b, v29.8b
++
++    ushll       v0.4s, v8.4h, #BILINEAR_INTERPOLATION_BITS
++    umlsl       v0.4s, v8.4h, v15.h[0]
++    umlal2      v0.4s, v8.8h, v15.h[0]
++
++    prfm        PREFETCH_MODE, [TMP4, PF_OFFS]
++    ld1         {v16.2s}, [TMP4], STRIDE
++    ld1         {v17.2s}, [TMP4]
++    prfm        PREFETCH_MODE, [TMP4, PF_OFFS]
++    umull       v11.8h, v16.8b, v28.8b
++    umlal       v11.8h, v17.8b, v29.8b
++
++    ushll       v1.4s, v9.4h, #BILINEAR_INTERPOLATION_BITS
++    umlsl       v1.4s, v9.4h, v15.h[4]
++    umlal2      v1.4s, v9.8h, v15.h[4]
++    ushr        v15.8h, v12.8h, #(16 - BILINEAR_INTERPOLATION_BITS)
++    add         v12.8h, v12.8h, v13.8h
++.endm
++
++.macro bilinear_over_8888_8888_process_pixblock_tail
++    ushll       v2.4s, v10.4h, #BILINEAR_INTERPOLATION_BITS
++    umlsl       v2.4s, v10.4h, v15.h[0]
++    umlal2      v2.4s, v10.8h, v15.h[0]
++    ushll       v3.4s, v11.4h, #BILINEAR_INTERPOLATION_BITS
++    umlsl       v3.4s, v11.4h, v15.h[4]
++    umlal2      v3.4s, v11.8h, v15.h[4]
++    shrn        v0.4h, v0.4s, #(2 * BILINEAR_INTERPOLATION_BITS)
++    shrn2       v0.8h, v1.4s, #(2 * BILINEAR_INTERPOLATION_BITS)
++    shrn        v2.4h, v2.4s, #(2 * BILINEAR_INTERPOLATION_BITS)
++    ushr        v15.8h, v12.8h, #(16 - BILINEAR_INTERPOLATION_BITS)
++    shrn2       v2.8h, v3.4s, #(2 * BILINEAR_INTERPOLATION_BITS)
++    xtn         v6.8b, v0.8h
++    xtn         v7.8b, v2.8h
++    ld1         {v2.2s, v3.2s}, [OUT]
++    prfm        PREFETCH_MODE, [OUT, #(prefetch_offset * 4)]
++    vuzp        v6.8b, v7.8b
++    vuzp        v2.8b, v3.8b
++    vuzp        v6.8b, v7.8b
++    vuzp        v2.8b, v3.8b
++    dup         v4.2s, v7.s[1]
++    mvn         v4.8b, v4.8b
++    umull       v11.8h, v2.8b, v4.8b
++    umull       v2.8h,  v3.8b, v4.8b
++    urshr       v1.8h, v11.8h, #8
++    urshr       v10.8h, v2.8h, #8
++    raddhn      v3.8b, v10.8h, v2.8h
++    raddhn      v2.8b, v1.8h, v11.8h
++    uqadd       v6.8b, v2.8b,  v6.8b
++    uqadd       v7.8b, v3.8b,  v7.8b
++    vuzp        v6.8b, v7.8b
++    vuzp        v6.8b, v7.8b
++    add         v12.8h, v12.8h, v13.8h
++    st1         {v6.2s, v7.2s}, [OUT], #16
++.endm
++
++.macro bilinear_over_8888_8888_process_pixblock_tail_head
++                                            ushll       v2.4s, v10.4h, #BILINEAR_INTERPOLATION_BITS
++    asr         WTMP1, X, #16
++    add         X, X, UX
++    add         TMP1, TOP, TMP1, lsl #2
++                                            umlsl       v2.4s, v10.4h, v15.h[0]
++    asr         WTMP2, X, #16
++    add         X, X, UX
++    add         TMP2, TOP, TMP2, lsl #2
++                                            umlal2      v2.4s, v10.8h, v15.h[0]
++                                            ushll       v3.4s, v11.4h, #BILINEAR_INTERPOLATION_BITS
++    ld1         {v20.2s}, [TMP1], STRIDE
++                                            umlsl       v3.4s, v11.4h, v15.h[4]
++                                            umlal2      v3.4s, v11.8h, v15.h[4]
++    ld1         {v21.2s}, [TMP1]
++    umull       v8.8h, v20.8b, v28.8b
++    umlal       v8.8h, v21.8b, v29.8b
++                                            shrn        v0.4h, v0.4s, #(2 * BILINEAR_INTERPOLATION_BITS)
++                                            shrn2       v0.8h, v1.4s, #(2 * BILINEAR_INTERPOLATION_BITS)
++                                            shrn        v2.4h, v2.4s, #(2 * BILINEAR_INTERPOLATION_BITS)
++                                            ushr        v15.8h, v12.8h, #(16 - BILINEAR_INTERPOLATION_BITS)
++    ld1         {v22.2s}, [TMP2], STRIDE
++                                            shrn2       v2.8h, v3.4s, #(2 * BILINEAR_INTERPOLATION_BITS)
++                                            xtn         v6.8b, v0.8h
++    ld1         {v23.2s}, [TMP2]
++    umull       v9.8h, v22.8b, v28.8b
++    asr         WTMP3, X, #16
++    add         X, X, UX
++    add         TMP3, TOP, TMP3, lsl #2
++    asr         WTMP4, X, #16
++    add         X, X, UX
++    add         TMP4, TOP, TMP4, lsl #2
++    umlal       v9.8h, v23.8b, v29.8b
++                                            xtn         v7.8b, v2.8h
++                                            ld1         {v2.2s, v3.2s}, [OUT]
++                                            prfm        PREFETCH_MODE, [OUT, PF_OFFS]
++    ld1         {v22.2s}, [TMP3], STRIDE
++                                            vuzp        v6.8b, v7.8b
++                                            vuzp        v2.8b, v3.8b
++                                            vuzp        v6.8b, v7.8b
++                                            vuzp        v2.8b, v3.8b
++                                            dup         v4.2s, v7.s[1]
++    ld1         {v23.2s}, [TMP3]
++                                            mvn         v4.8b, v4.8b
++    umull       v10.8h, v22.8b, v28.8b
++    umlal       v10.8h, v23.8b, v29.8b
++                                            umull       v11.8h, v2.8b, v4.8b
++                                            umull        v2.8h, v3.8b, v4.8b
++    ushll       v0.4s, v8.4h, #BILINEAR_INTERPOLATION_BITS
++    umlsl       v0.4s, v8.4h, v15.h[0]
++                                            urshr       v1.8h, v11.8h, #8
++    umlal2      v0.4s, v8.8h, v15.h[0]
++                                            urshr       v8.8h, v2.8h, #8
++                                            raddhn      v3.8b, v8.8h, v2.8h
++                                            raddhn      v2.8b, v1.8h, v11.8h
++    prfm        PREFETCH_MODE, [TMP4, PF_OFFS]
++    ld1         {v16.2s}, [TMP4], STRIDE
++                                            uqadd       v6.8b, v2.8b, v6.8b
++                                            uqadd       v7.8b, v3.8b, v7.8b
++    ld1         {v17.2s}, [TMP4]
++    prfm        PREFETCH_MODE, [TMP4, PF_OFFS]
++    umull       v11.8h, v16.8b, v28.8b
++    umlal       v11.8h, v17.8b, v29.8b
++                                            vuzp        v6.8b, v7.8b
++    ushll       v1.4s, v9.4h, #BILINEAR_INTERPOLATION_BITS
++                                            vuzp        v6.8b, v7.8b
++    umlsl       v1.4s, v9.4h, v15.h[4]
++                                            add         v12.8h, v12.8h, v13.8h
++    umlal2      v1.4s, v9.8h, v15.h[4]
++    ushr        v15.8h, v12.8h, #(16 - BILINEAR_INTERPOLATION_BITS)
++    add         v12.8h, v12.8h, v13.8h
++                                            st1         {v6.2s, v7.2s}, [OUT], #16
++.endm
++
++/* over_8888_8_8888 */
++.macro bilinear_over_8888_8_8888_process_last_pixel
++    bilinear_interpolate_last_pixel 8888, 8, 8888, over
++.endm
++
++.macro bilinear_over_8888_8_8888_process_two_pixels
++    bilinear_interpolate_two_pixels 8888, 8, 8888, over
++.endm
++
++.macro bilinear_over_8888_8_8888_process_four_pixels
++    bilinear_interpolate_two_pixels 8888, 8, 8888, over
++    bilinear_interpolate_two_pixels 8888, 8, 8888, over
++.endm
++
++.macro bilinear_over_8888_8_8888_process_pixblock_head
++    bilinear_over_8888_8_8888_process_four_pixels
++.endm
++
++.macro bilinear_over_8888_8_8888_process_pixblock_tail
++.endm
++
++.macro bilinear_over_8888_8_8888_process_pixblock_tail_head
++     bilinear_over_8888_8_8888_process_pixblock_tail
++     bilinear_over_8888_8_8888_process_pixblock_head
++.endm
++
++/* add_8888_8888 */
++.macro bilinear_add_8888_8888_process_last_pixel
++    bilinear_interpolate_last_pixel 8888, x, 8888, add
++.endm
++
++.macro bilinear_add_8888_8888_process_two_pixels
++    bilinear_interpolate_two_pixels 8888, x, 8888, add
++.endm
++
++.macro bilinear_add_8888_8888_process_four_pixels
++    bilinear_interpolate_two_pixels 8888, x, 8888, add
++    bilinear_interpolate_two_pixels 8888, x, 8888, add
++.endm
++
++.macro bilinear_add_8888_8888_process_pixblock_head
++    bilinear_add_8888_8888_process_four_pixels
++.endm
++
++.macro bilinear_add_8888_8888_process_pixblock_tail
++.endm
++
++.macro bilinear_add_8888_8888_process_pixblock_tail_head
++    bilinear_add_8888_8888_process_pixblock_tail
++    bilinear_add_8888_8888_process_pixblock_head
++.endm
++
++/* add_8888_8_8888 */
++.macro bilinear_add_8888_8_8888_process_last_pixel
++    bilinear_interpolate_last_pixel 8888, 8, 8888, add
++.endm
++
++.macro bilinear_add_8888_8_8888_process_two_pixels
++    bilinear_interpolate_two_pixels 8888, 8, 8888, add
++.endm
++
++.macro bilinear_add_8888_8_8888_process_four_pixels
++    bilinear_interpolate_four_pixels 8888, 8, 8888, add
++.endm
++
++.macro bilinear_add_8888_8_8888_process_pixblock_head
++    bilinear_add_8888_8_8888_process_four_pixels
++.endm
++
++.macro bilinear_add_8888_8_8888_process_pixblock_tail
++.endm
++
++.macro bilinear_add_8888_8_8888_process_pixblock_tail_head
++    bilinear_add_8888_8_8888_process_pixblock_tail
++    bilinear_add_8888_8_8888_process_pixblock_head
++.endm
++
++
++/* Bilinear scanline functions */
++generate_bilinear_scanline_func \
++    pixman_scaled_bilinear_scanline_8888_8_8888_SRC_asm_neon, \
++    8888, 8888, 2, 2, \
++    bilinear_src_8888_8_8888_process_last_pixel, \
++    bilinear_src_8888_8_8888_process_two_pixels, \
++    bilinear_src_8888_8_8888_process_four_pixels, \
++    bilinear_src_8888_8_8888_process_pixblock_head, \
++    bilinear_src_8888_8_8888_process_pixblock_tail, \
++    bilinear_src_8888_8_8888_process_pixblock_tail_head, \
++    4, 28, BILINEAR_FLAG_USE_MASK
++
++generate_bilinear_scanline_func \
++    pixman_scaled_bilinear_scanline_8888_8_0565_SRC_asm_neon, \
++    8888, 0565, 2, 1, \
++    bilinear_src_8888_8_0565_process_last_pixel, \
++    bilinear_src_8888_8_0565_process_two_pixels, \
++    bilinear_src_8888_8_0565_process_four_pixels, \
++    bilinear_src_8888_8_0565_process_pixblock_head, \
++    bilinear_src_8888_8_0565_process_pixblock_tail, \
++    bilinear_src_8888_8_0565_process_pixblock_tail_head, \
++    4, 28, BILINEAR_FLAG_USE_MASK
++
++generate_bilinear_scanline_func \
++    pixman_scaled_bilinear_scanline_0565_8_x888_SRC_asm_neon, \
++    0565, 8888, 1, 2, \
++    bilinear_src_0565_8_x888_process_last_pixel, \
++    bilinear_src_0565_8_x888_process_two_pixels, \
++    bilinear_src_0565_8_x888_process_four_pixels, \
++    bilinear_src_0565_8_x888_process_pixblock_head, \
++    bilinear_src_0565_8_x888_process_pixblock_tail, \
++    bilinear_src_0565_8_x888_process_pixblock_tail_head, \
++    4, 28, BILINEAR_FLAG_USE_MASK
++
++generate_bilinear_scanline_func \
++    pixman_scaled_bilinear_scanline_0565_8_0565_SRC_asm_neon, \
++    0565, 0565, 1, 1, \
++    bilinear_src_0565_8_0565_process_last_pixel, \
++    bilinear_src_0565_8_0565_process_two_pixels, \
++    bilinear_src_0565_8_0565_process_four_pixels, \
++    bilinear_src_0565_8_0565_process_pixblock_head, \
++    bilinear_src_0565_8_0565_process_pixblock_tail, \
++    bilinear_src_0565_8_0565_process_pixblock_tail_head, \
++    4, 28, BILINEAR_FLAG_USE_MASK
++
++generate_bilinear_scanline_func \
++    pixman_scaled_bilinear_scanline_8888_8888_OVER_asm_neon, \
++    8888, 8888, 2, 2, \
++    bilinear_over_8888_8888_process_last_pixel, \
++    bilinear_over_8888_8888_process_two_pixels, \
++    bilinear_over_8888_8888_process_four_pixels, \
++    bilinear_over_8888_8888_process_pixblock_head, \
++    bilinear_over_8888_8888_process_pixblock_tail, \
++    bilinear_over_8888_8888_process_pixblock_tail_head, \
++    4, 28, 0
++
++generate_bilinear_scanline_func \
++    pixman_scaled_bilinear_scanline_8888_8_8888_OVER_asm_neon, \
++    8888, 8888, 2, 2, \
++    bilinear_over_8888_8_8888_process_last_pixel, \
++    bilinear_over_8888_8_8888_process_two_pixels, \
++    bilinear_over_8888_8_8888_process_four_pixels, \
++    bilinear_over_8888_8_8888_process_pixblock_head, \
++    bilinear_over_8888_8_8888_process_pixblock_tail, \
++    bilinear_over_8888_8_8888_process_pixblock_tail_head, \
++    4, 28, BILINEAR_FLAG_USE_MASK
++
++generate_bilinear_scanline_func \
++    pixman_scaled_bilinear_scanline_8888_8888_ADD_asm_neon, \
++    8888, 8888, 2, 2, \
++    bilinear_add_8888_8888_process_last_pixel, \
++    bilinear_add_8888_8888_process_two_pixels, \
++    bilinear_add_8888_8888_process_four_pixels, \
++    bilinear_add_8888_8888_process_pixblock_head, \
++    bilinear_add_8888_8888_process_pixblock_tail, \
++    bilinear_add_8888_8888_process_pixblock_tail_head, \
++    4, 28, 0
++
++generate_bilinear_scanline_func \
++    pixman_scaled_bilinear_scanline_8888_8_8888_ADD_asm_neon, \
++    8888, 8888, 2, 2, \
++    bilinear_add_8888_8_8888_process_last_pixel, \
++    bilinear_add_8888_8_8888_process_two_pixels, \
++    bilinear_add_8888_8_8888_process_four_pixels, \
++    bilinear_add_8888_8_8888_process_pixblock_head, \
++    bilinear_add_8888_8_8888_process_pixblock_tail, \
++    bilinear_add_8888_8_8888_process_pixblock_tail_head, \
++    4, 28, BILINEAR_FLAG_USE_MASK
+diff --git a/pixman/pixman-arma64-neon-asm.S b/pixman/pixman-arma64-neon-asm.S
+new file mode 100644
+index 0000000..18ace0e
+--- /dev/null
++++ b/pixman/pixman-arma64-neon-asm.S
+@@ -0,0 +1,3704 @@
++/*
++ * Copyright © 2009 Nokia Corporation
++ *
++ * Permission is hereby granted, free of charge, to any person obtaining a
++ * copy of this software and associated documentation files (the "Software"),
++ * to deal in the Software without restriction, including without limitation
++ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
++ * and/or sell copies of the Software, and to permit persons to whom the
++ * Software is furnished to do so, subject to the following conditions:
++ *
++ * The above copyright notice and this permission notice (including the next
++ * paragraph) shall be included in all copies or substantial portions of the
++ * Software.
++ *
++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
++ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
++ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
++ * DEALINGS IN THE SOFTWARE.
++ *
++ * Author:  Siarhei Siamashka (siarhei.siamashka@nokia.com)
++ */
++
++/*
++ * This file contains implementations of NEON optimized pixel processing
++ * functions. There is no full and detailed tutorial, but some functions
++ * (those which are exposing some new or interesting features) are
++ * extensively commented and can be used as examples.
++ *
++ * You may want to have a look at the comments for following functions:
++ *  - pixman_composite_over_8888_0565_asm_neon
++ *  - pixman_composite_over_n_8_0565_asm_neon
++ */
++
++/* Prevent the stack from becoming executable for no reason... */
++#if defined(__linux__) && defined(__ELF__)
++.section .note.GNU-stack,"",%progbits
++#endif
++
++.text
++.arch armv8-a
++
++.altmacro
++.p2align 2
++
++#include "pixman-private.h"
++#include "pixman-arm-asm.h"
++#include "pixman-arma64-neon-asm.h"
++
++/* Global configuration options and preferences */
++
++/*
++ * The code can optionally make use of unaligned memory accesses to improve
++ * performance of handling leading/trailing pixels for each scanline.
++ * Configuration variable RESPECT_STRICT_ALIGNMENT can be set to 0 for
++ * example in linux if unaligned memory accesses are not configured to
++ * generate.exceptions.
++ */
++.set RESPECT_STRICT_ALIGNMENT, 1
++
++/*
++ * Set default prefetch type. There is a choice between the following options:
++ *
++ * PREFETCH_TYPE_NONE (may be useful for the ARM cores where PLD is set to work
++ * as NOP to workaround some HW bugs or for whatever other reason)
++ *
++ * PREFETCH_TYPE_SIMPLE (may be useful for simple single-issue ARM cores where
++ * advanced prefetch intruduces heavy overhead)
++ *
++ * PREFETCH_TYPE_ADVANCED (useful for superscalar cores such as ARM Cortex-A8
++ * which can run ARM and NEON instructions simultaneously so that extra ARM
++ * instructions do not add (many) extra cycles, but improve prefetch efficiency)
++ *
++ * Note: some types of function can't support advanced prefetch and fallback
++ *       to simple one (those which handle 24bpp pixels)
++ */
++.set PREFETCH_TYPE_DEFAULT, PREFETCH_TYPE_ADVANCED
++
++/* Prefetch distance in pixels for simple prefetch */
++.set PREFETCH_DISTANCE_SIMPLE, 64
++
++/*
++ * Implementation of pixman_composite_over_8888_0565_asm_neon
++ *
++ * This function takes a8r8g8b8 source buffer, r5g6b5 destination buffer and
++ * performs OVER compositing operation. Function fast_composite_over_8888_0565
++ * from pixman-fast-path.c does the same in C and can be used as a reference.
++ *
++ * First we need to have some NEON assembly code which can do the actual
++ * operation on the pixels and provide it to the template macro.
++ *
++ * Template macro quite conveniently takes care of emitting all the necessary
++ * code for memory reading and writing (including quite tricky cases of
++ * handling unaligned leading/trailing pixels), so we only need to deal with
++ * the data in NEON registers.
++ *
++ * NEON registers allocation in general is recommented to be the following:
++ * v0,  v1,  v2,  v3  - contain loaded source pixel data
++ * v4,  v5,  v6,  v7  - contain loaded destination pixels (if they are needed)
++ * v24, v25, v26, v27 - contain loading mask pixel data (if mask is used)
++ * v28, v29, v30, v31 - place for storing the result (destination pixels)
++ *
++ * As can be seen above, four 64-bit NEON registers are used for keeping
++ * intermediate pixel data and up to 8 pixels can be processed in one step
++ * for 32bpp formats (16 pixels for 16bpp, 32 pixels for 8bpp).
++ *
++ * This particular function uses the following registers allocation:
++ * v0,  v1,  v2,  v3  - contain loaded source pixel data
++ * v4,  v5            - contain loaded destination pixels (they are needed)
++ * v28, v29           - place for storing the result (destination pixels)
++ */
++
++/*
++ * Step one. We need to have some code to do some arithmetics on pixel data.
++ * This is implemented as a pair of macros: '*_head' and '*_tail'. When used
++ * back-to-back, they take pixel data from {v0, v1, v2, v3} and {v4, v5},
++ * perform all the needed calculations and write the result to {v28, v29}.
++ * The rationale for having two macros and not just one will be explained
++ * later. In practice, any single monolitic function which does the work can
++ * be split into two parts in any arbitrary way without affecting correctness.
++ *
++ * There is one special trick here too. Common template macro can optionally
++ * make our life a bit easier by doing R, G, B, A color components
++ * deinterleaving for 32bpp pixel formats (and this feature is used in
++ * 'pixman_composite_over_8888_0565_asm_neon' function). So it means that
++ * instead of having 8 packed pixels in {v0, v1, v2, v3} registers, we
++ * actually use v0 register for blue channel (a vector of eight 8-bit
++ * values), v1 register for green, v2 for red and v3 for alpha. This
++ * simple conversion can be also done with a few NEON instructions:
++ *
++ * Packed to planar conversion: // vuzp8 is a wrapper macro
++ *  vuzp8 v0, v1
++ *  vuzp8 v2, v3
++ *  vuzp8 v1, v3
++ *  vuzp8 v0, v2
++ *
++ * Planar to packed conversion: // vzip8 is a wrapper macro
++ *  vzip8 v0, v2
++ *  vzip8 v1, v3
++ *  vzip8 v2, v3
++ *  vzip8 v0, v1
++ *
++ * But pixel can be loaded directly in planar format using LD4 / b NEON
++ * instruction. It is 1 cycle slower than LD1 / s, so this is not always
++ * desirable, that's why deinterleaving is optional.
++ *
++ * But anyway, here is the code:
++ */
++
++.macro pixman_composite_over_8888_0565_process_pixblock_head
++    /* convert 8 r5g6b5 pixel data from {v4} to planar 8-bit format
++       and put data into v6 - red, v7 - green, v30 - blue */
++    mov         v4.d[1], v5.d[0]
++    shrn        v6.8b, v4.8h, #8
++    shrn        v7.8b, v4.8h, #3
++    sli         v4.8h, v4.8h, #5
++    sri         v6.8b, v6.8b, #5
++    mvn         v3.8b, v3.8b      /* invert source alpha */
++    sri         v7.8b, v7.8b, #6
++    shrn        v30.8b, v4.8h, #2
++    /* now do alpha blending, storing results in 8-bit planar format
++       into v20 - red, v23 - green, v22 - blue */
++    umull       v10.8h, v3.8b, v6.8b
++    umull       v11.8h, v3.8b, v7.8b
++    umull       v12.8h, v3.8b, v30.8b
++    urshr       v17.8h, v10.8h, #8
++    urshr       v18.8h, v11.8h, #8
++    urshr       v19.8h, v12.8h, #8
++    raddhn      v20.8b, v10.8h, v17.8h
++    raddhn      v23.8b, v11.8h, v18.8h
++    raddhn      v22.8b, v12.8h, v19.8h
++.endm
++
++.macro pixman_composite_over_8888_0565_process_pixblock_tail
++    /* ... continue alpha blending */
++    uqadd       v17.8b, v2.8b, v20.8b
++    uqadd       v18.8b, v0.8b, v22.8b
++    uqadd       v19.8b, v1.8b, v23.8b
++    /* convert the result to r5g6b5 and store it into {v14} */
++    ushll       v14.8h, v17.8b, #7
++    sli         v14.8h, v14.8h, #1
++    ushll       v8.8h, v19.8b, #7
++    sli         v8.8h, v8.8h, #1
++    ushll       v9.8h, v18.8b, #7
++    sli         v9.8h, v9.8h, #1
++    sri         v14.8h, v8.8h, #5
++    sri         v14.8h, v9.8h, #11
++    mov         v28.d[0], v14.d[0]
++    mov         v29.d[0], v14.d[1]
++.endm
++
++/*
++ * OK, now we got almost everything that we need. Using the above two
++ * macros, the work can be done right. But now we want to optimize
++ * it a bit. ARM Cortex-A8 is an in-order core, and benefits really
++ * a lot from good code scheduling and software pipelining.
++ *
++ * Let's construct some code, which will run in the core main loop.
++ * Some pseudo-code of the main loop will look like this:
++ *   head
++ *   while (...) {
++ *     tail
++ *     head
++ *   }
++ *   tail
++ *
++ * It may look a bit weird, but this setup allows to hide instruction
++ * latencies better and also utilize dual-issue capability more
++ * efficiently (make pairs of load-store and ALU instructions).
++ *
++ * So what we need now is a '*_tail_head' macro, which will be used
++ * in the core main loop. A trivial straightforward implementation
++ * of this macro would look like this:
++ *
++ *   pixman_composite_over_8888_0565_process_pixblock_tail
++ *   st1         {v28.4h, v29.4h}, [DST_W], #32
++ *   ld1         {v4.4h, v5.4h}, [DST_R], #16
++ *   ld4         {v0.2s, v1.2s, v2.2s, v3.2s}, [SRC], #32
++ *   pixman_composite_over_8888_0565_process_pixblock_head
++ *   cache_preload 8, 8
++ *
++ * Now it also got some VLD/VST instructions. We simply can't move from
++ * processing one block of pixels to the other one with just arithmetics.
++ * The previously processed data needs to be written to memory and new
++ * data needs to be fetched. Fortunately, this main loop does not deal
++ * with partial leading/trailing pixels and can load/store a full block
++ * of pixels in a bulk. Additionally, destination buffer is already
++ * 16 bytes aligned here (which is good for performance).
++ *
++ * New things here are DST_R, DST_W, SRC and MASK identifiers. These
++ * are the aliases for ARM registers which are used as pointers for
++ * accessing data. We maintain separate pointers for reading and writing
++ * destination buffer (DST_R and DST_W).
++ *
++ * Another new thing is 'cache_preload' macro. It is used for prefetching
++ * data into CPU L2 cache and improve performance when dealing with large
++ * images which are far larger than cache size. It uses one argument
++ * (actually two, but they need to be the same here) - number of pixels
++ * in a block. Looking into 'pixman-arm-neon-asm.h' can provide some
++ * details about this macro. Moreover, if good performance is needed
++ * the code from this macro needs to be copied into '*_tail_head' macro
++ * and mixed with the rest of code for optimal instructions scheduling.
++ * We are actually doing it below.
++ *
++ * Now after all the explanations, here is the optimized code.
++ * Different instruction streams (originaling from '*_head', '*_tail'
++ * and 'cache_preload' macro) use different indentation levels for
++ * better readability. Actually taking the code from one of these
++ * indentation levels and ignoring a few LD/ST instructions would
++ * result in exactly the code from '*_head', '*_tail' or 'cache_preload'
++ * macro!
++ */
++
++#if 1
++
++.macro pixman_composite_over_8888_0565_process_pixblock_tail_head
++        uqadd       v17.8b, v2.8b, v20.8b
++    ld1         {v4.4h, v5.4h}, [DST_R], #16
++    mov         v4.d[1], v5.d[0]
++        uqadd       v18.8b, v0.8b, v22.8b
++        uqadd       v19.8b, v1.8b, v23.8b
++    shrn        v6.8b, v4.8h, #8
++    fetch_src_pixblock
++    shrn        v7.8b, v4.8h, #3
++    sli         v4.8h, v4.8h, #5
++        ushll       v14.8h, v17.8b, #7
++        sli         v14.8h, v14.8h, #1
++                                    PF add PF_X, PF_X, #8
++        ushll       v8.8h, v19.8b, #7
++        sli         v8.8h, v8.8h,  #1
++                                    PF tst PF_CTL, #0xF
++    sri         v6.8b, v6.8b, #5
++                                    PF beq 10f
++                                    PF add PF_X, PF_X, #8
++10:
++    mvn         v3.8b, v3.8b
++                                    PF beq 10f
++                                    PF sub PF_CTL, PF_CTL, #1
++10:
++    sri         v7.8b, v7.8b, #6
++    shrn        v30.8b, v4.8h, #2
++    umull       v10.8h, v3.8b, v6.8b
++                                    PF lsl DUMMY, PF_X, #src_bpp_shift
++                                    PF prfm PREFETCH_MODE, [PF_SRC, DUMMY]
++    umull       v11.8h, v3.8b, v7.8b
++    umull       v12.8h, v3.8b, v30.8b
++                                    PF lsl DUMMY, PF_X, #dst_bpp_shift
++                                    PF prfm PREFETCH_MODE, [PF_DST, DUMMY]
++        sri         v14.8h, v8.8h, #5
++                                    PF cmp PF_X, ORIG_W
++        ushll       v9.8h, v18.8b, #7
++        sli         v9.8h, v9.8h, #1
++    urshr       v17.8h, v10.8h, #8
++                                    PF ble 10f
++                                    PF sub PF_X, PF_X, ORIG_W
++10:
++    urshr       v19.8h, v11.8h, #8
++    urshr       v18.8h, v12.8h, #8
++                                    PF ble 10f
++                                    PF subs PF_CTL, PF_CTL, #0x10
++10:
++        sri         v14.8h, v9.8h, #11
++        mov         v28.d[0], v14.d[0]
++        mov         v29.d[0], v14.d[1]
++                                    PF ble 10f
++                                    PF lsl DUMMY, SRC_STRIDE, #src_bpp_shift
++                                    PF ldrsb DUMMY, [PF_SRC, DUMMY]
++                                    PF add PF_SRC, PF_SRC, #1
++10:
++    raddhn      v20.8b, v10.8h, v17.8h
++    raddhn      v23.8b, v11.8h, v19.8h
++                                    PF ble 10f
++                                    PF lsl DUMMY, DST_STRIDE, #dst_bpp_shift
++                                    PF ldrsb DUMMY, [PF_DST, DUMMY]
++                                    PF add PF_DST, PF_SRC, #1
++10:
++    raddhn      v22.8b, v12.8h, v18.8h
++        st1         {v14.8h}, [DST_W], #16
++.endm
++
++#else
++
++/* If we did not care much about the performance, we would just use this... */
++.macro pixman_composite_over_8888_0565_process_pixblock_tail_head
++    pixman_composite_over_8888_0565_process_pixblock_tail
++    st1         {v14.8h}, [DST_W], #16
++    ld1         {v4.4h, v4.5h}, [DST_R], #16
++    fetch_src_pixblock
++    pixman_composite_over_8888_0565_process_pixblock_head
++    cache_preload 8, 8
++.endm
++
++#endif
++
++/*
++ * And now the final part. We are using 'generate_composite_function' macro
++ * to put all the stuff together. We are specifying the name of the function
++ * which we want to get, number of bits per pixel for the source, mask and
++ * destination (0 if unused, like mask in this case). Next come some bit
++ * flags:
++ *   FLAG_DST_READWRITE      - tells that the destination buffer is both read
++ *                             and written, for write-only buffer we would use
++ *                             FLAG_DST_WRITEONLY flag instead
++ *   FLAG_DEINTERLEAVE_32BPP - tells that we prefer to work with planar data
++ *                             and separate color channels for 32bpp format.
++ * The next things are:
++ *  - the number of pixels processed per iteration (8 in this case, because
++ *    that's the maximum what can fit into four 64-bit NEON registers).
++ *  - prefetch distance, measured in pixel blocks. In this case it is 5 times
++ *    by 8 pixels. That would be 40 pixels, or up to 160 bytes. Optimal
++ *    prefetch distance can be selected by running some benchmarks.
++ *
++ * After that we specify some macros, these are 'default_init',
++ * 'default_cleanup' here which are empty (but it is possible to have custom
++ * init/cleanup macros to be able to save/restore some extra NEON registers
++ * like d8-d15 or do anything else) followed by
++ * 'pixman_composite_over_8888_0565_process_pixblock_head',
++ * 'pixman_composite_over_8888_0565_process_pixblock_tail' and
++ * 'pixman_composite_over_8888_0565_process_pixblock_tail_head'
++ * which we got implemented above.
++ *
++ * The last part is the NEON registers allocation scheme.
++ */
++generate_composite_function \
++    pixman_composite_over_8888_0565_asm_neon, 32, 0, 16, \
++    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
++    8, /* number of pixels, processed in a single block */ \
++    5, /* prefetch distance */ \
++    default_init, \
++    default_cleanup, \
++    pixman_composite_over_8888_0565_process_pixblock_head, \
++    pixman_composite_over_8888_0565_process_pixblock_tail, \
++    pixman_composite_over_8888_0565_process_pixblock_tail_head, \
++    28, /* dst_w_basereg */ \
++    4,  /* dst_r_basereg */ \
++    0,  /* src_basereg   */ \
++    24  /* mask_basereg  */
++
++/******************************************************************************/
++
++.macro pixman_composite_over_n_0565_process_pixblock_head
++    /* convert 8 r5g6b5 pixel data from {v4} to planar 8-bit format
++       and put data into v6 - red, v7 - green, v30 - blue */
++    mov         v4.d[1], v5.d[0]
++    shrn        v6.8b, v4.8h, #8
++    shrn        v7.8b, v4.8h, #3
++    sli         v4.8h, v4.8h, #5
++    sri         v6.8b, v6.8b, #5
++    sri         v7.8b, v7.8b, #6
++    shrn        v30.8b, v4.8h, #2
++    /* now do alpha blending, storing results in 8-bit planar format
++       into v20 - red, v23 - green, v22 - blue */
++    umull       v10.8h, v3.8b, v6.8b
++    umull       v11.8h, v3.8b, v7.8b
++    umull       v12.8h, v3.8b, v30.8b
++    urshr       v13.8h, v10.8h, #8
++    urshr       v14.8h, v11.8h, #8
++    urshr       v15.8h, v12.8h, #8
++    raddhn      v20.8b, v10.8h, v13.8h
++    raddhn      v23.8b, v11.8h, v14.8h
++    raddhn      v22.8b, v12.8h, v15.8h
++.endm
++
++.macro pixman_composite_over_n_0565_process_pixblock_tail
++    /* ... continue alpha blending */
++    uqadd       v17.8b, v2.8b, v20.8b
++    uqadd       v18.8b, v0.8b, v22.8b
++    uqadd       v19.8b, v1.8b, v23.8b
++    /* convert the result to r5g6b5 and store it into {v14} */
++    ushll       v14.8h, v17.8b, #7
++    sli         v14.8h, v14.8h, #1
++    ushll       v8.8h, v19.8b, #7
++    sli         v8.8h, v8.8h, #1
++    ushll       v9.8h, v18.8b, #7
++    sli         v9.8h, v9.8h, #1
++    sri         v14.8h, v8.8h, #5
++    sri         v14.8h, v9.8h, #11
++    mov         v28.d[0], v14.d[0]
++    mov         v29.d[0], v14.d[1]
++.endm
++
++/* TODO: expand macros and do better instructions scheduling */
++.macro pixman_composite_over_n_0565_process_pixblock_tail_head
++    pixman_composite_over_n_0565_process_pixblock_tail
++    ld1         {v4.4h, v5.4h}, [DST_R], #16
++    st1         {v14.8h}, [DST_W], #16
++    pixman_composite_over_n_0565_process_pixblock_head
++    cache_preload 8, 8
++.endm
++
++.macro pixman_composite_over_n_0565_init
++    mov         v3.s[0], w4
++    dup         v0.8b, v3.b[0]
++    dup         v1.8b, v3.b[1]
++    dup         v2.8b, v3.b[2]
++    dup         v3.8b, v3.b[3]
++    mvn         v3.8b, v3.8b      /* invert source alpha */
++.endm
++
++generate_composite_function \
++    pixman_composite_over_n_0565_asm_neon, 0, 0, 16, \
++    FLAG_DST_READWRITE, \
++    8, /* number of pixels, processed in a single block */ \
++    5, /* prefetch distance */ \
++    pixman_composite_over_n_0565_init, \
++    default_cleanup, \
++    pixman_composite_over_n_0565_process_pixblock_head, \
++    pixman_composite_over_n_0565_process_pixblock_tail, \
++    pixman_composite_over_n_0565_process_pixblock_tail_head, \
++    28, /* dst_w_basereg */ \
++    4,  /* dst_r_basereg */ \
++    0,  /* src_basereg   */ \
++    24  /* mask_basereg  */
++
++/******************************************************************************/
++
++.macro pixman_composite_src_8888_0565_process_pixblock_head
++    ushll       v8.8h,  v1.8b,  #7
++    sli         v8.8h,  v8.8h,  #1
++    ushll       v14.8h, v2.8b,  #7
++    sli         v14.8h, v14.8h, #1
++    ushll       v9.8h,  v0.8b,  #7
++    sli         v9.8h,  v9.8h,  #1
++.endm
++
++.macro pixman_composite_src_8888_0565_process_pixblock_tail
++    sri         v14.8h, v8.8h, #5
++    sri         v14.8h, v9.8h, #11
++    mov         v28.d[0], v14.d[0]
++    mov         v29.d[0], v14.d[1]
++.endm
++
++.macro pixman_composite_src_8888_0565_process_pixblock_tail_head
++        sri         v14.8h, v8.8h, #5
++                                    PF add PF_X, PF_X, #8
++                                    PF tst PF_CTL, #0xF
++    fetch_src_pixblock
++                                    PF beq 10f
++                                    PF add PF_X, PF_X, #8
++                                    PF sub PF_CTL, PF_CTL, #1
++10:
++        sri         v14.8h, v9.8h, #11
++        mov         v28.d[0], v14.d[0]
++        mov         v29.d[0], v14.d[1]
++                                    PF cmp PF_X, ORIG_W
++                                    PF lsl DUMMY, PF_X, #src_bpp_shift
++                                    PF prfm PREFETCH_MODE, [PF_SRC, DUMMY]
++    ushll       v8.8h, v1.8b, #7
++    sli         v8.8h, v8.8h, #1
++        st1        {v14.8h}, [DST_W], #16
++                                    PF ble 10f
++                                    PF sub PF_X, PF_X, ORIG_W
++                                    PF subs PF_CTL, PF_CTL, #0x10
++10:
++    ushll       v14.8h, v2.8b, #7
++    sli         v14.8h, v14.8h, #1
++                                    PF ble 10f
++                                    PF lsl DUMMY, SRC_STRIDE, #src_bpp_shift
++                                    PF ldrsb DUMMY, [PF_SRC, DUMMY]
++                                    PF add PF_SRC, PF_SRC, #1
++10:
++    ushll       v9.8h, v0.8b, #7
++    sli         v9.8h, v9.8h, #1
++.endm
++
++generate_composite_function \
++    pixman_composite_src_8888_0565_asm_neon, 32, 0, 16, \
++    FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \
++    8, /* number of pixels, processed in a single block */ \
++    10, /* prefetch distance */ \
++    default_init, \
++    default_cleanup, \
++    pixman_composite_src_8888_0565_process_pixblock_head, \
++    pixman_composite_src_8888_0565_process_pixblock_tail, \
++    pixman_composite_src_8888_0565_process_pixblock_tail_head
++
++/******************************************************************************/
++
++.macro pixman_composite_src_0565_8888_process_pixblock_head
++    mov         v0.d[1], v1.d[0]
++    shrn        v30.8b, v0.8h, #8
++    shrn        v29.8b, v0.8h, #3
++    sli         v0.8h,  v0.8h, #5
++    movi        v31.8b, #255
++    sri         v30.8b, v30.8b, #5
++    sri         v29.8b, v29.8b, #6
++    shrn        v28.8b, v0.8h, #2
++.endm
++
++.macro pixman_composite_src_0565_8888_process_pixblock_tail
++.endm
++
++/* TODO: expand macros and do better instructions scheduling */
++.macro pixman_composite_src_0565_8888_process_pixblock_tail_head
++    pixman_composite_src_0565_8888_process_pixblock_tail
++    st4         {v28.8b, v29.8b, v30.8b, v31.8b}, [DST_W], #32
++    fetch_src_pixblock
++    pixman_composite_src_0565_8888_process_pixblock_head
++    cache_preload 8, 8
++.endm
++
++generate_composite_function \
++    pixman_composite_src_0565_8888_asm_neon, 16, 0, 32, \
++    FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \
++    8, /* number of pixels, processed in a single block */ \
++    10, /* prefetch distance */ \
++    default_init, \
++    default_cleanup, \
++    pixman_composite_src_0565_8888_process_pixblock_head, \
++    pixman_composite_src_0565_8888_process_pixblock_tail, \
++    pixman_composite_src_0565_8888_process_pixblock_tail_head
++
++/******************************************************************************/
++
++.macro pixman_composite_add_8_8_process_pixblock_head
++    uqadd       v28.8b, v0.8b, v4.8b
++    uqadd       v29.8b, v1.8b, v5.8b
++    uqadd       v30.8b, v2.8b, v6.8b
++    uqadd       v31.8b, v3.8b, v7.8b
++.endm
++
++.macro pixman_composite_add_8_8_process_pixblock_tail
++.endm
++
++.macro pixman_composite_add_8_8_process_pixblock_tail_head
++    fetch_src_pixblock
++                                    PF add PF_X, PF_X, #32
++                                    PF tst PF_CTL, #0xF
++    ld1         {v4.8b, v5.8b, v6.8b, v7.8b}, [DST_R], #32
++                                    PF beq 10f
++                                    PF add PF_X, PF_X, #32
++                                    PF sub PF_CTL, PF_CTL, #1
++10:
++        st1     {v28.8b, v29.8b, v30.8b, v31.8b}, [DST_W], #32
++                                    PF cmp PF_X, ORIG_W
++                                    PF lsl DUMMY, PF_X, #src_bpp_shift
++                                    PF prfm PREFETCH_MODE, [PF_SRC, DUMMY]
++                                    PF lsl DUMMY, PF_X, #dst_bpp_shift
++                                    PF prfm PREFETCH_MODE, [PF_DST, DUMMY]
++                                    PF ble 10f
++                                    PF sub PF_X, PF_X, ORIG_W
++                                    PF subs PF_CTL, PF_CTL, #0x10
++10:
++    uqadd       v28.8b, v0.8b, v4.8b
++                                    PF ble 10f
++                                    PF lsl DUMMY, SRC_STRIDE, #src_bpp_shift
++                                    PF ldrsb DUMMY, [PF_SRC, DUMMY]
++                                    PF add PF_SRC, PF_SRC, #1
++                                    PF lsl DUMMY, DST_STRIDE, #dst_bpp_shift
++                                    PF ldrsb DUMMY, [PF_DST, DUMMY]
++                                    PF add PF_DST, PF_DST, #1
++10:
++    uqadd       v29.8b, v1.8b, v5.8b
++    uqadd       v30.8b, v2.8b, v6.8b
++    uqadd       v31.8b, v3.8b, v7.8b
++.endm
++
++generate_composite_function \
++    pixman_composite_add_8_8_asm_neon, 8, 0, 8, \
++    FLAG_DST_READWRITE, \
++    32, /* number of pixels, processed in a single block */ \
++    10, /* prefetch distance */ \
++    default_init, \
++    default_cleanup, \
++    pixman_composite_add_8_8_process_pixblock_head, \
++    pixman_composite_add_8_8_process_pixblock_tail, \
++    pixman_composite_add_8_8_process_pixblock_tail_head
++
++/******************************************************************************/
++
++.macro pixman_composite_add_8888_8888_process_pixblock_tail_head
++    fetch_src_pixblock
++                                    PF add PF_X, PF_X, #8
++                                    PF tst PF_CTL, #0xF
++    ld1         {v4.8b, v5.8b, v6.8b, v7.8b}, [DST_R], #32
++                                    PF beq 10f
++                                    PF add PF_X, PF_X, #8
++                                    PF sub PF_CTL, PF_CTL, #1
++10:
++        st1     {v28.8b, v29.8b, v30.8b, v31.8b}, [DST_W], #32
++                                    PF cmp PF_X, ORIG_W
++                                    PF lsl DUMMY, PF_X, #src_bpp_shift
++                                    PF prfm PREFETCH_MODE, [PF_SRC, DUMMY]
++                                    PF lsl DUMMY, PF_X, #dst_bpp_shift
++                                    PF prfm PREFETCH_MODE, [PF_DST, DUMMY]
++                                    PF ble 10f
++                                    PF sub PF_X, PF_X, ORIG_W
++                                    PF subs PF_CTL, PF_CTL, #0x10
++10:
++    uqadd       v28.8b, v0.8b, v4.8b
++                                    PF ble 10f
++                                    PF lsl DUMMY, SRC_STRIDE, #src_bpp_shift
++                                    PF ldrsb DUMMY, [PF_SRC, DUMMY]
++                                    PF add PF_SRC, PF_SRC, #1
++                                    PF lsl DUMMY, DST_STRIDE, #dst_bpp_shift
++                                    PF ldrsb DUMMY, [PF_DST, DUMMY]
++                                    PF add PF_DST, PF_DST, #1
++10:
++    uqadd       v29.8b, v1.8b, v5.8b
++    uqadd       v30.8b, v2.8b, v6.8b
++    uqadd       v31.8b, v3.8b, v7.8b
++.endm
++
++generate_composite_function \
++    pixman_composite_add_8888_8888_asm_neon, 32, 0, 32, \
++    FLAG_DST_READWRITE, \
++    8, /* number of pixels, processed in a single block */ \
++    10, /* prefetch distance */ \
++    default_init, \
++    default_cleanup, \
++    pixman_composite_add_8_8_process_pixblock_head, \
++    pixman_composite_add_8_8_process_pixblock_tail, \
++    pixman_composite_add_8888_8888_process_pixblock_tail_head
++
++generate_composite_function_single_scanline \
++    pixman_composite_scanline_add_asm_neon, 32, 0, 32, \
++    FLAG_DST_READWRITE, \
++    8, /* number of pixels, processed in a single block */ \
++    default_init, \
++    default_cleanup, \
++    pixman_composite_add_8_8_process_pixblock_head, \
++    pixman_composite_add_8_8_process_pixblock_tail, \
++    pixman_composite_add_8888_8888_process_pixblock_tail_head
++
++/******************************************************************************/
++
++.macro pixman_composite_out_reverse_8888_8888_process_pixblock_head
++    mvn         v24.8b, v3.8b  /* get inverted alpha */
++    /* do alpha blending */
++    umull       v8.8h, v24.8b, v4.8b
++    umull       v9.8h, v24.8b, v5.8b
++    umull       v10.8h, v24.8b, v6.8b
++    umull       v11.8h, v24.8b, v7.8b
++.endm
++
++.macro pixman_composite_out_reverse_8888_8888_process_pixblock_tail
++    urshr       v14.8h, v8.8h, #8
++    urshr       v15.8h, v9.8h, #8
++    urshr       v16.8h, v10.8h, #8
++    urshr       v17.8h, v11.8h, #8
++    raddhn      v28.8b, v14.8h, v8.8h
++    raddhn      v29.8b, v15.8h, v9.8h
++    raddhn      v30.8b, v16.8h, v10.8h
++    raddhn      v31.8b, v17.8h, v11.8h
++.endm
++
++.macro pixman_composite_out_reverse_8888_8888_process_pixblock_tail_head
++     ld4        {v4.8b, v5.8b, v6.8b, v7.8b}, [DST_R], #32
++        urshr       v14.8h, v8.8h, #8
++                                    PF add PF_X, PF_X, #8
++                                    PF tst PF_CTL, #0xF
++        urshr       v15.8h, v9.8h, #8
++        urshr       v16.8h, v10.8h, #8
++        urshr       v17.8h, v11.8h, #8
++                                    PF beq 10f
++                                    PF add PF_X, PF_X, #8
++                                    PF sub PF_CTL, PF_CTL, #1
++10:
++        raddhn      v28.8b, v14.8h, v8.8h
++        raddhn      v29.8b, v15.8h, v9.8h
++                                    PF cmp PF_X, ORIG_W
++        raddhn      v30.8b, v16.8h, v10.8h
++        raddhn      v31.8b, v17.8h, v11.8h
++    fetch_src_pixblock
++                                    PF lsl DUMMY, PF_X, #src_bpp_shift
++                                    PF prfm PREFETCH_MODE, [PF_SRC, DUMMY]
++    mvn         v22.8b, v3.8b
++                                    PF lsl DUMMY, PF_X, #dst_bpp_shift
++                                    PF prfm PREFETCH_MODE, [PF_DST, DUMMY]
++         st4        {v28.8b, v29.8b, v30.8b, v31.8b}, [DST_W], #32
++                                    PF ble 10f
++                                    PF sub PF_X, PF_X, ORIG_W
++10:
++    umull      v8.8h, v22.8b, v4.8b
++                                    PF ble 10f
++                                    PF subs PF_CTL, PF_CTL, #0x10
++10:
++    umull      v9.8h, v22.8b, v5.8b
++                                    PF ble 10f
++                                    PF lsl DUMMY, SRC_STRIDE, #src_bpp_shift
++                                    PF ldrsb DUMMY, [PF_SRC, DUMMY]
++                                    PF add PF_SRC, PF_SRC, #1
++10:
++    umull      v10.8h, v22.8b, v6.8b
++                                    PF ble 10f
++                                    PF lsl DUMMY, DST_STRIDE, #dst_bpp_shift
++                                    PF ldrsb DUMMY, [PF_DST, DUMMY]
++                                    PF add PF_DST, PF_DST, #1
++10:
++     umull     v11.8h, v22.8b, v7.8b
++.endm
++
++generate_composite_function_single_scanline \
++    pixman_composite_scanline_out_reverse_asm_neon, 32, 0, 32, \
++    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
++    8, /* number of pixels, processed in a single block */ \
++    default_init, \
++    default_cleanup, \
++    pixman_composite_out_reverse_8888_8888_process_pixblock_head, \
++    pixman_composite_out_reverse_8888_8888_process_pixblock_tail, \
++    pixman_composite_out_reverse_8888_8888_process_pixblock_tail_head
++
++/******************************************************************************/
++
++.macro pixman_composite_over_8888_8888_process_pixblock_head
++    pixman_composite_out_reverse_8888_8888_process_pixblock_head
++.endm
++
++.macro pixman_composite_over_8888_8888_process_pixblock_tail
++    pixman_composite_out_reverse_8888_8888_process_pixblock_tail
++    uqadd       v28.8b, v0.8b, v28.8b
++    uqadd       v29.8b, v1.8b, v29.8b
++    uqadd       v30.8b, v2.8b, v30.8b
++    uqadd       v31.8b, v3.8b, v31.8b
++.endm
++
++.macro pixman_composite_over_8888_8888_process_pixblock_tail_head
++     ld4        {v4.8b, v5.8b, v6.8b, v7.8b}, [DST_R], #32
++        urshr       v14.8h, v8.8h, #8
++                                    PF add PF_X, PF_X, #8
++                                    PF tst PF_CTL, #0xF
++        urshr       v15.8h, v9.8h, #8
++        urshr       v16.8h, v10.8h, #8
++        urshr       v17.8h, v11.8h, #8
++                                    PF beq 10f
++                                    PF add PF_X, PF_X, #8
++                                    PF sub PF_CTL, PF_CTL, #1
++10:
++        raddhn      v28.8b, v14.8h, v8.8h
++        raddhn      v29.8b, v15.8h, v9.8h
++                                    PF cmp PF_X, ORIG_W
++        raddhn      v30.8b, v16.8h, v10.8h
++        raddhn      v31.8b, v17.8h, v11.8h
++        uqadd       v28.8b, v0.8b, v28.8b
++        uqadd       v29.8b, v1.8b, v29.8b
++        uqadd       v30.8b, v2.8b, v30.8b
++        uqadd       v31.8b, v3.8b, v31.8b
++    fetch_src_pixblock
++                                    PF lsl DUMMY, PF_X, #src_bpp_shift
++                                    PF prfm PREFETCH_MODE, [PF_SRC, DUMMY]
++    mvn        v22.8b, v3.8b
++                                    PF lsl DUMMY, PF_X, #dst_bpp_shift
++                                    PF prfm PREFETCH_MODE, [PF_DST, DUMMY]
++         st4        {v28.8b, v29.8b, v30.8b, v31.8b}, [DST_W], #32
++                                    PF ble 10f
++                                    PF sub PF_X, PF_X, ORIG_W
++10:
++    umull      v8.8h, v22.8b, v4.8b
++                                    PF ble 10f
++                                    PF subs PF_CTL, PF_CTL, #0x10
++10:
++    umull      v9.8h, v22.8b, v5.8b
++                                    PF ble 10f
++                                    PF lsl DUMMY, SRC_STRIDE, #src_bpp_shift
++                                    PF ldrsb DUMMY, [PF_SRC, DUMMY]
++                                    PF add PF_SRC, PF_SRC, #1
++10:
++    umull      v10.8h, v22.8b, v6.8b
++                                    PF ble 10f
++                                    PF lsl DUMMY, DST_STRIDE, #dst_bpp_shift
++                                    PF ldrsb DUMMY, [PF_DST, DUMMY]
++                                    PF add PF_DST, PF_DST, #1
++10:
++    umull      v11.8h, v22.8b, v7.8b
++.endm
++
++generate_composite_function \
++    pixman_composite_over_8888_8888_asm_neon, 32, 0, 32, \
++    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
++    8, /* number of pixels, processed in a single block */ \
++    5, /* prefetch distance */ \
++    default_init, \
++    default_cleanup, \
++    pixman_composite_over_8888_8888_process_pixblock_head, \
++    pixman_composite_over_8888_8888_process_pixblock_tail, \
++    pixman_composite_over_8888_8888_process_pixblock_tail_head
++
++generate_composite_function_single_scanline \
++    pixman_composite_scanline_over_asm_neon, 32, 0, 32, \
++    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
++    8, /* number of pixels, processed in a single block */ \
++    default_init, \
++    default_cleanup, \
++    pixman_composite_over_8888_8888_process_pixblock_head, \
++    pixman_composite_over_8888_8888_process_pixblock_tail, \
++    pixman_composite_over_8888_8888_process_pixblock_tail_head
++
++/******************************************************************************/
++
++.macro pixman_composite_over_n_8888_process_pixblock_head
++    /* deinterleaved source pixels in {v0, v1, v2, v3} */
++    /* inverted alpha in {v24} */
++    /* destination pixels in {v4, v5, v6, v7} */
++    umull       v8.8h, v24.8b, v4.8b
++    umull       v9.8h, v24.8b, v5.8b
++    umull       v10.8h, v24.8b, v6.8b
++    umull       v11.8h, v24.8b, v7.8b
++.endm
++
++.macro pixman_composite_over_n_8888_process_pixblock_tail
++    urshr       v14.8h, v8.8h, #8
++    urshr       v15.8h, v9.8h, #8
++    urshr       v16.8h, v10.8h, #8
++    urshr       v17.8h, v11.8h, #8
++    raddhn      v28.8b, v14.8h, v8.8h
++    raddhn      v29.8b, v15.8h, v9.8h
++    raddhn      v30.8b, v16.8h, v10.8h
++    raddhn      v31.8b, v17.8h, v11.8h
++    uqadd       v28.8b, v0.8b, v28.8b
++    uqadd       v29.8b, v1.8b, v29.8b
++    uqadd       v30.8b, v2.8b, v30.8b
++    uqadd       v31.8b, v3.8b, v31.8b
++.endm
++
++.macro pixman_composite_over_n_8888_process_pixblock_tail_head
++        urshr       v14.8h, v8.8h, #8
++        urshr       v15.8h, v9.8h, #8
++        urshr       v16.8h, v10.8h, #8
++        urshr       v17.8h, v11.8h, #8
++        raddhn      v28.8b, v14.8h, v8.8h
++        raddhn      v29.8b, v15.8h, v9.8h
++        raddhn      v30.8b, v16.8h, v10.8h
++        raddhn      v31.8b, v17.8h, v11.8h
++    ld4         {v4.8b, v5.8b, v6.8b, v7.8b}, [DST_R], #32
++        uqadd       v28.8b, v0.8b, v28.8b
++                                    PF add PF_X, PF_X, #8
++                                    PF tst PF_CTL, #0x0F
++                                    PF beq 10f
++                                    PF add PF_X, PF_X, #8
++                                    PF sub PF_CTL, PF_CTL, #1
++10:
++        uqadd       v29.8b, v1.8b, v29.8b
++        uqadd       v30.8b, v2.8b, v30.8b
++        uqadd       v31.8b, v3.8b, v31.8b
++                                    PF cmp PF_X, ORIG_W
++    umull       v8.8h, v24.8b, v4.8b
++                                    PF lsl DUMMY, PF_X, #dst_bpp_shift
++                                    PF prfm PREFETCH_MODE, [PF_DST, DUMMY]
++    umull       v9.8h, v24.8b, v5.8b
++                                    PF ble 10f
++                                    PF sub PF_X, PF_X, ORIG_W
++10:
++    umull       v10.8h, v24.8b, v6.8b
++                                    PF subs PF_CTL, PF_CTL, #0x10
++    umull       v11.8h, v24.8b, v7.8b
++                                    PF ble 10f
++                                    PF lsl DUMMY, DST_STRIDE, #dst_bpp_shift
++                                    PF ldrsb DUMMY, [PF_DST, DUMMY]
++                                    PF add PF_DST, PF_DST, #1
++10:
++        st4         {v28.8b, v29.8b, v30.8b, v31.8b}, [DST_W], #32
++.endm
++
++.macro pixman_composite_over_n_8888_init
++    mov         v3.s[0], w4
++    dup         v0.8b, v3.b[0]
++    dup         v1.8b, v3.b[1]
++    dup         v2.8b, v3.b[2]
++    dup         v3.8b, v3.b[3]
++    mvn         v24.8b, v3.8b  /* get inverted alpha */
++.endm
++
++generate_composite_function \
++    pixman_composite_over_n_8888_asm_neon, 0, 0, 32, \
++    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
++    8, /* number of pixels, processed in a single block */ \
++    5, /* prefetch distance */ \
++    pixman_composite_over_n_8888_init, \
++    default_cleanup, \
++    pixman_composite_over_8888_8888_process_pixblock_head, \
++    pixman_composite_over_8888_8888_process_pixblock_tail, \
++    pixman_composite_over_n_8888_process_pixblock_tail_head
++
++/******************************************************************************/
++
++.macro pixman_composite_over_reverse_n_8888_process_pixblock_tail_head
++        urshr       v14.8h, v8.8h, #8
++                                    PF add PF_X, PF_X, #8
++                                    PF tst PF_CTL, #0xF
++        urshr       v15.8h, v9.8h, #8
++        urshr       v12.8h, v10.8h, #8
++        urshr       v13.8h, v11.8h, #8
++                                    PF beq 10f
++                                    PF add PF_X, PF_X, #8
++                                    PF sub PF_CTL, PF_CTL, #1
++10:
++        raddhn      v28.8b, v14.8h, v8.8h
++        raddhn      v29.8b, v15.8h, v9.8h
++                                    PF cmp PF_X, ORIG_W
++        raddhn      v30.8b, v12.8h, v10.8h
++        raddhn      v31.8b, v13.8h, v11.8h
++        uqadd       v28.8b, v0.8b, v28.8b
++        uqadd       v29.8b, v1.8b, v29.8b
++        uqadd       v30.8b, v2.8b, v30.8b
++        uqadd       v31.8b, v3.8b, v31.8b
++    ld4         {v0.8b, v1.8b, v2.8b, v3.8b}, [DST_R], #32
++    mvn         v22.8b, v3.8b
++                                    PF lsl DUMMY, PF_X, #dst_bpp_shift
++                                    PF prfm PREFETCH_MODE, [PF_DST, DUMMY]
++        st4         {v28.8b, v29.8b, v30.8b, v31.8b}, [DST_W], #32
++                                    PF blt 10f
++                                    PF sub PF_X, PF_X, ORIG_W
++10:
++    umull       v8.8h, v22.8b, v4.8b
++                                    PF blt 10f
++                                    PF subs PF_CTL, PF_CTL, #0x10
++10:
++    umull       v9.8h, v22.8b, v5.8b
++    umull       v10.8h, v22.8b, v6.8b
++                                    PF blt 10f
++                                    PF lsl DUMMY, DST_STRIDE, #dst_bpp_shift
++                                    PF ldrsb DUMMY, [PF_DST, DUMMY]
++                                    PF add PF_DST, PF_DST, #1
++10:
++    umull       v11.8h, v22.8b, v7.8b
++.endm
++
++.macro pixman_composite_over_reverse_n_8888_init
++    mov         v7.s[0], w4
++    dup         v4.8b, v7.b[0]
++    dup         v5.8b, v7.b[1]
++    dup         v6.8b, v7.b[2]
++    dup         v7.8b, v7.b[3]
++.endm
++
++generate_composite_function \
++    pixman_composite_over_reverse_n_8888_asm_neon, 0, 0, 32, \
++    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
++    8, /* number of pixels, processed in a single block */ \
++    5, /* prefetch distance */ \
++    pixman_composite_over_reverse_n_8888_init, \
++    default_cleanup, \
++    pixman_composite_over_8888_8888_process_pixblock_head, \
++    pixman_composite_over_8888_8888_process_pixblock_tail, \
++    pixman_composite_over_reverse_n_8888_process_pixblock_tail_head, \
++    28, /* dst_w_basereg */ \
++    0,  /* dst_r_basereg */ \
++    4,  /* src_basereg   */ \
++    24  /* mask_basereg  */
++
++/******************************************************************************/
++
++.macro pixman_composite_over_8888_8_0565_process_pixblock_head
++    umull       v0.8h,  v24.8b, v8.8b    /* IN for SRC pixels (part1) */
++    umull       v1.8h,  v24.8b, v9.8b
++    umull       v2.8h,  v24.8b, v10.8b
++    umull       v3.8h,  v24.8b, v11.8b
++        mov         v4.d[1], v5.d[0]
++        shrn        v25.8b,  v4.8h, #8 /* convert DST_R data to 32-bpp (part1) */
++        shrn        v26.8b,  v4.8h, #3
++        sli         v4.8h,   v4.8h, #5
++    urshr       v17.8h, v0.8h,  #8    /* IN for SRC pixels (part2) */
++    urshr       v18.8h, v1.8h,  #8
++    urshr       v19.8h, v2.8h,  #8
++    urshr       v20.8h, v3.8h,  #8
++    raddhn      v0.8b,  v0.8h,  v17.8h
++    raddhn      v1.8b,  v1.8h,  v18.8h
++    raddhn      v2.8b,  v2.8h,  v19.8h
++    raddhn      v3.8b,  v3.8h,  v20.8h
++        sri         v25.8b, v25.8b, #5 /* convert DST_R data to 32-bpp (part2) */
++        sri         v26.8b, v26.8b, #6
++    mvn         v3.8b,  v3.8b
++        shrn        v30.8b, v4.8h,  #2
++    umull       v18.8h, v3.8b, v25.8b     /* now do alpha blending */
++    umull       v19.8h, v3.8b, v26.8b
++    umull       v20.8h, v3.8b, v30.8b
++.endm
++
++.macro pixman_composite_over_8888_8_0565_process_pixblock_tail
++    /* 3 cycle bubble (after vmull.u8) */
++    urshr       v5.8h, v18.8h, #8
++    urshr       v6.8h, v19.8h, #8
++    urshr       v7.8h, v20.8h, #8
++    raddhn      v17.8b, v18.8h, v5.8h
++    raddhn      v19.8b, v19.8h, v6.8h
++    raddhn      v18.8b, v20.8h, v7.8h
++    uqadd       v5.8b, v2.8b,  v17.8b
++    /* 1 cycle bubble */
++    uqadd       v6.8b, v0.8b,  v18.8b
++    uqadd       v7.8b, v1.8b,  v19.8b
++    ushll       v14.8h, v5.8b, #7    /* convert to 16bpp */
++    sli         v14.8h, v14.8h, #1
++    ushll       v18.8h, v7.8b, #7
++    sli         v18.8h, v18.8h, #1
++    ushll       v19.8h, v6.8b, #7
++    sli         v19.8h, v19.8h, #1
++    sri         v14.8h, v18.8h, #5
++    /* 1 cycle bubble */
++    sri         v14.8h, v19.8h, #11
++    mov         v28.d[0], v14.d[0]
++    mov         v29.d[0], v14.d[1]
++.endm
++
++.macro pixman_composite_over_8888_8_0565_process_pixblock_tail_head
++#if 0
++    ld1         {v4.8h}, [DST_R], #16
++    shrn        v25.8b,  v4.8h,  #8
++    fetch_mask_pixblock
++    shrn        v26.8b,  v4.8h,  #3
++    fetch_src_pixblock
++    umull       v22.8h,  v24.8b, v10.8b
++        urshr       v13.8h, v18.8h, #8
++        urshr       v11.8h, v19.8h, #8
++        urshr       v15.8h, v20.8h, #8
++        raddhn      v17.8b, v18.8h, v13.8h
++        raddhn      v19.8b, v19.8h, v11.8h
++        raddhn      v18.8b, v20.8h, v15.8h
++        uqadd       v17.8b, v2.8b, v17.8b
++    umull       v21.8h,  v24.8b, v9.8b
++        uqadd       v18.8b, v0.8b, v18.8b
++        uqadd       v19.8b, v1.8b, v19.8b
++        ushll       v14.8h, v17.8b, #7
++        sli         v14.8h, v14.8h, #1
++    umull       v20.8h,  v24.8b, v8.8b
++        ushll       v18.8h,  v18.8b, #7
++        sli         v18.8h,  v18.8h, #1
++        ushll       v19.8h,  v19.8b, #7
++        sli         v19.8h,  v19.8h, #1
++        sri         v14.8h,  v18.8h, #5
++    umull       v23.8h,  v24.8b, v11.8b
++        sri         v14.8h,  v19.8h, #11
++        mov         v28.d[0], v14.d[0]
++        mov         v29.d[0], v14.d[1]
++
++    cache_preload 8, 8
++    
++    sli         v4.8h,  v4.8h,   #5
++    urshr       v16.8h, v20.8h,  #8
++    urshr       v17.8h, v21.8h,  #8
++    urshr       v18.8h, v22.8h,  #8
++    urshr       v19.8h, v23.8h,  #8
++    raddhn      v0.8b,  v20.8h, v16.8h
++    raddhn      v1.8b,  v21.8h, v17.8h
++    raddhn      v2.8b,  v22.8h, v18.8h
++    raddhn      v3.8b,  v23.8h, v19.8h
++    sri         v25.8b,  v25.8b,  #5
++    sri         v26.8b,  v26.8b,  #6
++    mvn         v3.8b,  v3.8b
++    shrn        v30.8b, v4.8h,  #2
++    st1         {v14.8h}, [DST_W], #16
++    umull       v18.8h, v3.8b, v25.8b
++    umull       v19.8h, v3.8b, v26.8b
++    umull       v20.8h, v3.8b, v30.8b
++#else
++    pixman_composite_over_8888_8_0565_process_pixblock_tail
++    st1         {v28.4h, v29.4h}, [DST_W], #16
++    ld1         {v4.4h, v5.4h}, [DST_R], #16
++    fetch_mask_pixblock
++    fetch_src_pixblock
++    pixman_composite_over_8888_8_0565_process_pixblock_head
++#endif
++.endm
++
++generate_composite_function \
++    pixman_composite_over_8888_8_0565_asm_neon, 32, 8, 16, \
++    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
++    8, /* number of pixels, processed in a single block */ \
++    5, /* prefetch distance */ \
++    default_init_need_all_regs, \
++    default_cleanup_need_all_regs, \
++    pixman_composite_over_8888_8_0565_process_pixblock_head, \
++    pixman_composite_over_8888_8_0565_process_pixblock_tail, \
++    pixman_composite_over_8888_8_0565_process_pixblock_tail_head, \
++    28, /* dst_w_basereg */ \
++    4,  /* dst_r_basereg */ \
++    8,  /* src_basereg   */ \
++    24  /* mask_basereg  */
++
++/******************************************************************************/
++
++/*
++ * This function needs a special initialization of solid mask.
++ * Solid source pixel data is fetched from stack at ARGS_STACK_OFFSET
++ * offset, split into color components and replicated in d8-d11
++ * registers. Additionally, this function needs all the NEON registers,
++ * so it has to save d8-d15 registers which are callee saved according
++ * to ABI. These registers are restored from 'cleanup' macro. All the
++ * other NEON registers are caller saved, so can be clobbered freely
++ * without introducing any problems.
++ */
++.macro pixman_composite_over_n_8_0565_init
++    mov         v11.s[0], w4
++    dup         v8.8b, v11.b[0]
++    dup         v9.8b, v11.b[1]
++    dup         v10.8b, v11.b[2]
++    dup         v11.8b, v11.b[3]
++.endm
++
++.macro pixman_composite_over_n_8_0565_cleanup
++.endm
++
++generate_composite_function \
++    pixman_composite_over_n_8_0565_asm_neon, 0, 8, 16, \
++    FLAG_DST_READWRITE, \
++    8, /* number of pixels, processed in a single block */ \
++    5, /* prefetch distance */ \
++    pixman_composite_over_n_8_0565_init, \
++    pixman_composite_over_n_8_0565_cleanup, \
++    pixman_composite_over_8888_8_0565_process_pixblock_head, \
++    pixman_composite_over_8888_8_0565_process_pixblock_tail, \
++    pixman_composite_over_8888_8_0565_process_pixblock_tail_head, \
++    28, /* dst_w_basereg */ \
++    4,  /* dst_r_basereg */ \
++    8,  /* src_basereg   */ \
++    24  /* mask_basereg  */
++
++/******************************************************************************/
++
++.macro pixman_composite_over_8888_n_0565_init
++    mov         v24.s[0], w6
++    dup         v24.8b, v24.b[3]
++.endm
++
++.macro pixman_composite_over_8888_n_0565_cleanup
++.endm
++
++generate_composite_function \
++    pixman_composite_over_8888_n_0565_asm_neon, 32, 0, 16, \
++    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
++    8, /* number of pixels, processed in a single block */ \
++    5, /* prefetch distance */ \
++    pixman_composite_over_8888_n_0565_init, \
++    pixman_composite_over_8888_n_0565_cleanup, \
++    pixman_composite_over_8888_8_0565_process_pixblock_head, \
++    pixman_composite_over_8888_8_0565_process_pixblock_tail, \
++    pixman_composite_over_8888_8_0565_process_pixblock_tail_head, \
++    28, /* dst_w_basereg */ \
++    4,  /* dst_r_basereg */ \
++    8,  /* src_basereg   */ \
++    24  /* mask_basereg  */
++
++/******************************************************************************/
++
++.macro pixman_composite_src_0565_0565_process_pixblock_head
++.endm
++
++.macro pixman_composite_src_0565_0565_process_pixblock_tail
++.endm
++
++.macro pixman_composite_src_0565_0565_process_pixblock_tail_head
++    st1     {v0.4h, v1.4h, v2.4h, v3.4h}, [DST_W], #32
++    fetch_src_pixblock
++    cache_preload 16, 16
++.endm
++
++generate_composite_function \
++    pixman_composite_src_0565_0565_asm_neon, 16, 0, 16, \
++    FLAG_DST_WRITEONLY, \
++    16, /* number of pixels, processed in a single block */ \
++    10, /* prefetch distance */ \
++    default_init, \
++    default_cleanup, \
++    pixman_composite_src_0565_0565_process_pixblock_head, \
++    pixman_composite_src_0565_0565_process_pixblock_tail, \
++    pixman_composite_src_0565_0565_process_pixblock_tail_head, \
++    0, /* dst_w_basereg */ \
++    0, /* dst_r_basereg */ \
++    0, /* src_basereg   */ \
++    0  /* mask_basereg  */
++
++/******************************************************************************/
++
++.macro pixman_composite_src_n_8_process_pixblock_head
++.endm
++
++.macro pixman_composite_src_n_8_process_pixblock_tail
++.endm
++
++.macro pixman_composite_src_n_8_process_pixblock_tail_head
++    st1         {v0.8b, v1.8b, v2.8b, v3.8b}, [DST_W], 32
++.endm
++
++.macro pixman_composite_src_n_8_init
++    mov         v0.s[0], w4
++    dup         v3.8b, v0.b[0]
++    dup         v2.8b, v0.b[0]
++    dup         v1.8b, v0.b[0]
++    dup         v0.8b, v0.b[0]
++.endm
++
++.macro pixman_composite_src_n_8_cleanup
++.endm
++
++generate_composite_function \
++    pixman_composite_src_n_8_asm_neon, 0, 0, 8, \
++    FLAG_DST_WRITEONLY, \
++    32, /* number of pixels, processed in a single block */ \
++    0,  /* prefetch distance */ \
++    pixman_composite_src_n_8_init, \
++    pixman_composite_src_n_8_cleanup, \
++    pixman_composite_src_n_8_process_pixblock_head, \
++    pixman_composite_src_n_8_process_pixblock_tail, \
++    pixman_composite_src_n_8_process_pixblock_tail_head, \
++    0, /* dst_w_basereg */ \
++    0, /* dst_r_basereg */ \
++    0, /* src_basereg   */ \
++    0  /* mask_basereg  */
++
++/******************************************************************************/
++
++.macro pixman_composite_src_n_0565_process_pixblock_head
++.endm
++
++.macro pixman_composite_src_n_0565_process_pixblock_tail
++.endm
++
++.macro pixman_composite_src_n_0565_process_pixblock_tail_head
++    st1     {v0.4h, v1.4h, v2.4h, v3.4h}, [DST_W], #32
++.endm
++
++.macro pixman_composite_src_n_0565_init
++    mov         v0.s[0], w4
++    dup         v3.4h, v0.h[0]
++    dup         v2.4h, v0.h[0]
++    dup         v1.4h, v0.h[0]
++    dup         v0.4h, v0.h[0]
++.endm
++
++.macro pixman_composite_src_n_0565_cleanup
++.endm
++
++generate_composite_function \
++    pixman_composite_src_n_0565_asm_neon, 0, 0, 16, \
++    FLAG_DST_WRITEONLY, \
++    16, /* number of pixels, processed in a single block */ \
++    0,  /* prefetch distance */ \
++    pixman_composite_src_n_0565_init, \
++    pixman_composite_src_n_0565_cleanup, \
++    pixman_composite_src_n_0565_process_pixblock_head, \
++    pixman_composite_src_n_0565_process_pixblock_tail, \
++    pixman_composite_src_n_0565_process_pixblock_tail_head, \
++    0, /* dst_w_basereg */ \
++    0, /* dst_r_basereg */ \
++    0, /* src_basereg   */ \
++    0  /* mask_basereg  */
++
++/******************************************************************************/
++
++.macro pixman_composite_src_n_8888_process_pixblock_head
++.endm
++
++.macro pixman_composite_src_n_8888_process_pixblock_tail
++.endm
++
++.macro pixman_composite_src_n_8888_process_pixblock_tail_head
++    st1         {v0.2s, v1.2s, v2.2s, v3.2s}, [DST_W], #32
++.endm
++
++.macro pixman_composite_src_n_8888_init
++    mov         v0.s[0], w4
++    dup         v3.2s, v0.s[0]
++    dup         v2.2s, v0.s[0]
++    dup         v1.2s, v0.s[0]
++    dup         v0.2s, v0.s[0]
++.endm
++
++.macro pixman_composite_src_n_8888_cleanup
++.endm
++
++generate_composite_function \
++    pixman_composite_src_n_8888_asm_neon, 0, 0, 32, \
++    FLAG_DST_WRITEONLY, \
++    8, /* number of pixels, processed in a single block */ \
++    0, /* prefetch distance */ \
++    pixman_composite_src_n_8888_init, \
++    pixman_composite_src_n_8888_cleanup, \
++    pixman_composite_src_n_8888_process_pixblock_head, \
++    pixman_composite_src_n_8888_process_pixblock_tail, \
++    pixman_composite_src_n_8888_process_pixblock_tail_head, \
++    0, /* dst_w_basereg */ \
++    0, /* dst_r_basereg */ \
++    0, /* src_basereg   */ \
++    0  /* mask_basereg  */
++
++/******************************************************************************/
++
++.macro pixman_composite_src_8888_8888_process_pixblock_head
++.endm
++
++.macro pixman_composite_src_8888_8888_process_pixblock_tail
++.endm
++
++.macro pixman_composite_src_8888_8888_process_pixblock_tail_head
++    st1  {v0.2s, v1.2s, v2.2s, v3.2s}, [DST_W], #32
++    fetch_src_pixblock
++    cache_preload 8, 8
++.endm
++
++generate_composite_function \
++    pixman_composite_src_8888_8888_asm_neon, 32, 0, 32, \
++    FLAG_DST_WRITEONLY, \
++    8, /* number of pixels, processed in a single block */ \
++    10, /* prefetch distance */ \
++    default_init, \
++    default_cleanup, \
++    pixman_composite_src_8888_8888_process_pixblock_head, \
++    pixman_composite_src_8888_8888_process_pixblock_tail, \
++    pixman_composite_src_8888_8888_process_pixblock_tail_head, \
++    0, /* dst_w_basereg */ \
++    0, /* dst_r_basereg */ \
++    0, /* src_basereg   */ \
++    0  /* mask_basereg  */
++
++/******************************************************************************/
++
++.macro pixman_composite_src_x888_8888_process_pixblock_head
++    orr      v0.8b, v0.8b, v4.8b
++    orr      v1.8b, v1.8b, v4.8b
++    orr      v2.8b, v2.8b, v4.8b
++    orr      v3.8b, v3.8b, v4.8b
++.endm
++
++.macro pixman_composite_src_x888_8888_process_pixblock_tail
++.endm
++
++.macro pixman_composite_src_x888_8888_process_pixblock_tail_head
++    st1      {v0.2s, v1.2s, v2.2s, v3.2s}, [DST_W], #32
++    fetch_src_pixblock
++    orr      v0.8b, v0.8b, v4.8b
++    orr      v1.8b, v1.8b, v4.8b
++    orr      v2.8b, v2.8b, v4.8b
++    orr      v3.8b, v3.8b, v4.8b
++    cache_preload 8, 8
++.endm
++
++.macro pixman_composite_src_x888_8888_init
++    movi    v4.2s, #0xff, lsl 24
++.endm
++
++generate_composite_function \
++    pixman_composite_src_x888_8888_asm_neon, 32, 0, 32, \
++    FLAG_DST_WRITEONLY, \
++    8, /* number of pixels, processed in a single block */ \
++    10, /* prefetch distance */ \
++    pixman_composite_src_x888_8888_init, \
++    default_cleanup, \
++    pixman_composite_src_x888_8888_process_pixblock_head, \
++    pixman_composite_src_x888_8888_process_pixblock_tail, \
++    pixman_composite_src_x888_8888_process_pixblock_tail_head, \
++    0, /* dst_w_basereg */ \
++    0, /* dst_r_basereg */ \
++    0, /* src_basereg   */ \
++    0  /* mask_basereg  */
++
++/******************************************************************************/
++
++.macro pixman_composite_src_n_8_8888_process_pixblock_head
++    /* expecting solid source in {v0, v1, v2, v3} */
++    /* mask is in v24 (v25, v26, v27 are unused) */
++
++    /* in */
++    umull       v8.8h,  v24.8b, v0.8b
++    umull       v9.8h,  v24.8b, v1.8b
++    umull       v10.8h, v24.8b, v2.8b
++    umull       v11.8h, v24.8b, v3.8b
++    ursra       v8.8h,  v8.8h, #8
++    ursra       v9.8h,  v9.8h, #8
++    ursra       v10.8h, v10.8h, #8
++    ursra       v11.8h, v11.8h, #8
++.endm
++
++.macro pixman_composite_src_n_8_8888_process_pixblock_tail
++    rshrn       v28.8b, v8.8h, #8
++    rshrn       v29.8b, v9.8h, #8
++    rshrn       v30.8b, v10.8h, #8
++    rshrn       v31.8b, v11.8h, #8
++.endm
++
++.macro pixman_composite_src_n_8_8888_process_pixblock_tail_head
++    fetch_mask_pixblock
++                                    PF add PF_X, PF_X, #8
++        rshrn       v28.8b, v8.8h, #8
++                                    PF tst PF_CTL, #0x0F
++        rshrn       v29.8b, v9.8h, #8
++                                    PF beq 10f
++                                    PF add PF_X, PF_X, #8
++10:
++        rshrn      v30.8b, v10.8h, #8
++                                    PF beq 10f
++                                    PF sub PF_CTL, PF_CTL, #1
++10:
++        rshrn      v31.8b, v11.8h, #8
++                                    PF cmp PF_X, ORIG_W
++    umull          v8.8h, v24.8b, v0.8b
++                                    PF lsl DUMMY, PF_X, #mask_bpp_shift
++                                    PF prfm PREFETCH_MODE, [PF_MASK, DUMMY]
++    umull          v9.8h, v24.8b, v1.8b
++                                    PF ble 10f
++                                    PF sub PF_X, PF_X, ORIG_W
++10:
++    umull          v10.8h, v24.8b, v2.8b
++                                    PF ble 10f
++                                    PF subs PF_CTL, PF_CTL, #0x10
++10:
++    umull          v11.8h, v24.8b, v3.8b
++                                    PF ble 10f
++                                    PF lsl DUMMY, MASK_STRIDE, #mask_bpp_shift
++                                    PF ldrsb DUMMY, [PF_MASK, DUMMY]
++                                    PF add PF_MASK, PF_MASK, #1
++10:
++        st4        {v28.8b, v29.8b, v30.8b, v31.8b}, [DST_W], #32
++    ursra       v8.8h, v8.8h, #8
++    ursra       v9.8h, v9.8h, #8
++    ursra       v10.8h, v10.8h, #8
++    ursra       v11.8h, v11.8h, #8
++.endm
++
++.macro pixman_composite_src_n_8_8888_init
++    mov         v3.s[0], w4
++    dup         v0.8b, v3.b[0]
++    dup         v1.8b, v3.b[1]
++    dup         v2.8b, v3.b[2]
++    dup         v3.8b, v3.b[3]
++.endm
++
++.macro pixman_composite_src_n_8_8888_cleanup
++.endm
++
++generate_composite_function \
++    pixman_composite_src_n_8_8888_asm_neon, 0, 8, 32, \
++    FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \
++    8, /* number of pixels, processed in a single block */ \
++    5, /* prefetch distance */ \
++    pixman_composite_src_n_8_8888_init, \
++    pixman_composite_src_n_8_8888_cleanup, \
++    pixman_composite_src_n_8_8888_process_pixblock_head, \
++    pixman_composite_src_n_8_8888_process_pixblock_tail, \
++    pixman_composite_src_n_8_8888_process_pixblock_tail_head, \
++
++/******************************************************************************/
++
++.macro pixman_composite_src_n_8_8_process_pixblock_head
++    umull       v0.8h, v24.8b, v16.8b
++    umull       v1.8h, v25.8b, v16.8b
++    umull       v2.8h, v26.8b, v16.8b
++    umull       v3.8h, v27.8b, v16.8b
++    ursra       v0.8h, v0.8h,  #8
++    ursra       v1.8h, v1.8h,  #8
++    ursra       v2.8h, v2.8h,  #8
++    ursra       v3.8h, v3.8h,  #8
++.endm
++
++.macro pixman_composite_src_n_8_8_process_pixblock_tail
++    rshrn       v28.8b, v0.8h, #8
++    rshrn       v29.8b, v1.8h, #8
++    rshrn       v30.8b, v2.8h, #8
++    rshrn       v31.8b, v3.8h, #8
++.endm
++
++.macro pixman_composite_src_n_8_8_process_pixblock_tail_head
++    fetch_mask_pixblock
++                                    PF add PF_X, PF_X, #8
++        rshrn       v28.8b, v0.8h, #8
++                                    PF tst PF_CTL, #0x0F
++        rshrn       v29.8b, v1.8h, #8
++                                    PF beq 10f
++                                    PF add PF_X, PF_X, #8
++10:
++        rshrn       v30.8b, v2.8h, #8
++                                    PF beq 10f
++                                    PF sub PF_CTL, PF_CTL, #1
++10:
++        rshrn       v31.8b, v3.8h, #8
++                                    PF cmp PF_X, ORIG_W
++    umull       v0.8h,  v24.8b, v16.8b
++                                    PF lsl DUMMY, PF_X, mask_bpp_shift
++                                    PF prfm PREFETCH_MODE, [PF_MASK, DUMMY]
++    umull       v1.8h,  v25.8b, v16.8b
++                                    PF ble 10f
++                                    PF sub PF_X, PF_X, ORIG_W
++10:
++    umull       v2.8h,  v26.8b, v16.8b
++                                    PF ble 10f
++                                    PF subs PF_CTL, PF_CTL, #0x10
++10:
++    umull       v3.8h,  v27.8b, v16.8b
++                                    PF ble 10f
++                                    PF lsl DUMMY, MASK_STRIDE, #mask_bpp_shift
++                                    PF ldrsb DUMMY, [PF_MASK, DUMMY]
++                                    PF add PF_MASK, PF_MASK, #1
++10:
++        st1         {v28.8b, v29.8b, v30.8b, v31.8b}, [DST_W], #32
++    ursra       v0.8h, v0.8h,  #8
++    ursra       v1.8h, v1.8h,  #8
++    ursra       v2.8h, v2.8h,  #8
++    ursra       v3.8h, v3.8h,  #8
++.endm
++
++.macro pixman_composite_src_n_8_8_init
++    mov         v16.s[0], w4
++    dup         v16.8b, v16.b[3]
++.endm
++
++.macro pixman_composite_src_n_8_8_cleanup
++.endm
++
++generate_composite_function \
++    pixman_composite_src_n_8_8_asm_neon, 0, 8, 8, \
++    FLAG_DST_WRITEONLY, \
++    32, /* number of pixels, processed in a single block */ \
++    5, /* prefetch distance */ \
++    pixman_composite_src_n_8_8_init, \
++    pixman_composite_src_n_8_8_cleanup, \
++    pixman_composite_src_n_8_8_process_pixblock_head, \
++    pixman_composite_src_n_8_8_process_pixblock_tail, \
++    pixman_composite_src_n_8_8_process_pixblock_tail_head
++
++/******************************************************************************/
++
++.macro pixman_composite_over_n_8_8888_process_pixblock_head
++    /* expecting deinterleaved source data in {v8, v9, v10, v11} */
++    /* v8 - blue, v9 - green, v10 - red, v11 - alpha */
++    /* and destination data in {v4, v5, v6, v7} */
++    /* mask is in v24 (v25, v26, v27 are unused) */
++
++    /* in */
++    umull       v12.8h, v24.8b, v8.8b
++    umull       v13.8h, v24.8b, v9.8b
++    umull       v14.8h, v24.8b, v10.8b
++    umull       v15.8h, v24.8b, v11.8b
++    urshr       v16.8h, v12.8h, #8
++    urshr       v17.8h, v13.8h, #8
++    urshr       v18.8h, v14.8h, #8
++    urshr       v19.8h, v15.8h, #8
++    raddhn      v0.8b, v12.8h, v16.8h
++    raddhn      v1.8b, v13.8h, v17.8h
++    raddhn      v2.8b, v14.8h, v18.8h
++    raddhn      v3.8b, v15.8h, v19.8h
++    mvn         v25.8b, v3.8b  /* get inverted alpha */
++    /* source:      v0 - blue, v1 - green, v2 - red, v3 - alpha */
++    /* destination: v4 - blue, v5 - green, v6 - red, v7 - alpha */
++    /* now do alpha blending */
++    umull       v12.8h, v25.8b, v4.8b
++    umull       v13.8h, v25.8b, v5.8b
++    umull       v14.8h, v25.8b, v6.8b
++    umull       v15.8h, v25.8b, v7.8b
++.endm
++
++.macro pixman_composite_over_n_8_8888_process_pixblock_tail
++    urshr       v16.8h, v12.8h, #8
++    urshr       v17.8h, v13.8h, #8
++    urshr       v18.8h, v14.8h, #8
++    urshr       v19.8h, v15.8h, #8
++    raddhn      v28.8b, v16.8h, v12.8h
++    raddhn      v29.8b, v17.8h, v13.8h
++    raddhn      v30.8b, v18.8h, v14.8h
++    raddhn      v31.8b, v19.8h, v15.8h
++    uqadd       v28.8b, v0.8b, v28.8b
++    uqadd       v29.8b, v1.8b, v29.8b
++    uqadd       v30.8b, v2.8b, v30.8b
++    uqadd       v31.8b, v3.8b, v31.8b
++.endm
++
++.macro pixman_composite_over_n_8_8888_process_pixblock_tail_head
++        urshr       v16.8h, v12.8h, #8
++     ld4        {v4.8b, v5.8b, v6.8b, v7.8b}, [DST_R], #32
++        urshr       v17.8h, v13.8h, #8
++    fetch_mask_pixblock
++        urshr       v18.8h, v14.8h, #8
++                                    PF add PF_X, PF_X, #8
++        urshr       v19.8h, v15.8h, #8
++                                    PF tst PF_CTL, #0x0F
++        raddhn      v28.8b, v16.8h, v12.8h
++                                    PF beq 10f
++                                    PF add PF_X, PF_X, #8
++10:
++        raddhn      v29.8b, v17.8h, v13.8h
++                                    PF beq 10f
++                                    PF sub PF_CTL, PF_CTL, #1
++10:
++        raddhn      v30.8b, v18.8h, v14.8h
++                                    PF cmp PF_X, ORIG_W
++        raddhn      v31.8b, v19.8h, v15.8h
++                                    PF lsl DUMMY, PF_X, #dst_bpp_shift
++                                    PF prfm PREFETCH_MODE, [PF_DST, DUMMY]
++    umull       v16.8h, v24.8b, v8.8b
++                                    PF lsl DUMMY, PF_X, #mask_bpp_shift
++                                    PF prfm PREFETCH_MODE, [PF_MASK, DUMMY]
++    umull       v17.8h, v24.8b, v9.8b
++                                    PF ble 10f
++                                    PF sub PF_X, PF_X, ORIG_W
++10:
++    umull       v18.8h, v24.8b, v10.8b
++                                    PF ble 10f
++                                    PF subs PF_CTL, PF_CTL, #0x10
++10:
++    umull       v19.8h, v24.8b, v11.8b
++                                    PF ble 10f
++                                    PF lsl DUMMY, DST_STRIDE, #dst_bpp_shift
++                                    PF ldrsb DUMMY, [PF_DST, DUMMY]
++                                    PF add PF_DST, PF_DST, #1
++10:
++        uqadd       v28.8b, v0.8b, v28.8b
++                                    PF ble 10f
++                                    PF lsl DUMMY, MASK_STRIDE, #mask_bpp_shift
++                                    PF ldrsb DUMMY, [PF_MASK, DUMMY]
++                                    PF add PF_MASK, PF_MASK, #1
++10:
++        uqadd        v29.8b, v1.8b, v29.8b
++        uqadd        v30.8b, v2.8b, v30.8b
++        uqadd        v31.8b, v3.8b, v31.8b
++    urshr       v12.8h, v16.8h, #8
++    urshr       v13.8h, v17.8h, #8
++    urshr       v14.8h, v18.8h, #8
++    urshr       v15.8h, v19.8h, #8
++    raddhn      v0.8b, v16.8h, v12.8h
++    raddhn      v1.8b, v17.8h, v13.8h
++    raddhn      v2.8b, v18.8h, v14.8h
++    raddhn      v3.8b, v19.8h, v15.8h
++        st4          {v28.8b, v29.8b, v30.8b, v31.8b}, [DST_W], #32
++    mvn         v25.8b, v3.8b
++    umull       v12.8h, v25.8b, v4.8b
++    umull       v13.8h, v25.8b, v5.8b
++    umull       v14.8h, v25.8b, v6.8b
++    umull       v15.8h, v25.8b, v7.8b
++.endm
++
++.macro pixman_composite_over_n_8_8888_init
++    mov         v11.s[0], w4
++    dup         v8.8b, v11.b[0]
++    dup         v9.8b, v11.b[1]
++    dup         v10.8b, v11.b[2]
++    dup         v11.8b, v11.b[3]
++.endm
++
++.macro pixman_composite_over_n_8_8888_cleanup
++.endm
++
++generate_composite_function \
++    pixman_composite_over_n_8_8888_asm_neon, 0, 8, 32, \
++    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
++    8, /* number of pixels, processed in a single block */ \
++    5, /* prefetch distance */ \
++    pixman_composite_over_n_8_8888_init, \
++    pixman_composite_over_n_8_8888_cleanup, \
++    pixman_composite_over_n_8_8888_process_pixblock_head, \
++    pixman_composite_over_n_8_8888_process_pixblock_tail, \
++    pixman_composite_over_n_8_8888_process_pixblock_tail_head
++
++/******************************************************************************/
++
++.macro pixman_composite_over_n_8_8_process_pixblock_head
++    umull       v0.8h,  v24.8b, v8.8b
++    umull       v1.8h,  v25.8b, v8.8b
++    umull       v2.8h,  v26.8b, v8.8b
++    umull       v3.8h,  v27.8b, v8.8b
++    urshr       v10.8h, v0.8h,  #8
++    urshr       v11.8h, v1.8h,  #8
++    urshr       v12.8h, v2.8h,  #8
++    urshr       v13.8h, v3.8h,  #8
++    raddhn      v0.8b,  v0.8h,  v10.8h
++    raddhn      v1.8b,  v1.8h,  v11.8h
++    raddhn      v2.8b,  v2.8h,  v12.8h
++    raddhn      v3.8b,  v3.8h,  v13.8h
++    mvn         v24.8b, v0.8b
++    mvn         v25.8b, v1.8b
++    mvn         v26.8b, v2.8b
++    mvn         v27.8b, v3.8b
++    umull       v10.8h, v24.8b, v4.8b
++    umull       v11.8h, v25.8b, v5.8b
++    umull       v12.8h, v26.8b, v6.8b
++    umull       v13.8h, v27.8b, v7.8b
++.endm
++
++.macro pixman_composite_over_n_8_8_process_pixblock_tail
++    urshr       v14.8h, v10.8h,  #8
++    urshr       v15.8h, v11.8h,  #8
++    urshr       v16.8h, v12.8h, #8
++    urshr       v17.8h, v13.8h, #8
++    raddhn      v28.8b, v14.8h, v10.8h
++    raddhn      v29.8b, v15.8h, v11.8h
++    raddhn      v30.8b, v16.8h, v12.8h
++    raddhn      v31.8b, v17.8h, v13.8h
++    uqadd       v28.8b, v0.8b,  v28.8b
++    uqadd       v29.8b, v1.8b,  v29.8b
++    uqadd       v30.8b, v2.8b,  v30.8b
++    uqadd       v31.8b, v3.8b,  v31.8b
++.endm
++
++/* TODO: expand macros and do better instructions scheduling */
++.macro pixman_composite_over_n_8_8_process_pixblock_tail_head
++    ld1         {v4.8b, v5.8b, v6.8b, v7.8b}, [DST_R], #32
++    pixman_composite_over_n_8_8_process_pixblock_tail
++    fetch_mask_pixblock
++    cache_preload 32, 32
++    st1         {v28.8b, v29.8b, v30.8b, v31.8b}, [DST_W], #32
++    pixman_composite_over_n_8_8_process_pixblock_head
++.endm
++
++.macro pixman_composite_over_n_8_8_init
++    mov         v8.s[0], w4
++    dup         v8.8b, v8.b[3]
++.endm
++
++.macro pixman_composite_over_n_8_8_cleanup
++.endm
++
++generate_composite_function \
++    pixman_composite_over_n_8_8_asm_neon, 0, 8, 8, \
++    FLAG_DST_READWRITE, \
++    32, /* number of pixels, processed in a single block */ \
++    5, /* prefetch distance */ \
++    pixman_composite_over_n_8_8_init, \
++    pixman_composite_over_n_8_8_cleanup, \
++    pixman_composite_over_n_8_8_process_pixblock_head, \
++    pixman_composite_over_n_8_8_process_pixblock_tail, \
++    pixman_composite_over_n_8_8_process_pixblock_tail_head
++
++/******************************************************************************/
++
++.macro pixman_composite_over_n_8888_8888_ca_process_pixblock_head
++    /*
++     * 'combine_mask_ca' replacement
++     *
++     * input:  solid src (n) in {v8,  v9,  v10, v11}
++     *         dest in          {v4,  v5,  v6,  v7 }
++     *         mask in          {v24, v25, v26, v27}
++     * output: updated src in   {v0,  v1,  v2,  v3 }
++     *         updated mask in  {v24, v25, v26, v3 }
++     */
++    umull       v0.8h,  v24.8b, v8.8b
++    umull       v1.8h,  v25.8b, v9.8b
++    umull       v2.8h,  v26.8b, v10.8b
++    umull       v3.8h,  v27.8b, v11.8b
++    umull       v12.8h, v11.8b, v25.8b
++    umull       v13.8h, v11.8b, v24.8b
++    umull       v14.8h, v11.8b, v26.8b
++    urshr       v15.8h, v0.8h,  #8
++    urshr       v16.8h, v1.8h,  #8
++    urshr       v17.8h, v2.8h,  #8
++    raddhn      v0.8b,  v0.8h,  v15.8h
++    raddhn      v1.8b,  v1.8h,  v16.8h
++    raddhn      v2.8b,  v2.8h,  v17.8h
++    urshr       v15.8h, v13.8h, #8
++    urshr       v16.8h, v12.8h, #8
++    urshr       v17.8h, v14.8h, #8
++    urshr       v18.8h, v3.8h,  #8
++    raddhn      v24.8b, v13.8h, v15.8h
++    raddhn      v25.8b, v12.8h, v16.8h
++    raddhn      v26.8b, v14.8h, v17.8h
++    raddhn      v3.8b,  v3.8h,  v18.8h
++    /*
++     * 'combine_over_ca' replacement
++     *
++     * output: updated dest in {v28, v29, v30, v31}
++     */
++    mvn         v24.8b, v24.8b
++    mvn         v25.8b, v25.8b
++    mvn         v26.8b, v26.8b
++    mvn         v27.8b, v3.8b
++    umull       v12.8h, v24.8b, v4.8b
++    umull       v13.8h, v25.8b, v5.8b
++    umull       v14.8h, v26.8b, v6.8b
++    umull       v15.8h, v27.8b, v7.8b
++.endm
++
++.macro pixman_composite_over_n_8888_8888_ca_process_pixblock_tail
++    /* ... continue 'combine_over_ca' replacement */
++    urshr       v16.8h, v12.8h, #8
++    urshr       v17.8h, v13.8h, #8
++    urshr       v18.8h, v14.8h, #8
++    urshr       v19.8h, v15.8h, #8
++    raddhn      v28.8b, v16.8h, v12.8h
++    raddhn      v29.8b, v17.8h, v13.8h
++    raddhn      v30.8b, v18.8h, v14.8h
++    raddhn      v31.8b, v19.8h, v15.8h
++    uqadd       v28.8b, v0.8b,  v28.8b
++    uqadd       v29.8b, v1.8b,  v29.8b
++    uqadd       v30.8b, v2.8b,  v30.8b
++    uqadd       v31.8b, v3.8b,  v31.8b
++.endm
++
++.macro pixman_composite_over_n_8888_8888_ca_process_pixblock_tail_head
++        urshr       v16.8h, v12.8h, #8
++        urshr       v17.8h, v13.8h, #8
++    ld4         {v4.8b, v5.8b, v6.8b, v7.8b}, [DST_R], #32
++        urshr       v18.8h, v14.8h, #8
++        urshr       v19.8h, v15.8h, #8
++        raddhn      v28.8b, v16.8h, v12.8h
++        raddhn      v29.8b, v17.8h, v13.8h
++        raddhn      v30.8b, v18.8h, v14.8h
++        raddhn      v31.8b, v19.8h, v15.8h
++    fetch_mask_pixblock
++        uqadd       v28.8b, v0.8b, v28.8b
++        uqadd       v29.8b, v1.8b, v29.8b
++        uqadd       v30.8b, v2.8b, v30.8b
++        uqadd       v31.8b, v3.8b, v31.8b
++    cache_preload 8, 8
++    pixman_composite_over_n_8888_8888_ca_process_pixblock_head
++    st4         {v28.8b, v29.8b, v30.8b, v31.8b}, [DST_W], #32
++.endm
++
++.macro pixman_composite_over_n_8888_8888_ca_init
++    mov         v13.s[0], w4
++    dup         v8.8b, v13.b[0]
++    dup         v9.8b, v13.b[1]
++    dup         v10.8b, v13.b[2]
++    dup         v11.8b, v13.b[3]
++.endm
++
++.macro pixman_composite_over_n_8888_8888_ca_cleanup
++.endm
++
++generate_composite_function \
++    pixman_composite_over_n_8888_8888_ca_asm_neon, 0, 32, 32, \
++    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
++    8, /* number of pixels, processed in a single block */ \
++    5, /* prefetch distance */ \
++    pixman_composite_over_n_8888_8888_ca_init, \
++    pixman_composite_over_n_8888_8888_ca_cleanup, \
++    pixman_composite_over_n_8888_8888_ca_process_pixblock_head, \
++    pixman_composite_over_n_8888_8888_ca_process_pixblock_tail, \
++    pixman_composite_over_n_8888_8888_ca_process_pixblock_tail_head
++
++/******************************************************************************/
++
++.macro pixman_composite_over_n_8888_0565_ca_process_pixblock_head
++    /*
++     * 'combine_mask_ca' replacement
++     *
++     * input:  solid src (n) in {v8,  v9,  v10, v11}  [B, G, R, A]
++     *         mask in          {v24, v25, v26}       [B, G, R]
++     * output: updated src in   {v0,  v1,  v2 }       [B, G, R]
++     *         updated mask in  {v24, v25, v26}       [B, G, R]
++     */
++    umull       v0.8h,  v24.8b, v8.8b
++    umull       v1.8h,  v25.8b, v9.8b
++    umull       v2.8h,  v26.8b, v10.8b
++    umull       v12.8h, v11.8b, v24.8b
++    umull       v13.8h, v11.8b, v25.8b
++    umull       v14.8h, v11.8b, v26.8b
++    urshr       v15.8h, v0.8h,  #8
++    urshr       v16.8h, v1.8h,  #8
++    urshr       v17.8h, v2.8h,  #8
++    raddhn      v0.8b,  v0.8h,  v15.8h
++    raddhn      v1.8b,  v1.8h,  v16.8h
++    raddhn      v2.8b,  v2.8h,  v17.8h
++    urshr       v19.8h, v12.8h, #8
++    urshr       v20.8h, v13.8h, #8
++    urshr       v21.8h, v14.8h, #8
++    raddhn      v24.8b, v12.8h, v19.8h
++    raddhn      v25.8b, v13.8h, v20.8h
++    /*
++     * convert 8 r5g6b5 pixel data from {v4} to planar 8-bit format
++     * and put data into v16 - blue, v17 - green, v18 - red
++     */
++       mov         v4.d[1], v5.d[0]
++       shrn        v17.8b, v4.8h,  #3
++       shrn        v18.8b, v4.8h,  #8
++    raddhn      v26.8b, v14.8h, v21.8h
++       sli         v4.8h,  v4.8h,  #5
++       sri         v18.8b, v18.8b, #5
++       sri         v17.8b, v17.8b, #6
++    /*
++     * 'combine_over_ca' replacement
++     *
++     * output: updated dest in v16 - blue, v17 - green, v18 - red
++     */
++    mvn         v24.8b, v24.8b
++    mvn         v25.8b, v25.8b
++       shrn       v16.8b, v4.8h,  #2
++    mvn         v26.8b, v26.8b
++    umull       v5.8h, v16.8b, v24.8b
++    umull       v6.8h, v17.8b, v25.8b
++    umull       v7.8h, v18.8b, v26.8b
++.endm
++
++.macro pixman_composite_over_n_8888_0565_ca_process_pixblock_tail
++    /* ... continue 'combine_over_ca' replacement */
++    urshr       v13.8h, v5.8h, #8
++    urshr       v14.8h, v6.8h, #8
++    urshr       v15.8h, v7.8h, #8
++    raddhn      v16.8b, v13.8h, v5.8h
++    raddhn      v17.8b, v14.8h, v6.8h
++    raddhn      v18.8b, v15.8h, v7.8h
++    uqadd       v16.8b, v0.8b, v16.8b
++    uqadd       v17.8b, v1.8b, v17.8b
++    uqadd       v18.8b, v2.8b, v18.8b
++    /*
++     * convert the results in v16, v17, v18 to r5g6b5 and store
++     * them into {v14}
++     */
++    ushll       v14.8h, v18.8b, #7
++    sli         v14.8h, v14.8h, #1
++    ushll       v12.8h, v17.8b, #7
++    sli         v12.8h, v12.8h, #1
++    ushll       v13.8h, v16.8b, #7
++    sli         v13.8h, v13.8h, #1
++    sri         v14.8h, v12.8h, #5
++    sri         v14.8h, v13.8h, #11
++    mov         v28.d[0], v14.d[0]
++    mov         v29.d[0], v14.d[1]
++.endm
++
++.macro pixman_composite_over_n_8888_0565_ca_process_pixblock_tail_head
++    fetch_mask_pixblock
++        urshr       v13.8h, v5.8h, #8
++        urshr       v14.8h, v6.8h, #8
++    ld1         {v4.8h}, [DST_R], #16
++        urshr       v15.8h, v7.8h, #8
++        raddhn      v16.8b, v13.8h, v5.8h
++        raddhn      v17.8b, v14.8h, v6.8h
++        raddhn      v18.8b, v15.8h, v7.8h
++    mov         v5.d[0], v4.d[1]
++            /* process_pixblock_head */
++            /*
++             * 'combine_mask_ca' replacement
++             *
++             * input:  solid src (n) in {v8,  v9,  v10, v11}  [B, G, R, A]
++             *         mask in          {v24, v25, v26}       [B, G, R]
++             * output: updated src in   {v0,  v1,  v2 }       [B, G, R]
++             *         updated mask in  {v24, v25, v26}       [B, G, R]
++             */
++        uqadd       v16.8b, v0.8b, v16.8b
++        uqadd       v17.8b, v1.8b, v17.8b
++        uqadd       v18.8b, v2.8b, v18.8b
++            umull       v0.8h,  v24.8b, v8.8b
++            umull       v1.8h,  v25.8b, v9.8b
++            umull       v2.8h,  v26.8b, v10.8b
++        /*
++         * convert the result in v16, v17, v18 to r5g6b5 and store
++         * it into {v14}
++         */
++        ushll       v14.8h, v18.8b, #7
++        sli         v14.8h, v14.8h, #1
++        ushll       v18.8h, v16.8b, #7
++        sli         v18.8h, v18.8h, #1
++        ushll       v19.8h, v17.8b, #7
++        sli         v19.8h, v19.8h, #1
++            umull       v12.8h, v11.8b, v24.8b
++        sri         v14.8h, v19.8h, #5
++            umull       v13.8h, v11.8b, v25.8b
++            umull       v15.8h, v11.8b, v26.8b
++        sri         v14.8h, v18.8h, #11
++        mov         v28.d[0], v14.d[0]
++        mov         v29.d[0], v14.d[1]
++    cache_preload 8, 8
++            urshr       v16.8h, v0.8h,  #8
++            urshr       v17.8h, v1.8h,  #8
++            urshr       v18.8h, v2.8h,  #8
++            raddhn      v0.8b,  v0.8h,  v16.8h
++            raddhn      v1.8b,  v1.8h,  v17.8h
++            raddhn      v2.8b,  v2.8h,  v18.8h
++            urshr       v19.8h, v12.8h, #8
++            urshr       v20.8h, v13.8h, #8
++            urshr       v21.8h, v15.8h, #8
++            raddhn      v24.8b, v12.8h, v19.8h
++            raddhn      v25.8b, v13.8h, v20.8h
++                /*
++                 * convert 8 r5g6b5 pixel data from {v4, v5} to planar
++             * 8-bit format and put data into v16 - blue, v17 - green,
++             * v18 - red
++                 */
++		mov         v4.d[1], v5.d[0]
++                shrn        v17.8b, v4.8h,  #3
++                shrn        v18.8b, v4.8h,  #8
++            raddhn      v26.8b, v15.8h, v21.8h
++                sli         v4.8h,  v4.8h,  #5
++                sri         v17.8b, v17.8b, #6
++                sri         v18.8b, v18.8b, #5
++            /*
++             * 'combine_over_ca' replacement
++             *
++             * output: updated dest in v16 - blue, v17 - green, v18 - red
++             */
++            mvn         v24.8b, v24.8b
++            mvn         v25.8b, v25.8b
++                shrn        v16.8b, v4.8h,  #2
++            mvn         v26.8b, v26.8b
++            umull       v5.8h, v16.8b, v24.8b
++            umull       v6.8h, v17.8b, v25.8b
++            umull       v7.8h, v18.8b, v26.8b
++    st1         {v14.8h}, [DST_W], #16
++.endm
++
++.macro pixman_composite_over_n_8888_0565_ca_init
++    mov         v13.s[0], w4
++    dup         v8.8b, v13.b[0]
++    dup         v9.8b, v13.b[1]
++    dup         v10.8b, v13.b[2]
++    dup         v11.8b, v13.b[3]
++.endm
++
++.macro pixman_composite_over_n_8888_0565_ca_cleanup
++.endm
++
++generate_composite_function \
++    pixman_composite_over_n_8888_0565_ca_asm_neon, 0, 32, 16, \
++    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
++    8, /* number of pixels, processed in a single block */ \
++    5, /* prefetch distance */ \
++    pixman_composite_over_n_8888_0565_ca_init, \
++    pixman_composite_over_n_8888_0565_ca_cleanup, \
++    pixman_composite_over_n_8888_0565_ca_process_pixblock_head, \
++    pixman_composite_over_n_8888_0565_ca_process_pixblock_tail, \
++    pixman_composite_over_n_8888_0565_ca_process_pixblock_tail_head
++
++/******************************************************************************/
++
++.macro pixman_composite_in_n_8_process_pixblock_head
++    /* expecting source data in {v0, v1, v2, v3} */
++    /* and destination data in {v4, v5, v6, v7} */
++    umull       v8.8h,  v4.8b,  v3.8b
++    umull       v9.8h,  v5.8b,  v3.8b
++    umull       v10.8h, v6.8b,  v3.8b
++    umull       v11.8h, v7.8b,  v3.8b
++.endm
++
++.macro pixman_composite_in_n_8_process_pixblock_tail
++    urshr       v14.8h, v8.8h,  #8
++    urshr       v15.8h, v9.8h,  #8
++    urshr       v12.8h, v10.8h, #8
++    urshr       v13.8h, v11.8h, #8
++    raddhn      v28.8b, v8.8h,  v14.8h
++    raddhn      v29.8b, v9.8h,  v15.8h
++    raddhn      v30.8b, v10.8h, v12.8h
++    raddhn      v31.8b, v11.8h, v13.8h
++.endm
++
++.macro pixman_composite_in_n_8_process_pixblock_tail_head
++    pixman_composite_in_n_8_process_pixblock_tail
++    ld1         {v4.8b, v5.8b, v6.8b, v7.8b}, [DST_R], #32
++    cache_preload 32, 32
++    pixman_composite_in_n_8_process_pixblock_head
++    st1         {v28.8b, v29.8b, v30.8b, v31.8b}, [DST_W], #32
++.endm
++
++.macro pixman_composite_in_n_8_init
++    mov         v3.s[0], w4
++    dup         v3.8b, v3.b[3]
++.endm
++
++.macro pixman_composite_in_n_8_cleanup
++.endm
++
++generate_composite_function \
++    pixman_composite_in_n_8_asm_neon, 0, 0, 8, \
++    FLAG_DST_READWRITE, \
++    32, /* number of pixels, processed in a single block */ \
++    5, /* prefetch distance */ \
++    pixman_composite_in_n_8_init, \
++    pixman_composite_in_n_8_cleanup, \
++    pixman_composite_in_n_8_process_pixblock_head, \
++    pixman_composite_in_n_8_process_pixblock_tail, \
++    pixman_composite_in_n_8_process_pixblock_tail_head, \
++    28, /* dst_w_basereg */ \
++    4,  /* dst_r_basereg */ \
++    0,  /* src_basereg   */ \
++    24  /* mask_basereg  */
++
++.macro pixman_composite_add_n_8_8_process_pixblock_head
++    /* expecting source data in {v8, v9, v10, v11} */
++    /* v8 - blue, v9 - green, v10 - red, v11 - alpha */
++    /* and destination data in {v4, v5, v6, v7} */
++    /* mask is in v24, v25, v26, v27 */
++    umull       v0.8h, v24.8b, v11.8b
++    umull       v1.8h, v25.8b, v11.8b
++    umull       v2.8h, v26.8b, v11.8b
++    umull       v3.8h, v27.8b, v11.8b
++    urshr       v12.8h, v0.8h, #8
++    urshr       v13.8h, v1.8h, #8
++    urshr       v14.8h, v2.8h, #8
++    urshr       v15.8h, v3.8h, #8
++    raddhn      v0.8b, v0.8h, v12.8h
++    raddhn      v1.8b, v1.8h, v13.8h
++    raddhn      v2.8b, v2.8h, v14.8h
++    raddhn      v3.8b, v3.8h, v15.8h
++    uqadd       v28.8b, v0.8b, v4.8b
++    uqadd       v29.8b, v1.8b, v5.8b
++    uqadd       v30.8b, v2.8b, v6.8b
++    uqadd       v31.8b, v3.8b, v7.8b
++.endm
++
++.macro pixman_composite_add_n_8_8_process_pixblock_tail
++.endm
++
++/* TODO: expand macros and do better instructions scheduling */
++.macro pixman_composite_add_n_8_8_process_pixblock_tail_head
++    pixman_composite_add_n_8_8_process_pixblock_tail
++    st1         {v28.8b, v29.8b, v30.8b, v31.8b}, [DST_W], #32
++    ld1         {v4.8b, v5.8b, v6.8b, v7.8b}, [DST_R], #32
++    fetch_mask_pixblock
++    cache_preload 32, 32
++    pixman_composite_add_n_8_8_process_pixblock_head
++.endm
++
++.macro pixman_composite_add_n_8_8_init
++    mov         v11.s[0], w4
++    dup         v11.8b, v11.b[3]
++.endm
++
++.macro pixman_composite_add_n_8_8_cleanup
++.endm
++
++generate_composite_function \
++    pixman_composite_add_n_8_8_asm_neon, 0, 8, 8, \
++    FLAG_DST_READWRITE, \
++    32, /* number of pixels, processed in a single block */ \
++    5, /* prefetch distance */ \
++    pixman_composite_add_n_8_8_init, \
++    pixman_composite_add_n_8_8_cleanup, \
++    pixman_composite_add_n_8_8_process_pixblock_head, \
++    pixman_composite_add_n_8_8_process_pixblock_tail, \
++    pixman_composite_add_n_8_8_process_pixblock_tail_head
++
++/******************************************************************************/
++
++.macro pixman_composite_add_8_8_8_process_pixblock_head
++    /* expecting source data in {v0, v1, v2, v3} */
++    /* destination data in {v4, v5, v6, v7} */
++    /* mask in {v24, v25, v26, v27} */
++    umull       v8.8h, v24.8b, v0.8b
++    umull       v9.8h, v25.8b, v1.8b
++    umull       v10.8h, v26.8b, v2.8b
++    umull       v11.8h, v27.8b, v3.8b
++    urshr       v0.8h, v8.8h, #8
++    urshr       v1.8h, v9.8h, #8
++    urshr       v12.8h, v10.8h, #8
++    urshr       v13.8h, v11.8h, #8
++    raddhn      v0.8b, v0.8h, v8.8h
++    raddhn      v1.8b, v1.8h, v9.8h
++    raddhn      v2.8b, v12.8h, v10.8h
++    raddhn      v3.8b, v13.8h, v11.8h
++    uqadd       v28.8b, v0.8b, v4.8b
++    uqadd       v29.8b, v1.8b, v5.8b
++    uqadd       v30.8b, v2.8b, v6.8b
++    uqadd       v31.8b, v3.8b, v7.8b
++.endm
++
++.macro pixman_composite_add_8_8_8_process_pixblock_tail
++.endm
++
++/* TODO: expand macros and do better instructions scheduling */
++.macro pixman_composite_add_8_8_8_process_pixblock_tail_head
++    pixman_composite_add_8_8_8_process_pixblock_tail
++    st1         {v28.8b, v29.8b, v30.8b, v31.8b}, [DST_W], #32
++    ld1         {v4.8b, v5.8b, v6.8b, v7.8b}, [DST_R], #32
++    fetch_mask_pixblock
++    fetch_src_pixblock
++    cache_preload 32, 32
++    pixman_composite_add_8_8_8_process_pixblock_head
++.endm
++
++.macro pixman_composite_add_8_8_8_init
++.endm
++
++.macro pixman_composite_add_8_8_8_cleanup
++.endm
++
++generate_composite_function \
++    pixman_composite_add_8_8_8_asm_neon, 8, 8, 8, \
++    FLAG_DST_READWRITE, \
++    32, /* number of pixels, processed in a single block */ \
++    5, /* prefetch distance */ \
++    pixman_composite_add_8_8_8_init, \
++    pixman_composite_add_8_8_8_cleanup, \
++    pixman_composite_add_8_8_8_process_pixblock_head, \
++    pixman_composite_add_8_8_8_process_pixblock_tail, \
++    pixman_composite_add_8_8_8_process_pixblock_tail_head
++
++/******************************************************************************/
++
++.macro pixman_composite_add_8888_8888_8888_process_pixblock_head
++    /* expecting source data in {v0, v1, v2, v3} */
++    /* destination data in {v4, v5, v6, v7} */
++    /* mask in {v24, v25, v26, v27} */
++    umull       v8.8h,  v27.8b, v0.8b
++    umull       v9.8h,  v27.8b, v1.8b
++    umull       v10.8h, v27.8b, v2.8b
++    umull       v11.8h, v27.8b, v3.8b
++    /* 1 cycle bubble */
++    ursra       v8.8h,  v8.8h,  #8
++    ursra       v9.8h,  v9.8h,  #8
++    ursra       v10.8h, v10.8h, #8
++    ursra       v11.8h, v11.8h, #8
++.endm
++
++.macro pixman_composite_add_8888_8888_8888_process_pixblock_tail
++    /* 2 cycle bubble */
++    rshrn       v28.8b, v8.8h,  #8
++    rshrn       v29.8b, v9.8h,  #8
++    rshrn       v30.8b, v10.8h, #8
++    rshrn       v31.8b, v11.8h, #8
++    uqadd       v28.8b, v4.8b,  v28.8b
++    uqadd       v29.8b, v5.8b,  v29.8b
++    uqadd       v30.8b, v6.8b,  v30.8b
++    uqadd       v31.8b, v7.8b,  v31.8b
++.endm
++
++.macro pixman_composite_add_8888_8888_8888_process_pixblock_tail_head
++    fetch_src_pixblock
++        rshrn       v28.8b, v8.8h,  #8
++    fetch_mask_pixblock
++        rshrn       v29.8b, v9.8h,  #8
++    umull       v8.8h,  v27.8b, v0.8b
++        rshrn       v30.8b, v10.8h, #8
++    umull       v9.8h,  v27.8b, v1.8b
++        rshrn       v31.8b, v11.8h, #8
++    umull       v10.8h, v27.8b, v2.8b
++    umull       v11.8h, v27.8b, v3.8b
++        uqadd       v28.8b, v4.8b,  v28.8b
++        uqadd       v29.8b, v5.8b,  v29.8b
++        uqadd       v30.8b, v6.8b,  v30.8b
++        uqadd       v31.8b, v7.8b,  v31.8b
++    ursra       v8.8h,  v8.8h,  #8
++    ld4         {v4.8b, v5.8b, v6.8b, v7.8b}, [DST_R], #32
++    ursra       v9.8h,  v9.8h,  #8
++        st4         {v28.8b, v29.8b, v30.8b, v31.8b}, [DST_W], #32
++    ursra       v10.8h, v10.8h, #8
++
++    cache_preload 8, 8
++
++    ursra       v11.8h, v11.8h, #8
++.endm
++
++generate_composite_function \
++    pixman_composite_add_8888_8888_8888_asm_neon, 32, 32, 32, \
++    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
++    8, /* number of pixels, processed in a single block */ \
++    10, /* prefetch distance */ \
++    default_init, \
++    default_cleanup, \
++    pixman_composite_add_8888_8888_8888_process_pixblock_head, \
++    pixman_composite_add_8888_8888_8888_process_pixblock_tail, \
++    pixman_composite_add_8888_8888_8888_process_pixblock_tail_head, \
++    28, /* dst_w_basereg */ \
++    4,  /* dst_r_basereg */ \
++    0,  /* src_basereg   */ \
++    24  /* mask_basereg  */
++
++generate_composite_function_single_scanline \
++    pixman_composite_scanline_add_mask_asm_neon, 32, 32, 32, \
++    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
++    8, /* number of pixels, processed in a single block */ \
++    default_init, \
++    default_cleanup, \
++    pixman_composite_add_8888_8888_8888_process_pixblock_head, \
++    pixman_composite_add_8888_8888_8888_process_pixblock_tail, \
++    pixman_composite_add_8888_8888_8888_process_pixblock_tail_head, \
++    28, /* dst_w_basereg */ \
++    4,  /* dst_r_basereg */ \
++    0,  /* src_basereg   */ \
++    24  /* mask_basereg  */
++
++/******************************************************************************/
++
++generate_composite_function \
++    pixman_composite_add_8888_8_8888_asm_neon, 32, 8, 32, \
++    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
++    8, /* number of pixels, processed in a single block */ \
++    5, /* prefetch distance */ \
++    default_init, \
++    default_cleanup, \
++    pixman_composite_add_8888_8888_8888_process_pixblock_head, \
++    pixman_composite_add_8888_8888_8888_process_pixblock_tail, \
++    pixman_composite_add_8888_8888_8888_process_pixblock_tail_head, \
++    28, /* dst_w_basereg */ \
++    4,  /* dst_r_basereg */ \
++    0,  /* src_basereg   */ \
++    27  /* mask_basereg  */
++
++/******************************************************************************/
++
++.macro pixman_composite_add_n_8_8888_init
++    mov         v3.s[0], w4
++    dup         v0.8b, v3.b[0]
++    dup         v1.8b, v3.b[1]
++    dup         v2.8b, v3.b[2]
++    dup         v3.8b, v3.b[3]
++.endm
++
++.macro pixman_composite_add_n_8_8888_cleanup
++.endm
++
++generate_composite_function \
++    pixman_composite_add_n_8_8888_asm_neon, 0, 8, 32, \
++    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
++    8, /* number of pixels, processed in a single block */ \
++    5, /* prefetch distance */ \
++    pixman_composite_add_n_8_8888_init, \
++    pixman_composite_add_n_8_8888_cleanup, \
++    pixman_composite_add_8888_8888_8888_process_pixblock_head, \
++    pixman_composite_add_8888_8888_8888_process_pixblock_tail, \
++    pixman_composite_add_8888_8888_8888_process_pixblock_tail_head, \
++    28, /* dst_w_basereg */ \
++    4,  /* dst_r_basereg */ \
++    0,  /* src_basereg   */ \
++    27  /* mask_basereg  */
++
++/******************************************************************************/
++
++.macro pixman_composite_add_8888_n_8888_init
++    mov         v27.s[0], w6
++    dup         v27.8b, v27.b[3]
++.endm
++
++.macro pixman_composite_add_8888_n_8888_cleanup
++.endm
++
++generate_composite_function \
++    pixman_composite_add_8888_n_8888_asm_neon, 32, 0, 32, \
++    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
++    8, /* number of pixels, processed in a single block */ \
++    5, /* prefetch distance */ \
++    pixman_composite_add_8888_n_8888_init, \
++    pixman_composite_add_8888_n_8888_cleanup, \
++    pixman_composite_add_8888_8888_8888_process_pixblock_head, \
++    pixman_composite_add_8888_8888_8888_process_pixblock_tail, \
++    pixman_composite_add_8888_8888_8888_process_pixblock_tail_head, \
++    28, /* dst_w_basereg */ \
++    4,  /* dst_r_basereg */ \
++    0,  /* src_basereg   */ \
++    27  /* mask_basereg  */
++
++/******************************************************************************/
++
++.macro pixman_composite_out_reverse_8888_n_8888_process_pixblock_head
++    /* expecting source data in {v0, v1, v2, v3} */
++    /* destination data in {v4, v5, v6, v7} */
++    /* solid mask is in v15 */
++
++    /* 'in' */
++    umull       v11.8h, v15.8b, v3.8b
++    umull       v10.8h, v15.8b, v2.8b
++    umull       v9.8h,  v15.8b, v1.8b
++    umull       v8.8h,  v15.8b, v0.8b
++    urshr       v16.8h, v11.8h, #8
++    urshr       v14.8h, v10.8h, #8
++    urshr       v13.8h,  v9.8h, #8
++    urshr       v12.8h,  v8.8h, #8
++    raddhn      v3.8b, v11.8h, v16.8h
++    raddhn      v2.8b, v10.8h, v14.8h
++    raddhn      v1.8b,  v9.8h, v13.8h
++    raddhn      v0.8b,  v8.8h, v12.8h
++    mvn         v24.8b, v3.8b  /* get inverted alpha */
++    /* now do alpha blending */
++    umull       v8.8h, v24.8b, v4.8b
++    umull       v9.8h, v24.8b, v5.8b
++    umull       v10.8h, v24.8b, v6.8b
++    umull       v11.8h, v24.8b, v7.8b
++.endm
++
++.macro pixman_composite_out_reverse_8888_n_8888_process_pixblock_tail
++    urshr       v16.8h, v8.8h, #8
++    urshr       v17.8h, v9.8h, #8
++    urshr       v18.8h, v10.8h, #8
++    urshr       v19.8h, v11.8h, #8
++    raddhn      v28.8b, v16.8h, v8.8h
++    raddhn      v29.8b, v17.8h, v9.8h
++    raddhn      v30.8b, v18.8h, v10.8h
++    raddhn      v31.8b, v19.8h, v11.8h
++.endm
++
++/* TODO: expand macros and do better instructions scheduling */
++.macro pixman_composite_out_reverse_8888_8888_8888_process_pixblock_tail_head
++    ld4        {v4.8b, v5.8b, v6.8b, v7.8b}, [DST_R], #32
++    pixman_composite_out_reverse_8888_n_8888_process_pixblock_tail
++    fetch_src_pixblock
++    cache_preload 8, 8
++    fetch_mask_pixblock
++    pixman_composite_out_reverse_8888_n_8888_process_pixblock_head
++    st4        {v28.8b, v29.8b, v30.8b, v31.8b}, [DST_W], #32
++.endm
++
++generate_composite_function_single_scanline \
++    pixman_composite_scanline_out_reverse_mask_asm_neon, 32, 32, 32, \
++    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
++    8, /* number of pixels, processed in a single block */ \
++    default_init_need_all_regs, \
++    default_cleanup_need_all_regs, \
++    pixman_composite_out_reverse_8888_n_8888_process_pixblock_head, \
++    pixman_composite_out_reverse_8888_n_8888_process_pixblock_tail, \
++    pixman_composite_out_reverse_8888_8888_8888_process_pixblock_tail_head \
++    28, /* dst_w_basereg */ \
++    4,  /* dst_r_basereg */ \
++    0,  /* src_basereg   */ \
++    12  /* mask_basereg  */
++
++/******************************************************************************/
++
++.macro pixman_composite_over_8888_n_8888_process_pixblock_head
++    pixman_composite_out_reverse_8888_n_8888_process_pixblock_head
++.endm
++
++.macro pixman_composite_over_8888_n_8888_process_pixblock_tail
++    pixman_composite_out_reverse_8888_n_8888_process_pixblock_tail
++    uqadd       v28.8b, v0.8b, v28.8b
++    uqadd       v29.8b, v1.8b, v29.8b
++    uqadd       v30.8b, v2.8b, v30.8b
++    uqadd       v31.8b, v3.8b, v31.8b
++.endm
++
++/* TODO: expand macros and do better instructions scheduling */
++.macro pixman_composite_over_8888_n_8888_process_pixblock_tail_head
++    ld4        {v4.8b, v5.8b, v6.8b, v7.8b}, [DST_R], #32
++    pixman_composite_over_8888_n_8888_process_pixblock_tail
++    fetch_src_pixblock
++    cache_preload 8, 8
++    pixman_composite_over_8888_n_8888_process_pixblock_head
++    st4        {v28.8b, v29.8b, v30.8b, v31.8b}, [DST_W], #32
++.endm
++
++.macro pixman_composite_over_8888_n_8888_init
++    mov         v15.s[0], w6
++    dup         v15.8b, v15.b[3]
++.endm
++
++.macro pixman_composite_over_8888_n_8888_cleanup
++.endm
++
++generate_composite_function \
++    pixman_composite_over_8888_n_8888_asm_neon, 32, 0, 32, \
++    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
++    8, /* number of pixels, processed in a single block */ \
++    5, /* prefetch distance */ \
++    pixman_composite_over_8888_n_8888_init, \
++    pixman_composite_over_8888_n_8888_cleanup, \
++    pixman_composite_over_8888_n_8888_process_pixblock_head, \
++    pixman_composite_over_8888_n_8888_process_pixblock_tail, \
++    pixman_composite_over_8888_n_8888_process_pixblock_tail_head, \
++    28, /* dst_w_basereg */ \
++    4,  /* dst_r_basereg */ \
++    0,  /* src_basereg   */ \
++    12  /* mask_basereg  */
++
++/******************************************************************************/
++
++/* TODO: expand macros and do better instructions scheduling */
++.macro pixman_composite_over_8888_8888_8888_process_pixblock_tail_head
++    ld4        {v4.8b, v5.8b, v6.8b, v7.8b}, [DST_R], #32
++    pixman_composite_over_8888_n_8888_process_pixblock_tail
++    fetch_src_pixblock
++    cache_preload 8, 8
++    fetch_mask_pixblock
++    pixman_composite_over_8888_n_8888_process_pixblock_head
++    st4        {v28.8b, v29.8b, v30.8b, v31.8b}, [DST_W], #32
++.endm
++
++generate_composite_function \
++    pixman_composite_over_8888_8888_8888_asm_neon, 32, 32, 32, \
++    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
++    8, /* number of pixels, processed in a single block */ \
++    5, /* prefetch distance */ \
++    default_init_need_all_regs, \
++    default_cleanup_need_all_regs, \
++    pixman_composite_over_8888_n_8888_process_pixblock_head, \
++    pixman_composite_over_8888_n_8888_process_pixblock_tail, \
++    pixman_composite_over_8888_8888_8888_process_pixblock_tail_head \
++    28, /* dst_w_basereg */ \
++    4,  /* dst_r_basereg */ \
++    0,  /* src_basereg   */ \
++    12  /* mask_basereg  */
++
++generate_composite_function_single_scanline \
++    pixman_composite_scanline_over_mask_asm_neon, 32, 32, 32, \
++    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
++    8, /* number of pixels, processed in a single block */ \
++    default_init_need_all_regs, \
++    default_cleanup_need_all_regs, \
++    pixman_composite_over_8888_n_8888_process_pixblock_head, \
++    pixman_composite_over_8888_n_8888_process_pixblock_tail, \
++    pixman_composite_over_8888_8888_8888_process_pixblock_tail_head \
++    28, /* dst_w_basereg */ \
++    4,  /* dst_r_basereg */ \
++    0,  /* src_basereg   */ \
++    12  /* mask_basereg  */
++
++/******************************************************************************/
++
++/* TODO: expand macros and do better instructions scheduling */
++.macro pixman_composite_over_8888_8_8888_process_pixblock_tail_head
++    ld4        {v4.8b, v5.8b, v6.8b, v7.8b}, [DST_R], #32
++    pixman_composite_over_8888_n_8888_process_pixblock_tail
++    fetch_src_pixblock
++    cache_preload 8, 8
++    fetch_mask_pixblock
++    pixman_composite_over_8888_n_8888_process_pixblock_head
++    st4        {v28.8b, v29.8b, v30.8b, v31.8b}, [DST_W], #32
++.endm
++
++generate_composite_function \
++    pixman_composite_over_8888_8_8888_asm_neon, 32, 8, 32, \
++    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
++    8, /* number of pixels, processed in a single block */ \
++    5, /* prefetch distance */ \
++    default_init_need_all_regs, \
++    default_cleanup_need_all_regs, \
++    pixman_composite_over_8888_n_8888_process_pixblock_head, \
++    pixman_composite_over_8888_n_8888_process_pixblock_tail, \
++    pixman_composite_over_8888_8_8888_process_pixblock_tail_head \
++    28, /* dst_w_basereg */ \
++    4,  /* dst_r_basereg */ \
++    0,  /* src_basereg   */ \
++    15  /* mask_basereg  */
++
++/******************************************************************************/
++
++.macro pixman_composite_src_0888_0888_process_pixblock_head
++.endm
++
++.macro pixman_composite_src_0888_0888_process_pixblock_tail
++.endm
++
++.macro pixman_composite_src_0888_0888_process_pixblock_tail_head
++    st3     {v0.8b, v1.8b, v2.8b}, [DST_W], #24
++    fetch_src_pixblock
++    cache_preload 8, 8
++.endm
++
++generate_composite_function \
++    pixman_composite_src_0888_0888_asm_neon, 24, 0, 24, \
++    FLAG_DST_WRITEONLY, \
++    8, /* number of pixels, processed in a single block */ \
++    10, /* prefetch distance */ \
++    default_init, \
++    default_cleanup, \
++    pixman_composite_src_0888_0888_process_pixblock_head, \
++    pixman_composite_src_0888_0888_process_pixblock_tail, \
++    pixman_composite_src_0888_0888_process_pixblock_tail_head, \
++    0, /* dst_w_basereg */ \
++    0, /* dst_r_basereg */ \
++    0, /* src_basereg   */ \
++    0  /* mask_basereg  */
++
++/******************************************************************************/
++
++.macro pixman_composite_src_0888_8888_rev_process_pixblock_head
++    mov    v31.8b, v2.8b
++    mov    v2.8b, v0.8b
++    mov    v0.8b, v31.8b
++.endm
++
++.macro pixman_composite_src_0888_8888_rev_process_pixblock_tail
++.endm
++
++.macro pixman_composite_src_0888_8888_rev_process_pixblock_tail_head
++    st4    {v0.8b, v1.8b, v2.8b, v3.8b}, [DST_W], #32
++    fetch_src_pixblock
++    mov    v31.8b, v2.8b
++    mov    v2.8b, v0.8b
++    mov    v0.8b, v31.8b
++    cache_preload 8, 8
++.endm
++
++.macro pixman_composite_src_0888_8888_rev_init
++    eor    v3.8b, v3.8b, v3.8b
++.endm
++
++generate_composite_function \
++    pixman_composite_src_0888_8888_rev_asm_neon, 24, 0, 32, \
++    FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \
++    8, /* number of pixels, processed in a single block */ \
++    10, /* prefetch distance */ \
++    pixman_composite_src_0888_8888_rev_init, \
++    default_cleanup, \
++    pixman_composite_src_0888_8888_rev_process_pixblock_head, \
++    pixman_composite_src_0888_8888_rev_process_pixblock_tail, \
++    pixman_composite_src_0888_8888_rev_process_pixblock_tail_head, \
++    0, /* dst_w_basereg */ \
++    0, /* dst_r_basereg */ \
++    0, /* src_basereg   */ \
++    0  /* mask_basereg  */
++
++/******************************************************************************/
++
++.macro pixman_composite_src_0888_0565_rev_process_pixblock_head
++    ushll       v8.8h, v1.8b, #7
++    sli         v8.8h, v8.8h, #1
++    ushll       v9.8h, v2.8b, #7
++    sli         v9.8h, v9.8h, #1
++.endm
++
++.macro pixman_composite_src_0888_0565_rev_process_pixblock_tail
++    ushll       v14.8h, v0.8b, #7
++    sli         v14.8h, v14.8h, #1
++    sri         v14.8h, v8.8h, #5
++    sri         v14.8h, v9.8h, #11
++    mov         v28.d[0], v14.d[0]
++    mov         v29.d[0], v14.d[1]
++.endm
++
++.macro pixman_composite_src_0888_0565_rev_process_pixblock_tail_head
++        ushll       v14.8h, v0.8b, #7
++        sli         v14.8h, v14.8h, #1
++    fetch_src_pixblock
++        sri         v14.8h, v8.8h, #5
++        sri         v14.8h, v9.8h, #11
++        mov         v28.d[0], v14.d[0]
++        mov         v29.d[0], v14.d[1]
++    ushll       v8.8h, v1.8b, #7
++    sli         v8.8h, v8.8h, #1
++        st1     {v14.8h}, [DST_W], #16
++    ushll       v9.8h, v2.8b, #7
++    sli         v9.8h, v9.8h, #1
++.endm
++
++generate_composite_function \
++    pixman_composite_src_0888_0565_rev_asm_neon, 24, 0, 16, \
++    FLAG_DST_WRITEONLY, \
++    8, /* number of pixels, processed in a single block */ \
++    10, /* prefetch distance */ \
++    default_init, \
++    default_cleanup, \
++    pixman_composite_src_0888_0565_rev_process_pixblock_head, \
++    pixman_composite_src_0888_0565_rev_process_pixblock_tail, \
++    pixman_composite_src_0888_0565_rev_process_pixblock_tail_head, \
++    28, /* dst_w_basereg */ \
++    0, /* dst_r_basereg */ \
++    0, /* src_basereg   */ \
++    0  /* mask_basereg  */
++
++/******************************************************************************/
++
++.macro pixman_composite_src_pixbuf_8888_process_pixblock_head
++    umull       v8.8h, v3.8b, v0.8b
++    umull       v9.8h, v3.8b, v1.8b
++    umull       v10.8h, v3.8b, v2.8b
++.endm
++
++.macro pixman_composite_src_pixbuf_8888_process_pixblock_tail
++    urshr       v11.8h, v8.8h, #8
++    mov         v30.8b, v31.8b
++    mov         v31.8b, v3.8b
++    mov         v3.8b, v30.8b
++    urshr       v12.8h, v9.8h, #8
++    urshr       v13.8h, v10.8h, #8
++    raddhn      v30.8b, v11.8h, v8.8h
++    raddhn      v29.8b, v12.8h, v9.8h
++    raddhn      v28.8b, v13.8h, v10.8h
++.endm
++
++.macro pixman_composite_src_pixbuf_8888_process_pixblock_tail_head
++        urshr       v11.8h, v8.8h, #8
++        mov         v30.8b, v31.8b
++        mov         v31.8b, v3.8b
++        mov         v3.8b, v31.8b
++        urshr       v12.8h, v9.8h, #8
++        urshr       v13.8h, v10.8h, #8
++    fetch_src_pixblock
++        raddhn      v30.8b, v11.8h, v8.8h
++                                    PF add PF_X, PF_X, #8
++                                    PF tst PF_CTL, #0xF
++                                    PF beq 10f
++                                    PF add PF_X, PF_X, #8
++                                    PF sub PF_CTL, PF_CTL, #1
++10:
++        raddhn      v29.8b, v12.8h, v9.8h
++        raddhn      v28.8b, v13.8h, v10.8h
++    umull       v8.8h, v3.8b, v0.8b
++    umull       v9.8h, v3.8b, v1.8b
++    umull       v10.8h, v3.8b, v2.8b
++         st4    {v28.8b, v29.8b, v30.8b, v31.8b}, [DST_W], #32
++                                    PF cmp PF_X, ORIG_W
++                                    PF lsl DUMMY, PF_X, src_bpp_shift
++                                    PF prfm PREFETCH_MODE, [PF_SRC, DUMMY]
++                                    PF ble 10f
++                                    PF sub PF_X, PF_X, ORIG_W
++                                    PF subs PF_CTL, PF_CTL, #0x10
++                                    PF ble 10f
++                                    PF lsl DUMMY, SRC_STRIDE, #src_bpp_shift
++                                    PF ldrsb DUMMY, [PF_SRC, DUMMY]
++                                    PF add PF_SRC, PF_SRC, #1
++10:
++.endm
++
++generate_composite_function \
++    pixman_composite_src_pixbuf_8888_asm_neon, 32, 0, 32, \
++    FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \
++    8, /* number of pixels, processed in a single block */ \
++    10, /* prefetch distance */ \
++    default_init, \
++    default_cleanup, \
++    pixman_composite_src_pixbuf_8888_process_pixblock_head, \
++    pixman_composite_src_pixbuf_8888_process_pixblock_tail, \
++    pixman_composite_src_pixbuf_8888_process_pixblock_tail_head, \
++    28, /* dst_w_basereg */ \
++    0, /* dst_r_basereg */ \
++    0, /* src_basereg   */ \
++    0  /* mask_basereg  */
++
++/******************************************************************************/
++
++.macro pixman_composite_src_rpixbuf_8888_process_pixblock_head
++    umull       v8.8h, v3.8b, v0.8b
++    umull       v9.8h, v3.8b, v1.8b
++    umull       v10.8h, v3.8b, v2.8b
++.endm
++
++.macro pixman_composite_src_rpixbuf_8888_process_pixblock_tail
++    urshr       v11.8h, v8.8h, #8
++    mov         v30.8b, v31.8b
++    mov         v31.8b, v3.8b
++    mov         v3.8b, v30.8b
++    urshr       v12.8h, v9.8h, #8
++    urshr       v13.8h, v10.8h, #8
++    raddhn      v28.8b, v11.8h, v8.8h
++    raddhn      v29.8b, v12.8h, v9.8h
++    raddhn      v30.8b, v13.8h, v10.8h
++.endm
++
++.macro pixman_composite_src_rpixbuf_8888_process_pixblock_tail_head
++        urshr       v11.8h, v8.8h, #8
++        mov         v30.8b, v31.8b
++        mov         v31.8b, v3.8b
++        mov         v3.8b, v30.8b
++        urshr       v12.8h, v9.8h, #8
++        urshr       v13.8h, v10.8h, #8
++    fetch_src_pixblock
++        raddhn      v28.8b, v11.8h, v8.8h
++                                    PF add PF_X, PF_X, #8
++                                    PF tst PF_CTL, #0xF
++                                    PF beq 10f
++                                    PF add PF_X, PF_X, #8
++                                    PF sub PF_CTL, PF_CTL, #1
++10:
++        raddhn      v29.8b, v12.8h, v9.8h
++        raddhn      v30.8b, v13.8h, v10.8h
++    umull       v8.8h, v3.8b, v0.8b
++    umull       v9.8h, v3.8b, v1.8b
++    umull       v10.8h, v3.8b, v2.8b
++         st4    {v28.8b, v29.8b, v30.8b, v31.8b}, [DST_W], #32
++                                    PF cmp PF_X, ORIG_W
++                                    PF lsl DUMMY, PF_X, src_bpp_shift
++                                    PF prfm PREFETCH_MODE, [PF_SRC, DUMMY]
++                                    PF ble 10f
++                                    PF sub PF_X, PF_X, ORIG_W
++                                    PF subs PF_CTL, PF_CTL, #0x10
++                                    PF ble 10f
++                                    PF lsl DUMMY, SRC_STRIDE, #src_bpp_shift
++                                    PF ldrsb DUMMY, [PF_SRC, DUMMY]
++                                    PF add PF_SRC, PF_SRC, #1
++10:
++.endm
++
++generate_composite_function \
++    pixman_composite_src_rpixbuf_8888_asm_neon, 32, 0, 32, \
++    FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \
++    8, /* number of pixels, processed in a single block */ \
++    10, /* prefetch distance */ \
++    default_init, \
++    default_cleanup, \
++    pixman_composite_src_rpixbuf_8888_process_pixblock_head, \
++    pixman_composite_src_rpixbuf_8888_process_pixblock_tail, \
++    pixman_composite_src_rpixbuf_8888_process_pixblock_tail_head, \
++    28, /* dst_w_basereg */ \
++    0, /* dst_r_basereg */ \
++    0, /* src_basereg   */ \
++    0  /* mask_basereg  */
++
++/******************************************************************************/
++
++.macro pixman_composite_over_0565_8_0565_process_pixblock_head
++    /* mask is in v15 */
++    mov         v4.d[0], v8.d[0]
++    mov         v4.d[1], v9.d[0]
++    mov         v13.d[0], v10.d[0]
++    mov         v13.d[1], v11.d[0]
++    convert_0565_to_x888 v4, v2, v1, v0
++    convert_0565_to_x888 v13, v6, v5, v4
++    /* source pixel data is in      {v0, v1, v2, XX} */
++    /* destination pixel data is in {v4, v5, v6, XX} */
++    mvn         v7.8b,  v15.8b
++    umull       v10.8h, v15.8b, v2.8b
++    umull       v9.8h,  v15.8b, v1.8b
++    umull       v8.8h,  v15.8b, v0.8b
++    umull       v11.8h, v7.8b,  v4.8b
++    umull       v12.8h, v7.8b,  v5.8b
++    umull       v13.8h, v7.8b,  v6.8b
++    urshr       v19.8h, v10.8h, #8
++    urshr       v18.8h, v9.8h,  #8
++    urshr       v17.8h, v8.8h,  #8
++    raddhn      v2.8b,  v10.8h, v19.8h
++    raddhn      v1.8b,  v9.8h,  v18.8h
++    raddhn      v0.8b,  v8.8h,  v17.8h
++.endm
++
++.macro pixman_composite_over_0565_8_0565_process_pixblock_tail
++    urshr       v17.8h, v11.8h,  #8
++    urshr       v18.8h, v12.8h,  #8
++    urshr       v19.8h, v13.8h,  #8
++    raddhn      v28.8b, v17.8h, v11.8h
++    raddhn      v29.8b, v18.8h, v12.8h
++    raddhn      v30.8b, v19.8h, v13.8h
++    uqadd       v0.8b,  v0.8b,  v28.8b
++    uqadd       v1.8b,  v1.8b,  v29.8b
++    uqadd       v2.8b,  v2.8b,  v30.8b
++    /* 32bpp result is in {v0, v1, v2, XX} */
++    convert_8888_to_0565 v2, v1, v0, v14, v30, v13
++    mov         v28.d[0], v14.d[0]
++    mov         v29.d[0], v14.d[1]
++.endm
++
++/* TODO: expand macros and do better instructions scheduling */
++.macro pixman_composite_over_0565_8_0565_process_pixblock_tail_head
++    fetch_mask_pixblock
++    pixman_composite_over_0565_8_0565_process_pixblock_tail
++    fetch_src_pixblock
++    ld1        {v10.4h, v11.4h}, [DST_R], #16
++    cache_preload 8, 8
++    pixman_composite_over_0565_8_0565_process_pixblock_head
++    st1        {v14.8h}, [DST_W], #16
++.endm
++
++generate_composite_function \
++    pixman_composite_over_0565_8_0565_asm_neon, 16, 8, 16, \
++    FLAG_DST_READWRITE, \
++    8, /* number of pixels, processed in a single block */ \
++    5, /* prefetch distance */ \
++    default_init_need_all_regs, \
++    default_cleanup_need_all_regs, \
++    pixman_composite_over_0565_8_0565_process_pixblock_head, \
++    pixman_composite_over_0565_8_0565_process_pixblock_tail, \
++    pixman_composite_over_0565_8_0565_process_pixblock_tail_head, \
++    28, /* dst_w_basereg */ \
++    10,  /* dst_r_basereg */ \
++    8,  /* src_basereg   */ \
++    15  /* mask_basereg  */
++
++/******************************************************************************/
++
++.macro pixman_composite_over_0565_n_0565_init
++    mov         v15.s[0], w6
++    dup         v15.8b, v15.b[3]
++.endm
++
++.macro pixman_composite_over_0565_n_0565_cleanup
++.endm
++
++generate_composite_function \
++    pixman_composite_over_0565_n_0565_asm_neon, 16, 0, 16, \
++    FLAG_DST_READWRITE, \
++    8, /* number of pixels, processed in a single block */ \
++    5, /* prefetch distance */ \
++    pixman_composite_over_0565_n_0565_init, \
++    pixman_composite_over_0565_n_0565_cleanup, \
++    pixman_composite_over_0565_8_0565_process_pixblock_head, \
++    pixman_composite_over_0565_8_0565_process_pixblock_tail, \
++    pixman_composite_over_0565_8_0565_process_pixblock_tail_head, \
++    28, /* dst_w_basereg */ \
++    10,  /* dst_r_basereg */ \
++    8,  /* src_basereg   */ \
++    15  /* mask_basereg  */
++
++/******************************************************************************/
++
++.macro pixman_composite_add_0565_8_0565_process_pixblock_head
++    /* mask is in v15 */
++    mov         v4.d[0], v8.d[0]
++    mov         v4.d[1], v9.d[0]
++    mov         v13.d[0], v10.d[0]
++    mov         v13.d[1], v11.d[0]
++    convert_0565_to_x888 v4,  v2, v1, v0
++    convert_0565_to_x888 v13, v6, v5, v4
++    /* source pixel data is in      {v0, v1, v2, XX} */
++    /* destination pixel data is in {v4, v5, v6, XX} */
++    umull       v9.8h,  v15.8b, v2.8b
++    umull       v8.8h,  v15.8b, v1.8b
++    umull       v7.8h,  v15.8b, v0.8b
++    urshr       v12.8h, v9.8h,  #8
++    urshr       v11.8h, v8.8h,  #8
++    urshr       v10.8h, v7.8h,  #8
++    raddhn      v2.8b,  v9.8h,  v12.8h
++    raddhn      v1.8b,  v8.8h,  v11.8h
++    raddhn      v0.8b,  v7.8h,  v10.8h
++.endm
++
++.macro pixman_composite_add_0565_8_0565_process_pixblock_tail
++    uqadd       v0.8b,  v0.8b,  v4.8b
++    uqadd       v1.8b,  v1.8b,  v5.8b
++    uqadd       v2.8b,  v2.8b,  v6.8b
++    /* 32bpp result is in {v0, v1, v2, XX} */
++    convert_8888_to_0565 v2, v1, v0, v14, v30, v13
++    mov         v28.d[0], v14.d[0]
++    mov         v29.d[0], v14.d[1]
++.endm
++
++/* TODO: expand macros and do better instructions scheduling */
++.macro pixman_composite_add_0565_8_0565_process_pixblock_tail_head
++    fetch_mask_pixblock
++    pixman_composite_add_0565_8_0565_process_pixblock_tail
++    fetch_src_pixblock
++    ld1        {v10.4h, v11.4h}, [DST_R], #16
++    cache_preload 8, 8
++    pixman_composite_add_0565_8_0565_process_pixblock_head
++    st1        {v14.8h}, [DST_W], #16
++.endm
++
++generate_composite_function \
++    pixman_composite_add_0565_8_0565_asm_neon, 16, 8, 16, \
++    FLAG_DST_READWRITE, \
++    8, /* number of pixels, processed in a single block */ \
++    5, /* prefetch distance */ \
++    default_init_need_all_regs, \
++    default_cleanup_need_all_regs, \
++    pixman_composite_add_0565_8_0565_process_pixblock_head, \
++    pixman_composite_add_0565_8_0565_process_pixblock_tail, \
++    pixman_composite_add_0565_8_0565_process_pixblock_tail_head, \
++    28, /* dst_w_basereg */ \
++    10, /* dst_r_basereg */ \
++    8,  /* src_basereg   */ \
++    15  /* mask_basereg  */
++
++/******************************************************************************/
++
++.macro pixman_composite_out_reverse_8_0565_process_pixblock_head
++    /* mask is in v15 */
++    mov         v12.d[0], v10.d[0]
++    mov         v12.d[1], v11.d[0]
++    convert_0565_to_x888 v12, v6, v5, v4
++    /* destination pixel data is in {v4, v5, v6, xx} */
++    mvn         v24.8b, v15.8b /* get inverted alpha */
++    /* now do alpha blending */
++    umull       v8.8h,  v24.8b, v4.8b
++    umull       v9.8h,  v24.8b, v5.8b
++    umull       v10.8h, v24.8b, v6.8b
++.endm
++
++.macro pixman_composite_out_reverse_8_0565_process_pixblock_tail
++    urshr       v11.8h, v8.8h, #8
++    urshr       v12.8h, v9.8h, #8
++    urshr       v13.8h, v10.8h, #8
++    raddhn      v0.8b, v11.8h, v8.8h
++    raddhn      v1.8b, v12.8h, v9.8h
++    raddhn      v2.8b, v13.8h, v10.8h
++    /* 32bpp result is in {v0, v1, v2, XX} */
++    convert_8888_to_0565 v2, v1, v0, v14, v12, v3
++    mov         v28.d[0], v14.d[0]
++    mov         v29.d[0], v14.d[1]
++.endm
++
++/* TODO: expand macros and do better instructions scheduling */
++.macro pixman_composite_out_reverse_8_0565_process_pixblock_tail_head
++    fetch_src_pixblock
++    pixman_composite_out_reverse_8_0565_process_pixblock_tail
++    ld1        {v10.4h, v11.4h}, [DST_R], #16
++    cache_preload 8, 8
++    pixman_composite_out_reverse_8_0565_process_pixblock_head
++    st1        {v14.8h}, [DST_W], #16
++.endm
++
++generate_composite_function \
++    pixman_composite_out_reverse_8_0565_asm_neon, 8, 0, 16, \
++    FLAG_DST_READWRITE, \
++    8, /* number of pixels, processed in a single block */ \
++    5, /* prefetch distance */ \
++    default_init_need_all_regs, \
++    default_cleanup_need_all_regs, \
++    pixman_composite_out_reverse_8_0565_process_pixblock_head, \
++    pixman_composite_out_reverse_8_0565_process_pixblock_tail, \
++    pixman_composite_out_reverse_8_0565_process_pixblock_tail_head, \
++    28, /* dst_w_basereg */ \
++    10,  /* dst_r_basereg */ \
++    15, /* src_basereg   */ \
++    0   /* mask_basereg  */
++
++/******************************************************************************/
++
++.macro pixman_composite_out_reverse_8_8888_process_pixblock_head
++    /* src is in v0 */
++    /* destination pixel data is in {v4, v5, v6, v7} */
++    mvn         v1.8b, v0.8b /* get inverted alpha */
++    /* now do alpha blending */
++    umull       v8.8h, v1.8b, v4.8b
++    umull       v9.8h, v1.8b, v5.8b
++    umull       v10.8h, v1.8b, v6.8b
++    umull       v11.8h, v1.8b, v7.8b
++.endm
++
++.macro pixman_composite_out_reverse_8_8888_process_pixblock_tail
++    urshr       v14.8h, v8.8h, #8
++    urshr       v15.8h, v9.8h, #8
++    urshr       v12.8h, v10.8h, #8
++    urshr       v13.8h, v11.8h, #8
++    raddhn      v28.8b, v14.8h, v8.8h
++    raddhn      v29.8b, v15.8h, v9.8h
++    raddhn      v30.8b, v12.8h, v10.8h
++    raddhn      v31.8b, v13.8h, v11.8h
++    /* 32bpp result is in {v28, v29, v30, v31} */
++.endm
++
++/* TODO: expand macros and do better instructions scheduling */
++.macro pixman_composite_out_reverse_8_8888_process_pixblock_tail_head
++    fetch_src_pixblock
++    pixman_composite_out_reverse_8_8888_process_pixblock_tail
++    ld4       {v4.8b, v5.8b, v6.8b, v7.8b}, [DST_R], #32
++    cache_preload 8, 8
++    pixman_composite_out_reverse_8_8888_process_pixblock_head
++    st4       {v28.8b, v29.8b, v30.8b, v31.8b}, [DST_W], #32
++.endm
++
++generate_composite_function \
++    pixman_composite_out_reverse_8_8888_asm_neon, 8, 0, 32, \
++    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
++    8, /* number of pixels, processed in a single block */ \
++    5, /* prefetch distance */ \
++    default_init, \
++    default_cleanup, \
++    pixman_composite_out_reverse_8_8888_process_pixblock_head, \
++    pixman_composite_out_reverse_8_8888_process_pixblock_tail, \
++    pixman_composite_out_reverse_8_8888_process_pixblock_tail_head, \
++    28, /* dst_w_basereg */ \
++    4, /* dst_r_basereg */ \
++    0, /* src_basereg   */ \
++    0   /* mask_basereg  */
++    
++/******************************************************************************/
++
++generate_composite_function_nearest_scanline \
++    pixman_scaled_nearest_scanline_8888_8888_OVER_asm_neon, 32, 0, 32, \
++    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
++    8, /* number of pixels, processed in a single block */ \
++    default_init, \
++    default_cleanup, \
++    pixman_composite_over_8888_8888_process_pixblock_head, \
++    pixman_composite_over_8888_8888_process_pixblock_tail, \
++    pixman_composite_over_8888_8888_process_pixblock_tail_head
++
++generate_composite_function_nearest_scanline \
++    pixman_scaled_nearest_scanline_8888_0565_OVER_asm_neon, 32, 0, 16, \
++    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
++    8, /* number of pixels, processed in a single block */ \
++    default_init, \
++    default_cleanup, \
++    pixman_composite_over_8888_0565_process_pixblock_head, \
++    pixman_composite_over_8888_0565_process_pixblock_tail, \
++    pixman_composite_over_8888_0565_process_pixblock_tail_head, \
++    28, /* dst_w_basereg */ \
++    4,  /* dst_r_basereg */ \
++    0,  /* src_basereg   */ \
++    24  /* mask_basereg  */
++
++generate_composite_function_nearest_scanline \
++    pixman_scaled_nearest_scanline_8888_0565_SRC_asm_neon, 32, 0, 16, \
++    FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \
++    8, /* number of pixels, processed in a single block */ \
++    default_init, \
++    default_cleanup, \
++    pixman_composite_src_8888_0565_process_pixblock_head, \
++    pixman_composite_src_8888_0565_process_pixblock_tail, \
++    pixman_composite_src_8888_0565_process_pixblock_tail_head, \
++    
++generate_composite_function_nearest_scanline \
++    pixman_scaled_nearest_scanline_0565_8888_SRC_asm_neon, 16, 0, 32, \
++    FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \
++    8, /* number of pixels, processed in a single block */ \
++    default_init, \
++    default_cleanup, \
++    pixman_composite_src_0565_8888_process_pixblock_head, \
++    pixman_composite_src_0565_8888_process_pixblock_tail, \
++    pixman_composite_src_0565_8888_process_pixblock_tail_head
++
++generate_composite_function_nearest_scanline \
++    pixman_scaled_nearest_scanline_8888_8_0565_OVER_asm_neon, 32, 8, 16, \
++    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
++    8, /* number of pixels, processed in a single block */ \
++    default_init_need_all_regs, \
++    default_cleanup_need_all_regs, \
++    pixman_composite_over_8888_8_0565_process_pixblock_head, \
++    pixman_composite_over_8888_8_0565_process_pixblock_tail, \
++    pixman_composite_over_8888_8_0565_process_pixblock_tail_head, \
++    28, /* dst_w_basereg */ \
++    4,  /* dst_r_basereg */ \
++    8,  /* src_basereg   */ \
++    24  /* mask_basereg  */
++
++generate_composite_function_nearest_scanline \
++    pixman_scaled_nearest_scanline_0565_8_0565_OVER_asm_neon, 16, 8, 16, \
++    FLAG_DST_READWRITE, \
++    8, /* number of pixels, processed in a single block */ \
++    default_init_need_all_regs, \
++    default_cleanup_need_all_regs, \
++    pixman_composite_over_0565_8_0565_process_pixblock_head, \
++    pixman_composite_over_0565_8_0565_process_pixblock_tail, \
++    pixman_composite_over_0565_8_0565_process_pixblock_tail_head, \
++    28, /* dst_w_basereg */ \
++    10,  /* dst_r_basereg */ \
++    8,  /* src_basereg   */ \
++    15  /* mask_basereg  */
++
++/******************************************************************************/
++
++/*
++ * Bilinear scaling support code which tries to provide pixel fetching, color
++ * format conversion, and interpolation as separate macros which can be used
++ * as the basic building blocks for constructing bilinear scanline functions.
++ */
++
++.macro bilinear_load_8888 reg1, reg2, tmp
++    asr       TMP1, X, #16
++    add       X, X, UX
++    add       TMP1, TOP, TMP1, lsl #2
++    ld1       {&reg1&.2s}, [TMP1], STRIDE
++    ld1       {&reg2&.2s}, [TMP1]
++.endm
++
++.macro bilinear_load_0565 reg1, reg2, tmp
++    asr       TMP1, X, #16
++    add       X, X, UX
++    add       TMP1, TOP, TMP1, lsl #1
++    ld1       {&reg2&.s}[0], [TMP1], STRIDE
++    ld1       {&reg2&.s}[1], [TMP1]
++    convert_four_0565_to_x888_packed reg2, reg1, reg2, tmp
++.endm
++
++.macro bilinear_load_and_vertical_interpolate_two_8888 \
++                    acc1, acc2, reg1, reg2, reg3, reg4, tmp1, tmp2
++
++    bilinear_load_8888 reg1, reg2, tmp1
++    umull     &acc1&.8h, &reg1&.8b, v28.8b
++    umlal     &acc1&.8h, &reg2&.8b, v29.8b
++    bilinear_load_8888 reg3, reg4, tmp2
++    umull     &acc2&.8h, &reg3&.8b, v28.8b
++    umlal     &acc2&.8h, &reg4&.8b, v29.8b
++.endm
++
++.macro bilinear_load_and_vertical_interpolate_four_8888 \
++                xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi \
++                yacc1, yacc2, yreg1, yreg2, yreg3, yreg4, yacc2lo, yacc2hi
++
++    bilinear_load_and_vertical_interpolate_two_8888 \
++                xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi
++    bilinear_load_and_vertical_interpolate_two_8888 \
++                yacc1, yacc2, yreg1, yreg2, yreg3, yreg4, yacc2lo, yacc2hi
++.endm
++
++.macro vzip reg1, reg2
++    umov      TMP4, v31.d[0]
++    zip1      v31.8b, reg1, reg2
++    zip2      reg2,   reg1, reg2
++    mov       reg1,   v31.8b
++    mov       v31.d[0], TMP4
++.endm
++
++.macro vuzp reg1, reg2
++    umov      TMP4, v31.d[0]
++    uzp1      v31.8b, reg1, reg2
++    uzp2      reg2,   reg1, reg2
++    mov       reg1,   v31.8b
++    mov       v31.d[0], TMP4
++.endm
++
++.macro bilinear_load_and_vertical_interpolate_two_0565 \
++                acc1, acc2, reg1, reg2, reg3, reg4, acc2lo, acc2hi
++    asr       TMP1, X, #16
++    add       X, X, UX
++    add       TMP1, TOP, TMP1, lsl #1
++    asr       TMP2, X, #16
++    add       X, X, UX
++    add       TMP2, TOP, TMP2, lsl #1
++    ld1       {&acc2&.s}[0], [TMP1], STRIDE
++    ld1       {&acc2&.s}[2], [TMP2], STRIDE
++    ld1       {&acc2&.s}[1], [TMP1]
++    ld1       {&acc2&.s}[3], [TMP2]
++    convert_0565_to_x888 acc2, reg3, reg2, reg1
++    vzip      &reg1&.8b, &reg3&.8b
++    vzip      &reg2&.8b, &reg4&.8b
++    vzip      &reg3&.8b, &reg4&.8b
++    vzip      &reg1&.8b, &reg2&.8b
++    umull     &acc1&.8h, &reg1&.8b, v28.8b
++    umlal     &acc1&.8h, &reg2&.8b, v29.8b
++    umull     &acc2&.8h, &reg3&.8b, v28.8b
++    umlal     &acc2&.8h, &reg4&.8b, v29.8b
++.endm
++
++.macro bilinear_load_and_vertical_interpolate_four_0565 \
++                xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi \
++                yacc1, yacc2, yreg1, yreg2, yreg3, yreg4, yacc2lo, yacc2hi
++    asr       TMP1, X, #16
++    add       X, X, UX
++    add       TMP1, TOP, TMP1, lsl #1
++    asr       TMP2, X, #16
++    add       X, X, UX
++    add       TMP2, TOP, TMP2, lsl #1
++    ld1       {&xacc2&.s}[0], [TMP1], STRIDE
++    ld1       {&xacc2&.s}[2], [TMP2], STRIDE
++    ld1       {&xacc2&.s}[1], [TMP1]
++    ld1       {&xacc2&.s}[3], [TMP2]
++    convert_0565_to_x888 xacc2, xreg3, xreg2, xreg1
++    asr       TMP1, X, #16
++    add       X, X, UX
++    add       TMP1, TOP, TMP1, lsl #1
++    asr       TMP2, X, #16
++    add       X, X, UX
++    add       TMP2, TOP, TMP2, lsl #1
++    ld1       {&yacc2&.s}[0], [TMP1], STRIDE
++    vzip      &xreg1&.8b, &xreg3&.8b
++    ld1       {&yacc2&.s}[2], [TMP2], STRIDE
++    vzip      &xreg2&.8b, &xreg4&.8b
++    ld1       {&yacc2&.s}[1], [TMP1]
++    vzip      &xreg3&.8b, &xreg4&.8b
++    ld1       {&yacc2&.s}[3], [TMP2]
++    vzip      &xreg1&.8b, &xreg2&.8b
++    convert_0565_to_x888 yacc2, yreg3, yreg2, yreg1
++    umull     &xacc1&.8h, &xreg1&.8b, v28.8b
++    vzip      &yreg1&.8b, &yreg3&.8b
++    umlal     &xacc1&.8h, &xreg2&.8b, v29.8b
++    vzip      &yreg2&.8b, &yreg4&.8b
++    umull     &xacc2&.8h, &xreg3&.8b, v28.8b
++    vzip      &yreg3&.8b, &yreg4&.8b
++    umlal     &xacc2&.8h, &xreg4&.8b, v29.8b
++    vzip      &yreg1&.8b, &yreg2&.8b
++    umull     &yacc1&.8h, &yreg1&.8b, v28.8b
++    umlal     &yacc1&.8h, &yreg2&.8b, v29.8b
++    umull     &yacc2&.8h, &yreg3&.8b, v28.8b
++    umlal     &yacc2&.8h, &yreg4&.8b, v29.8b
++.endm
++
++.macro bilinear_store_8888 numpix, tmp1, tmp2
++.if numpix == 4
++    st1       {v0.2s, v1.2s}, [OUT], #16
++.elseif numpix == 2
++    st1       {v0.2s}, [OUT], #8
++.elseif numpix == 1
++    st1       {v0.s}[0], [OUT], #4
++.else
++    .error bilinear_store_8888 numpix is unsupported
++.endif
++.endm
++
++.macro bilinear_store_0565 numpix, tmp1, tmp2
++    vuzp      v0.8b, v1.8b
++    vuzp      v2.8b, v3.8b
++    vuzp      v1.8b, v3.8b
++    vuzp      v0.8b, v2.8b
++    convert_8888_to_0565 v2, v1, v0, v1, tmp1, tmp2
++.if numpix == 4
++    st1       {v1.4h}, [OUT], #8
++.elseif numpix == 2
++    st1       {v1.s}[0], [OUT], #4
++.elseif numpix == 1
++    st1       {v1.h}[0], [OUT], #2
++.else
++    .error bilinear_store_0565 numpix is unsupported
++.endif
++.endm
++
++.macro bilinear_interpolate_last_pixel src_fmt, dst_fmt
++    bilinear_load_&src_fmt v0, v1, v2
++    umull     v2.8h, v0.8b, v28.8b
++    umlal     v2.8h, v1.8b, v29.8b
++    /* 5 cycles bubble */
++    ushll     v0.4s, v2.4h, #BILINEAR_INTERPOLATION_BITS
++    umlsl     v0.4s, v2.4h, v15.h[0]
++    umlal2    v0.4s, v2.8h, v15.h[0]
++    /* 5 cycles bubble */
++    shrn      v0.4h, v0.4s, #(2 * BILINEAR_INTERPOLATION_BITS)
++    /* 3 cycles bubble */
++    xtn       v0.8b, v0.8h
++    /* 1 cycle bubble */
++    bilinear_store_&dst_fmt 1, v3, v4
++.endm
++
++.macro bilinear_interpolate_two_pixels src_fmt, dst_fmt
++    bilinear_load_and_vertical_interpolate_two_&src_fmt \
++                v1, v11, v2, v3, v20, v21, v22, v23
++    ushll     v0.4s, v1.4h, #BILINEAR_INTERPOLATION_BITS
++    umlsl     v0.4s, v1.4h, v15.h[0]
++    umlal2    v0.4s, v1.8h, v15.h[0]
++    ushll     v10.4s, v11.4h, #BILINEAR_INTERPOLATION_BITS
++    umlsl     v10.4s, v11.4h, v15.h[4]
++    umlal2    v10.4s, v11.8h, v15.h[4]
++    shrn      v0.4h, v0.4s, #(2 * BILINEAR_INTERPOLATION_BITS)
++    shrn2     v0.8h, v10.4s, #(2 * BILINEAR_INTERPOLATION_BITS)
++    ushr      v15.8h, v12.8h, #(16 - BILINEAR_INTERPOLATION_BITS)
++    add       v12.8h, v12.8h, v13.8h
++    xtn       v0.8b, v0.8h
++    bilinear_store_&dst_fmt 2, v3, v4
++.endm
++
++.macro bilinear_interpolate_four_pixels src_fmt, dst_fmt
++    bilinear_load_and_vertical_interpolate_four_&src_fmt \
++                v1, v11, v14, v20, v16, v17, v22, v23 \
++                v3, v9,  v24, v25, v26, v27, v18, v19
++    prfm      PREFETCH_MODE, [TMP1, PF_OFFS]
++    sub       TMP1, TMP1, STRIDE
++    ushll     v0.4s, v1.4h, #BILINEAR_INTERPOLATION_BITS
++    umlsl     v0.4s, v1.4h, v15.h[0]
++    umlal2    v0.4s, v1.8h, v15.h[0]
++    ushll     v10.4s, v11.4h, #BILINEAR_INTERPOLATION_BITS
++    umlsl     v10.4s, v11.4h, v15.h[4]
++    umlal2    v10.4s, v11.8h, v15.h[4]
++    ushr      v15.8h, v12.8h, #(16 - BILINEAR_INTERPOLATION_BITS)
++    ushll     v2.4s, v3.4h, #BILINEAR_INTERPOLATION_BITS
++    umlsl     v2.4s, v3.4h, v15.h[0]
++    umlal2    v2.4s, v3.8h, v15.h[0]
++    ushll     v8.4s, v9.4h, #BILINEAR_INTERPOLATION_BITS
++    prfm      PREFETCH_MODE, [TMP2, PF_OFFS]
++    umlsl     v8.4s, v9.4h, v15.h[4]
++    umlal2    v8.4s, v9.8h, v15.h[4]
++    add       v12.8h, v12.8h, v13.8h
++    shrn      v0.4h, v0.4s, #(2 * BILINEAR_INTERPOLATION_BITS)
++    shrn2     v0.8h, v10.4s, #(2 * BILINEAR_INTERPOLATION_BITS)
++    shrn      v2.4h, v2.4s, #(2 * BILINEAR_INTERPOLATION_BITS)
++    shrn2     v2.8h, v8.4s, #(2 * BILINEAR_INTERPOLATION_BITS)
++    ushr      v15.8h, v12.8h, #(16 - BILINEAR_INTERPOLATION_BITS)
++    xtn       v0.8b, v0.8h
++    xtn       v1.8b, v2.8h
++    add       v12.8h, v12.8h, v13.8h
++    bilinear_store_&dst_fmt 4, v3, v4
++.endm
++
++.macro bilinear_interpolate_four_pixels_head src_fmt, dst_fmt
++.ifdef have_bilinear_interpolate_four_pixels_&src_fmt&_&dst_fmt
++    bilinear_interpolate_four_pixels_&src_fmt&_&dst_fmt&_head
++.else
++    bilinear_interpolate_four_pixels src_fmt, dst_fmt
++.endif
++.endm
++
++.macro bilinear_interpolate_four_pixels_tail src_fmt, dst_fmt
++.ifdef have_bilinear_interpolate_four_pixels_&src_fmt&_&dst_fmt
++    bilinear_interpolate_four_pixels_&src_fmt&_&dst_fmt&_tail
++.endif
++.endm
++
++.macro bilinear_interpolate_four_pixels_tail_head src_fmt, dst_fmt
++.ifdef have_bilinear_interpolate_four_pixels_&src_fmt&_&dst_fmt
++    bilinear_interpolate_four_pixels_&src_fmt&_&dst_fmt&_tail_head
++.else
++    bilinear_interpolate_four_pixels src_fmt, dst_fmt
++.endif
++.endm
++
++.macro bilinear_interpolate_eight_pixels_head src_fmt, dst_fmt
++.ifdef have_bilinear_interpolate_eight_pixels_&src_fmt&_&dst_fmt
++    bilinear_interpolate_eight_pixels_&src_fmt&_&dst_fmt&_head
++.else
++    bilinear_interpolate_four_pixels_head src_fmt, dst_fmt
++    bilinear_interpolate_four_pixels_tail_head src_fmt, dst_fmt
++.endif
++.endm
++
++.macro bilinear_interpolate_eight_pixels_tail src_fmt, dst_fmt
++.ifdef have_bilinear_interpolate_eight_pixels_&src_fmt&_&dst_fmt
++    bilinear_interpolate_eight_pixels_&src_fmt&_&dst_fmt&_tail
++.else
++    bilinear_interpolate_four_pixels_tail src_fmt, dst_fmt
++.endif
++.endm
++
++.macro bilinear_interpolate_eight_pixels_tail_head src_fmt, dst_fmt
++.ifdef have_bilinear_interpolate_eight_pixels_&src_fmt&_&dst_fmt
++    bilinear_interpolate_eight_pixels_&src_fmt&_&dst_fmt&_tail_head
++.else
++    bilinear_interpolate_four_pixels_tail_head src_fmt, dst_fmt
++    bilinear_interpolate_four_pixels_tail_head src_fmt, dst_fmt
++.endif
++.endm
++
++.set BILINEAR_FLAG_UNROLL_4,          0
++.set BILINEAR_FLAG_UNROLL_8,          1
++.set BILINEAR_FLAG_USE_ALL_NEON_REGS, 2
++
++/*
++ * Main template macro for generating NEON optimized bilinear scanline
++ * functions.
++ *
++ * Bilinear scanline scaler macro template uses the following arguments:
++ *  fname             - name of the function to generate
++ *  src_fmt           - source color format (8888 or 0565)
++ *  dst_fmt           - destination color format (8888 or 0565)
++ *  bpp_shift         - (1 << bpp_shift) is the size of source pixel in bytes
++ *  prefetch_distance - prefetch in the source image by that many
++ *                      pixels ahead
++ */
++
++.macro generate_bilinear_scanline_func fname, src_fmt, dst_fmt, \
++                                       src_bpp_shift, dst_bpp_shift, \
++                                       prefetch_distance, flags
++
++pixman_asm_function fname
++    OUT       .req      x0
++    TOP       .req      x1
++    BOTTOM    .req      x2
++    WT        .req      x3
++    WB        .req      x4
++    X         .req      x5
++    UX        .req      x6
++    WIDTH     .req      x7
++    TMP1      .req      x8
++    TMP2      .req      x9
++    PF_OFFS   .req      x10
++    TMP3      .req      x11
++    TMP4      .req      x12
++    STRIDE    .req      x13
++    
++    sxtw      x3, w3
++    sxtw      x4, w4
++    sxtw      x5, w5
++    sxtw      x6, w6
++    sxtw      x7, w7
++
++    stp       x29, x30, [sp, -16]!
++    mov       x29, sp
++    sub       sp,  sp, 112  /* push all registers */
++    sub       x29, x29, 64
++    st1       {v8.8b, v9.8b, v10.8b, v11.8b}, [x29], #32
++    st1       {v12.8b, v13.8b, v14.8b, v15.8b}, [x29], #32
++    stp        x8,  x9, [x29, -80]
++    stp       x10, x11, [x29, -96]
++    stp       x12, x13, [x29, -112]
++
++    mov       PF_OFFS, #prefetch_distance
++    mul       PF_OFFS, PF_OFFS, UX
++
++    subs      STRIDE, BOTTOM, TOP
++    .unreq    BOTTOM
++
++    cmp       WIDTH, #0
++    ble       300f
++
++    dup       v12.8h, w5
++    dup       v13.8h, w6
++    dup       v28.8b, w3
++    dup       v29.8b, w4
++    mov       v25.d[0], v12.d[1]
++    mov       v26.d[0], v13.d[0]
++    add       v25.4h, v25.4h, v26.4h
++    mov       v12.d[1], v25.d[0]
++
++    /* ensure good destination alignment  */
++    cmp       WIDTH, #1
++    blt       100f
++    tst       OUT, #(1 << dst_bpp_shift)
++    beq       100f
++    ushr      v15.8h, v12.8h, #(16 - BILINEAR_INTERPOLATION_BITS)
++    add       v12.8h, v12.8h, v13.8h
++    bilinear_interpolate_last_pixel src_fmt, dst_fmt
++    sub       WIDTH, WIDTH, #1
++100:
++    add       v13.8h, v13.8h, v13.8h
++    ushr      v15.8h, v12.8h, #(16 - BILINEAR_INTERPOLATION_BITS)
++    add       v12.8h, v12.8h, v13.8h
++
++    cmp       WIDTH, #2
++    blt       100f
++    tst       OUT, #(1 << (dst_bpp_shift + 1))
++    beq       100f
++    bilinear_interpolate_two_pixels src_fmt, dst_fmt
++    sub       WIDTH, WIDTH, #2
++100:
++.if ((flags) & BILINEAR_FLAG_UNROLL_8) != 0
++/*********** 8 pixels per iteration *****************/
++    cmp       WIDTH, #4
++    blt       100f
++    tst       OUT, #(1 << (dst_bpp_shift + 2))
++    beq       100f
++    bilinear_interpolate_four_pixels src_fmt, dst_fmt
++    sub       WIDTH, WIDTH, #4
++100:
++    subs      WIDTH, WIDTH, #8
++    blt       100f
++    asr       PF_OFFS, PF_OFFS, #(16 - src_bpp_shift)
++    bilinear_interpolate_eight_pixels_head src_fmt, dst_fmt
++    subs      WIDTH, WIDTH, #8
++    blt       500f
++1000:
++    bilinear_interpolate_eight_pixels_tail_head src_fmt, dst_fmt
++    subs      WIDTH, WIDTH, #8
++    bge       1000b
++500:
++    bilinear_interpolate_eight_pixels_tail src_fmt, dst_fmt
++100:
++    tst       WIDTH, #4
++    beq       200f
++    bilinear_interpolate_four_pixels src_fmt, dst_fmt
++200:
++.else
++/*********** 4 pixels per iteration *****************/
++    subs      WIDTH, WIDTH, #4
++    blt       100f
++    asr       PF_OFFS, PF_OFFS, #(16 - src_bpp_shift)
++    bilinear_interpolate_four_pixels_head src_fmt, dst_fmt
++    subs      WIDTH, WIDTH, #4
++    blt       500f
++1000:
++    bilinear_interpolate_four_pixels_tail_head src_fmt, dst_fmt
++    subs      WIDTH, WIDTH, #4
++    bge       1000b
++500:
++    bilinear_interpolate_four_pixels_tail src_fmt, dst_fmt
++100:
++/****************************************************/
++.endif
++    /* handle the remaining trailing pixels */
++    tst       WIDTH, #2
++    beq       200f
++    bilinear_interpolate_two_pixels src_fmt, dst_fmt
++200:
++    tst       WIDTH, #1
++    beq       300f
++    bilinear_interpolate_last_pixel src_fmt, dst_fmt
++300:
++    sub       x29, x29, 64
++    ld1       {v8.8b, v9.8b, v10.8b, v11.8b}, [x29], #32
++    ld1       {v12.8b, v13.8b, v14.8b, v15.8b}, [x29], #32
++    ldp        x8,  x9, [x29, -80]
++    ldp       x10, x11, [x29, -96]
++    ldp       x12, x13, [x29, -104]
++    mov       sp, x29
++    ldp       x29, x30, [sp], 16 
++    ret
++
++    .unreq    OUT
++    .unreq    TOP
++    .unreq    WT
++    .unreq    WB
++    .unreq    X
++    .unreq    UX
++    .unreq    WIDTH
++    .unreq    TMP1
++    .unreq    TMP2
++    .unreq    PF_OFFS
++    .unreq    TMP3
++    .unreq    TMP4
++    .unreq    STRIDE
++.endfunc
++
++.endm
++
++/*****************************************************************************/
++
++.set have_bilinear_interpolate_four_pixels_8888_8888, 1
++
++.macro bilinear_interpolate_four_pixels_8888_8888_head
++    asr       TMP1, X, #16
++    add       X, X, UX
++    add       TMP1, TOP, TMP1, lsl #2
++    asr       TMP2, X, #16
++    add       X, X, UX
++    add       TMP2, TOP, TMP2, lsl #2
++
++    ld1       {v22.2s}, [TMP1], STRIDE
++    ld1       {v23.2s}, [TMP1]
++    asr       TMP3, X, #16
++    add       X, X, UX
++    add       TMP3, TOP, TMP3, lsl #2
++    umull     v8.8h, v22.8b, v28.8b
++    umlal     v8.8h, v23.8b, v29.8b
++
++    ld1       {v22.2s}, [TMP2], STRIDE
++    ld1       {v23.2s}, [TMP2]
++    asr       TMP4, X, #16
++    add       X, X, UX
++    add       TMP4, TOP, TMP4, lsl #2
++    umull     v9.8h, v22.8b, v28.8b
++    umlal     v9.8h, v23.8b, v29.8b
++
++    ld1       {v22.2s}, [TMP3], STRIDE
++    ld1       {v23.2s}, [TMP3]
++    umull     v10.8h, v22.8b, v28.8b
++    umlal     v10.8h, v23.8b, v29.8b
++
++    ushll     v0.4s, v8.4h, #BILINEAR_INTERPOLATION_BITS
++    umlsl     v0.4s, v8.4h, v15.h[0]
++    umlal2    v0.4s, v8.8h, v15.h[0]
++
++    prfm      PREFETCH_MODE, [TMP4, PF_OFFS]
++    ld1       {v16.2s}, [TMP4], STRIDE
++    ld1       {v17.2s}, [TMP4]
++    prfm      PREFETCH_MODE, [TMP4, PF_OFFS]
++    umull     v11.8h, v16.8b, v28.8b
++    umlal     v11.8h, v17.8b, v29.8b
++
++    ushll     v1.4s, v9.4h, #BILINEAR_INTERPOLATION_BITS
++    umlsl     v1.4s, v9.4h, v15.h[4]
++.endm
++
++.macro bilinear_interpolate_four_pixels_8888_8888_tail
++    umlal2    v1.4s, v9.8h, v15.h[4]
++    ushr      v15.8h, v12.8h, #(16 - BILINEAR_INTERPOLATION_BITS)
++    ushll     v2.4s, v10.4h, #BILINEAR_INTERPOLATION_BITS
++    umlsl     v2.4s, v10.4h, v15.h[0]
++    umlal2    v2.4s, v10.8h, v15.h[0]
++    ushll     v3.4s, v11.4h, #BILINEAR_INTERPOLATION_BITS
++    umlsl     v3.4s, v11.4h, v15.h[4]
++    umlal2    v3.4s, v11.8h, v15.h[4]
++    add       v12.8h, v12.8h, v13.8h
++    shrn      v0.4h, v0.4s, #(2 * BILINEAR_INTERPOLATION_BITS)
++    shrn2     v0.8h, v1.4s, #(2 * BILINEAR_INTERPOLATION_BITS)
++    shrn      v2.4h, v2.4s, #(2 * BILINEAR_INTERPOLATION_BITS)
++    ushr      v15.8h, v12.8h, #(16 - BILINEAR_INTERPOLATION_BITS)
++    shrn2     v2.8h, v3.4s, #(2 * BILINEAR_INTERPOLATION_BITS)
++    xtn       v6.8b, v0.8h
++    xtn       v7.8b, v2.8h
++    add       v12.8h, v12.8h, v13.8h
++    st1       {v6.2s, v7.2s}, [OUT], #16
++.endm
++
++.macro bilinear_interpolate_four_pixels_8888_8888_tail_head
++    asr       TMP1, X, #16
++    add       X, X, UX
++    add       TMP1, TOP, TMP1, lsl #2
++    asr       TMP2, X, #16
++    add       X, X, UX
++    add       TMP2, TOP, TMP2, lsl #2
++        umlal2    v1.4s, v9.8h, v15.h[4]
++        ushr      v15.8h, v12.8h, #(16 - BILINEAR_INTERPOLATION_BITS)
++        ushll     v2.4s, v10.4h, #BILINEAR_INTERPOLATION_BITS
++        umlsl     v2.4s, v10.4h, v15.h[0]
++        umlal2    v2.4s, v10.8h, v15.h[0]
++        ushll     v3.4s, v11.4h, #BILINEAR_INTERPOLATION_BITS
++    ld1       {v20.2s}, [TMP1], STRIDE
++        umlsl     v3.4s, v11.4h, v15.h[4]
++        umlal2    v3.4s, v11.8h, v15.h[4]
++    ld1       {v21.2s}, [TMP1]
++    umull     v8.8h, v20.8b, v28.8b
++    umlal     v8.8h, v21.8b, v29.8b
++        shrn      v0.4h, v0.4s, #(2 * BILINEAR_INTERPOLATION_BITS)
++        shrn2     v0.8h, v1.4s, #(2 * BILINEAR_INTERPOLATION_BITS)
++        shrn      v4.4h, v2.4s, #(2 * BILINEAR_INTERPOLATION_BITS)
++    ld1       {v22.2s}, [TMP2], STRIDE
++        shrn2     v4.8h, v3.4s, #(2 * BILINEAR_INTERPOLATION_BITS)
++        add       v12.8h, v12.8h, v13.8h
++    ld1       {v23.2s}, [TMP2]
++    umull     v9.8h, v22.8b, v28.8b
++    asr       TMP3, X, #16
++    add       X, X, UX
++    add       TMP3, TOP, TMP3, lsl #2
++    asr       TMP4, X, #16
++    add       X, X, UX
++    add       TMP4, TOP, TMP4, lsl #2
++    umlal     v9.8h, v23.8b, v29.8b
++    ld1       {v22.2s}, [TMP3], STRIDE
++        ushr      v15.8h, v12.8h, #(16 - BILINEAR_INTERPOLATION_BITS)
++    ld1       {v23.2s}, [TMP3]
++    umull     v10.8h, v22.8b, v28.8b
++    umlal     v10.8h, v23.8b, v29.8b
++        xtn       v6.8b, v0.8h
++    ushll     v0.4s, v8.4h, #BILINEAR_INTERPOLATION_BITS
++        xtn       v7.8b, v4.8h
++    umlsl     v0.4s, v8.4h, v15.h[0]
++    umlal2    v0.4s, v8.8h, v15.h[0]
++    prfm      PREFETCH_MODE, [TMP4, PF_OFFS]
++    ld1       {v16.2s}, [TMP4], STRIDE
++        add       v12.8h, v12.8h, v13.8h
++    ld1       {v17.2s}, [TMP4]
++    prfm      PREFETCH_MODE, [TMP4, PF_OFFS]
++    umull     v11.8h, v16.8b, v28.8b
++    umlal     v11.8h, v17.8b, v29.8b
++        st1       {v6.2s, v7.2s}, [OUT], #16
++    ushll     v1.4s, v9.4h, #BILINEAR_INTERPOLATION_BITS
++    umlsl     v1.4s, v9.4h, v15.h[4]
++.endm
++
++/*****************************************************************************/
++
++generate_bilinear_scanline_func \
++    pixman_scaled_bilinear_scanline_8888_8888_SRC_asm_neon, 8888, 8888, \
++    2, 2, 28, BILINEAR_FLAG_UNROLL_4
++
++generate_bilinear_scanline_func \
++    pixman_scaled_bilinear_scanline_8888_0565_SRC_asm_neon, 8888, 0565, \
++    2, 1, 28, BILINEAR_FLAG_UNROLL_8 | BILINEAR_FLAG_USE_ALL_NEON_REGS
++
++generate_bilinear_scanline_func \
++    pixman_scaled_bilinear_scanline_0565_x888_SRC_asm_neon, 0565, 8888, \
++    1, 2, 28, BILINEAR_FLAG_UNROLL_4
++
++generate_bilinear_scanline_func \
++    pixman_scaled_bilinear_scanline_0565_0565_SRC_asm_neon, 0565, 0565, \
++    1, 1, 28, BILINEAR_FLAG_UNROLL_4
+diff --git a/pixman/pixman-arma64-neon-asm.h b/pixman/pixman-arma64-neon-asm.h
+new file mode 100644
+index 0000000..0389d12
+--- /dev/null
++++ b/pixman/pixman-arma64-neon-asm.h
+@@ -0,0 +1,1310 @@
++/*
++ * Copyright © 2009 Nokia Corporation
++ *
++ * Permission is hereby granted, free of charge, to any person obtaining a
++ * copy of this software and associated documentation files (the "Software"),
++ * to deal in the Software without restriction, including without limitation
++ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
++ * and/or sell copies of the Software, and to permit persons to whom the
++ * Software is furnished to do so, subject to the following conditions:
++ *
++ * The above copyright notice and this permission notice (including the next
++ * paragraph) shall be included in all copies or substantial portions of the
++ * Software.
++ *
++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
++ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
++ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
++ * DEALINGS IN THE SOFTWARE.
++ *
++ * Author:  Siarhei Siamashka (siarhei.siamashka@nokia.com)
++ */
++
++/*
++ * This file contains a macro ('generate_composite_function') which can
++ * construct 2D image processing functions, based on a common template.
++ * Any combinations of source, destination and mask images with 8bpp,
++ * 16bpp, 24bpp, 32bpp color formats are supported.
++ *
++ * This macro takes care of:
++ *  - handling of leading and trailing unaligned pixels
++ *  - doing most of the work related to L2 cache preload
++ *  - encourages the use of software pipelining for better instructions
++ *    scheduling
++ *
++ * The user of this macro has to provide some configuration parameters
++ * (bit depths for the images, prefetch distance, etc.) and a set of
++ * macros, which should implement basic code chunks responsible for
++ * pixels processing. See 'pixman-armv8-neon-asm.S' file for the usage
++ * examples.
++ *
++ * TODO:
++ *  - try overlapped pixel method (from Ian Rickards) when processing
++ *    exactly two blocks of pixels
++ *  - maybe add an option to do reverse scanline processing
++ */
++
++/*
++ * Bit flags for 'generate_composite_function' macro which are used
++ * to tune generated functions behavior.
++ */
++.set FLAG_DST_WRITEONLY,       0
++.set FLAG_DST_READWRITE,       1
++.set FLAG_DEINTERLEAVE_32BPP,  2
++
++/*
++ * Constants for selecting preferable prefetch type.
++ */
++.set PREFETCH_TYPE_NONE,       0 /* No prefetch at all */
++.set PREFETCH_TYPE_SIMPLE,     1 /* A simple, fixed-distance-ahead prefetch */
++.set PREFETCH_TYPE_ADVANCED,   2 /* Advanced fine-grained prefetch */
++
++/*
++ * prefetch mode
++ * available modes are:
++ * pldl1keep
++ * pldl1strm
++ * pldl2keep
++ * pldl2strm
++ * pldl3keep
++ * pldl3strm
++ */
++#define PREFETCH_MODE pldl1keep
++
++/*
++ * Definitions of supplementary pixld/pixst macros (for partial load/store of
++ * pixel data).
++ */
++
++.macro pixldst1 op, elem_size, reg1, mem_operand, abits
++    op {v&reg1&.&elem_size}, [&mem_operand&], #8
++.endm
++
++.macro pixldst2 op, elem_size, reg1, reg2, mem_operand, abits
++    op {v&reg1&.&elem_size, v&reg2&.&elem_size}, [&mem_operand&], #16
++.endm
++
++.macro pixldst4 op, elem_size, reg1, reg2, reg3, reg4, mem_operand, abits
++    op {v&reg1&.&elem_size, v&reg2&.&elem_size, v&reg3&.&elem_size, v&reg4&.&elem_size}, [&mem_operand&], #32
++.endm
++
++.macro pixldst0 op, elem_size, reg1, idx, mem_operand, abits, bytes
++    op {v&reg1&.&elem_size}[idx], [&mem_operand&], #&bytes&
++.endm
++
++.macro pixldst3 op, elem_size, reg1, reg2, reg3, mem_operand
++    op {v&reg1&.&elem_size, v&reg2&.&elem_size, v&reg3&.&elem_size}, [&mem_operand&], #24
++.endm
++
++.macro pixldst30 op, elem_size, reg1, reg2, reg3, idx, mem_operand
++    op {v&reg1&.&elem_size, v&reg2&.&elem_size, v&reg3&.&elem_size}[idx], [&mem_operand&], #3
++.endm
++
++.macro pixldst numbytes, op, elem_size, basereg, mem_operand, abits
++.if numbytes == 32
++    .if elem_size==32
++        pixldst4 op, 2s, %(basereg+4), %(basereg+5), \
++                              %(basereg+6), %(basereg+7), mem_operand, abits
++    .elseif elem_size==16
++        pixldst4 op, 4h, %(basereg+4), %(basereg+5), \
++                              %(basereg+6), %(basereg+7), mem_operand, abits
++    .else
++        pixldst4 op, 8b, %(basereg+4), %(basereg+5), \
++                              %(basereg+6), %(basereg+7), mem_operand, abits
++    .endif
++.elseif numbytes == 16
++    .if elem_size==32
++          pixldst2 op, 2s, %(basereg+2), %(basereg+3), mem_operand, abits
++    .elseif elem_size==16
++          pixldst2 op, 4h, %(basereg+2), %(basereg+3), mem_operand, abits
++    .else
++          pixldst2 op, 8b, %(basereg+2), %(basereg+3), mem_operand, abits
++    .endif
++.elseif numbytes == 8
++    .if elem_size==32
++        pixldst1 op, 2s, %(basereg+1), mem_operand, abits
++    .elseif elem_size==16
++        pixldst1 op, 4h, %(basereg+1), mem_operand, abits
++    .else
++        pixldst1 op, 8b, %(basereg+1), mem_operand, abits
++    .endif
++.elseif numbytes == 4
++    .if !RESPECT_STRICT_ALIGNMENT || (elem_size == 32)
++        pixldst0 op, s, %(basereg+0), 1, mem_operand, abits, 4
++    .elseif elem_size == 16
++        pixldst0 op, h, %(basereg+0), 2, mem_operand, abits, 2
++        pixldst0 op, h, %(basereg+0), 3, mem_operand, abits, 2
++    .else
++        pixldst0 op, b, %(basereg+0), 4, mem_operand, abits, 1
++        pixldst0 op, b, %(basereg+0), 5, mem_operand, abits, 1
++        pixldst0 op, b, %(basereg+0), 6, mem_operand, abits, 1
++        pixldst0 op, b, %(basereg+0), 7, mem_operand, abits, 1
++    .endif
++.elseif numbytes == 2
++    .if !RESPECT_STRICT_ALIGNMENT || (elem_size == 16)
++        pixldst0 op, h, %(basereg+0), 1, mem_operand, abits, 2
++    .else
++        pixldst0 op, b, %(basereg+0), 2, mem_operand, abits, 1
++        pixldst0 op, b, %(basereg+0), 3, mem_operand, abits, 1
++    .endif
++.elseif numbytes == 1
++        pixldst0 op, b, %(basereg+0), 1, mem_operand, abits, 1
++.else
++    .error "unsupported size: numbytes"
++.endif
++.endm
++
++.macro pixld numpix, bpp, basereg, mem_operand, abits=0
++.if bpp > 0
++.if (bpp == 32) && (numpix == 8) && (DEINTERLEAVE_32BPP_ENABLED != 0)
++    pixldst4 ld4, 8b, %(basereg+4), %(basereg+5), \
++                      %(basereg+6), %(basereg+7), mem_operand, abits
++.elseif (bpp == 24) && (numpix == 8)
++    pixldst3 ld3, 8b, %(basereg+3), %(basereg+4), %(basereg+5), mem_operand
++.elseif (bpp == 24) && (numpix == 4)
++    pixldst30 ld3, b, %(basereg+0), %(basereg+1), %(basereg+2), 4, mem_operand
++    pixldst30 ld3, b, %(basereg+0), %(basereg+1), %(basereg+2), 5, mem_operand
++    pixldst30 ld3, b, %(basereg+0), %(basereg+1), %(basereg+2), 6, mem_operand
++    pixldst30 ld3, b, %(basereg+0), %(basereg+1), %(basereg+2), 7, mem_operand
++.elseif (bpp == 24) && (numpix == 2)
++    pixldst30 ld3, b, %(basereg+0), %(basereg+1), %(basereg+2), 2, mem_operand
++    pixldst30 ld3, b, %(basereg+0), %(basereg+1), %(basereg+2), 3, mem_operand
++.elseif (bpp == 24) && (numpix == 1)
++    pixldst30 ld3, b, %(basereg+0), %(basereg+1), %(basereg+2), 1, mem_operand
++.else
++    pixldst %(numpix * bpp / 8), ld1, %(bpp), basereg, mem_operand, abits
++.endif
++.endif
++.endm
++
++.macro pixst numpix, bpp, basereg, mem_operand, abits=0
++.if bpp > 0
++.if (bpp == 32) && (numpix == 8) && (DEINTERLEAVE_32BPP_ENABLED != 0)
++    pixldst4 st4, 8b, %(basereg+4), %(basereg+5), \
++                      %(basereg+6), %(basereg+7), mem_operand, abits
++.elseif (bpp == 24) && (numpix == 8)
++    pixldst3 st3, 8b, %(basereg+3), %(basereg+4), %(basereg+5), mem_operand
++.elseif (bpp == 24) && (numpix == 4)
++    pixldst30 st3, b, %(basereg+0), %(basereg+1), %(basereg+2), 4, mem_operand
++    pixldst30 st3, b, %(basereg+0), %(basereg+1), %(basereg+2), 5, mem_operand
++    pixldst30 st3, b, %(basereg+0), %(basereg+1), %(basereg+2), 6, mem_operand
++    pixldst30 st3, b, %(basereg+0), %(basereg+1), %(basereg+2), 7, mem_operand
++.elseif (bpp == 24) && (numpix == 2)
++    pixldst30 st3, b, %(basereg+0), %(basereg+1), %(basereg+2), 2, mem_operand
++    pixldst30 st3, b, %(basereg+0), %(basereg+1), %(basereg+2), 3, mem_operand
++.elseif (bpp == 24) && (numpix == 1)
++    pixldst30 st3, b, %(basereg+0), %(basereg+1), %(basereg+2), 1, mem_operand
++.elseif numpix * bpp == 32 && abits == 32
++    pixldst 4, st1, 32, basereg, mem_operand, abits
++.elseif numpix * bpp == 16 && abits == 16
++    pixldst 2, st1, 16, basereg, mem_operand, abits
++.else
++    pixldst %(numpix * bpp / 8), st1, %(bpp), basereg, mem_operand, abits
++.endif
++.endif
++.endm
++
++.macro pixld_a numpix, bpp, basereg, mem_operand
++.if (bpp * numpix) <= 128
++    pixld numpix, bpp, basereg, mem_operand, %(bpp * numpix)
++.else
++    pixld numpix, bpp, basereg, mem_operand, 128
++.endif
++.endm
++
++.macro pixst_a numpix, bpp, basereg, mem_operand
++.if (bpp * numpix) <= 128
++    pixst numpix, bpp, basereg, mem_operand, %(bpp * numpix)
++.else
++    pixst numpix, bpp, basereg, mem_operand, 128
++.endif
++.endm
++
++/*
++ * Pixel fetcher for nearest scaling (needs TMP1, TMP2, VX, UNIT_X register
++ * aliases to be defined)
++ */
++.macro pixld1_s elem_size, reg1, mem_operand
++.if elem_size == 16
++    asr     TMP1, VX, #16
++    adds    VX, VX, UNIT_X
++    bmi     55f
++5:  subs    VX, VX, SRC_WIDTH_FIXED
++    bpl     5b
++55:
++    add     TMP1, mem_operand, TMP1, lsl #1
++    asr     TMP2, VX, #16
++    adds    VX, VX, UNIT_X
++    bmi     55f
++5:  subs    VX, VX, SRC_WIDTH_FIXED
++    bpl     5b
++55:
++    add     TMP2, mem_operand, TMP2, lsl #1
++    ld1     {v&reg1&.h}[0], [TMP1]
++    asr     TMP1, VX, #16
++    adds    VX, VX, UNIT_X
++    bmi     55f
++5:  subs    VX, VX, SRC_WIDTH_FIXED
++    bpl     5b
++55:
++    add     TMP1, mem_operand, TMP1, lsl #1
++    ld1     {v&reg1&.h}[1], [TMP2]
++    asr     TMP2, VX, #16
++    adds    VX, VX, UNIT_X
++    bmi     55f
++5:  subs    VX, VX, SRC_WIDTH_FIXED
++    bpl     5b
++55:
++    add     TMP2, mem_operand, TMP2, lsl #1
++    ld1     {v&reg1&.h}[2], [TMP1]
++    ld1     {v&reg1&.h}[3], [TMP2]
++.elseif elem_size == 32
++    asr     TMP1, VX, #16
++    adds    VX, VX, UNIT_X
++    bmi     55f
++5:  subs    VX, VX, SRC_WIDTH_FIXED
++    bpl     5b
++55:
++    add     TMP1, mem_operand, TMP1, lsl #2
++    asr     TMP2, VX, #16
++    adds    VX, VX, UNIT_X
++    bmi     55f
++5:  subs    VX, VX, SRC_WIDTH_FIXED
++    bpl     5b
++55:
++    add     TMP2, mem_operand, TMP2, lsl #2
++    ld1     {v&reg1&.s}[0], [TMP1]
++    ld1     {v&reg1&.s}[1], [TMP2]
++.else
++    .error "unsupported"
++.endif
++.endm
++
++.macro pixld2_s elem_size, reg1, reg2, mem_operand
++.if 0 /* elem_size == 32 */
++    mov     TMP1, VX, asr #16
++    add     VX, VX, UNIT_X, asl #1
++    add     TMP1, mem_operand, TMP1, asl #2
++    mov     TMP2, VX, asr #16
++    sub     VX, VX, UNIT_X
++    add     TMP2, mem_operand, TMP2, asl #2
++    ld1     {v&reg1&.s}[0], [TMP1]
++    mov     TMP1, VX, asr #16
++    add     VX, VX, UNIT_X, asl #1
++    add     TMP1, mem_operand, TMP1, asl #2
++    ld1     {v&reg2&.s}[0], [TMP2, :32]
++    mov     TMP2, VX, asr #16
++    add     VX, VX, UNIT_X
++    add     TMP2, mem_operand, TMP2, asl #2
++    ld1     {v&reg1&.s}[1], [TMP1]
++    ld1     {v&reg2&.s}[1], [TMP2]
++.else
++    pixld1_s elem_size, reg1, mem_operand
++    pixld1_s elem_size, reg2, mem_operand
++.endif
++.endm
++
++.macro pixld0_s elem_size, reg1, idx, mem_operand
++.if elem_size == 16
++    asr     TMP1, VX, #16
++    adds    VX, VX, UNIT_X
++    bmi     55f
++5:  subs    VX, VX, SRC_WIDTH_FIXED
++    bpl     5b
++55:
++    add     TMP1, mem_operand, TMP1, lsl #1
++    ld1     {v&reg1&.h}[idx], [TMP1]
++.elseif elem_size == 32
++    asr     DUMMY, VX, #16
++    mov     TMP1, DUMMY
++    adds    VX, VX, UNIT_X
++    bmi     55f
++5:  subs    VX, VX, SRC_WIDTH_FIXED
++    bpl     5b
++55:
++    add     TMP1, mem_operand, TMP1, lsl #2
++    ld1     {v&reg1&.s}[idx], [TMP1]
++.endif
++.endm
++
++.macro pixld_s_internal numbytes, elem_size, basereg, mem_operand
++.if numbytes == 32
++    pixld2_s elem_size, %(basereg+4), %(basereg+5), mem_operand
++    pixld2_s elem_size, %(basereg+6), %(basereg+7), mem_operand
++    pixdeinterleave elem_size, %(basereg+4)
++.elseif numbytes == 16
++    pixld2_s elem_size, %(basereg+2), %(basereg+3), mem_operand
++.elseif numbytes == 8
++    pixld1_s elem_size, %(basereg+1), mem_operand
++.elseif numbytes == 4
++    .if elem_size == 32
++        pixld0_s elem_size, %(basereg+0), 1, mem_operand
++    .elseif elem_size == 16
++        pixld0_s elem_size, %(basereg+0), 2, mem_operand
++        pixld0_s elem_size, %(basereg+0), 3, mem_operand
++    .else
++        pixld0_s elem_size, %(basereg+0), 4, mem_operand
++        pixld0_s elem_size, %(basereg+0), 5, mem_operand
++        pixld0_s elem_size, %(basereg+0), 6, mem_operand
++        pixld0_s elem_size, %(basereg+0), 7, mem_operand
++    .endif
++.elseif numbytes == 2
++    .if elem_size == 16
++        pixld0_s elem_size, %(basereg+0), 1, mem_operand
++    .else
++        pixld0_s elem_size, %(basereg+0), 2, mem_operand
++        pixld0_s elem_size, %(basereg+0), 3, mem_operand
++    .endif
++.elseif numbytes == 1
++    pixld0_s elem_size, %(basereg+0), 1, mem_operand
++.else
++    .error "unsupported size: numbytes"
++.endif
++.endm
++
++.macro pixld_s numpix, bpp, basereg, mem_operand
++.if bpp > 0
++    pixld_s_internal %(numpix * bpp / 8), %(bpp), basereg, mem_operand
++.endif
++.endm
++
++.macro vuzp8 reg1, reg2
++    umov DUMMY, v16.d[0]
++    uzp1 v16.8b,     v&reg1&.8b, v&reg2&.8b
++    uzp2 v&reg2&.8b, v&reg1&.8b, v&reg2&.8b
++    mov  v&reg1&.8b, v16.8b
++    mov  v16.d[0], DUMMY
++.endm
++
++.macro vzip8 reg1, reg2
++    umov DUMMY, v16.d[0]
++    zip1 v16.8b,     v&reg1&.8b, v&reg2&.8b
++    zip2 v&reg2&.8b, v&reg1&.8b, v&reg2&.8b
++    mov  v&reg1&.8b, v16.8b
++    mov  v16.d[0], DUMMY
++.endm
++
++/* deinterleave B, G, R, A channels for eight 32bpp pixels in 4 registers */
++.macro pixdeinterleave bpp, basereg
++.if (bpp == 32) && (DEINTERLEAVE_32BPP_ENABLED != 0)
++    vuzp8 %(basereg+0), %(basereg+1)
++    vuzp8 %(basereg+2), %(basereg+3)
++    vuzp8 %(basereg+1), %(basereg+3)
++    vuzp8 %(basereg+0), %(basereg+2)
++.endif
++.endm
++
++/* interleave B, G, R, A channels for eight 32bpp pixels in 4 registers */
++.macro pixinterleave bpp, basereg
++.if (bpp == 32) && (DEINTERLEAVE_32BPP_ENABLED != 0)
++    vzip8 %(basereg+0), %(basereg+2)
++    vzip8 %(basereg+1), %(basereg+3)
++    vzip8 %(basereg+2), %(basereg+3)
++    vzip8 %(basereg+0), %(basereg+1)
++.endif
++.endm
++
++/*
++ * This is a macro for implementing cache preload. The main idea is that
++ * cache preload logic is mostly independent from the rest of pixels
++ * processing code. It starts at the top left pixel and moves forward
++ * across pixels and can jump across scanlines. Prefetch distance is
++ * handled in an 'incremental' way: it starts from 0 and advances to the
++ * optimal distance over time. After reaching optimal prefetch distance,
++ * it is kept constant. There are some checks which prevent prefetching
++ * unneeded pixel lines below the image (but it still can prefetch a bit
++ * more data on the right side of the image - not a big issue and may
++ * be actually helpful when rendering text glyphs). Additional trick is
++ * the use of LDR instruction for prefetch instead of PLD when moving to
++ * the next line, the point is that we have a high chance of getting TLB
++ * miss in this case, and PLD would be useless.
++ *
++ * This sounds like it may introduce a noticeable overhead (when working with
++ * fully cached data). But in reality, due to having a separate pipeline and
++ * instruction queue for NEON unit in ARM Cortex-A8, normal ARM code can
++ * execute simultaneously with NEON and be completely shadowed by it. Thus
++ * we get no performance overhead at all (*). This looks like a very nice
++ * feature of Cortex-A8, if used wisely. We don't have a hardware prefetcher,
++ * but still can implement some rather advanced prefetch logic in software
++ * for almost zero cost!
++ *
++ * (*) The overhead of the prefetcher is visible when running some trivial
++ * pixels processing like simple copy. Anyway, having prefetch is a must
++ * when working with the graphics data.
++ */
++.macro PF a, x:vararg
++.if (PREFETCH_TYPE_CURRENT == PREFETCH_TYPE_ADVANCED)
++    a x
++.endif
++.endm
++
++.macro cache_preload std_increment, boost_increment
++.if (src_bpp_shift >= 0) || (dst_r_bpp != 0) || (mask_bpp_shift >= 0)
++.if std_increment != 0
++    PF add PF_X, PF_X, #std_increment
++.endif
++    PF tst PF_CTL, #0xF
++    PF beq 71f
++    PF add PF_X, PF_X, #boost_increment
++    PF sub PF_CTL, PF_CTL, #1
++71:
++    PF cmp PF_X, ORIG_W
++.if src_bpp_shift >= 0
++    PF lsl DUMMY, PF_X, #src_bpp_shift
++    PF prfm PREFETCH_MODE, [PF_SRC, DUMMY]
++.endif
++.if dst_r_bpp != 0
++    PF lsl DUMMY, PF_X, #dst_bpp_shift
++    PF prfm PREFETCH_MODE, [PF_DST, DUMMY]
++.endif
++.if mask_bpp_shift >= 0
++    PF lsl DUMMY, PF_X, #mask_bpp_shift
++    PF prfm PREFETCH_MODE, [PF_MASK, DUMMY]
++.endif
++    PF ble 71f
++    PF sub PF_X, PF_X, ORIG_W
++    PF subs PF_CTL, PF_CTL, #0x10
++71:
++    PF ble 72f
++.if src_bpp_shift >= 0
++    PF lsl DUMMY, SRC_STRIDE, #src_bpp_shift
++    PF ldrsb DUMMY, [PF_SRC, DUMMY]
++    PF add PF_SRC, PF_SRC, #1
++.endif
++.if dst_r_bpp != 0
++    PF lsl DUMMY, DST_STRIDE, #dst_bpp_shift
++    PF ldrsb DUMMY, [PF_DST, DUMMY]
++    PF add PF_DST, PF_DST, #1
++.endif
++.if mask_bpp_shift >= 0
++    PF lsl DUMMY, MASK_STRIDE, #mask_bpp_shift
++    PF ldrsb DUMMY, [PF_MASK, DUMMY]
++    PF add PF_MASK, PF_MASK, #1
++.endif
++72:
++.endif
++.endm
++
++.macro cache_preload_simple
++.if (PREFETCH_TYPE_CURRENT == PREFETCH_TYPE_SIMPLE)
++.if src_bpp > 0
++    prfm PREFETCH_MODE, [SRC, #(PREFETCH_DISTANCE_SIMPLE * src_bpp / 8)]
++.endif
++.if dst_r_bpp > 0
++    prfm PREFETCH_MODE, [DST_R, #(PREFETCH_DISTANCE_SIMPLE * dst_r_bpp / 8)]
++.endif
++.if mask_bpp > 0
++    prfm PREFETCH_MODE, [MASK, #(PREFETCH_DISTANCE_SIMPLE * mask_bpp / 8)]
++.endif
++.endif
++.endm
++
++.macro fetch_mask_pixblock
++    pixld       pixblock_size, mask_bpp, \
++                (mask_basereg - pixblock_size * mask_bpp / 64), MASK
++.endm
++
++/*
++ * Macro which is used to process leading pixels until destination
++ * pointer is properly aligned (at 16 bytes boundary). When destination
++ * buffer uses 16bpp format, this is unnecessary, or even pointless.
++ */
++.macro ensure_destination_ptr_alignment process_pixblock_head, \
++                                        process_pixblock_tail, \
++                                        process_pixblock_tail_head
++.if dst_w_bpp != 24
++    tst         DST_R, #0xF
++    beq         52f
++
++.if src_bpp > 0 || mask_bpp > 0 || dst_r_bpp > 0
++.irp lowbit, 1, 2, 4, 8, 16
++local skip1
++.if (dst_w_bpp <= (lowbit * 8)) && ((lowbit * 8) < (pixblock_size * dst_w_bpp))
++.if lowbit < 16 /* we don't need more than 16-byte alignment */
++    tst         DST_R, #lowbit
++    beq         51f
++.endif
++    pixld_src   (lowbit * 8 / dst_w_bpp), src_bpp, src_basereg, SRC
++    pixld       (lowbit * 8 / dst_w_bpp), mask_bpp, mask_basereg, MASK
++.if dst_r_bpp > 0
++    pixld_a     (lowbit * 8 / dst_r_bpp), dst_r_bpp, dst_r_basereg, DST_R
++.else
++    add         DST_R, DST_R, #lowbit
++.endif
++    PF add      PF_X, PF_X, #(lowbit * 8 / dst_w_bpp)
++    sub         W, W, #(lowbit * 8 / dst_w_bpp)
++51:
++.endif
++.endr
++.endif
++    pixdeinterleave src_bpp, src_basereg
++    pixdeinterleave mask_bpp, mask_basereg
++    pixdeinterleave dst_r_bpp, dst_r_basereg
++
++    process_pixblock_head
++    cache_preload 0, pixblock_size
++    cache_preload_simple
++    process_pixblock_tail
++
++    pixinterleave dst_w_bpp, dst_w_basereg
++
++.irp lowbit, 1, 2, 4, 8, 16
++.if (dst_w_bpp <= (lowbit * 8)) && ((lowbit * 8) < (pixblock_size * dst_w_bpp))
++.if lowbit < 16 /* we don't need more than 16-byte alignment */
++    tst         DST_W, #lowbit
++    beq         51f
++.endif
++.if src_bpp == 0 && mask_bpp == 0 && dst_r_bpp == 0
++    sub         W, W, #(lowbit * 8 / dst_w_bpp)
++.endif
++    pixst_a     (lowbit * 8 / dst_w_bpp), dst_w_bpp, dst_w_basereg, DST_W
++51:
++.endif
++.endr
++.endif
++52:
++.endm
++
++/*
++ * Special code for processing up to (pixblock_size - 1) remaining
++ * trailing pixels. As SIMD processing performs operation on
++ * pixblock_size pixels, anything smaller than this has to be loaded
++ * and stored in a special way. Loading and storing of pixel data is
++ * performed in such a way that we fill some 'slots' in the NEON
++ * registers (some slots naturally are unused), then perform compositing
++ * operation as usual. In the end, the data is taken from these 'slots'
++ * and saved to memory.
++ *
++ * cache_preload_flag - allows to suppress prefetch if
++ *                      set to 0
++ * dst_aligned_flag   - selects whether destination buffer
++ *                      is aligned
++ */
++.macro process_trailing_pixels cache_preload_flag, \
++                               dst_aligned_flag, \
++                               process_pixblock_head, \
++                               process_pixblock_tail, \
++                               process_pixblock_tail_head
++    tst         W, #(pixblock_size - 1)
++    beq         52f
++.if src_bpp > 0 || mask_bpp > 0 || dst_r_bpp > 0
++.irp chunk_size, 16, 8, 4, 2, 1
++.if pixblock_size > chunk_size
++    tst         W, #chunk_size
++    beq         51f
++    pixld_src   chunk_size, src_bpp, src_basereg, SRC
++    pixld       chunk_size, mask_bpp, mask_basereg, MASK
++.if dst_aligned_flag != 0
++    pixld_a     chunk_size, dst_r_bpp, dst_r_basereg, DST_R
++.else
++    pixld       chunk_size, dst_r_bpp, dst_r_basereg, DST_R
++.endif
++.if cache_preload_flag != 0
++    PF add      PF_X, PF_X, #chunk_size
++.endif
++51:
++.endif
++.endr
++.endif
++    pixdeinterleave src_bpp, src_basereg
++    pixdeinterleave mask_bpp, mask_basereg
++    pixdeinterleave dst_r_bpp, dst_r_basereg
++
++    process_pixblock_head
++.if cache_preload_flag != 0
++    cache_preload 0, pixblock_size
++    cache_preload_simple
++.endif
++    process_pixblock_tail
++    pixinterleave dst_w_bpp, dst_w_basereg
++.irp chunk_size, 16, 8, 4, 2, 1
++.if pixblock_size > chunk_size
++    tst         W, #chunk_size
++    beq         51f
++.if dst_aligned_flag != 0
++    pixst_a     chunk_size, dst_w_bpp, dst_w_basereg, DST_W
++.else
++    pixst       chunk_size, dst_w_bpp, dst_w_basereg, DST_W
++.endif
++51:
++.endif
++.endr
++52:
++.endm
++
++/*
++ * Macro, which performs all the needed operations to switch to the next
++ * scanline and start the next loop iteration unless all the scanlines
++ * are already processed.
++ */
++.macro advance_to_next_scanline start_of_loop_label
++    mov         W, ORIG_W
++    add         DST_W, DST_W, DST_STRIDE, lsl #dst_bpp_shift
++.if src_bpp != 0
++    add         SRC, SRC, SRC_STRIDE, lsl #src_bpp_shift
++.endif
++.if mask_bpp != 0
++    add         MASK, MASK, MASK_STRIDE, lsl #mask_bpp_shift
++.endif
++.if (dst_w_bpp != 24)
++    sub         DST_W, DST_W, W, lsl #dst_bpp_shift
++.endif
++.if (src_bpp != 24) && (src_bpp != 0)
++    sub         SRC, SRC, W, lsl #src_bpp_shift
++.endif
++.if (mask_bpp != 24) && (mask_bpp != 0)
++    sub         MASK, MASK, W, lsl #mask_bpp_shift
++.endif
++    subs        H, H, #1
++    mov         DST_R, DST_W
++    bge         start_of_loop_label
++.endm
++
++/*
++ * Registers are allocated in the following way by default:
++ * v0, v1, v2, v3     - reserved for loading source pixel data
++ * v4, v5, v6, v7     - reserved for loading destination pixel data
++ * v24, v25, v26, v27 - reserved for loading mask pixel data
++ * v28, v29, v30, v31 - final destination pixel data for writeback to memory
++ */
++.macro generate_composite_function fname, \
++                                   src_bpp_, \
++                                   mask_bpp_, \
++                                   dst_w_bpp_, \
++                                   flags, \
++                                   pixblock_size_, \
++                                   prefetch_distance, \
++                                   init, \
++                                   cleanup, \
++                                   process_pixblock_head, \
++                                   process_pixblock_tail, \
++                                   process_pixblock_tail_head, \
++                                   dst_w_basereg_ = 28, \
++                                   dst_r_basereg_ = 4, \
++                                   src_basereg_   = 0, \
++                                   mask_basereg_  = 24
++
++    pixman_asm_function fname
++    stp         x29, x30, [sp, -16]!
++    mov         x29, sp
++    sub         sp,   sp, 232  /* push all registers */
++    sub         x29, x29, 64
++    st1         {v8.8b, v9.8b, v10.8b, v11.8b}, [x29], #32
++    st1         {v12.8b, v13.8b, v14.8b, v15.8b}, [x29], #32
++    stp          x8,   x9, [x29, -80]
++    stp         x10,  x11, [x29, -96]
++    stp         x12,  x13, [x29, -112]
++    stp         x14,  x15, [x29, -128]
++    stp         x16,  x17, [x29, -144]
++    stp         x18,  x19, [x29, -160]
++    stp         x20,  x21, [x29, -176]
++    stp         x22,  x23, [x29, -192]
++    stp         x24,  x25, [x29, -208]
++    stp         x26,  x27, [x29, -224]
++    str         x28, [x29, -232]
++
++/*
++ * Select prefetch type for this function. If prefetch distance is
++ * set to 0 or one of the color formats is 24bpp, SIMPLE prefetch
++ * has to be used instead of ADVANCED.
++ */
++    .set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_DEFAULT
++.if prefetch_distance == 0
++    .set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_NONE
++.elseif (PREFETCH_TYPE_CURRENT > PREFETCH_TYPE_SIMPLE) && \
++        ((src_bpp_ == 24) || (mask_bpp_ == 24) || (dst_w_bpp_ == 24))
++    .set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_SIMPLE
++.endif
++
++/*
++ * Make some macro arguments globally visible and accessible
++ * from other macros
++ */
++    .set src_bpp, src_bpp_
++    .set mask_bpp, mask_bpp_
++    .set dst_w_bpp, dst_w_bpp_
++    .set pixblock_size, pixblock_size_
++    .set dst_w_basereg, dst_w_basereg_
++    .set dst_r_basereg, dst_r_basereg_
++    .set src_basereg, src_basereg_
++    .set mask_basereg, mask_basereg_
++
++    .macro pixld_src x:vararg
++        pixld x
++    .endm
++    .macro fetch_src_pixblock
++        pixld_src   pixblock_size, src_bpp, \
++                    (src_basereg - pixblock_size * src_bpp / 64), SRC
++    .endm
++/*
++ * Assign symbolic names to registers
++ */
++    W           .req       x0      /* width (is updated during processing) */
++    H           .req       x1      /* height (is updated during processing) */
++    DST_W       .req       x2      /* destination buffer pointer for writes */
++    DST_STRIDE  .req       x3      /* destination image stride */
++    SRC         .req       x4      /* source buffer pointer */
++    SRC_STRIDE  .req       x5      /* source image stride */
++    MASK        .req       x6      /* mask pointer */
++    MASK_STRIDE .req       x7      /* mask stride */
++
++    DST_R       .req       x8      /* destination buffer pointer for reads */
++
++    PF_CTL      .req       x9      /* combined lines counter and prefetch */
++                                    /* distance increment counter */
++    PF_X        .req       x10     /* pixel index in a scanline for current */
++                                    /* pretetch position */
++    PF_SRC      .req       x11     /* pointer to source scanline start */
++                                    /* for prefetch purposes */
++    PF_DST      .req       x12     /* pointer to destination scanline start */
++                                    /* for prefetch purposes */
++    PF_MASK     .req       x13     /* pointer to mask scanline start */
++                                    /* for prefetch purposes */
++
++    ORIG_W      .req       x14     /* saved original width */
++    DUMMY       .req       x15     /* temporary register */
++
++    sxtw        x0, w0
++    sxtw        x1, w1
++    sxtw        x3, w3
++    sxtw        x5, w5
++    sxtw        x7, w7
++
++    .set mask_bpp_shift, -1
++.if src_bpp == 32
++    .set src_bpp_shift, 2
++.elseif src_bpp == 24
++    .set src_bpp_shift, 0
++.elseif src_bpp == 16
++    .set src_bpp_shift, 1
++.elseif src_bpp == 8
++    .set src_bpp_shift, 0
++.elseif src_bpp == 0
++    .set src_bpp_shift, -1
++.else
++    .error "requested src bpp (src_bpp) is not supported"
++.endif
++.if mask_bpp == 32
++    .set mask_bpp_shift, 2
++.elseif mask_bpp == 24
++    .set mask_bpp_shift, 0
++.elseif mask_bpp == 8
++    .set mask_bpp_shift, 0
++.elseif mask_bpp == 0
++    .set mask_bpp_shift, -1
++.else
++    .error "requested mask bpp (mask_bpp) is not supported"
++.endif
++.if dst_w_bpp == 32
++    .set dst_bpp_shift, 2
++.elseif dst_w_bpp == 24
++    .set dst_bpp_shift, 0
++.elseif dst_w_bpp == 16
++    .set dst_bpp_shift, 1
++.elseif dst_w_bpp == 8
++    .set dst_bpp_shift, 0
++.else
++    .error "requested dst bpp (dst_w_bpp) is not supported"
++.endif
++
++.if (((flags) & FLAG_DST_READWRITE) != 0)
++    .set dst_r_bpp, dst_w_bpp
++.else
++    .set dst_r_bpp, 0
++.endif
++.if (((flags) & FLAG_DEINTERLEAVE_32BPP) != 0)
++    .set DEINTERLEAVE_32BPP_ENABLED, 1
++.else
++    .set DEINTERLEAVE_32BPP_ENABLED, 0
++.endif
++
++.if prefetch_distance < 0 || prefetch_distance > 15
++    .error "invalid prefetch distance (prefetch_distance)"
++.endif
++
++    PF mov      PF_X, #0
++    mov         DST_R, DST_W
++
++.if src_bpp == 24
++    sub         SRC_STRIDE, SRC_STRIDE, W
++    sub         SRC_STRIDE, SRC_STRIDE, W, lsl #1
++.endif
++.if mask_bpp == 24
++    sub         MASK_STRIDE, MASK_STRIDE, W
++    sub         MASK_STRIDE, MASK_STRIDE, W, lsl #1
++.endif
++.if dst_w_bpp == 24
++    sub         DST_STRIDE, DST_STRIDE, W
++    sub         DST_STRIDE, DST_STRIDE, W, lsl #1
++.endif
++
++/*
++ * Setup advanced prefetcher initial state
++ */
++    PF mov      PF_SRC, SRC
++    PF mov      PF_DST, DST_R
++    PF mov      PF_MASK, MASK
++    /* PF_CTL = prefetch_distance | ((h - 1) << 4) */
++    PF lsl      DUMMY, H, #4
++    PF mov      PF_CTL, DUMMY
++    PF add      PF_CTL, PF_CTL, #(prefetch_distance - 0x10)
++
++    init
++    subs        H, H, #1
++    mov         ORIG_W, W
++    blt         9f
++    cmp         W, #(pixblock_size * 2)
++    blt         800f
++/*
++ * This is the start of the pipelined loop, which if optimized for
++ * long scanlines
++ */
++0:
++    ensure_destination_ptr_alignment process_pixblock_head, \
++                                     process_pixblock_tail, \
++                                     process_pixblock_tail_head
++
++    /* Implement "head (tail_head) ... (tail_head) tail" loop pattern */
++    pixld_a     pixblock_size, dst_r_bpp, \
++                (dst_r_basereg - pixblock_size * dst_r_bpp / 64), DST_R
++    fetch_src_pixblock
++    pixld       pixblock_size, mask_bpp, \
++                (mask_basereg - pixblock_size * mask_bpp / 64), MASK
++    PF add      PF_X, PF_X, #pixblock_size
++    process_pixblock_head
++    cache_preload 0, pixblock_size
++    cache_preload_simple
++    subs        W, W, #(pixblock_size * 2)
++    blt         200f
++
++100:
++    process_pixblock_tail_head
++    cache_preload_simple
++    subs        W, W, #pixblock_size
++    bge         100b
++
++200:
++    process_pixblock_tail
++    pixst_a     pixblock_size, dst_w_bpp, \
++                (dst_w_basereg - pixblock_size * dst_w_bpp / 64), DST_W
++
++    /* Process the remaining trailing pixels in the scanline */
++    process_trailing_pixels 1, 1, \
++                            process_pixblock_head, \
++                            process_pixblock_tail, \
++                            process_pixblock_tail_head
++    advance_to_next_scanline 0b
++
++    cleanup
++1000:
++    /* pop all registers */
++    sub         x29, x29, 64
++    ld1         {v8.8b, v9.8b, v10.8b, v11.8b}, [x29], 32
++    ld1         {v12.8b, v13.8b, v14.8b, v15.8b}, [x29], 32
++    ldp          x8,   x9, [x29, -80]
++    ldp         x10,  x11, [x29, -96]
++    ldp         x12,  x13, [x29, -112]
++    ldp         x14,  x15, [x29, -128]
++    ldp         x16,  x17, [x29, -144]
++    ldp         x18,  x19, [x29, -160]
++    ldp         x20,  x21, [x29, -176]
++    ldp         x22,  x23, [x29, -192]
++    ldp         x24,  x25, [x29, -208]
++    ldp         x26,  x27, [x29, -224]
++    ldr         x28, [x29, -232]
++    mov         sp, x29
++    ldp         x29, x30, [sp], 16 
++    ret  /* exit */
++/*
++ * This is the start of the loop, designed to process images with small width
++ * (less than pixblock_size * 2 pixels). In this case neither pipelining
++ * nor prefetch are used.
++ */
++800:
++.if src_bpp_shift >= 0
++    PF lsl DUMMY, SRC_STRIDE, #src_bpp_shift
++    PF prfm PREFETCH_MODE, [SRC, DUMMY]
++.endif
++.if dst_r_bpp != 0
++    PF lsl  DUMMY, DST_STRIDE, #dst_bpp_shift
++    PF prfm PREFETCH_MODE, [DST_R, DUMMY]
++.endif
++.if mask_bpp_shift >= 0
++    PF lsl  DUMMY, MASK_STRIDE, #mask_bpp_shift
++    PF prfm PREFETCH_MODE, [MASK, DUMMY]
++.endif
++    /* Process exactly pixblock_size pixels if needed */
++    tst         W, #pixblock_size
++    beq         100f
++    pixld       pixblock_size, dst_r_bpp, \
++                (dst_r_basereg - pixblock_size * dst_r_bpp / 64), DST_R
++    fetch_src_pixblock
++    pixld       pixblock_size, mask_bpp, \
++                (mask_basereg - pixblock_size * mask_bpp / 64), MASK
++    process_pixblock_head
++    process_pixblock_tail
++    pixst       pixblock_size, dst_w_bpp, \
++                (dst_w_basereg - pixblock_size * dst_w_bpp / 64), DST_W
++100:
++    /* Process the remaining trailing pixels in the scanline */
++    process_trailing_pixels 0, 0, \
++                            process_pixblock_head, \
++                            process_pixblock_tail, \
++                            process_pixblock_tail_head
++    advance_to_next_scanline 800b
++9:
++    cleanup
++    /* pop all registers */
++    sub         x29, x29, 64
++    ld1         {v8.8b, v9.8b, v10.8b, v11.8b}, [x29], 32
++    ld1         {v12.8b, v13.8b, v14.8b, v15.8b}, [x29], 32
++    ldp          x8,   x9, [x29, -80]
++    ldp         x10,  x11, [x29, -96]
++    ldp         x12,  x13, [x29, -112]
++    ldp         x14,  x15, [x29, -128]
++    ldp         x16,  x17, [x29, -144]
++    ldp         x18,  x19, [x29, -160]
++    ldp         x20,  x21, [x29, -176]
++    ldp         x22,  x23, [x29, -192]
++    ldp         x24,  x25, [x29, -208]
++    ldp         x26,  x27, [x29, -224]
++    ldr         x28, [x29, -232]
++    mov         sp, x29
++    ldp         x29, x30, [sp], 16 
++    ret  /* exit */
++
++    .purgem     fetch_src_pixblock
++    .purgem     pixld_src
++
++    .unreq      SRC
++    .unreq      MASK
++    .unreq      DST_R
++    .unreq      DST_W
++    .unreq      ORIG_W
++    .unreq      W
++    .unreq      H
++    .unreq      SRC_STRIDE
++    .unreq      DST_STRIDE
++    .unreq      MASK_STRIDE
++    .unreq      PF_CTL
++    .unreq      PF_X
++    .unreq      PF_SRC
++    .unreq      PF_DST
++    .unreq      PF_MASK
++    .unreq      DUMMY
++    .endfunc
++.endm
++
++/*
++ * A simplified variant of function generation template for a single
++ * scanline processing (for implementing pixman combine functions)
++ */
++.macro generate_composite_function_scanline        use_nearest_scaling, \
++                                                   fname, \
++                                                   src_bpp_, \
++                                                   mask_bpp_, \
++                                                   dst_w_bpp_, \
++                                                   flags, \
++                                                   pixblock_size_, \
++                                                   init, \
++                                                   cleanup, \
++                                                   process_pixblock_head, \
++                                                   process_pixblock_tail, \
++                                                   process_pixblock_tail_head, \
++                                                   dst_w_basereg_ = 28, \
++                                                   dst_r_basereg_ = 4, \
++                                                   src_basereg_   = 0, \
++                                                   mask_basereg_  = 24
++
++    pixman_asm_function fname
++    .set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_NONE
++
++/*
++ * Make some macro arguments globally visible and accessible
++ * from other macros
++ */
++    .set src_bpp, src_bpp_
++    .set mask_bpp, mask_bpp_
++    .set dst_w_bpp, dst_w_bpp_
++    .set pixblock_size, pixblock_size_
++    .set dst_w_basereg, dst_w_basereg_
++    .set dst_r_basereg, dst_r_basereg_
++    .set src_basereg, src_basereg_
++    .set mask_basereg, mask_basereg_
++    
++.if use_nearest_scaling != 0
++    /*
++     * Assign symbolic names to registers for nearest scaling
++     */
++    W           .req        x0
++    DST_W       .req        x1
++    SRC         .req        x2
++    VX          .req        x3
++    UNIT_X      .req        x4
++    SRC_WIDTH_FIXED .req    x5
++    MASK        .req        x6
++    TMP1        .req        x8
++    TMP2        .req        x9
++    DST_R       .req        x10
++    DUMMY       .req        x30
++
++    .macro pixld_src x:vararg
++        pixld_s x
++    .endm
++
++    sxtw        x0, w0
++    sxtw        x3, w3
++    sxtw        x4, w4
++    sxtw        x5, w5
++
++    stp         x29, x30, [sp, -16]!
++    mov         x29, sp
++    sub         sp, sp, 88
++    sub         x29, x29, 64
++    st1         {v8.8b, v9.8b, v10.8b, v11.8b}, [x29], 32
++    st1         {v12.8b, v13.8b, v14.8b, v15.8b}, [x29], 32
++    stp         x8, x9, [x29, -80]
++    str         x10, [x29, -88]
++.else
++    /*
++     * Assign symbolic names to registers
++     */
++    W           .req        x0      /* width (is updated during processing) */
++    DST_W       .req        x1      /* destination buffer pointer for writes */
++    SRC         .req        x2      /* source buffer pointer */
++    MASK        .req        x3      /* mask pointer */
++    DST_R       .req        x4      /* destination buffer pointer for reads */
++    DUMMY       .req        x30
++
++    .macro pixld_src x:vararg
++        pixld x
++    .endm
++
++    sxtw        x0, w0
++
++    stp         x29, x30, [sp, -16]!
++    mov         x29, sp
++    sub         sp, sp, 64
++    sub         x29, x29, 64
++    st1         {v8.8b, v9.8b, v10.8b, v11.8b}, [x29], 32
++    st1         {v12.8b, v13.8b, v14.8b, v15.8b}, [x29], 32
++.endif
++
++.if (((flags) & FLAG_DST_READWRITE) != 0)
++    .set dst_r_bpp, dst_w_bpp
++.else
++    .set dst_r_bpp, 0
++.endif
++.if (((flags) & FLAG_DEINTERLEAVE_32BPP) != 0)
++    .set DEINTERLEAVE_32BPP_ENABLED, 1
++.else
++    .set DEINTERLEAVE_32BPP_ENABLED, 0
++.endif
++
++    .macro fetch_src_pixblock
++        pixld_src   pixblock_size, src_bpp, \
++                    (src_basereg - pixblock_size * src_bpp / 64), SRC
++    .endm
++
++    init
++    mov         DST_R, DST_W
++
++    cmp         W, #pixblock_size
++    blt         800f
++
++    ensure_destination_ptr_alignment process_pixblock_head, \
++                                     process_pixblock_tail, \
++                                     process_pixblock_tail_head
++
++    subs        W, W, #pixblock_size
++    blt         700f
++
++    /* Implement "head (tail_head) ... (tail_head) tail" loop pattern */
++    pixld_a     pixblock_size, dst_r_bpp, \
++                (dst_r_basereg - pixblock_size * dst_r_bpp / 64), DST_R
++    fetch_src_pixblock
++    pixld       pixblock_size, mask_bpp, \
++                (mask_basereg - pixblock_size * mask_bpp / 64), MASK
++    process_pixblock_head
++    subs        W, W, #pixblock_size
++    blt         200f
++100:
++    process_pixblock_tail_head
++    subs        W, W, #pixblock_size
++    bge         100b
++200:
++    process_pixblock_tail
++    pixst_a     pixblock_size, dst_w_bpp, \
++                (dst_w_basereg - pixblock_size * dst_w_bpp / 64), DST_W
++700:
++    /* Process the remaining trailing pixels in the scanline (dst aligned) */
++    process_trailing_pixels 0, 1, \
++                            process_pixblock_head, \
++                            process_pixblock_tail, \
++                            process_pixblock_tail_head
++
++    cleanup
++.if use_nearest_scaling != 0
++    sub         x29, x29, 64
++    ld1         {v8.8b, v9.8b, v10.8b, v11.8b}, [x29], 32
++    ld1         {v12.8b, v13.8b, v14.8b, v15.8b}, [x29], 32
++    ldp         x8, x9, [x29, -80]
++    ldr         x10, [x29, -96]
++    mov         sp, x29
++    ldp         x29, x30, [sp], 16 
++    ret  /* exit */
++.else
++    sub         x29, x29, 64
++    ld1         {v8.8b, v9.8b, v10.8b, v11.8b}, [x29], 32
++    ld1         {v12.8b, v13.8b, v14.8b, v15.8b}, [x29], 32
++    mov         sp, x29
++    ldp         x29, x30, [sp], 16 
++    ret  /* exit */
++.endif
++800:
++    /* Process the remaining trailing pixels in the scanline (dst unaligned) */
++    process_trailing_pixels 0, 0, \
++                            process_pixblock_head, \
++                            process_pixblock_tail, \
++                            process_pixblock_tail_head
++
++    cleanup
++.if use_nearest_scaling != 0
++    sub         x29, x29, 64
++    ld1         {v8.8b, v9.8b, v10.8b, v11.8b}, [x29], 32
++    ld1         {v12.8b, v13.8b, v14.8b, v15.8b}, [x29], 32
++    ldp         x8, x9, [x29, -80]
++    ldr         x10, [x29, -88]
++    mov         sp, x29
++    ldp         x29, x30, [sp], 16 
++    ret  /* exit */
++
++    .unreq      DUMMY
++    .unreq      DST_R
++    .unreq      SRC
++    .unreq      W
++    .unreq      VX
++    .unreq      UNIT_X
++    .unreq      TMP1
++    .unreq      TMP2
++    .unreq      DST_W
++    .unreq      MASK
++    .unreq      SRC_WIDTH_FIXED
++
++.else
++    sub         x29, x29, 64
++    ld1         {v8.8b, v9.8b, v10.8b, v11.8b}, [x29], 32
++    ld1         {v12.8b, v13.8b, v14.8b, v15.8b}, [x29], 32
++    mov          sp, x29
++    ldp          x29, x30, [sp], 16 
++    ret  /* exit */
++
++    .unreq      DUMMY
++    .unreq      SRC
++    .unreq      MASK
++    .unreq      DST_R
++    .unreq      DST_W
++    .unreq      W
++.endif
++
++    .purgem     fetch_src_pixblock
++    .purgem     pixld_src
++
++    .endfunc
++.endm
++
++.macro generate_composite_function_single_scanline x:vararg
++    generate_composite_function_scanline 0, x
++.endm
++
++.macro generate_composite_function_nearest_scanline x:vararg
++    generate_composite_function_scanline 1, x
++.endm
++
++/* Default prologue/epilogue, nothing special needs to be done */
++
++.macro default_init
++.endm
++
++.macro default_cleanup
++.endm
++
++/*
++ * Prologue/epilogue variant which additionally saves/restores v8-v15
++ * registers (they need to be saved/restored by callee according to ABI).
++ * This is required if the code needs to use all the NEON registers.
++ */
++
++.macro default_init_need_all_regs
++.endm
++
++.macro default_cleanup_need_all_regs
++.endm
++
++/******************************************************************************/
++
++/*
++ * Conversion of 8 r5g6b6 pixels packed in 128-bit register (in)
++ * into a planar a8r8g8b8 format (with a, r, g, b color components
++ * stored into 64-bit registers out_a, out_r, out_g, out_b respectively).
++ *
++ * Warning: the conversion is destructive and the original
++ *          value (in) is lost.
++ */
++.macro convert_0565_to_8888 in, out_a, out_r, out_g, out_b
++    shrn        &out_r&.8b, &in&.8h,    #8
++    shrn        &out_g&.8b, &in&.8h,    #3
++    sli         &in&.8h,    &in&.8h,    #5
++    movi        &out_a&.8b, #255
++    sri         &out_r&.8b, &out_r&.8b, #5
++    sri         &out_g&.8b, &out_g&.8b, #6
++    shrn        &out_b&.8b, &in&.8h,    #2
++.endm
++
++.macro convert_0565_to_x888 in, out_r, out_g, out_b
++    shrn        &out_r&.8b, &in&.8h,    #8
++    shrn        &out_g&.8b, &in&.8h,    #3
++    sli         &in&.8h,    &in&.8h,    #5
++    sri         &out_r&.8b, &out_r&.8b, #5
++    sri         &out_g&.8b, &out_g&.8b, #6
++    shrn        &out_b&.8b, &in&.8h,    #2
++.endm
++
++/*
++ * Conversion from planar a8r8g8b8 format (with a, r, g, b color components
++ * in 64-bit registers in_a, in_r, in_g, in_b respectively) into 8 r5g6b6
++ * pixels packed in 128-bit register (out). Requires two temporary 128-bit
++ * registers (tmp1, tmp2)
++ */
++.macro convert_8888_to_0565 in_r, in_g, in_b, out, tmp1, tmp2
++    ushll       &tmp1&.8h, &in_g&.8b, #7
++    shl         &tmp1&.8h, &tmp1&.8h, #1
++    ushll       &out&.8h,  &in_r&.8b, #7
++    shl         &out&.8h,  &out&.8h,  #1
++    ushll       &tmp2&.8h, &in_b&.8b, #7
++    shl         &tmp2&.8h, &tmp2&.8h, #1
++    sri         &out&.8h, &tmp1&.8h, #5
++    sri         &out&.8h, &tmp2&.8h, #11
++.endm
++
++/*
++ * Conversion of four r5g6b5 pixels (in) to four x8r8g8b8 pixels
++ * returned in (out0, out1) registers pair. Requires one temporary
++ * 64-bit register (tmp). 'out1' and 'in' may overlap, the original
++ * value from 'in' is lost
++ */
++.macro convert_four_0565_to_x888_packed in, out0, out1, tmp
++    shl         &out0&.4h, &in&.4h,   #5  /* G top 6 bits */
++    shl         &tmp&.4h,  &in&.4h,   #11 /* B top 5 bits */
++    sri         &in&.4h,   &in&.4h,   #5  /* R is ready in top bits */
++    sri         &out0&.4h, &out0&.4h, #6  /* G is ready in top bits */
++    sri         &tmp&.4h,  &tmp&.4h,  #5  /* B is ready in top bits */
++    ushr        &out1&.4h, &in&.4h,   #8  /* R is in place */
++    sri         &out0&.4h, &tmp&.4h,  #8  /* G & B is in place */
++    zip1        &tmp&.4h,  &out0&.4h, &out1&.4h  /* everything is in place */
++    zip2        &out1&.4h, &out0&.4h, &out1&.4h
++    mov         &out0&.d[0], &tmp&.d[0]
++.endm
+diff --git a/pixman/pixman-private.h b/pixman/pixman-private.h
+index 73a5414..22c8ccc
+--- a/pixman/pixman-private.h
++++ b/pixman/pixman-private.h
+@@ -607,6 +607,11 @@ pixman_implementation_t *
+ _pixman_implementation_create_arm_neon (pixman_implementation_t *fallback);
+ #endif
+ 
++#ifdef USE_ARM_A64_NEON
++pixman_implementation_t *
++_pixman_implementation_create_arm_neon (pixman_implementation_t *fallback);
++#endif
++
+ #ifdef USE_MIPS_DSPR2
+ pixman_implementation_t *
+ _pixman_implementation_create_mips_dspr2 (pixman_implementation_t *fallback);
+-- 
+2.8.0
+
diff --git a/SDK/board/funkey/patches/sdl/0001-pixel-alpha-multiply.patch b/SDK/board/funkey/patches/sdl/0001-pixel-alpha-multiply.patch
new file mode 100644
index 0000000..29e6af0
--- /dev/null
+++ b/SDK/board/funkey/patches/sdl/0001-pixel-alpha-multiply.patch
@@ -0,0 +1,295 @@
+ SDL_blit_A.c |  270 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
+ 1 file changed, 270 insertions(+)
+
+diff --git a/src/video/SDL_blit_A.c b/src/video/SDL_blit_A.c
+index 219cdcc..d8e921e 100644
+--- a/src/video/SDL_blit_A.c
++++ b/src/video/SDL_blit_A.c
+@@ -27,3 +34,270 @@
++
++/*!  
++\brief Alpha adjustment table for custom blitter.
++
++The table provides values for a modified, non-linear 
++transfer function which maintain brightness.
++
++*/
++const unsigned int GFX_ALPHA_ADJUST_ARRAY[256] = {
++	0,  /* 0 */
++	15,  /* 1 */
++	22,  /* 2 */
++	27,  /* 3 */
++	31,  /* 4 */
++	35,  /* 5 */
++	39,  /* 6 */
++	42,  /* 7 */
++	45,  /* 8 */
++	47,  /* 9 */
++	50,  /* 10 */
++	52,  /* 11 */
++	55,  /* 12 */
++	57,  /* 13 */
++	59,  /* 14 */
++	61,  /* 15 */
++	63,  /* 16 */
++	65,  /* 17 */
++	67,  /* 18 */
++	69,  /* 19 */
++	71,  /* 20 */
++	73,  /* 21 */
++	74,  /* 22 */
++	76,  /* 23 */
++	78,  /* 24 */
++	79,  /* 25 */
++	81,  /* 26 */
++	82,  /* 27 */
++	84,  /* 28 */
++	85,  /* 29 */
++	87,  /* 30 */
++	88,  /* 31 */
++	90,  /* 32 */
++	91,  /* 33 */
++	93,  /* 34 */
++	94,  /* 35 */
++	95,  /* 36 */
++	97,  /* 37 */
++	98,  /* 38 */
++	99,  /* 39 */
++	100,  /* 40 */
++	102,  /* 41 */
++	103,  /* 42 */
++	104,  /* 43 */
++	105,  /* 44 */
++	107,  /* 45 */
++	108,  /* 46 */
++	109,  /* 47 */
++	110,  /* 48 */
++	111,  /* 49 */
++	112,  /* 50 */
++	114,  /* 51 */
++	115,  /* 52 */
++	116,  /* 53 */
++	117,  /* 54 */
++	118,  /* 55 */
++	119,  /* 56 */
++	120,  /* 57 */
++	121,  /* 58 */
++	122,  /* 59 */
++	123,  /* 60 */
++	124,  /* 61 */
++	125,  /* 62 */
++	126,  /* 63 */
++	127,  /* 64 */
++	128,  /* 65 */
++	129,  /* 66 */
++	130,  /* 67 */
++	131,  /* 68 */
++	132,  /* 69 */
++	133,  /* 70 */
++	134,  /* 71 */
++	135,  /* 72 */
++	136,  /* 73 */
++	137,  /* 74 */
++	138,  /* 75 */
++	139,  /* 76 */
++	140,  /* 77 */
++	141,  /* 78 */
++	141,  /* 79 */
++	142,  /* 80 */
++	143,  /* 81 */
++	144,  /* 82 */
++	145,  /* 83 */
++	146,  /* 84 */
++	147,  /* 85 */
++	148,  /* 86 */
++	148,  /* 87 */
++	149,  /* 88 */
++	150,  /* 89 */
++	151,  /* 90 */
++	152,  /* 91 */
++	153,  /* 92 */
++	153,  /* 93 */
++	154,  /* 94 */
++	155,  /* 95 */
++	156,  /* 96 */
++	157,  /* 97 */
++	158,  /* 98 */
++	158,  /* 99 */
++	159,  /* 100 */
++	160,  /* 101 */
++	161,  /* 102 */
++	162,  /* 103 */
++	162,  /* 104 */
++	163,  /* 105 */
++	164,  /* 106 */
++	165,  /* 107 */
++	165,  /* 108 */
++	166,  /* 109 */
++	167,  /* 110 */
++	168,  /* 111 */
++	168,  /* 112 */
++	169,  /* 113 */
++	170,  /* 114 */
++	171,  /* 115 */
++	171,  /* 116 */
++	172,  /* 117 */
++	173,  /* 118 */
++	174,  /* 119 */
++	174,  /* 120 */
++	175,  /* 121 */
++	176,  /* 122 */
++	177,  /* 123 */
++	177,  /* 124 */
++	178,  /* 125 */
++	179,  /* 126 */
++	179,  /* 127 */
++	180,  /* 128 */
++	181,  /* 129 */
++	182,  /* 130 */
++	182,  /* 131 */
++	183,  /* 132 */
++	184,  /* 133 */
++	184,  /* 134 */
++	185,  /* 135 */
++	186,  /* 136 */
++	186,  /* 137 */
++	187,  /* 138 */
++	188,  /* 139 */
++	188,  /* 140 */
++	189,  /* 141 */
++	190,  /* 142 */
++	190,  /* 143 */
++	191,  /* 144 */
++	192,  /* 145 */
++	192,  /* 146 */
++	193,  /* 147 */
++	194,  /* 148 */
++	194,  /* 149 */
++	195,  /* 150 */
++	196,  /* 151 */
++	196,  /* 152 */
++	197,  /* 153 */
++	198,  /* 154 */
++	198,  /* 155 */
++	199,  /* 156 */
++	200,  /* 157 */
++	200,  /* 158 */
++	201,  /* 159 */
++	201,  /* 160 */
++	202,  /* 161 */
++	203,  /* 162 */
++	203,  /* 163 */
++	204,  /* 164 */
++	205,  /* 165 */
++	205,  /* 166 */
++	206,  /* 167 */
++	206,  /* 168 */
++	207,  /* 169 */
++	208,  /* 170 */
++	208,  /* 171 */
++	209,  /* 172 */
++	210,  /* 173 */
++	210,  /* 174 */
++	211,  /* 175 */
++	211,  /* 176 */
++	212,  /* 177 */
++	213,  /* 178 */
++	213,  /* 179 */
++	214,  /* 180 */
++	214,  /* 181 */
++	215,  /* 182 */
++	216,  /* 183 */
++	216,  /* 184 */
++	217,  /* 185 */
++	217,  /* 186 */
++	218,  /* 187 */
++	218,  /* 188 */
++	219,  /* 189 */
++	220,  /* 190 */
++	220,  /* 191 */
++	221,  /* 192 */
++	221,  /* 193 */
++	222,  /* 194 */
++	222,  /* 195 */
++	223,  /* 196 */
++	224,  /* 197 */
++	224,  /* 198 */
++	225,  /* 199 */
++	225,  /* 200 */
++	226,  /* 201 */
++	226,  /* 202 */
++	227,  /* 203 */
++	228,  /* 204 */
++	228,  /* 205 */
++	229,  /* 206 */
++	229,  /* 207 */
++	230,  /* 208 */
++	230,  /* 209 */
++	231,  /* 210 */
++	231,  /* 211 */
++	232,  /* 212 */
++	233,  /* 213 */
++	233,  /* 214 */
++	234,  /* 215 */
++	234,  /* 216 */
++	235,  /* 217 */
++	235,  /* 218 */
++	236,  /* 219 */
++	236,  /* 220 */
++	237,  /* 221 */
++	237,  /* 222 */
++	238,  /* 223 */
++	238,  /* 224 */
++	239,  /* 225 */
++	240,  /* 226 */
++	240,  /* 227 */
++	241,  /* 228 */
++	241,  /* 229 */
++	242,  /* 230 */
++	242,  /* 231 */
++	243,  /* 232 */
++	243,  /* 233 */
++	244,  /* 234 */
++	244,  /* 235 */
++	245,  /* 236 */
++	245,  /* 237 */
++	246,  /* 238 */
++	246,  /* 239 */
++	247,  /* 240 */
++	247,  /* 241 */
++	248,  /* 242 */
++	248,  /* 243 */
++	249,  /* 244 */
++	249,  /* 245 */
++	250,  /* 246 */
++	250,  /* 247 */
++	251,  /* 248 */
++	251,  /* 249 */
++	252,  /* 250 */
++	252,  /* 251 */
++	253,  /* 252 */
++	253,  /* 253 */
++	254,  /* 254 */
++	255   /* 255 */
++};
++
+ /*
+   In Visual C, VC6 has mmintrin.h in the "Processor Pack" add-on.
+    Checking if _mm_free is #defined in malloc.h is is the only way to
+@@ -2679,6 +2985,7 @@ static void BlitNtoNPixelAlpha(SDL_BlitInfo *info)
+ 	int dstskip = info->d_skip;
+ 	SDL_PixelFormat *srcfmt = info->src;
+ 	SDL_PixelFormat *dstfmt = info->dst;
++	uint8_t alpha_multiply = srcfmt->alpha;
+ 
+ 	int  srcbpp;
+ 	int  dstbpp;
+@@ -2705,6 +3012,8 @@ static void BlitNtoNPixelAlpha(SDL_BlitInfo *info)
+ 		unsigned sA;
+ 		unsigned dA;
+ 		DISEMBLE_RGBA(src, srcbpp, srcfmt, Pixel, sR, sG, sB, sA);
++		sA=(sA*alpha_multiply)>>8;
++		sA=GFX_ALPHA_ADJUST_ARRAY[sA & 255];
+ 		if(sA) {
+ 		  DISEMBLE_RGBA(dst, dstbpp, dstfmt, Pixel, dR, dG, dB, dA);
+ 		  ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB);
diff --git a/SDK/board/funkey/patches/sdl/003-alsa-fix-excessiveio.patch b/SDK/board/funkey/patches/sdl/003-alsa-fix-excessiveio.patch
new file mode 100644
index 0000000..46e9524
--- /dev/null
+++ b/SDK/board/funkey/patches/sdl/003-alsa-fix-excessiveio.patch
@@ -0,0 +1,26 @@
+Merge this bug as it can affect performance :
+https://github.com/OpenDingux/SDL/commit/e51100dce8da9099278dac9f5affbecf6396378b
+
+--- a/src/audio/alsa/SDL_alsa_audio.c 
++++ b/src/audio/alsa/SDL_alsa_audio.c 
+@@ -479,6 +479,10 @@
+ 		return(-1);
+ 	}
+ 
++	/* Switch to blocking mode for playback */
++	/* Note: this must happen before hw/sw params are set. */
++	SDL_NAME(snd_pcm_nonblock)(pcm_handle, 0);
++
+ 	/* Figure out what the hardware is capable of */
+ 	snd_pcm_hw_params_alloca(&hwparams);
+ 	status = SDL_NAME(snd_pcm_hw_params_any)(pcm_handle, hwparams);
+@@ -611,9 +615,6 @@
+ 	}
+ 	SDL_memset(mixbuf, spec->silence, spec->size);
+ 
+-	/* Switch to blocking mode for playback */
+-	SDL_NAME(snd_pcm_nonblock)(pcm_handle, 0);
+-
+ 	/* We're ready to rock and roll. :-) */
+ 	return(0);
+ }
diff --git a/SDK/board/funkey/patches/sdl/sdl-fix-kb-input.patch b/SDK/board/funkey/patches/sdl/sdl-fix-kb-input.patch
new file mode 100644
index 0000000..8f7db83
--- /dev/null
+++ b/SDK/board/funkey/patches/sdl/sdl-fix-kb-input.patch
@@ -0,0 +1,22 @@
+diff --git a/src/video/fbcon/SDL_fbevents.c b/src/video/fbcon/SDL_fbevents.c
+index 5e369a4..549a7ad 100644
+--- a/src/video/fbcon/SDL_fbevents.c
++++ b/src/video/fbcon/SDL_fbevents.c
+@@ -270,17 +270,6 @@ int FB_OpenKeyboard(_THIS)
+ 				fprintf(stderr, "vtpath = %s, fd = %d\n",
+ 					vtpath, keyboard_fd);
+ #endif /* DEBUG_KEYBOARD */
+-
+-				/* This needs to be our controlling tty
+-				   so that the kernel ioctl() calls work
+-				*/
+-				if ( keyboard_fd >= 0 ) {
+-					tty0_fd = open("/dev/tty", O_RDWR, 0);
+-					if ( tty0_fd >= 0 ) {
+-						ioctl(tty0_fd, TIOCNOTTY, 0);
+-						close(tty0_fd);
+-					}
+-				}
+ 			}
+ 		}
+  		if ( keyboard_fd < 0 ) {
diff --git a/SDK/configs/funkey_defconfig b/SDK/configs/funkey_defconfig
new file mode 100644
index 0000000..56e64a0
--- /dev/null
+++ b/SDK/configs/funkey_defconfig
@@ -0,0 +1,82 @@
+BR2_arm=y
+BR2_cortex_a7=y
+BR2_ARM_FPU_VFPV4=y
+BR2_DL_DIR="../download"
+BR2_CCACHE=y
+BR2_OPTIMIZE_FAST=y
+BR2_SHARED_STATIC_LIBS=y
+BR2_GLOBAL_PATCH_DIR="$(BR2_EXTERNAL_SDK_PATH)/board/funkey/patches"
+BR2_TOOLCHAIN_BUILDROOT_VENDOR="funkey"
+BR2_TOOLCHAIN_BUILDROOT_MUSL=y
+BR2_KERNEL_HEADERS_VERSION=y
+BR2_DEFAULT_KERNEL_VERSION="4.14.14"
+BR2_PACKAGE_HOST_LINUX_HEADERS_CUSTOM_4_14=y
+BR2_BINUTILS_VERSION_2_35_X=y
+BR2_GCC_VERSION_10_X=y
+BR2_TOOLCHAIN_BUILDROOT_CXX=y
+BR2_GCC_ENABLE_LTO=y
+BR2_PACKAGE_HOST_GDB=y
+BR2_PACKAGE_HOST_GDB_TUI=y
+BR2_GDB_VERSION_9_2=y
+BR2_TARGET_OPTIMIZATION="-fno-PIC -march=armv7-a+neon-vfpv4 -mtune=cortex-a7 -mfpu=neon-vfpv4 -mvectorize-with-neon-quad"
+BR2_INIT_NONE=y
+BR2_PACKAGE_BUSYBOX_SHOW_OTHERS=y
+BR2_PACKAGE_ALSA_UTILS=y
+BR2_PACKAGE_GSTREAMER1=y
+BR2_PACKAGE_GST1_PLUGINS_BASE=y
+BR2_PACKAGE_MPG123=y
+BR2_PACKAGE_LZOP=y
+BR2_PACKAGE_E2FSPROGS=y
+BR2_PACKAGE_E2FSPROGS_RESIZE2FS=y
+BR2_PACKAGE_SDL_GFX=y
+BR2_PACKAGE_SDL_IMAGE=y
+BR2_PACKAGE_SDL_IMAGE_GIF=y
+BR2_PACKAGE_SDL_IMAGE_JPEG=y
+BR2_PACKAGE_SDL_IMAGE_PNG=y
+BR2_PACKAGE_SDL_MIXER=y
+BR2_PACKAGE_SDL_NET=y
+BR2_PACKAGE_SDL_SOUND=y
+BR2_PACKAGE_SDL_TTF=y
+BR2_PACKAGE_PARTED=y
+BR2_PACKAGE_UBOOT_TOOLS=y
+BR2_PACKAGE_UBOOT_TOOLS_MKIMAGE=y
+BR2_PACKAGE_UBOOT_TOOLS_MKENVIMAGE=y
+BR2_PACKAGE_LUA=y
+BR2_PACKAGE_LIBSAMPLERATE=y
+BR2_PACKAGE_LIBSNDFILE=y
+BR2_PACKAGE_OPENAL=y
+BR2_PACKAGE_TINYALSA=y
+BR2_PACKAGE_TREMOR=y
+BR2_PACKAGE_LIBARCHIVE=y
+BR2_PACKAGE_LIBARCHIVE_BSDTAR=y
+BR2_PACKAGE_LIBARCHIVE_BSDCPIO=y
+BR2_PACKAGE_LIBARCHIVE_BSDCAT=y
+BR2_PACKAGE_SQLITE=y
+BR2_PACKAGE_LIBCONFIG=y
+BR2_PACKAGE_GIFLIB=y
+BR2_PACKAGE_LIBQRENCODE=y
+BR2_PACKAGE_LIBQRENCODE_TOOLS=y
+BR2_PACKAGE_PIXMAN=y
+BR2_PACKAGE_TINYXML2=y
+BR2_PACKAGE_LIBNL=y
+BR2_PACKAGE_LIBRSYNC=y
+BR2_PACKAGE_FMT=y
+BR2_PACKAGE_ICU=y
+BR2_PACKAGE_ACL=y
+BR2_PACKAGE_PROCPS_NG=y
+# BR2_TARGET_ROOTFS_TAR is not set
+BR2_PACKAGE_HOST_DOSFSTOOLS=y
+BR2_PACKAGE_HOST_DTC=y
+BR2_PACKAGE_HOST_E2FSPROGS=y
+BR2_PACKAGE_HOST_ENVIRONMENT_SETUP=y
+BR2_PACKAGE_HOST_GENIMAGE=y
+BR2_PACKAGE_HOST_KMOD=y
+BR2_PACKAGE_HOST_MKPASSWD=y
+BR2_PACKAGE_HOST_MTOOLS=y
+BR2_PACKAGE_HOST_SQUASHFS=y
+BR2_PACKAGE_HOST_UBOOT_TOOLS=y
+BR2_PACKAGE_LIBOPK=y
+BR2_PACKAGE_LIBXDGMIME=y
+BR2_PACKAGE_AGG=y
+BR2_PACKAGE_FLUIDLITE=y
+BR2_PACKAGE_LIBMIKMOD=y
diff --git a/SDK/external.desc b/SDK/external.desc
new file mode 100644
index 0000000..1623a12
--- /dev/null
+++ b/SDK/external.desc
@@ -0,0 +1,2 @@
+name: SDK
+desc: FunKey SDK
diff --git a/SDK/external.mk b/SDK/external.mk
new file mode 100644
index 0000000..7433fb4
--- /dev/null
+++ b/SDK/external.mk
@@ -0,0 +1 @@
+include $(sort $(wildcard $(BR2_EXTERNAL_SDK_PATH)/package/*/*.mk))
diff --git a/SDK/package/agg/0001-Fix-non-terminating-loop-conditions-when-len-1.patch b/SDK/package/agg/0001-Fix-non-terminating-loop-conditions-when-len-1.patch
new file mode 100644
index 0000000..eaf0467
--- /dev/null
+++ b/SDK/package/agg/0001-Fix-non-terminating-loop-conditions-when-len-1.patch
@@ -0,0 +1,81 @@
+From efd33aad5e69f36ab343b1f28839a55db4538104 Mon Sep 17 00:00:00 2001
+From: Tom Hughes <tom@compton.nu>
+Date: Sun, 19 May 2013 10:55:37 +0100
+Subject: [PATCH 01/15] Fix non-terminating loop conditions when len=1
+
+-   while(abs(sx - lp.x1) + abs(sy - lp.y1) > lp2.len)
++   while(abs(sx - lp.x1) + abs(sy - lp.y1) > 1 + lp2.len)
+    {
+        sx = (lp.x1 + sx) >> 1;
+        sy = (lp.y1 + sy) >> 1;
+    }
+---
+ include/agg_renderer_outline_aa.h    | 8 ++++----
+ include/agg_renderer_outline_image.h | 4 ++--
+ 2 files changed, 6 insertions(+), 6 deletions(-)
+
+diff --git a/include/agg_renderer_outline_aa.h b/include/agg_renderer_outline_aa.h
+index ce25a2e..cb2aa00 100644
+--- a/include/agg_renderer_outline_aa.h
++++ b/include/agg_renderer_outline_aa.h
+@@ -1659,7 +1659,7 @@ namespace agg
+                         }
+                         else
+                         {
+-                            while(abs(sx - lp.x1) + abs(sy - lp.y1) > lp2.len)
++                            while(abs(sx - lp.x1) + abs(sy - lp.y1) > 1 + lp2.len)
+                             {
+                                 sx = (lp.x1 + sx) >> 1;
+                                 sy = (lp.y1 + sy) >> 1;
+@@ -1726,7 +1726,7 @@ namespace agg
+                         }
+                         else
+                         {
+-                            while(abs(ex - lp.x2) + abs(ey - lp.y2) > lp2.len)
++                            while(abs(ex - lp.x2) + abs(ey - lp.y2) > 1 + lp2.len)
+                             {
+                                 ex = (lp.x2 + ex) >> 1;
+                                 ey = (lp.y2 + ey) >> 1;
+@@ -1798,7 +1798,7 @@ namespace agg
+                         }
+                         else
+                         {
+-                            while(abs(sx - lp.x1) + abs(sy - lp.y1) > lp2.len)
++                            while(abs(sx - lp.x1) + abs(sy - lp.y1) > 1 + lp2.len)
+                             {
+                                 sx = (lp.x1 + sx) >> 1;
+                                 sy = (lp.y1 + sy) >> 1;
+@@ -1811,7 +1811,7 @@ namespace agg
+                         }
+                         else
+                         {
+-                            while(abs(ex - lp.x2) + abs(ey - lp.y2) > lp2.len)
++                            while(abs(ex - lp.x2) + abs(ey - lp.y2) > 1 + lp2.len)
+                             {
+                                 ex = (lp.x2 + ex) >> 1;
+                                 ey = (lp.y2 + ey) >> 1;
+diff --git a/include/agg_renderer_outline_image.h b/include/agg_renderer_outline_image.h
+index fbfac10..66d2b9a 100644
+--- a/include/agg_renderer_outline_image.h
++++ b/include/agg_renderer_outline_image.h
+@@ -969,7 +969,7 @@ namespace agg
+                         }
+                         else
+                         {
+-                            while(abs(sx - lp.x1) + abs(sy - lp.y1) > lp2.len)
++                            while(abs(sx - lp.x1) + abs(sy - lp.y1) > 1 + lp2.len)
+                             {
+                                 sx = (lp.x1 + sx) >> 1;
+                                 sy = (lp.y1 + sy) >> 1;
+@@ -982,7 +982,7 @@ namespace agg
+                         }
+                         else
+                         {
+-                            while(abs(ex - lp.x2) + abs(ey - lp.y2) > lp2.len)
++                            while(abs(ex - lp.x2) + abs(ey - lp.y2) > 1 + lp2.len)
+                             {
+                                 ex = (lp.x2 + ex) >> 1;
+                                 ey = (lp.y2 + ey) >> 1;
+-- 
+1.8.1.4
+
diff --git a/SDK/package/agg/0001-autogen.patch b/SDK/package/agg/0001-autogen.patch
new file mode 100644
index 0000000..b773f12
--- /dev/null
+++ b/SDK/package/agg/0001-autogen.patch
@@ -0,0 +1,15 @@
+Author: Andrea Veri <and@debian.org>
+Description: Disable configure's run from the autogen file.
+
+Index: agg-2.5+dfsg1/autogen.sh
+===================================================================
+--- a/autogen.sh	2007-10-11 00:06:16.000000000 +0200
++++ b/autogen.sh	2012-05-01 16:57:37.916862783 +0200
+@@ -18,6 +18,6 @@
+ automake --foreign --add-missing --ignore-deps
+ 
+ # and finally invoke our new configure
+-./configure $*
++[ -n "$NOCONFIGURE" ] || ./configure $*
+ 
+ # end
diff --git a/SDK/package/agg/0002-Cure-recursion-by-aborting-if-the-co-ordinates-are-t.patch b/SDK/package/agg/0002-Cure-recursion-by-aborting-if-the-co-ordinates-are-t.patch
new file mode 100644
index 0000000..4fe7434
--- /dev/null
+++ b/SDK/package/agg/0002-Cure-recursion-by-aborting-if-the-co-ordinates-are-t.patch
@@ -0,0 +1,40 @@
+From e269fe9b62af6fe314cebe0ee7a6d6d1a4a84d1c Mon Sep 17 00:00:00 2001
+From: Tom Hughes <tom@compton.nu>
+Date: Sun, 19 May 2013 11:03:26 +0100
+Subject: [PATCH 02/15] Cure recursion by aborting if the co-ordinates are to
+ big to handle
+
+---
+ include/agg_rasterizer_cells_aa.h | 9 ++++++++-
+ 1 file changed, 8 insertions(+), 1 deletion(-)
+
+diff --git a/include/agg_rasterizer_cells_aa.h b/include/agg_rasterizer_cells_aa.h
+index d3bb138..3a616d9 100644
+--- a/include/agg_rasterizer_cells_aa.h
++++ b/include/agg_rasterizer_cells_aa.h
+@@ -40,7 +40,8 @@
+ #define AGG_RASTERIZER_CELLS_AA_INCLUDED
+ 
+ #include <string.h>
+-#include <math.h>
++#include <cstdlib>
++#include <limits>
+ #include "agg_math.h"
+ #include "agg_array.h"
+ 
+@@ -333,6 +334,12 @@ namespace agg
+         {
+             int cx = (x1 + x2) >> 1;
+             int cy = (y1 + y2) >> 1;
++
++            // Bail if values are so large they are likely to wrap
++            if ((std::abs(x1) >= std::numeric_limits<int>::max()/2) || (std::abs(y1) >= std::numeric_limits<int>::max()/2) ||
++                (std::abs(x2) >= std::numeric_limits<int>::max()/2) || (std::abs(y2) >= std::numeric_limits<int>::max()/2))
++                    return;
++
+             line(x1, y1, cx, cy);
+             line(cx, cy, x2, y2);
+         }
+-- 
+1.8.1.4
+
diff --git a/SDK/package/agg/0002-caca.patch b/SDK/package/agg/0002-caca.patch
new file mode 100644
index 0000000..f98a573
--- /dev/null
+++ b/SDK/package/agg/0002-caca.patch
@@ -0,0 +1,34 @@
+--- a/configure.in
++++ b/configure.in
+@@ -39,7 +39,7 @@
+ # used as platform library in examples:
+ # todo, make the PREFERED_PLATFORM selectable, after the set of possible 
+ # Platforms to link the examples have been evaluated.
+-PREFERED_PLATFORM=X11
++PREFERED_PLATFORM=sdl
+ case "$host" in
+   *darwin* )
+     OSX_LIBS="-framework Carbon -framework QuickTime"
+@@ -120,9 +120,7 @@
+ if test "$no_x" = "yes"; then
+   AC_MSG_WARN([*** X11 not found! Omitting X11 layer.])
+ fi
+-AM_CONDITIONAL(ENABLE_X11,[test x$no_x = x -a xno != x$enable_platform -a x$win32_host != xyes])
+-AC_SUBST(x_includes)
+-AC_SUBST(x_libraries)
++AM_CONDITIONAL(ENABLE_X11,0)
+ dnl ###############################################
+ 
+ dnl Settung up library version
+
+--- a/include/agg_renderer_outline_aa.h
++++ b/include/agg_renderer_outline_aa.h
+@@ -1375,7 +1375,7 @@
+         //---------------------------------------------------------------------
+         void profile(const line_profile_aa& prof) { m_profile = &prof; }
+         const line_profile_aa& profile() const { return *m_profile; }
+-        line_profile_aa& profile() { return *m_profile; }
++//        line_profile_aa& profile() { return *m_profile; }
+ 
+         //---------------------------------------------------------------------
+         int subpixel_width() const { return m_profile->subpixel_width(); }
diff --git a/SDK/package/agg/0003-Get-coordinates-from-previous-vertex-if-last-command.patch b/SDK/package/agg/0003-Get-coordinates-from-previous-vertex-if-last-command.patch
new file mode 100644
index 0000000..b12684d
--- /dev/null
+++ b/SDK/package/agg/0003-Get-coordinates-from-previous-vertex-if-last-command.patch
@@ -0,0 +1,30 @@
+From 032d5342430f4c5dfbc34a2817d67386a14fd51b Mon Sep 17 00:00:00 2001
+From: Tom Hughes <tom@compton.nu>
+Date: Sun, 19 May 2013 11:40:49 +0100
+Subject: [PATCH 03/15] Get coordinates from previous vertex if last command is
+ path_cmd_end_poly
+
+---
+ include/agg_path_storage.h | 6 ++++++
+ 1 file changed, 6 insertions(+)
+
+diff --git a/include/agg_path_storage.h b/include/agg_path_storage.h
+index 7be7393..8922fc8 100644
+--- a/include/agg_path_storage.h
++++ b/include/agg_path_storage.h
+@@ -878,6 +878,12 @@ namespace agg
+                 *x += x2;
+                 *y += y2;
+             }
++            else if (!is_stop(m_vertices.last_command()) &&
++                     is_vertex(m_vertices.prev_vertex(&x2, &y2)))
++            {
++                *x += x2;
++                *y += y2;
++            }
+         }
+     }
+ 
+-- 
+1.8.1.4
+
diff --git a/SDK/package/agg/0004-Make-rasterizer_outline_aa-ignore-close_polygon-when.patch b/SDK/package/agg/0004-Make-rasterizer_outline_aa-ignore-close_polygon-when.patch
new file mode 100644
index 0000000..0cecaf7
--- /dev/null
+++ b/SDK/package/agg/0004-Make-rasterizer_outline_aa-ignore-close_polygon-when.patch
@@ -0,0 +1,138 @@
+From b9c4b1c72b4ad6b24c37f402d3eec39ef393b0eb Mon Sep 17 00:00:00 2001
+From: Tom Hughes <tom@compton.nu>
+Date: Sun, 19 May 2013 14:17:43 +0100
+Subject: [PATCH 04/15] Make rasterizer_outline_aa ignore close_polygon when
+ vertex count < 3
+
+---
+ include/agg_rasterizer_outline_aa.h | 107 ++++++++++++++++++------------------
+ 1 file changed, 52 insertions(+), 55 deletions(-)
+
+diff --git a/include/agg_rasterizer_outline_aa.h b/include/agg_rasterizer_outline_aa.h
+index 4d6dd57..24301d5 100644
+--- a/include/agg_rasterizer_outline_aa.h
++++ b/include/agg_rasterizer_outline_aa.h
+@@ -333,68 +333,65 @@ namespace agg
+         int y2;
+         int lprev;
+ 
+-        if(close_polygon)
++        if(close_polygon && (m_src_vertices.size() >= 3))
+         {
+-            if(m_src_vertices.size() >= 3)
++            dv.idx = 2;
++
++            v     = &m_src_vertices[m_src_vertices.size() - 1];
++            x1    = v->x;
++            y1    = v->y;
++            lprev = v->len;
++
++            v  = &m_src_vertices[0];
++            x2 = v->x;
++            y2 = v->y;
++            dv.lcurr = v->len;
++            line_parameters prev(x1, y1, x2, y2, lprev);
++
++            v = &m_src_vertices[1];
++            dv.x1    = v->x;
++            dv.y1    = v->y;
++            dv.lnext = v->len;
++            dv.curr = line_parameters(x2, y2, dv.x1, dv.y1, dv.lcurr);
++
++            v = &m_src_vertices[dv.idx];
++            dv.x2 = v->x;
++            dv.y2 = v->y;
++            dv.next = line_parameters(dv.x1, dv.y1, dv.x2, dv.y2, dv.lnext);
++
++            dv.xb1 = 0;
++            dv.yb1 = 0;
++            dv.xb2 = 0;
++            dv.yb2 = 0;
++
++            switch(m_line_join)
+             {
+-                dv.idx = 2;
+-
+-                v     = &m_src_vertices[m_src_vertices.size() - 1];
+-                x1    = v->x;
+-                y1    = v->y;
+-                lprev = v->len;
+-
+-                v  = &m_src_vertices[0];
+-                x2 = v->x;
+-                y2 = v->y;
+-                dv.lcurr = v->len;
+-                line_parameters prev(x1, y1, x2, y2, lprev);
+-
+-                v = &m_src_vertices[1];
+-                dv.x1    = v->x;
+-                dv.y1    = v->y;
+-                dv.lnext = v->len;
+-                dv.curr = line_parameters(x2, y2, dv.x1, dv.y1, dv.lcurr);
+-
+-                v = &m_src_vertices[dv.idx];
+-                dv.x2 = v->x;
+-                dv.y2 = v->y;
+-                dv.next = line_parameters(dv.x1, dv.y1, dv.x2, dv.y2, dv.lnext);
+-
+-                dv.xb1 = 0;
+-                dv.yb1 = 0;
+-                dv.xb2 = 0;
+-                dv.yb2 = 0;
+-
+-                switch(m_line_join)
+-                {
+-                case outline_no_join:
+-                    dv.flags = 3;
+-                    break;
++            case outline_no_join:
++                dv.flags = 3;
++                break;
+ 
+-                case outline_miter_join:
+-                case outline_round_join:
+-                    dv.flags = 
+-                            (prev.diagonal_quadrant() == dv.curr.diagonal_quadrant()) |
+-                        ((dv.curr.diagonal_quadrant() == dv.next.diagonal_quadrant()) << 1);
+-                    break;
++            case outline_miter_join:
++            case outline_round_join:
++                dv.flags = 
++                        (prev.diagonal_quadrant() == dv.curr.diagonal_quadrant()) |
++                    ((dv.curr.diagonal_quadrant() == dv.next.diagonal_quadrant()) << 1);
++                break;
+ 
+-                case outline_miter_accurate_join:
+-                    dv.flags = 0;
+-                    break;
+-                }
++            case outline_miter_accurate_join:
++                dv.flags = 0;
++                break;
++            }
+ 
+-                if((dv.flags & 1) == 0 && m_line_join != outline_round_join)
+-                {
+-                    bisectrix(prev, dv.curr, &dv.xb1, &dv.yb1);
+-                }
++            if((dv.flags & 1) == 0 && m_line_join != outline_round_join)
++            {
++                bisectrix(prev, dv.curr, &dv.xb1, &dv.yb1);
++            }
+ 
+-                if((dv.flags & 2) == 0 && m_line_join != outline_round_join)
+-                {
+-                    bisectrix(dv.curr, dv.next, &dv.xb2, &dv.yb2);
+-                }
+-                draw(dv, 0, m_src_vertices.size());
++            if((dv.flags & 2) == 0 && m_line_join != outline_round_join)
++            {
++                bisectrix(dv.curr, dv.next, &dv.xb2, &dv.yb2);
+             }
++            draw(dv, 0, m_src_vertices.size());
+         }
+         else
+         {
+-- 
+1.8.1.4
+
diff --git a/SDK/package/agg/0005-Remove-VC-6-workaround.patch b/SDK/package/agg/0005-Remove-VC-6-workaround.patch
new file mode 100644
index 0000000..f38f7c4
--- /dev/null
+++ b/SDK/package/agg/0005-Remove-VC-6-workaround.patch
@@ -0,0 +1,52 @@
+From b8c43fb0ba13af0cc2b1050f48f81d76d2fdf0c7 Mon Sep 17 00:00:00 2001
+From: Tom Hughes <tom@compton.nu>
+Date: Sun, 19 May 2013 15:04:05 +0100
+Subject: [PATCH 05/15] Remove VC++ 6 workaround
+
+---
+ include/agg_renderer_scanline.h | 29 +----------------------------
+ 1 file changed, 1 insertion(+), 28 deletions(-)
+
+diff --git a/include/agg_renderer_scanline.h b/include/agg_renderer_scanline.h
+index c3bb6f0..c27ca60 100644
+--- a/include/agg_renderer_scanline.h
++++ b/include/agg_renderer_scanline.h
+@@ -79,34 +79,7 @@ namespace agg
+             sl.reset(ras.min_x(), ras.max_x());
+             while(ras.sweep_scanline(sl))
+             {
+-                //render_scanline_aa_solid(sl, ren, ren_color);
+-
+-                // This code is equivalent to the above call (copy/paste). 
+-                // It's just a "manual" optimization for old compilers,
+-                // like Microsoft Visual C++ v6.0
+-                //-------------------------------
+-                int y = sl.y();
+-                unsigned num_spans = sl.num_spans();
+-                typename Scanline::const_iterator span = sl.begin();
+-
+-                for(;;)
+-                {
+-                    int x = span->x;
+-                    if(span->len > 0)
+-                    {
+-                        ren.blend_solid_hspan(x, y, (unsigned)span->len, 
+-                                              ren_color, 
+-                                              span->covers);
+-                    }
+-                    else
+-                    {
+-                        ren.blend_hline(x, y, (unsigned)(x - span->len - 1), 
+-                                        ren_color, 
+-                                        *(span->covers));
+-                    }
+-                    if(--num_spans == 0) break;
+-                    ++span;
+-                }
++                render_scanline_aa_solid(sl, ren, ren_color);
+             }
+         }
+     }
+-- 
+1.8.1.4
+
diff --git a/SDK/package/agg/0006-Implement-grain-merge-blending-mode-GIMP.patch b/SDK/package/agg/0006-Implement-grain-merge-blending-mode-GIMP.patch
new file mode 100644
index 0000000..f1e465b
--- /dev/null
+++ b/SDK/package/agg/0006-Implement-grain-merge-blending-mode-GIMP.patch
@@ -0,0 +1,85 @@
+From 9422570f4e099a834fc43619f7b2a7eb6b442e25 Mon Sep 17 00:00:00 2001
+From: Tom Hughes <tom@compton.nu>
+Date: Sun, 19 May 2013 15:31:01 +0100
+Subject: [PATCH 06/15] Implement grain-merge blending mode (GIMP)
+
+---
+ include/agg_pixfmt_rgba.h | 42 ++++++++++++++++++++++++++++++++++++++++--
+ 1 file changed, 40 insertions(+), 2 deletions(-)
+
+diff --git a/include/agg_pixfmt_rgba.h b/include/agg_pixfmt_rgba.h
+index 79d10dc..f576ce4 100644
+--- a/include/agg_pixfmt_rgba.h
++++ b/include/agg_pixfmt_rgba.h
+@@ -1401,9 +1401,46 @@ namespace agg
+         }
+     };
+ 
++    //================================================comp_op_rgba_grain_merge
++    template <typename ColorT, typename Order> struct comp_op_rgba_grain_merge
++    {
++        typedef ColorT color_type;
++        typedef Order order_type;
++        typedef typename color_type::value_type value_type;
++        typedef typename color_type::calc_type calc_type;
++        typedef typename color_type::long_type long_type;
++        enum base_scale_e
++        {
++            base_shift = color_type::base_shift,
++            base_mask  = color_type::base_mask
++        };
+ 
++        // E = I + M - 128
++        static AGG_INLINE void blend_pix(value_type* p,
++                                         unsigned sr, unsigned sg, unsigned sb,
++                                         unsigned sa, unsigned cover)
++        {
+ 
+-
++            if(cover < 255)
++            {
++                sr = (sr * cover + 255) >> 8;
++                sg = (sg * cover + 255) >> 8;
++                sb = (sb * cover + 255) >> 8;
++                sa = (sa * cover + 255) >> 8;
++            }
++            if(sa)
++            {
++                calc_type da = p[Order::A];
++                int dr = sr + p[Order::R] - 128;
++                int dg = sg + p[Order::G] - 128;
++                int db = sb + p[Order::B] - 128;
++                p[Order::R] = (value_type)(dr < 0 ? 0 : (dr > 255 ? 255 : dr));
++                p[Order::G] = (value_type)(dg < 0 ? 0 : (dg > 255 ? 255 : dg));
++                p[Order::B] = (value_type)(db < 0 ? 0 : (db > 255 ? 255 : db));
++                p[Order::A] = (value_type)(sa + da - ((sa * da + base_mask) >> base_shift));
++            }
++        }
++    };
+ 
+     //======================================================comp_op_table_rgba
+     template<class ColorT, class Order> struct comp_op_table_rgba
+@@ -1451,6 +1488,7 @@ namespace agg
+         comp_op_rgba_contrast   <ColorT,Order>::blend_pix,
+         comp_op_rgba_invert     <ColorT,Order>::blend_pix,
+         comp_op_rgba_invert_rgb <ColorT,Order>::blend_pix,
++        comp_op_rgba_grain_merge<ColorT,Order>::blend_pix,
+         0
+     };
+ 
+@@ -1486,6 +1524,7 @@ namespace agg
+         comp_op_contrast,      //----comp_op_contrast
+         comp_op_invert,        //----comp_op_invert
+         comp_op_invert_rgb,    //----comp_op_invert_rgb
++        comp_op_grain_merge,   //----comp_op_grain_merge
+ 
+         end_of_comp_op_e
+     };
+@@ -2908,4 +2947,3 @@ namespace agg
+ }
+ 
+ #endif
+-
+-- 
+1.8.1.4
+
diff --git a/SDK/package/agg/0007-Implement-grain-extract-blending-mode-GIMP.patch b/SDK/package/agg/0007-Implement-grain-extract-blending-mode-GIMP.patch
new file mode 100644
index 0000000..cafb36e
--- /dev/null
+++ b/SDK/package/agg/0007-Implement-grain-extract-blending-mode-GIMP.patch
@@ -0,0 +1,85 @@
+From abd440342e166a90d08610bf5b31d2a8357eafbe Mon Sep 17 00:00:00 2001
+From: Tom Hughes <tom@compton.nu>
+Date: Sun, 19 May 2013 15:43:18 +0100
+Subject: [PATCH 07/15] Implement grain-extract blending mode (GIMP)
+
+---
+ include/agg_pixfmt_rgba.h | 48 +++++++++++++++++++++++++++++++++++++++++++++++
+ 1 file changed, 48 insertions(+)
+
+diff --git a/include/agg_pixfmt_rgba.h b/include/agg_pixfmt_rgba.h
+index f576ce4..42f0a05 100644
+--- a/include/agg_pixfmt_rgba.h
++++ b/include/agg_pixfmt_rgba.h
+@@ -1442,6 +1442,52 @@ namespace agg
+         }
+     };
+ 
++    //==============================================comp_op_rgba_grain_extract
++    template <typename ColorT, typename Order> struct comp_op_rgba_grain_extract
++    {
++        typedef ColorT color_type;
++        typedef Order order_type;
++        typedef typename color_type::value_type value_type;
++        typedef typename color_type::calc_type calc_type;
++        typedef typename color_type::long_type long_type;
++        enum base_scale_e
++        {
++            base_shift = color_type::base_shift,
++            base_mask  = color_type::base_mask
++        };
++
++        // E = I - M + 128
++        static AGG_INLINE void blend_pix(value_type* p,
++                                         unsigned sr, unsigned sg, unsigned sb,
++                                         unsigned sa, unsigned cover)
++        {
++            calc_type da = (p[Order::A] * sa + 255) >> 8;
++
++            int dr = p[Order::R] - sr + 128;
++            int dg = p[Order::G] - sg + 128;
++            int db = p[Order::B] - sb + 128;
++
++            dr = dr < 0 ? 0 : (dr > 255 ? 255 : dr);
++            dg = dg < 0 ? 0 : (dg > 255 ? 255 : dg);
++            db = db < 0 ? 0 : (db > 255 ? 255 : db);
++
++            p[Order::A] = da;
++
++            if(da < 255)
++            {
++                p[Order::R] = (dr * da + 255) >> 8;
++                p[Order::G] = (dg * da + 255) >> 8;
++                p[Order::B] = (db * da + 255) >> 8;
++            }
++            else
++            {
++                p[Order::R] = dr;
++                p[Order::G] = dg;
++                p[Order::B] = db;
++            }
++        }
++    };
++
+     //======================================================comp_op_table_rgba
+     template<class ColorT, class Order> struct comp_op_table_rgba
+     {
+@@ -1489,6 +1535,7 @@ namespace agg
+         comp_op_rgba_invert     <ColorT,Order>::blend_pix,
+         comp_op_rgba_invert_rgb <ColorT,Order>::blend_pix,
+         comp_op_rgba_grain_merge<ColorT,Order>::blend_pix,
++        comp_op_rgba_grain_extract<ColorT,Order>::blend_pix,
+         0
+     };
+ 
+@@ -1525,6 +1572,7 @@ namespace agg
+         comp_op_invert,        //----comp_op_invert
+         comp_op_invert_rgb,    //----comp_op_invert_rgb
+         comp_op_grain_merge,   //----comp_op_grain_merge
++        comp_op_grain_extract, //----comp_op_grain_extract
+ 
+         end_of_comp_op_e
+     };
+-- 
+1.8.1.4
+
diff --git a/SDK/package/agg/0008-Declare-multiplication-and-division-operators-as-con.patch b/SDK/package/agg/0008-Declare-multiplication-and-division-operators-as-con.patch
new file mode 100644
index 0000000..0ed92ee
--- /dev/null
+++ b/SDK/package/agg/0008-Declare-multiplication-and-division-operators-as-con.patch
@@ -0,0 +1,36 @@
+From 2688af280836b95908d3cfd6915510d55de673b8 Mon Sep 17 00:00:00 2001
+From: Tom Hughes <tom@compton.nu>
+Date: Sun, 19 May 2013 16:15:01 +0100
+Subject: [PATCH 08/15] Declare multiplication and division operators as const
+
+---
+ include/agg_trans_affine.h | 8 ++++----
+ 1 file changed, 4 insertions(+), 4 deletions(-)
+
+diff --git a/include/agg_trans_affine.h b/include/agg_trans_affine.h
+index a662099..2f602a0 100644
+--- a/include/agg_trans_affine.h
++++ b/include/agg_trans_affine.h
+@@ -216,15 +216,15 @@ namespace agg
+         }
+ 
+         // Multiply the matrix by another one and return
+-        // the result in a separete matrix.
+-        trans_affine operator * (const trans_affine& m)
++        // the result in a separate matrix.
++        trans_affine operator * (const trans_affine& m) const
+         {
+             return trans_affine(*this).multiply(m);
+         }
+ 
+         // Multiply the matrix by inverse of another one 
+-        // and return the result in a separete matrix.
+-        trans_affine operator / (const trans_affine& m)
++        // and return the result in a separate matrix.
++        trans_affine operator / (const trans_affine& m) const
+         {
+             return trans_affine(*this).multiply_inv(m);
+         }
+-- 
+1.8.1.4
+
diff --git a/SDK/package/agg/0009-Add-a-static-identity-transformation.patch b/SDK/package/agg/0009-Add-a-static-identity-transformation.patch
new file mode 100644
index 0000000..01555cb
--- /dev/null
+++ b/SDK/package/agg/0009-Add-a-static-identity-transformation.patch
@@ -0,0 +1,37 @@
+From be9ed90897bc43b4547a3a1f8046827caaf13b4c Mon Sep 17 00:00:00 2001
+From: Tom Hughes <tom@compton.nu>
+Date: Sun, 19 May 2013 16:15:36 +0100
+Subject: [PATCH 09/15] Add a static identity transformation
+
+---
+ include/agg_trans_affine.h | 1 +
+ src/agg_trans_affine.cpp   | 1 +
+ 2 files changed, 2 insertions(+)
+
+diff --git a/include/agg_trans_affine.h b/include/agg_trans_affine.h
+index 2f602a0..67fe5ca 100644
+--- a/include/agg_trans_affine.h
++++ b/include/agg_trans_affine.h
+@@ -92,6 +92,7 @@ namespace agg
+     //----------------------------------------------------------------------
+     struct trans_affine
+     {
++        static const trans_affine identity;
+         double sx, shy, shx, sy, tx, ty;
+ 
+         //------------------------------------------ Construction
+diff --git a/src/agg_trans_affine.cpp b/src/agg_trans_affine.cpp
+index aca18c2..b3d9bc0 100644
+--- a/src/agg_trans_affine.cpp
++++ b/src/agg_trans_affine.cpp
+@@ -28,6 +28,7 @@
+ 
+ namespace agg
+ {
++    const trans_affine trans_affine::identity;
+ 
+     //------------------------------------------------------------------------
+     const trans_affine& trans_affine::parl_to_parl(const double* src, 
+-- 
+1.8.1.4
+
diff --git a/SDK/package/agg/0010-Add-renderer_scanline_aa_alpha.patch b/SDK/package/agg/0010-Add-renderer_scanline_aa_alpha.patch
new file mode 100644
index 0000000..b0be258
--- /dev/null
+++ b/SDK/package/agg/0010-Add-renderer_scanline_aa_alpha.patch
@@ -0,0 +1,193 @@
+From 749c8cd11e9e6f81e93ae5ce19258431722b6bdf Mon Sep 17 00:00:00 2001
+From: Tom Hughes <tom@compton.nu>
+Date: Sun, 19 May 2013 16:43:25 +0100
+Subject: [PATCH 10/15] Add renderer_scanline_aa_alpha
+
+---
+ include/agg_pixfmt_rgba.h       | 24 +++++++++++++-
+ include/agg_renderer_base.h     | 28 ++++++++++++++++
+ include/agg_renderer_scanline.h | 71 +++++++++++++++++++++++++++++++++++++++++
+ 3 files changed, 122 insertions(+), 1 deletion(-)
+
+diff --git a/include/agg_pixfmt_rgba.h b/include/agg_pixfmt_rgba.h
+index 42f0a05..6c4bc37 100644
+--- a/include/agg_pixfmt_rgba.h
++++ b/include/agg_pixfmt_rgba.h
+@@ -2247,7 +2247,6 @@ namespace agg
+         }
+ 
+ 
+-
+         //--------------------------------------------------------------------
+         void blend_color_vspan(int x, int y,
+                                unsigned len, 
+@@ -2751,6 +2750,29 @@ namespace agg
+         }
+ 
+         //--------------------------------------------------------------------
++        void blend_color_hspan_alpha(int x, int y, unsigned len,
++                                     const color_type* colors,
++                                     value_type alpha,
++                                     const int8u* covers,
++                                     int8u cover)
++        {
++            value_type* p = (value_type*)m_rbuf->row_ptr(x, y, len) + (x << 2);
++            do
++            {
++                blender_type::blend_pix(m_comp_op,
++                                        p,
++                                        (colors->r * alpha + 255) >> 8,
++                                        (colors->g * alpha + 255) >> 8,
++                                        (colors->b * alpha + 255) >> 8,
++                                        (colors->a * alpha + 255) >> 8,
++                                        covers ? *covers++ : cover);
++                p += 4;
++                ++colors;
++            }
++            while(--len);
++        }
++
++        //--------------------------------------------------------------------
+         void blend_color_vspan(int x, int y, unsigned len, 
+                                const color_type* colors, 
+                                const int8u* covers,
+diff --git a/include/agg_renderer_base.h b/include/agg_renderer_base.h
+index 1808944..25f07c3 100644
+--- a/include/agg_renderer_base.h
++++ b/include/agg_renderer_base.h
+@@ -37,6 +37,7 @@ namespace agg
+     public:
+         typedef PixelFormat pixfmt_type;
+         typedef typename pixfmt_type::color_type color_type;
++        typedef typename pixfmt_type::color_type::value_type value_type;
+         typedef typename pixfmt_type::row_data row_data;
+ 
+         //--------------------------------------------------------------------
+@@ -383,6 +384,33 @@ namespace agg
+         }
+ 
+         //--------------------------------------------------------------------
++        void blend_color_hspan_alpha(int x, int y, int len,
++                               const color_type* colors,
++                               value_type alpha,
++                               const cover_type* covers,
++                               cover_type cover = agg::cover_full)
++        {
++            if(y > ymax()) return;
++            if(y < ymin()) return;
++
++            if(x < xmin())
++            {
++                int d = xmin() - x;
++                len -= d;
++                if(len <= 0) return;
++                if(covers) covers += d;
++                colors += d;
++                x = xmin();
++            }
++            if(x + len > xmax())
++            {
++                len = xmax() - x + 1;
++                if(len <= 0) return;
++            }
++            m_ren->blend_color_hspan_alpha(x, y, len, colors, alpha,  covers, cover);
++        }
++
++        //--------------------------------------------------------------------
+         void blend_color_vspan(int x, int y, int len, 
+                                const color_type* colors, 
+                                const cover_type* covers,
+diff --git a/include/agg_renderer_scanline.h b/include/agg_renderer_scanline.h
+index c27ca60..4fcb557 100644
+--- a/include/agg_renderer_scanline.h
++++ b/include/agg_renderer_scanline.h
+@@ -156,6 +156,35 @@ namespace agg
+         }
+     }
+ 
++    //================================================render_scanline_aa_alpha
++    template<class Scanline, class BaseRenderer,
++             class SpanAllocator, class SpanGenerator>
++    void render_scanline_aa_alpha(const Scanline& sl, BaseRenderer& ren,
++                                  SpanAllocator& alloc, SpanGenerator& span_gen,
++                                  unsigned alpha)
++    {
++        int y = sl.y();
++
++        unsigned num_spans = sl.num_spans();
++        typename Scanline::const_iterator span = sl.begin();
++        for(;;)
++        {
++            int x = span->x;
++            int len = span->len;
++            const typename Scanline::cover_type* covers = span->covers;
++
++            if(len < 0) len = -len;
++            typename BaseRenderer::color_type* colors = alloc.allocate(len);
++            span_gen.generate(colors, x, y, len);
++            ren.blend_color_hspan_alpha(x, y, len, colors, alpha,
++                                  (span->len < 0) ? 0 : covers, *covers);
++
++            if(--num_spans == 0) break;
++            ++span;
++        }
++    }
++
++
+     //=====================================================render_scanlines_aa
+     template<class Rasterizer, class Scanline, class BaseRenderer, 
+              class SpanAllocator, class SpanGenerator>
+@@ -216,8 +245,50 @@ namespace agg
+     };
+ 
+ 
++    //==============================================renderer_scanline_aa_alpha
++    template<class BaseRenderer, class SpanAllocator, class SpanGenerator>
++    class renderer_scanline_aa_alpha
++    {
++    public:
++        typedef BaseRenderer  base_ren_type;
++        typedef SpanAllocator alloc_type;
++        typedef SpanGenerator span_gen_type;
+ 
++        //--------------------------------------------------------------------
++        renderer_scanline_aa_alpha() : m_ren(0), m_alloc(0), m_span_gen(0), m_alpha(1.0) {}
++        renderer_scanline_aa_alpha(base_ren_type& ren,
++                             alloc_type& alloc,
++                             span_gen_type& span_gen,
++                             unsigned alpha) :
++            m_ren(&ren),
++            m_alloc(&alloc),
++            m_span_gen(&span_gen),
++            m_alpha(alpha)
++        {}
++        void attach(base_ren_type& ren,
++                    alloc_type& alloc,
++                    span_gen_type& span_gen)
++        {
++            m_ren = &ren;
++            m_alloc = &alloc;
++            m_span_gen = &span_gen;
++        }
+ 
++        //--------------------------------------------------------------------
++        void prepare() { m_span_gen->prepare(); }
++
++        //--------------------------------------------------------------------
++        template<class Scanline> void render(const Scanline& sl)
++        {
++            render_scanline_aa_alpha(sl, *m_ren, *m_alloc, *m_span_gen, m_alpha);
++        }
++
++    private:
++        base_ren_type* m_ren;
++        alloc_type*    m_alloc;
++        span_gen_type* m_span_gen;
++        unsigned       m_alpha;
++    };
+ 
+ 
+     //===============================================render_scanline_bin_solid
+-- 
+1.8.1.4
+
diff --git a/SDK/package/agg/0011-Avoid-division-by-zero-in-color-burn-mode.patch b/SDK/package/agg/0011-Avoid-division-by-zero-in-color-burn-mode.patch
new file mode 100644
index 0000000..2a0d198
--- /dev/null
+++ b/SDK/package/agg/0011-Avoid-division-by-zero-in-color-burn-mode.patch
@@ -0,0 +1,58 @@
+From 0ec68d7f5695403eccac75025ba7f6f7ecf1814e Mon Sep 17 00:00:00 2001
+From: Tom Hughes <tom@compton.nu>
+Date: Sun, 19 May 2013 16:49:08 +0100
+Subject: [PATCH 11/15] Avoid division by zero in color-burn mode
+
+FIXME: re-work using latest math from http://www.w3.org/TR/SVGCompositing/
+---
+ include/agg_pixfmt_rgba.h | 21 ++++++++++++++++++---
+ 1 file changed, 18 insertions(+), 3 deletions(-)
+
+diff --git a/include/agg_pixfmt_rgba.h b/include/agg_pixfmt_rgba.h
+index 6c4bc37..5d6b511 100644
+--- a/include/agg_pixfmt_rgba.h
++++ b/include/agg_pixfmt_rgba.h
+@@ -1027,6 +1027,21 @@ namespace agg
+         //   Dca' = Sa.(Sca.Da + Dca.Sa - Sa.Da)/Sca + Sca.(1 - Da) + Dca.(1 - Sa)
+         // 
+         // Da'  = Sa + Da - Sa.Da 
++
++
++        // http://www.w3.org/TR/SVGCompositing/
++        // if Sca == 0 and Dca == Da
++        //   Dca' = Sa × Da + Sca × (1 - Da) + Dca × (1 - Sa)
++        //        = Sa × Da + Dca × (1 - Sa)
++        //        = Da = Dca
++        // otherwise if Sca == 0
++        //   Dca' = Sca × (1 - Da) + Dca × (1 - Sa)
++        //        = Dca × (1 - Sa)
++        // otherwise if Sca > 0
++        //   Dca' = Sa × Da - Sa × Da × min(1, (1 - Dca/Da) × Sa/Sca) + Sca × (1 - Da) + Dca × (1 - Sa)
++        //        = Sa × Da × (1 - min(1, (1 - Dca/Da) × Sa/Sca)) + Sca × (1 - Da) + Dca × (1 - Sa)
++
++        //   sa * da * (255 - std::min(255, (255 - p[0]/da)*(sa/(sc*sa)) +
+         static AGG_INLINE void blend_pix(value_type* p, 
+                                          unsigned sr, unsigned sg, unsigned sb, 
+                                          unsigned sa, unsigned cover)
+@@ -1056,15 +1071,15 @@ namespace agg
+ 
+                 p[Order::R] = (value_type)(((srda + drsa <= sada) ? 
+                     sr * d1a + dr * s1a :
+-                    sa * (srda + drsa - sada) / sr + sr * d1a + dr * s1a + base_mask) >> base_shift);
++                   (sr > 0 ? sa * (srda + drsa - sada) / sr + sr * d1a + dr * s1a + base_mask : 0)) >> base_shift);
+ 
+                 p[Order::G] = (value_type)(((sgda + dgsa <= sada) ? 
+                     sg * d1a + dg * s1a :
+-                    sa * (sgda + dgsa - sada) / sg + sg * d1a + dg * s1a + base_mask) >> base_shift);
++                   (sg > 0 ? sa * (sgda + dgsa - sada) / sg + sg * d1a + dg * s1a + base_mask : 0)) >> base_shift);
+ 
+                 p[Order::B] = (value_type)(((sbda + dbsa <= sada) ? 
+                     sb * d1a + db * s1a :
+-                    sa * (sbda + dbsa - sada) / sb + sb * d1a + db * s1a + base_mask) >> base_shift);
++                   (sb > 0 ? sa * (sbda + dbsa - sada) / sb + sb * d1a + db * s1a + base_mask : 0)) >> base_shift);
+ 
+                 p[Order::A] = (value_type)(sa + da - ((sa * da + base_mask) >> base_shift));
+             }
+-- 
+1.8.1.4
+
diff --git a/SDK/package/agg/0012-Avoid-pixel-artifacts-when-compositing.patch b/SDK/package/agg/0012-Avoid-pixel-artifacts-when-compositing.patch
new file mode 100644
index 0000000..b3e641e
--- /dev/null
+++ b/SDK/package/agg/0012-Avoid-pixel-artifacts-when-compositing.patch
@@ -0,0 +1,26 @@
+From bf0e0b71360cfbc690a29f4abe15d7b9b61b8479 Mon Sep 17 00:00:00 2001
+From: Tom Hughes <tom@compton.nu>
+Date: Sat, 22 Jun 2013 12:11:54 +0100
+Subject: [PATCH 12/15] Avoid pixel artifacts when compositing
+
+Change src_over alpha to avoid pixel artifacts by reordering computations.
+---
+ include/agg_pixfmt_rgba.h | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+diff --git a/include/agg_pixfmt_rgba.h b/include/agg_pixfmt_rgba.h
+index 5d6b511..bb255cd 100644
+--- a/include/agg_pixfmt_rgba.h
++++ b/include/agg_pixfmt_rgba.h
+@@ -346,7 +346,7 @@ namespace agg
+             p[Order::R] = (value_type)(sr + ((p[Order::R] * s1a + base_mask) >> base_shift));
+             p[Order::G] = (value_type)(sg + ((p[Order::G] * s1a + base_mask) >> base_shift));
+             p[Order::B] = (value_type)(sb + ((p[Order::B] * s1a + base_mask) >> base_shift));
+-            p[Order::A] = (value_type)(sa + p[Order::A] - ((sa * p[Order::A] + base_mask) >> base_shift));
++            p[Order::A] = (value_type)(sa + ((p[Order::A] * s1a + base_mask) >> base_shift));
+         }
+     };
+ 
+-- 
+1.8.1.4
+
diff --git a/SDK/package/agg/0013-Modify-agg-conv-classes-to-allow-access-to-the-origi.patch b/SDK/package/agg/0013-Modify-agg-conv-classes-to-allow-access-to-the-origi.patch
new file mode 100644
index 0000000..9deb904
--- /dev/null
+++ b/SDK/package/agg/0013-Modify-agg-conv-classes-to-allow-access-to-the-origi.patch
@@ -0,0 +1,93 @@
+From 6f1ab5f4b470bcf4e7e72aac6e2f7f6ee3e7b424 Mon Sep 17 00:00:00 2001
+From: Tom Hughes <tom@compton.nu>
+Date: Sat, 22 Jun 2013 12:16:42 +0100
+Subject: [PATCH 13/15] Modify agg conv classes to allow access to the original
+ geometry type
+
+---
+ include/agg_conv_adaptor_vcgen.h | 2 ++
+ include/agg_conv_adaptor_vpgen.h | 1 +
+ include/agg_conv_clip_polygon.h  | 1 +
+ include/agg_conv_clip_polyline.h | 1 +
+ include/agg_conv_smooth_poly1.h  | 2 ++
+ 5 files changed, 7 insertions(+)
+
+diff --git a/include/agg_conv_adaptor_vcgen.h b/include/agg_conv_adaptor_vcgen.h
+index 7bd9b07..fef4579 100644
+--- a/include/agg_conv_adaptor_vcgen.h
++++ b/include/agg_conv_adaptor_vcgen.h
+@@ -38,6 +38,7 @@ namespace agg
+ 
+         void rewind(unsigned) {}
+         unsigned vertex(double*, double*) { return path_cmd_stop; }
++        unsigned type() const { return 0; }
+     };
+ 
+ 
+@@ -73,6 +74,7 @@ namespace agg
+         }
+ 
+         unsigned vertex(double* x, double* y);
++        unsigned type() const { return m_source->type(); }
+ 
+     private:
+         // Prohibit copying
+diff --git a/include/agg_conv_adaptor_vpgen.h b/include/agg_conv_adaptor_vpgen.h
+index dca9415..a39102d 100644
+--- a/include/agg_conv_adaptor_vpgen.h
++++ b/include/agg_conv_adaptor_vpgen.h
+@@ -42,6 +42,7 @@ namespace agg
+ 
+         void rewind(unsigned path_id);
+         unsigned vertex(double* x, double* y);
++        unsigned type() const { return m_source->type(); }
+ 
+     private:
+         conv_adaptor_vpgen(const conv_adaptor_vpgen<VertexSource, VPGen>&);
+diff --git a/include/agg_conv_clip_polygon.h b/include/agg_conv_clip_polygon.h
+index 3c34590..e417a7d 100644
+--- a/include/agg_conv_clip_polygon.h
++++ b/include/agg_conv_clip_polygon.h
+@@ -60,6 +60,7 @@ namespace agg
+         double y1() const { return base_type::vpgen().y1(); }
+         double x2() const { return base_type::vpgen().x2(); }
+         double y2() const { return base_type::vpgen().y2(); }
++        unsigned type() const { return base_type::type(); }
+ 
+     private:
+         conv_clip_polygon(const conv_clip_polygon<VertexSource>&);
+diff --git a/include/agg_conv_clip_polyline.h b/include/agg_conv_clip_polyline.h
+index d45067f..0de4b57 100644
+--- a/include/agg_conv_clip_polyline.h
++++ b/include/agg_conv_clip_polyline.h
+@@ -60,6 +60,7 @@ namespace agg
+         double y1() const { return base_type::vpgen().y1(); }
+         double x2() const { return base_type::vpgen().x2(); }
+         double y2() const { return base_type::vpgen().y2(); }
++        unsigned type() const { return base_type::type(); }
+ 
+     private:
+         conv_clip_polyline(const conv_clip_polyline<VertexSource>&);
+diff --git a/include/agg_conv_smooth_poly1.h b/include/agg_conv_smooth_poly1.h
+index 15f7f8d..0956c4e 100644
+--- a/include/agg_conv_smooth_poly1.h
++++ b/include/agg_conv_smooth_poly1.h
+@@ -48,6 +48,7 @@ namespace agg
+ 
+         void   smooth_value(double v) { base_type::generator().smooth_value(v); }
+         double smooth_value() const { return base_type::generator().smooth_value(); }
++        unsigned type() const { return base_type::type(); }
+ 
+     private:
+         conv_smooth_poly1(const conv_smooth_poly1<VertexSource>&);
+@@ -70,6 +71,7 @@ namespace agg
+ 
+         void   smooth_value(double v) { m_smooth.generator().smooth_value(v); }
+         double smooth_value() const { return m_smooth.generator().smooth_value(); }
++        unsigned type() const { return m_smooth.type(); }
+ 
+     private:
+         conv_smooth_poly1_curve(const conv_smooth_poly1_curve<VertexSource>&);
+-- 
+1.8.1.4
+
diff --git a/SDK/package/agg/0014-Avoid-potential-zero-division-resulting-in-nan-in-ag.patch b/SDK/package/agg/0014-Avoid-potential-zero-division-resulting-in-nan-in-ag.patch
new file mode 100644
index 0000000..547b0d2
--- /dev/null
+++ b/SDK/package/agg/0014-Avoid-potential-zero-division-resulting-in-nan-in-ag.patch
@@ -0,0 +1,30 @@
+From 6433a64f4cd41e88499386b0b7c7ae05d30683b8 Mon Sep 17 00:00:00 2001
+From: Tom Hughes <tom@compton.nu>
+Date: Sat, 22 Jun 2013 12:33:32 +0100
+Subject: [PATCH 14/15] Avoid potential zero division resulting in nan in
+ agg::gamma_linear
+
+---
+ include/agg_gamma_functions.h | 6 +++++-
+ 1 file changed, 5 insertions(+), 1 deletion(-)
+
+diff --git a/include/agg_gamma_functions.h b/include/agg_gamma_functions.h
+index fa38a45..beb0c04 100644
+--- a/include/agg_gamma_functions.h
++++ b/include/agg_gamma_functions.h
+@@ -94,7 +94,11 @@ namespace agg
+         {
+             if(x < m_start) return 0.0;
+             if(x > m_end) return 1.0;
+-            return (x - m_start) / (m_end - m_start);
++            double delta = m_end - m_start;
++            // avoid nan from potential zero division
++            // https://github.com/mapnik/mapnik/issues/761
++            if (delta <= 0.0) return 0.0;
++            return (x - m_start) / delta;
+         }
+ 
+     private:
+-- 
+1.8.1.4
+
diff --git a/SDK/package/agg/0015-Ensure-first-value-in-the-gamma-table-is-always-zero.patch b/SDK/package/agg/0015-Ensure-first-value-in-the-gamma-table-is-always-zero.patch
new file mode 100644
index 0000000..6214bd6
--- /dev/null
+++ b/SDK/package/agg/0015-Ensure-first-value-in-the-gamma-table-is-always-zero.patch
@@ -0,0 +1,24 @@
+From ca818d4dcd428c5560fc3c341fbaf427a7485e32 Mon Sep 17 00:00:00 2001
+From: Tom Hughes <tom@compton.nu>
+Date: Sat, 22 Jun 2013 12:34:37 +0100
+Subject: [PATCH 15/15] Ensure first value in the gamma table is always zero
+
+---
+ include/agg_gamma_functions.h | 1 +
+ 1 file changed, 1 insertion(+)
+
+diff --git a/include/agg_gamma_functions.h b/include/agg_gamma_functions.h
+index beb0c04..b8eda52 100644
+--- a/include/agg_gamma_functions.h
++++ b/include/agg_gamma_functions.h
+@@ -49,6 +49,7 @@ namespace agg
+ 
+         double operator() (double x) const
+         {
++            if (x == 0.0) return 0.0;
+             return pow(x, m_gamma);
+         }
+ 
+-- 
+1.8.1.4
+
diff --git a/SDK/package/agg/CREATE_FILES.patch b/SDK/package/agg/CREATE_FILES.patch
new file mode 100644
index 0000000..1a78125
--- /dev/null
+++ b/SDK/package/agg/CREATE_FILES.patch
@@ -0,0 +1,14 @@
+--- a/README.orig	2007-01-07 13:58:28.000000000 +0000
++++ b/README	2007-01-07 14:02:40.000000000 +0000
+@@ -0,0 +1 @@
++cacac
+
+--- a/NEWS.orig	2007-01-07 13:58:28.000000000 +0000
++++ b/NEWS	2007-01-07 14:02:40.000000000 +0000
+@@ -0,0 +1 @@
++cacac
+
+--- a/AUTHORS.orig	2007-01-07 13:58:28.000000000 +0000
++++ b/AUTHORS	2007-01-07 14:02:40.000000000 +0000
+@@ -0,0 +1 @@
++cacac
diff --git a/SDK/package/agg/Config.in b/SDK/package/agg/Config.in
new file mode 100644
index 0000000..a842098
--- /dev/null
+++ b/SDK/package/agg/Config.in
@@ -0,0 +1,13 @@
+config BR2_PACKAGE_AGG
+	bool "agg"
+	depends on BR2_INSTALL_LIBSTDCPP
+	select BR2_PACKAGE_SDL
+	help
+	  The Anti-Grain Geometry project. A High Quality 2D Graphics Rendering
+	  Engine for C++.
+	  We select the SDL backend by default.
+
+	  http://www.antigrain.com/index.html
+
+comment "agg needs a toolchain with C++ support"
+	depends on !BR2_INSTALL_LIBSTDCPP
diff --git a/SDK/package/agg/agg-2.4-depends.patch b/SDK/package/agg/agg-2.4-depends.patch
new file mode 100644
index 0000000..f5506e2
--- /dev/null
+++ b/SDK/package/agg/agg-2.4-depends.patch
@@ -0,0 +1,36 @@
+--- agg-2.4.orig/font_freetype/Makefile.am	2005-10-18 11:45:40.000000000 +0100
++++ agg-2.4/font_freetype/Makefile.am	2006-07-10 15:11:55.000000000 +0100
+@@ -4,8 +4,9 @@
+ agginclude_HEADERS = agg_font_freetype.h
+ lib_LTLIBRARIES = libaggfontfreetype.la
+ 
+-libaggfontfreetype_la_LDFLAGS = -version-info @AGG_LIB_VERSION@ @FREETYPE_LIBS@ 
++libaggfontfreetype_la_LDFLAGS = -version-info @AGG_LIB_VERSION@
+ libaggfontfreetype_la_SOURCES = agg_font_freetype.cpp
+ libaggfontfreetype_la_CXXFLAGS = -I$(top_srcdir)/include @FREETYPE_CFLAGS@ 
++libaggfontfreetype_la_LIBADD = ../src/libagg.la @FREETYPE_LIBS@
+ endif
+ 
+--- agg-2.4.orig/src/platform/sdl/Makefile.am	2005-10-17 23:49:35.000000000 +0100
++++ agg-2.4/src/platform/sdl/Makefile.am	2006-07-10 15:11:55.000000000 +0100
+@@ -5,6 +5,6 @@
+ libaggplatformsdl_la_LDFLAGS = -version-info @AGG_LIB_VERSION@
+ libaggplatformsdl_la_SOURCES = agg_platform_support.cpp
+ libaggplatformsdl_la_CXXFLAGS = -I$(top_srcdir)/include @SDL_CFLAGS@
+-libaggplatformsdl_la_LIBADD = @SDL_LIBS@
++libaggplatformsdl_la_LIBADD = ../../libagg.la @SDL_LIBS@
+ endif
+ 
+--- agg-2.5.orig/src/platform/X11/Makefile.am	2006-12-11 00:59:45.000000000 +0000
++++ agg-2.5/src/platform/X11/Makefile.am	2007-01-07 14:07:39.000000000 +0000
+@@ -1,8 +1,8 @@
+ if ENABLE_X11
+ lib_LTLIBRARIES = libaggplatformX11.la
+ 
+-libaggplatformX11_la_LDFLAGS = -version-info @AGG_LIB_VERSION@  -L@x_libraries@
++libaggplatformX11_la_LDFLAGS = -version-info @AGG_LIB_VERSION@ @X_LDFLAGS@
+ libaggplatformX11_la_SOURCES = agg_platform_support.cpp
+ libaggplatformX11_la_CXXFLAGS =  -I$(top_srcdir)/include -I@x_includes@
+-libaggplatformX11_la_LIBADD = -lX11
++libaggplatformX11_la_LIBADD = ../../libagg.la -lX11
+ endif
diff --git a/SDK/package/agg/agg-2.5-autotools.patch b/SDK/package/agg/agg-2.5-autotools.patch
new file mode 100644
index 0000000..1272b65
--- /dev/null
+++ b/SDK/package/agg/agg-2.5-autotools.patch
@@ -0,0 +1,11 @@
+--- a/configure.in	2013-02-22 09:30:00.000000000 -0600
++++ b/configure.in	2013-02-22 09:30:49.030777571 -0600
+@@ -8,7 +8,7 @@
+ AC_PROG_CC
+ AC_PROG_CXX
+ AC_ISC_POSIX
+-AM_C_PROTOTYPES
++#AM_C_PROTOTYPES
+ if test "x$U" != "x"; then
+   AC_MSG_ERROR(Compiler not ANSI compliant)
+ fi
diff --git a/SDK/package/agg/agg-2.5-pkgconfig.patch b/SDK/package/agg/agg-2.5-pkgconfig.patch
new file mode 100644
index 0000000..a303bfb
--- /dev/null
+++ b/SDK/package/agg/agg-2.5-pkgconfig.patch
@@ -0,0 +1,10 @@
+--- agg-2.5/libagg.pc.in.orig	2007-01-07 13:58:28.000000000 +0000
++++ agg-2.5/libagg.pc.in	2007-01-07 14:02:40.000000000 +0000
+@@ -6,5 +6,6 @@
+ Name: libagg
+ Description: Anti Grain Geometry - A High Quality Rendering Engine for C++
+ Version: @VERSION@
+-Libs: -L${libdir} -Wl,-rpath,${exec_prefix}/lib -lagg
++Requires.private: freetype2
++Libs: -L${libdir} -lagg
+ Cflags: -I${includedir}
diff --git a/SDK/package/agg/agg.mk b/SDK/package/agg/agg.mk
new file mode 100644
index 0000000..ecf5749
--- /dev/null
+++ b/SDK/package/agg/agg.mk
@@ -0,0 +1,32 @@
+###############################################################################
+#
+# agg
+#
+###############################################################################
+
+AGG_VERSION = 2.5
+AGG_SOURCE = agg-$(AGG_VERSION).tar.gz
+AGG_SITE = https://ftp.osuosl.org/pub/blfs/8.0/a
+AGG_LICENSE = GPLv3+
+AGG_LICENSE_FILES = COPYING
+AGG_INSTALL_STAGING = YES
+AGG_AUTORECONF = YES
+
+AGG_DEPENDENCIES = host-pkgconf sdl
+
+AGG_CONF_OPTS = \
+        --with-sdl-prefix=$(STAGING_DIR)/usr \
+        --disable-sdltest
+
+AGG_CONF_OPTS += \
+	--with-x=NO \
+	--disable-examples --disable-gpc
+
+ifeq ($(BR2_PACKAGE_FREETYPE),y)
+AGG_DEPENDENCIES += freetype
+AGG_CONF_OPTS += --enable-freetype
+else
+AGG_CONF_OPTS += --disable-freetype
+endif
+
+$(eval $(autotools-package))
diff --git a/SDK/package/dmtx-utils/0001-no-static-debug.patch b/SDK/package/dmtx-utils/0001-no-static-debug.patch
new file mode 100644
index 0000000..7fbfa01
--- /dev/null
+++ b/SDK/package/dmtx-utils/0001-no-static-debug.patch
@@ -0,0 +1,35 @@
+ dmtxquery/Makefile.am |    2 +-
+ dmtxread/Makefile.am  |    2 +-
+ dmtxwrite/Makefile.am |    2 +-
+ 3 files changed, 3 insertions(+), 3 deletions(-)
+
+diff -Naur a/dmtxquery/Makefile.am b/dmtxquery/Makefile.am
+--- a/dmtxquery/Makefile.am
++++ b/dmtxquery/Makefile.am
+@@ -9,5 +9,5 @@
+ dmtxquery_LDADD = $(LIBOBJS)
+ 
+ dmtxquery_debug_SOURCES = dmtxquery.c dmtxquery.h ../common/dmtxutil.c ../common/dmtxutil.h
+-dmtxquery_debug_LDFLAGS = -static $(DMTX_LIBS)
++dmtxquery_debug_LDFLAGS = $(DMTX_LIBS)
+ dmtxquery_debug_LDADD = $(LIBOBJS)
+diff -Naur a/dmtxread/Makefile.am b/dmtxread/Makefile.am
+--- a/dmtxread/Makefile.am
++++ b/dmtxread/Makefile.am
+@@ -11,5 +11,5 @@
+ 
+ dmtxread_debug_SOURCES = dmtxread.c dmtxread.h ../common/dmtxutil.c ../common/dmtxutil.h
+ dmtxread_debug_CFLAGS = $(DMTX_CFLAGS) $(MAGICK_CFLAGS) -D_MAGICK_CONFIG_H
+-dmtxread_debug_LDFLAGS = -static $(DMTX_LIBS) $(MAGICK_LIBS)
++dmtxread_debug_LDFLAGS = $(DMTX_LIBS) $(MAGICK_LIBS)
+ dmtxread_debug_LDADD = $(LIBOBJS)
+diff -Naur a/dmtxwrite/Makefile.am b/dmtxwrite/Makefile.am
+--- a/dmtxwrite/Makefile.am
++++ b/dmtxwrite/Makefile.am
+@@ -11,5 +11,5 @@
+ 
+ dmtxwrite_debug_SOURCES = dmtxwrite.c dmtxwrite.h ../common/dmtxutil.c ../common/dmtxutil.h
+ dmtxwrite_debug_CFLAGS = $(DMTX_FLAGS) $(MAGICK_CFLAGS) -D_MAGICK_CONFIG_H
+-dmtxwrite_debug_LDFLAGS = -static $(DMTX_LIBS) $(MAGICK_LIBS)
++dmtxwrite_debug_LDFLAGS = $(DMTX_LIBS) $(MAGICK_LIBS)
+ dmtxwrite_debug_LDADD = $(LIBOBJS)
diff --git a/SDK/package/dmtx-utils/Config.in b/SDK/package/dmtx-utils/Config.in
new file mode 100644
index 0000000..0d22f43
--- /dev/null
+++ b/SDK/package/dmtx-utils/Config.in
@@ -0,0 +1,12 @@
+config BR2_PACKAGE_DMTX_UTILS
+	bool "dmtx utils"
+	depends on BR2_PACKAGE_LIBDMTX
+	select BR2_PACKAGE_IMAGEMAGICK
+	help
+	  libdmtx is a software library that enables programs to read
+	  and write Data Matrix barcodes of the modern ECC200
+	  variety. This package, dmtx-utils, provides command line
+	  utilities that allow scripts and command line users to use
+	  libdmtx functionality.
+
+	  https://github.com/dmtx/dmtx-utils
diff --git a/SDK/package/dmtx-utils/dmtx-utils.hash b/SDK/package/dmtx-utils/dmtx-utils.hash
new file mode 100644
index 0000000..0b9bf1c
--- /dev/null
+++ b/SDK/package/dmtx-utils/dmtx-utils.hash
@@ -0,0 +1,3 @@
+# Locally computed:
+sha256  0d396ec14f32a8cf9e08369a4122a16aa2e5fa1675e02218f16f1ab777ea2a28  dmtx-utils-0.7.6.tar.gz
+sha256  d8c320ffc0030d1b096ae4732b50d2b811cf95e9a9b7377c1127b2563e0a0388  COPYING
diff --git a/SDK/package/dmtx-utils/dmtx-utils.mk b/SDK/package/dmtx-utils/dmtx-utils.mk
new file mode 100644
index 0000000..4560b59
--- /dev/null
+++ b/SDK/package/dmtx-utils/dmtx-utils.mk
@@ -0,0 +1,20 @@
+################################################################################
+#
+# dmtx-utils
+#
+################################################################################
+
+DMTX_UTILS_VERSION = 0.7.6
+DMTX_UTILS_SITE = $(call github,dmtx,dmtx-utils,v$(DMTX_UTILS_VERSION))
+DMTX_UTILS_DEPENDENCIES = libdmtx imagemagick
+DMTX_UTILS_LICENSE = LGPL-2.1+
+DMTX_UTILS_LICENSE_FILES = COPYING
+# github tarball does not include configure
+DMTX_UTILS_AUTORECONF = YES
+
+define DMTX_UTILS_RUN_AUTOGEN
+	cd $(@D) && PATH=$(BR_PATH) ./autogen.sh
+endef
+DMTX_UTILS_PRE_CONFIGURE_HOOKS += DMTX_UTILS_RUN_AUTOGEN
+
+$(eval $(autotools-package))
diff --git a/SDK/package/fluidlite/0001-fluidlite.patch b/SDK/package/fluidlite/0001-fluidlite.patch
new file mode 100644
index 0000000..9dc01c4
--- /dev/null
+++ b/SDK/package/fluidlite/0001-fluidlite.patch
@@ -0,0 +1,11 @@
+--- a/CMakeLists.txt
++++ b/CMakeLists.txt
+@@ -125,7 +125,7 @@
+     endif()
+ endif()
+ 
+-option(FLUIDLITE_BUILD_SHARED "Build shared library" TRUE)
++option(FLUIDLITE_BUILD_SHARED "Build shared library" FALSE)
+ if(FLUIDLITE_BUILD_SHARED)
+     add_library(${PROJECT_NAME} SHARED ${SOURCES})
+ 
diff --git a/SDK/package/fluidlite/Config.in b/SDK/package/fluidlite/Config.in
new file mode 100644
index 0000000..bdfb7e0
--- /dev/null
+++ b/SDK/package/fluidlite/Config.in
@@ -0,0 +1,5 @@
+config BR2_PACKAGE_FLUIDLITE
+	bool "FluidLite"
+	depends on BR2_USE_MMU
+	help
+	  FluidLite is a very light version of FluidSynth.
diff --git a/SDK/package/fluidlite/fluidsynth.hash b/SDK/package/fluidlite/fluidsynth.hash
new file mode 100644
index 0000000..d39b0e8
--- /dev/null
+++ b/SDK/package/fluidlite/fluidsynth.hash
@@ -0,0 +1,3 @@
+# Locally computed
+sha256  ef4d008f9fe2fa9a48135505d42dd7e8e9cc4d7494a4b13d6caa13adb5c61ff8  1.0.tar.gz
+sha256  a5564e99fd7f49e1344152a8c5bc1d420d5f973b30e010946764db0b5b9e668c  LICENSE
diff --git a/SDK/package/fluidlite/fluidsynth.mk b/SDK/package/fluidlite/fluidsynth.mk
new file mode 100644
index 0000000..5cd24b7
--- /dev/null
+++ b/SDK/package/fluidlite/fluidsynth.mk
@@ -0,0 +1,19 @@
+################################################################################
+#
+# FLUIDLITE
+#
+################################################################################
+
+FLUIDLITE_VERSION = fdd05bad03cdb24d1f78b5fe3453842890c1b0e8
+FLUIDLITE_SITE = $(call github,gcw0,FluidLite,$(FLUIDLITE_VERSION))
+FLUIDLITE_LICENSE = LGPL-2.1+
+FLUIDLITE_LICENSE_FILES = LICENSE
+FLUIDLITE_INSTALL_STAGING = YES
+FLUIDLITE_DEPENDENCIES = 
+
+# Disable the shared library for static only build
+ifeq ($(BR2_STATIC_LIBS),y)
+FLUIDLITE_CONF_OPTS += -DDFLUIDLITE_BUILD_SHARED=FALSE
+endif
+
+$(eval $(cmake-package))
diff --git a/SDK/package/libini/Config.in b/SDK/package/libini/Config.in
new file mode 100644
index 0000000..a42967b
--- /dev/null
+++ b/SDK/package/libini/Config.in
@@ -0,0 +1,6 @@
+config BR2_PACKAGE_LIBINI
+	bool "libini"
+	help
+	  Tiny library to help reading INI files.
+
+	  https://github.com/pcercuei/libini
diff --git a/SDK/package/libini/libini.mk b/SDK/package/libini/libini.mk
new file mode 100644
index 0000000..7d50b25
--- /dev/null
+++ b/SDK/package/libini/libini.mk
@@ -0,0 +1,13 @@
+#############################################################
+#
+# libini
+#
+#############################################################
+LIBINI_VERSION = c3413da
+LIBINI_SITE_METHOD = git
+LIBINI_SITE = https://github.com/FunKey-Project/libini.git
+LIBINI_LICENSE = LGPL-2.1
+
+LIBINI_INSTALL_STAGING = YES
+
+$(eval $(cmake-package))
diff --git a/SDK/package/libmikmod/Config.in b/SDK/package/libmikmod/Config.in
new file mode 100644
index 0000000..909cc00
--- /dev/null
+++ b/SDK/package/libmikmod/Config.in
@@ -0,0 +1,7 @@
+config BR2_PACKAGE_LIBMIKMOD
+	bool "libmikmod"
+	help
+	  Mikmod is a module player and library supporting many
+	  tracker formats, including mod, s3m, it, and xm.
+
+	  http://mikmod.shlomifish.org/
diff --git a/SDK/package/libmikmod/libmikmod.mk b/SDK/package/libmikmod/libmikmod.mk
new file mode 100644
index 0000000..cce5190
--- /dev/null
+++ b/SDK/package/libmikmod/libmikmod.mk
@@ -0,0 +1,21 @@
+#############################################################
+#
+# libmikmod
+#
+#############################################################
+LIBMIKMOD_VERSION:=3.3.11.1
+LIBMIKMOD_SITE:=http://sourceforge.net/projects/mikmod/files/libmikmod/$(LIBMIKMOD_VERSION)
+
+LIBMIKMOD_CONF_OPTS = --localstatedir=/var
+
+LIBMIKMOD_LIBTOOL_PATCH = NO
+LIBMIKMOD_INSTALL_STAGING = YES
+
+LIBMIKMOD_CONFIG_SCRIPTS = libmikmod-config
+
+define LIBMIKMOD_REMOVE_LIBMIKMOD_CONFIG
+mv $(TARGET_DIR)/usr/bin/libmikmod-config $(HOST_DIR)/bin/
+endef
+LIBMIKMOD_POST_INSTALL_TARGET_HOOKS += LIBMIKMOD_REMOVE_LIBMIKMOD_CONFIG
+
+$(eval $(autotools-package))
diff --git a/SDK/package/libopk/Config.in b/SDK/package/libopk/Config.in
new file mode 100644
index 0000000..db3bce9
--- /dev/null
+++ b/SDK/package/libopk/Config.in
@@ -0,0 +1,8 @@
+config BR2_PACKAGE_LIBOPK
+        bool "libopk"
+        select BR2_PACKAGE_ZLIB
+        select BR2_PACKAGE_LIBINI
+        help
+          Library to handle OPK packages.
+
+          https://github.com/pcercuei/libopk
diff --git a/SDK/package/libopk/libopk.mk b/SDK/package/libopk/libopk.mk
new file mode 100644
index 0000000..8cf840a
--- /dev/null
+++ b/SDK/package/libopk/libopk.mk
@@ -0,0 +1,14 @@
+#############################################################
+#
+# libopk
+#
+#############################################################
+LIBOPK_VERSION = 3c918c8
+LIBOPK_SITE_METHOD = git
+LIBOPK_SITE = https://github.com/FunKey-Project/libopk.git
+
+LIBOPK_DEPENDENCIES = libini zlib
+
+LIBOPK_INSTALL_STAGING = YES
+
+$(eval $(cmake-package))
diff --git a/SDK/package/libxdgmime/Config.in b/SDK/package/libxdgmime/Config.in
new file mode 100644
index 0000000..f1d4fc0
--- /dev/null
+++ b/SDK/package/libxdgmime/Config.in
@@ -0,0 +1,12 @@
+config BR2_PACKAGE_LIBXDGMIME
+	bool "libxdgmime"
+	depends on BR2_USE_WCHAR # shared-mime-info
+	select BR2_PACKAGE_SHARED_MIME_INFO
+	help
+	  Simple library that parses the proposed MIME spec
+	  listed at http://freedesktop.org/.
+
+	  https://github.com/pcercuei/libxdgmime
+
+comment "libxdgmime requires a toolchain with WCHAR support"
+	depends on !BR2_USE_WCHAR
diff --git a/SDK/package/libxdgmime/libxdgmime.mk b/SDK/package/libxdgmime/libxdgmime.mk
new file mode 100644
index 0000000..c6e592f
--- /dev/null
+++ b/SDK/package/libxdgmime/libxdgmime.mk
@@ -0,0 +1,30 @@
+#############################################################
+#
+# libxdgmime
+#
+#############################################################
+LIBXDGMIME_VERSION = db79e7c
+LIBXDGMIME_SITE_METHOD = git
+LIBXDGMIME_SITE = https://github.com/FunKey-Project/libxdgmime.git
+LIBXDGMIME_DEPENDENCIES = shared-mime-info
+LIBXDGMIME_LICENCE = LGPL-2.1+ or AFL-2.1
+
+LIBXDGMIME_INSTALL_STAGING = YES
+
+LIBXDGMIME_MAKE_ENV = CFLAGS="$(TARGET_CFLAGS)" LDFLAGS="$(TARGET_LDFLAGS)" \
+				  CROSS_COMPILE="$(TARGET_CROSS)" PREFIX=/usr \
+				  PLATFORM="$(BR2_VENDOR)"
+
+define LIBXDGMIME_BUILD_CMDS
+	$(LIBXDGMIME_MAKE_ENV) $(MAKE) -C $(@D)
+endef
+
+define LIBXDGMIME_INSTALL_STAGING_CMDS
+	$(LIBXDGMIME_MAKE_ENV) DESTDIR="$(STAGING_DIR)" $(MAKE) -C $(@D) install
+endef
+
+define LIBXDGMIME_INSTALL_TARGET_CMDS
+	$(LIBXDGMIME_MAKE_ENV) DESTDIR="$(TARGET_DIR)" $(MAKE) -C $(@D) install-lib
+endef
+
+$(eval $(generic-package))
diff --git a/buildroot b/buildroot
index d8082db..d526838 160000
--- a/buildroot
+++ b/buildroot
@@ -1 +1 @@
-Subproject commit d8082db677046e004a6537828b3e4f4b9a818a4f
+Subproject commit d5268384884357d5f17515ee083939f9596a2e73
diff --git a/genimage.cfg b/genimage.cfg
index 5570864..b91bcac 100644
--- a/genimage.cfg
+++ b/genimage.cfg
@@ -31,7 +31,7 @@ image sdcard.img {
 		partition-type = 0x83
 		bootable = "yes"
 		image = "FunKey/output/images/rootfs.ext4"
-		size = 100M # This will be resized to 1G during first boot
+		size = 160M # This will be resized to 1G during first boot
 	}
 
 # These partitions will be created during first boot