C/C++ realizes YUV420SP image conversion to RGB image

C/C ++ realizes YUV420SP image to RGB image

#ifndef YUV420SP_TO_RGB_FAST_ASM_H
#define YUV420SP_TO_RGB_FAST_ASM_H

//#if __ARM_NEON
#include <arm_neon.h>

static void yuv420sp_to_rgb_fast_asm(const unsigned char* yuv420sp, int w, int h, unsigned char* rgb)
{
    const unsigned char* yptr = yuv420sp;
    const unsigned char* vuptr = yuv420sp + w * h;

    int8x8_t _v128 = vdup_n_s8(128);
    int8x8_t _v90 = vdup_n_s8(90);
    int8x8_t _v46 = vdup_n_s8(46);
    int8x8_t _v22 = vdup_n_s8(22);
    int8x8_t _v113 = vdup_n_s8(113);

    for (int y=0; y<h; y + =2)
    {
        const unsigned char* yptr0 = yptr;
        const unsigned char* yptr1 = yptr + w;
        unsigned char* rgb0 = rgb;
        unsigned char* rgb1 = rgb + w*3;

#if __ARM_NEON
        int nn = w >> 3;
        int remain = w - (nn << 3);
#else
        int remain = w;
#endif // __ARM_NEON

#if __ARM_NEON
#if __aarch64__
        for (; nn>0; nn--)
        {
            int16x8_t _yy0 = vreinterpretq_s16_u16(vshll_n_u8(vld1_u8(yptr0), 6));
            int16x8_t _yy1 = vreinterpretq_s16_u16(vshll_n_u8(vld1_u8(yptr1), 6));

            int8x8_t _vvuu = vsub_s8(vreinterpret_s8_u8(vld1_u8(vuptr)), _v128);
            int8x8x2_t _vvvvuuuu = vtrn_s8(_vvuu, _vvuu);
            int8x8_t _vv = _vvvvuuuu.val[0];
            int8x8_t _uu = _vvvvuuuu.val[1];

            int16x8_t _r0 = vmlal_s8(_yy0, _vv, _v90);
            int16x8_t _g0 = vmlsl_s8(_yy0, _vv, _v46);
            _g0 = vmlsl_s8(_g0, _uu, _v22);
            int16x8_t _b0 = vmlal_s8(_yy0, _uu, _v113);

            int16x8_t _r1 = vmlal_s8(_yy1, _vv, _v90);
            int16x8_t _g1 = vmlsl_s8(_yy1, _vv, _v46);
            _g1 = vmlsl_s8(_g1, _uu, _v22);
            int16x8_t _b1 = vmlal_s8(_yy1, _uu, _v113);

            uint8x8x3_t _rgb0;
            _rgb0.val[0] = vqshrun_n_s16(_r0, 6);
            _rgb0.val[1] = vqshrun_n_s16(_g0, 6);
            _rgb0.val[2] = vqshrun_n_s16(_b0, 6);

            uint8x8x3_t _rgb1;
            _rgb1.val[0] = vqshrun_n_s16(_r1, 6);
            _rgb1.val[1] = vqshrun_n_s16(_g1, 6);
            _rgb1.val[2] = vqshrun_n_s16(_b1, 6);

            vst3_u8(rgb0, _rgb0);
            vst3_u8(rgb1, _rgb1);

            yptr0 += 8;
            yptr1 += 8;
            vuptr += 8;
            rgb0 += 24;
            rgb1 += 24;
        }
#else
        if (nn > 0)
        {
        asm volatile(
            "pld [%3, #128]\
"
            "vld1.u8 {d2}, [%3]! \
"
            "vsub.s8 d2, d2, \
"
            "0:\
"
            "pld [%1, #128]\
"
            "vld1.u8 {d0}, [%1]! \
"
            "pld [%2, #128]\
"
            "vld1.u8 {d1}, [%2]! \
"
            "vshll.u8 q2, d0, #6\
"
            "vorr d3, d2, d2\
"
            "vshll.u8 q3, d1, #6\
"
            "vorr q9, q2, q2\
"
            "vtrn.s8 d2, d3\
"
            "vorr q11, q3, q3\
"
            "vmlsl.s8 q9, d2, \
"
            "vorr q8, q2, q2\
"
            "vmlsl.s8 q11, d2, \
"
            "vorr q10, q3, q3 \
"
            "vmlal.s8 q8, d2, \
"
            "vmlal.s8 q2, d3, \
"
            "vmlal.s8 q10, d2, \
"
            "vmlsl.s8 q9, d3, \
"
            "vmlal.s8 q3, d3, \
"
            "vmlsl.s8 q11, d3, \
"
            "vqshrun.s16 d24, q8, #6 \
"
            "vqshrun.s16 d26, q2, #6 \
"
            "vqshrun.s16 d4, q10, #6 \
"
            "vqshrun.s16 d25, q9, #6 \
"
            "vqshrun.s16 d6, q3, #6 \
"
            "vqshrun.s16 d5, q11, #6 \
"
            "pld [%3, #128]\
"
            "vld1.u8 {d2}, [%3]! \
"
            "subs %0, #1 \
"
            "vst3.u8 {d24-d26}, [%4]!\
"
            "vsub.s8 d2, d2, \
"
            "vst3.u8 {d4-d6}, [%5]!\
"
            "bne 0b \
"
            "sub %3, #8 \
"
            : "=r"(nn), // %0
              "=r"(yptr0), // %1
              "=r"(yptr1), // %2
              "=r"(vuptr), // %3
              "=r"(rgb0), // %4
              "=r"(rgb1) // %5
            : "0"(nn),
              "1"(yptr0),
              "2"(yptr1),
              "3"(vuptr),
              "4"(rgb0),
              "5"(rgb1),
              "w"(_v128), //
              "w"(_v90), //
              "w"(_v46), //
              "w"(_v22), //
              "w"(_v113) //
            : "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11", "q12", "d26"
        );
        }
#endif // __aarch64__
#endif // __ARM_NEON

        for (; remain>0; remain-=2)
        {
            // R = 1.164 * yy + 1.596 * vv
            // G = 1.164 * yy - 0.813 * vv - 0.391 * uu
            // B = 1.164 * yy + 2.018 * uu

            // R = Y + (1.370705 * (V-128))
            // G = Y - (0.698001 * (V-128)) - (0.337633 * (U-128))
            // B = Y + (1.732446 * (U-128))

            // R = ((Y << 6) + 87.72512 * (V-128)) >> 6
            // G = ((Y << 6) - 44.672064 * (V-128) - 21.608512 * (U-128)) >> 6
            // B = ((Y << 6) + 110.876544 * (U-128)) >> 6

            // R = ((Y << 6) + 90 * (V-128)) >> 6
            // G = ((Y << 6) - 46 * (V-128) - 22 * (U-128)) >> 6
            // B = ((Y << 6) + 113 * (U-128)) >> 6

            // R = (yy + 90 * vv) >> 6
            // G = (yy - 46 * vv - 22 * uu) >> 6
            // B = (yy + 113 * uu) >> 6

            int v = vuptr[0] - 128;
            int u = vuptr[1] - 128;

            int ruv = 90 * v;
            int guv = -46 * v + -22 * u;
            int buv = 113 * u;

#define SATURATE_CAST_UCHAR(X) (unsigned char)std::min(std::max(X, 0), 255);

            int y00 = yptr0[0] << 6;
            rgb0[0] = SATURATE_CAST_UCHAR((y00 + ruv) >> 6);
            rgb0[1] = SATURATE_CAST_UCHAR((y00 + guv) >> 6);
            rgb0[2] = SATURATE_CAST_UCHAR((y00 + buv) >> 6);

            int y01 = yptr0[1] << 6;
            rgb0[3] = SATURATE_CAST_UCHAR((y01 + ruv) >> 6);
            rgb0[4] = SATURATE_CAST_UCHAR((y01 + guv) >> 6);
            rgb0[5] = SATURATE_CAST_UCHAR((y01 + buv) >> 6);

            int y10 = yptr1[0] << 6;
            rgb1[0] = SATURATE_CAST_UCHAR((y10 + ruv) >> 6);
            rgb1[1] = SATURATE_CAST_UCHAR((y10 + guv) >> 6);
            rgb1[2] = SATURATE_CAST_UCHAR((y10 + buv) >> 6);

            int y11 = yptr1[1] << 6;
            rgb1[3] = SATURATE_CAST_UCHAR((y11 + ruv) >> 6);
            rgb1[4] = SATURATE_CAST_UCHAR((y11 + guv) >> 6);
            rgb1[5] = SATURATE_CAST_UCHAR((y11 + buv) >> 6);

#undef SATURATE_CAST_UCHAR

            yptr0 += 2;
            yptr1 += 2;
            vuptr += 2;
            rgb0 += 6;
            rgb1 += 6;
        }

        yptr + = 2*w;
        rgb + = 2*3*w;
    }
}

static void yuv420sp_to_rgba_fast_asm(const unsigned char* yuv420sp, int w, int h, unsigned char* rgba)
{
    const unsigned char* yptr = yuv420sp;
    const unsigned char* vuptr = yuv420sp + w * h;

    int8x8_t _v128 = vdup_n_s8(128);
    int8x8_t _v90 = vdup_n_s8(90);
    int8x8_t _v46 = vdup_n_s8(46);
    int8x8_t _v22 = vdup_n_s8(22);
    int8x8_t _v113 = vdup_n_s8(113);

    for (int y = 0; y < h; y + = 2)
    {
        const unsigned char* yptr0 = yptr;
        const unsigned char* yptr1 = yptr + w;
        unsigned char* rgba0 = rgba;
        unsigned char* rgba1 = rgba + w * 4;

#if __ARM_NEON
        int nn = w >> 3;
        int remain = w - (nn << 3);
#else
        int remain = w;
#endif // __ARM_NEON

#if __ARM_NEON
#if __aarch64__
        for (; nn > 0; nn--)
        {
            int16x8_t _yy0 = vreinterpretq_s16_u16(vshll_n_u8(vld1_u8(yptr0), 6));
            int16x8_t _yy1 = vreinterpretq_s16_u16(vshll_n_u8(vld1_u8(yptr1), 6));

            int8x8_t _vvuu = vsub_s8(vreinterpret_s8_u8(vld1_u8(vuptr)), _v128);
            int8x8x2_t _vvvvuuuu = vtrn_s8(_vvuu, _vvuu);
            int8x8_t _vv = _vvvvuuuu.val[0];
            int8x8_t _uu = _vvvvuuuu.val[1];

            int16x8_t _r0 = vmlal_s8(_yy0, _vv, _v90);
            int16x8_t _g0 = vmlsl_s8(_yy0, _vv, _v46);
            _g0 = vmlsl_s8(_g0, _uu, _v22);
            int16x8_t _b0 = vmlal_s8(_yy0, _uu, _v113);

            int16x8_t _r1 = vmlal_s8(_yy1, _vv, _v90);
            int16x8_t _g1 = vmlsl_s8(_yy1, _vv, _v46);
            _g1 = vmlsl_s8(_g1, _uu, _v22);
            int16x8_t _b1 = vmlal_s8(_yy1, _uu, _v113);

            uint8x8x4_t _rgba0;
            _rgba0.val[0] = vqshrun_n_s16(_r0, 6);
            _rgba0.val[1] = vqshrun_n_s16(_g0, 6);
            _rgba0.val[2] = vqshrun_n_s16(_b0, 6);

            uint8x8x4_t _rgba1;
            _rgba1.val[0] = vqshrun_n_s16(_r1, 6);
            _rgba1.val[1] = vqshrun_n_s16(_g1, 6);
            _rgba1.val[2] = vqshrun_n_s16(_b1, 6);

            vst4_u8(rgba0, _rgba0);
            vst4_u8(rgba1, _rgba1);

            yptr0 += 8;
            yptr1 += 8;
            vuptr += 8;
            rgba0 + = 8*4;
            rgba1 + = 8*4;
        }
#else
        if (nn > 0)
        {
            asm volatile(
                "pld [%3, #128]\
"
                "vld1.u8 {d2}, [%3]! \
"
                "vsub.s8 d2, d2, %P12\
"
                "vmov.u8 d27, #255 \
"

                "0:\
"
                "pld [%1, #128]\
"
                "vld1.u8 {d0}, [%1]! \
"
                "pld [%2, #128]\
"
                "vld1.u8 {d1}, [%2]! \
"
                "vshll.u8 q2, d0, #6\
"
                "vorr d3, d2, d2\
"
                "vshll.u8 q3, d1, #6\
"
                "vorr q9, q2, q2\
"
                "vtrn.s8 d2, d3\
"
                "vorr q11, q3, q3\
"
                "vmlsl.s8 q9, d2, %P14\
"
                "vorr q8, q2, q2\
"
                "vmlsl.s8 q11, d2, %P14\
"
                "vorr q10, q3, q3\
"
                "vmlal.s8 q8, d2, %P13\
"
                "vmlal.s8 q2, d3, %P16\
"
                "vmlal.s8 q10, d2, %P13\
"
                "vmlsl.s8 q9, d3, %P15\
"
                "vmlal.s8 q3, d3, %P16\
"
                "vmlsl.s8 q11, d3, %P15\
"
                "vqshrun.s16 d24, q8, #6\
"
                "vqshrun.s16 d26, q2, #6 \
"
                "vqshrun.s16 d4, q10, #6 \
"
                "vqshrun.s16 d25, q9, #6 \
"
                "vqshrun.s16 d6, q3, #6 \
"
                "vqshrun.s16 d5, q11, #6 \
"
                "vmov.u8 d7, #255 \
"
                "pld [%3, #128]\
"
                "vld1.u8 {d2}, [%3]! \
"
                "subs %0, #1 \
"
                "vst4.u8 {d24-d27}, [%4]!\
"
                "vsub.s8 d2, d2, \
"
                "vst4.u8 {d4-d7}, [%5]!\
"
                "bne 0b \
"
                "sub %3, #8 \
"
                : "=r"(nn), // %0
                "=r"(yptr0), // %1
                "=r"(yptr1), // %2
                "=r"(vuptr), // %3
                "=r"(rgba0), // %4
                "=r"(rgba1) // %5
                : "0"(nn),
                "1"(yptr0),
                "2"(yptr1),
                "3"(vuptr),
                "4"(rgba0),
                "5"(rgba1),
                "w"(_v128), //
                "w"(_v90), //
                "w"(_v46), //
                "w"(_v22), //
                "w"(_v113) //
                : "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11", "q12", "d26", " d27"
            );
        }
#endif // __aarch64__
#endif // __ARM_NEON

        for (; remain > 0; remain-=2)
        {
            // R = 1.164 * yy + 1.596 * vv
            // G = 1.164 * yy - 0.813 * vv - 0.391 * uu
            // B = 1.164 * yy + 2.018 * uu

            // R = Y + (1.370705 * (V-128))
            // G = Y - (0.698001 * (V-128)) - (0.337633 * (U-128))
            // B = Y + (1.732446 * (U-128))

            // R = ((Y << 6) + 87.72512 * (V-128)) >> 6
            // G = ((Y << 6) - 44.672064 * (V-128) - 21.608512 * (U-128)) >> 6
            // B = ((Y << 6) + 110.876544 * (U-128)) >> 6

            // R = ((Y << 6) + 90 * (V-128)) >> 6
            // G = ((Y << 6) - 46 * (V-128) - 22 * (U-128)) >> 6
            // B = ((Y << 6) + 113 * (U-128)) >> 6

            // R = (yy + 90 * vv) >> 6
            // G = (yy - 46 * vv - 22 * uu) >> 6
            // B = (yy + 113 * uu) >> 6

            int v = vuptr[0] - 128;
            int u = vuptr[1] - 128;

            int ruv = 90 * v;
            int guv = -46 * v + -22 * u;
            int buv = 113 * u;

#define SATURATE_CAST_UCHAR(X) (unsigned char)std::min(std::max(X, 0), 255);

            int y00 = yptr0[0] << 6;
            rgba0[0] = SATURATE_CAST_UCHAR((y00 + ruv) >> 6);
            rgba0[1] = SATURATE_CAST_UCHAR((y00 + guv) >> 6);
            rgba0[2] = SATURATE_CAST_UCHAR((y00 + buv) >> 6);

            int y01 = yptr0[1] << 6;
            rgba0[4] = SATURATE_CAST_UCHAR((y01 + ruv) >> 6);
            rgba0[5] = SATURATE_CAST_UCHAR((y01 + guv) >> 6);
            rgba0[6] = SATURATE_CAST_UCHAR((y01 + buv) >> 6);

            int y10 = yptr1[0] << 6;
            rgba1[0] = SATURATE_CAST_UCHAR((y10 + ruv) >> 6);
            rgba1[1] = SATURATE_CAST_UCHAR((y10 + guv) >> 6);
            rgba1[2] = SATURATE_CAST_UCHAR((y10 + buv) >> 6);

            int y11 = yptr1[1] << 6;
            rgba1[4] = SATURATE_CAST_UCHAR((y11 + ruv) >> 6);
            rgba1[5] = SATURATE_CAST_UCHAR((y11 + guv) >> 6);
            rgba1[6] = SATURATE_CAST_UCHAR((y11 + buv) >> 6);

#undef SATURATE_CAST_UCHAR

            yptr0 += 2;
            yptr1 += 2;
            vuptr += 2;
            rgba0 + = 2*4;
            rgba1 + = 2*4;
        }

        yptr += 2*w;
        rgba += 2*4*w;
    }
}

#endif

The knowledge points of the article match the official knowledge files, and you can further learn relevant knowledge