How to use neon to optimize the sine and cosine function (sin & cos)

How to use neon to optimize the sine and cosine function (sin & amp; cos)

basic idea
1. Approximate solution of cosf(x) and sinf(x)
- Taylor series expansion
- Fixed value calculation (actually Taylor series expansion)
2. NEON implementation
- sinf function
- cosf function
3. Verification

Basic idea

If you want to use neon to optimize the cosf() and sinf() functions, you first need to find a method for approximation, and then implement the function for approximation with neon.

1. Approximate solution of cosf(x) and sinf(x)

It should be noted that these approximate solution methods all need to limit x to the [-pi, pi] range. This can be achieved with the following code

float range_neg_pi_to_pi(float x)
{<!-- -->
    float y = 2 * pi;
// Find the remainder, limit the range to [-2 * pi, 2 * pi]
    int quotient = (int)(x / y);
    float val = x - (float)quotient * y;
// limit val to [-pi, pi]
    if (val > pi)
    {<!-- -->
        val -= 2 * pi;
    }
    else if (val < -pi)
    {<!-- -->
        val + = 2 * pi;
    }
    return val;
}

Vector processing, neon code:

// vector processing
static inline void range_neg_pi_to_pi_neon_intrinsics(float *x, float *result, float pi_num, int M)
{<!-- -->
    int i;
    float32x4_t v_2pi = vdupq_n_f32(pi_num * 2);
    float32x4_t v_rec_2pi = vdupq_n_f32(1 / (pi_num * 2));
    float32x4_t v_pi = vdupq_n_f32(pi_num);
    float32x4_t v_neg_pi = vdupq_n_f32(-pi_num);

    for (i = 0; i < M - 3; i + = 4)
    {<!-- -->
        float32x4_t v_x = vld1q_f32(x + i);
        int32x4_t v_quotient = vcvtq_s32_f32(vmulq_f32(v_x, v_rec_2pi));
        float32x4_t v_remainder = vssubq_f32(v_x, vmulq_f32(vcvtq_f32_s32(v_quotient), v_2pi));

        uint32x4_t v_mask1 = vcgtq_f32(v_remainder, v_pi);
        uint32x4_t v_mask2 = vcltq_f32(v_remainder, v_neg_pi);

        float32x4_t v_re1 = vsubq_f32(v_remainder, v_2pi);
        float32x4_t v_re2 = vaddq_f32(v_remainder, v_2pi);
        v_remainder = vbslq_f32(v_mask1, v_re1, v_remainder);
        v_remainder = vbslq_f32(v_mask2, v_re2, v_remainder);

        vst1q_f32(result + i, v_remainder);
    }

    for (; i < M; i ++ )
    {<!-- -->
        float y = 2 * pi_num;
        int quotient = (int)(x[i] / y);
        float val = x[i] - (float)quotient * y;

        if (val > pi_num)
        {<!-- -->
            val -= 2 * pi_num;
        }
        else if (val < -pi_num)
        {<!-- -->
            val += 2 * pi_num;
        }

        result[i] = val;
    }
}

// single float32x4_t
static inline float32x4_t range_neg_pi_to_pi_x4(float32x4_t v_x, float pi_num)
{<!-- -->
    float32x4_t v_2pi = vdupq_n_f32(pi_num * 2);
    float32x4_t v_rec_2pi = vdupq_n_f32(1 / (pi_num * 2));
    float32x4_t v_pi = vdupq_n_f32(pi_num);
    float32x4_t v_neg_pi = vdupq_n_f32(-pi_num);

    int32x4_t v_quotient = vcvtq_s32_f32(vmulq_f32(v_x, v_rec_2pi));
    float32x4_t v_remainder = vssubq_f32(v_x, vmulq_f32(vcvtq_f32_s32(v_quotient), v_2pi));

    uint32x4_t v_mask1 = vcgtq_f32(v_remainder, v_pi);
    uint32x4_t v_mask2 = vcltq_f32(v_remainder, v_neg_pi);

    float32x4_t v_re1 = vsubq_f32(v_remainder, v_2pi);
    float32x4_t v_re2 = vaddq_f32(v_remainder, v_2pi);
    v_remainder = vbslq_f32(v_mask1, v_re1, v_remainder);
    v_remainder = vbslq_f32(v_mask2, v_re2, v_remainder);

    return v_remainder;
}

Taylor series expansion

Fixed value calculation (actually Taylor series expansion)

C code:

// sin
static inline float sinapprox(float val)
{<!-- -->
    float val2 = val * val;
    return val * (0.9999793767f + val2 * (-0.1666243672f + val2 *
          (8.3089787513e-3f + val2 * (-1.9264918228e-4f + val2 *
           2.1478401777e-6f))));
}

//cos
static inline float cosapprox(float val) {<!-- -->
  float val2 = val*val;
  /* Generated in Sollya using:
     > f = remez(cos(x)-1, [|x*x, x*x*x*x, x*x*x*x*x*x, x*x*x*x*x*x* x*x|],
                            [0.000001, pi], 1, 1e-8);
     > plot(f-cos(x) + 1, [0, pi]);
     > f + 1
  */
  return
    1.f + val2 * (-0.4998515820f + val2 * (4.1518035216e-2f + val2 *
      (-1.3422947025e-3f + val2 * 1.8929864824e-5f)));
}

Refer to approxmath

2. NEON implementation

sinf function

#include <arm_neon.h>

#ifndef pi
#define pi 3.1415926535897932384626433832
#endif

// Method 1 Taylor series expansion method
static inline void neon_sin_approx(float32x4_t input, float32x4_t *sin_result)
{<!-- -->
    int iterations = 8;

    float32x4_t term = input;
    *sin_result = input;

    for (int j = 0; j < iterations; j ++ )
    {<!-- -->
        term = vmulq_f32(term, vmulq_f32(input, input));
        float32x4_t factor = vdupq_n_f32(-1.0f / ((2 * j + 3) * (2 * j + 2)));
        term = vmulq_f32(term, factor);
        *sin_result = vaddq_f32(*sin_result, term);
    }
}

// method 2 fixed value method
static inline void sinapprox_neon_intrinsics(float32x4_t val, float32x4_t *sin_result)
{<!-- -->
    float32x4_t num1 = vdupq_n_f32(0.9999793767f);
    float32x4_t num2 = vdupq_n_f32(-0.1666243672f);
    float32x4_t num3 = vdupq_n_f32(8.3089787513e-3f);
    float32x4_t num4 = vdupq_n_f32(-1.9264918228e-4f);
    float32x4_t num5 = vdupq_n_f32(2.1478401777e-6f);

    float32x4_t val2 = vmulq_f32(val, val);
    *sin_result = vmulq_f32(val, vmlaq_f32(num1, val2, vmlaq_f32(num2, val2, vmlaq_f32(num3, val2, vmlaq_f32(num4, val2, num5)))));
}

static inline void vector_sinf_neon_intrinsics(float *input, float *output, int M)
{<!-- -->// Find sinf for the vector input, M is the length of the vector
    int i;
    float32x4_t v_output;
    for (i = 0; i < M - 3; i + = 4)
    {<!-- -->
        float32x4_t v_input = vld1q_f32(input + i);
        sinapprox_neon_intrinsics(v_input, & amp;v_output); // Alternate approximation method neon_sin_approx(v_input, & amp;v_output);
        vst1q_f32(output + i, v_output);
    }
// process remaining elements
    for (; i < M; i ++ )
    {<!-- -->
        output[i] = sinf(input[i]);
    }
}

cosf function

// Method 1 Taylor series expansion method
static inline void neon_cos_approx(float32x4_t input, float32x4_t *cos_result)
{<!-- -->
    int j;
    int iterations = 8;

    float32x4_t term = vdupq_n_f32(1.0f);
    *cos_result = vdupq_n_f32(1.0f);

    for (j = 0; j < iterations; j ++ )
    {<!-- -->
        term = vmulq_f32(term, vmulq_f32(input, input));
        term = vmulq_f32(term, vdupq_n_f32(-1.0f / ((2 * j + 2) * (2 * j + 1))));
        *cos_result = vaddq_f32(*cos_result, term);
    }
}

// method 2 fixed value method
static inline void cosapprox_neon_intrinsics(float32x4_t val, float32x4_t *cos_result)
{<!-- -->
    float32x4_t num1 = vdupq_n_f32(1.f);
    float32x4_t num2 = vdupq_n_f32(-0.4998515820f);
    float32x4_t num3 = vdupq_n_f32(4.1518035216e-2f);
    float32x4_t num4 = vdupq_n_f32(-1.3422947025e-3f);
    float32x4_t num5 = vdupq_n_f32(1.8929864824e-5f);
    float32x4_t val2 = vmulq_f32(val, val);
    *cos_result = vmlaq_f32(num1, val2, vmlaq_f32(num2, val2, vmlaq_f32(num3, val2, vmlaq_f32(num4, val2, num5))));
}

static inline void vector_cosf_neon_intrinsics(float *input, float *output, int M)
{<!-- -->// Find sinf for the vector input, M is the length of the vector
    int i;
    float32x4_t v_output;
    for (i = 0; i < M - 3; i + = 4)
    {<!-- -->
        float32x4_t v_input = vld1q_f32(input + i);
        cosapprox_neon_intrinsics(v_input, & amp;v_output);// can replace the approximation method neon_cos_approx(v_input, & amp;v_output);
        vst1q_f32(output + i, v_output);
    }
// process remaining elements
    for (; i < M; i ++ )
    {<!-- -->
        output[i] = cosf(input[i]);
    }
}

3. Verify

#define pi 3.1415926535897932384626433832

int main {<!-- -->
int frame_length = 512;
    float *test_sin1 = malloc(frame_length * sizeof(float));
    float *test_sin2 = malloc(frame_length * sizeof(float));
    float *t_data_pi = malloc(frame_length * sizeof(float));
    float *t_data = malloc(frame_length * sizeof(float));
    float temp_t;
    for (i = 0; i < frame_length; i ++ )
    {<!-- -->
        t_data[i] = 0.85 * 10 * i;
        test_sin1[i] = cosf(t_data[i]);
    }
    range_neg_pi_to_pi_neon_intrinsics(t_data, t_data_pi, pi, frame_length);
    vector_sinf_neon_intrinsics(t_data_pi, test_sin2, frame_length);
    for (i = 0; i < st->frame_length; i ++ )
    {<!-- -->
        printf("test cos: %.10f, %.10f , cos(%.10f)\\
", test_sin1[i], test_sin2[i], t_data[i]);
        if (abs(test_sin1[i] - test_sin2[i]) > 0.001)
            printf("Bad result !!!!!!!!!!!!!!!\\
");
    }

    free(test_sin1);
    free(test_sin2);
    free(t_data_pi);
    free(t_data);
    return 0;
    }