Skip to content

Commit

Permalink
Merge pull request #729 from argilo/remove-dot-prod-sse41
Browse files Browse the repository at this point in the history
Remove disabled SSE4.1 dot product
  • Loading branch information
jdemel authored Jan 7, 2024
2 parents dedc6a7 + 40adedb commit e527309
Showing 1 changed file with 0 additions and 168 deletions.
168 changes: 0 additions & 168 deletions kernels/volk/volk_32fc_x2_dot_prod_32fc.h
Original file line number Diff line number Diff line change
Expand Up @@ -162,90 +162,6 @@ static inline void volk_32fc_x2_dot_prod_32fc_u_sse3(lv_32fc_t* result,

#endif /*LV_HAVE_SSE3*/

// #ifdef LV_HAVE_SSE4_1

// #include <smmintrin.h>

// static inline void volk_32fc_x2_dot_prod_32fc_u_sse4_1(lv_32fc_t* result,
// const lv_32fc_t* input,
// const lv_32fc_t* taps,
// unsigned int num_points)
// {

// unsigned int i = 0;
// const unsigned int qtr_points = num_points / 4;
// const unsigned int isodd = num_points & 3;

// __m128 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, real0, real1, im0, im1;
// float *p_input, *p_taps;
// __m64* p_result;

// p_result = (__m64*)result;
// p_input = (float*)input;
// p_taps = (float*)taps;

// static const __m128i neg = { 0x000000000000000080000000 };

// real0 = _mm_setzero_ps();
// real1 = _mm_setzero_ps();
// im0 = _mm_setzero_ps();
// im1 = _mm_setzero_ps();

// for (; i < qtr_points; ++i) {
// xmm0 = _mm_loadu_ps(p_input);
// xmm1 = _mm_loadu_ps(p_taps);

// p_input += 4;
// p_taps += 4;

// xmm2 = _mm_loadu_ps(p_input);
// xmm3 = _mm_loadu_ps(p_taps);

// p_input += 4;
// p_taps += 4;

// xmm4 = _mm_unpackhi_ps(xmm0, xmm2);
// xmm5 = _mm_unpackhi_ps(xmm1, xmm3);
// xmm0 = _mm_unpacklo_ps(xmm0, xmm2);
// xmm2 = _mm_unpacklo_ps(xmm1, xmm3);

// // imaginary vector from input
// xmm1 = _mm_unpackhi_ps(xmm0, xmm4);
// // real vector from input
// xmm3 = _mm_unpacklo_ps(xmm0, xmm4);
// // imaginary vector from taps
// xmm0 = _mm_unpackhi_ps(xmm2, xmm5);
// // real vector from taps
// xmm2 = _mm_unpacklo_ps(xmm2, xmm5);

// xmm4 = _mm_dp_ps(xmm3, xmm2, 0xf1);
// xmm5 = _mm_dp_ps(xmm1, xmm0, 0xf1);

// xmm6 = _mm_dp_ps(xmm3, xmm0, 0xf2);
// xmm7 = _mm_dp_ps(xmm1, xmm2, 0xf2);

// real0 = _mm_add_ps(xmm4, real0);
// real1 = _mm_add_ps(xmm5, real1);
// im0 = _mm_add_ps(xmm6, im0);
// im1 = _mm_add_ps(xmm7, im1);
// }

// real1 = _mm_xor_ps(real1, bit128_p(&neg)->float_vec);

// im0 = _mm_add_ps(im0, im1);
// real0 = _mm_add_ps(real0, real1);

// im0 = _mm_add_ps(im0, real0);

// _mm_storel_pi(p_result, im0);

// for (i = num_points - isodd; i < num_points; i++) {
// *result += input[i] * taps[i];
// }
// }

// #endif /*LV_HAVE_SSE4_1*/

#ifdef LV_HAVE_AVX

#include <immintrin.h>
Expand Down Expand Up @@ -454,90 +370,6 @@ static inline void volk_32fc_x2_dot_prod_32fc_a_sse3(lv_32fc_t* result,
#endif /*LV_HAVE_SSE3*/


// #ifdef LV_HAVE_SSE4_1

// #include <smmintrin.h>

// static inline void volk_32fc_x2_dot_prod_32fc_a_sse4_1(lv_32fc_t* result,
// const lv_32fc_t* input,
// const lv_32fc_t* taps,
// unsigned int num_points)
// {

// unsigned int i = 0;
// const unsigned int qtr_points = num_points / 4;
// const unsigned int isodd = num_points & 3;

// __m128 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, real0, real1, im0, im1;
// float *p_input, *p_taps;
// __m64* p_result;

// static const __m128i neg = { 0x000000000000000080000000 };

// p_result = (__m64*)result;
// p_input = (float*)input;
// p_taps = (float*)taps;

// real0 = _mm_setzero_ps();
// real1 = _mm_setzero_ps();
// im0 = _mm_setzero_ps();
// im1 = _mm_setzero_ps();

// for (; i < qtr_points; ++i) {
// xmm0 = _mm_load_ps(p_input);
// xmm1 = _mm_load_ps(p_taps);

// p_input += 4;
// p_taps += 4;

// xmm2 = _mm_load_ps(p_input);
// xmm3 = _mm_load_ps(p_taps);

// p_input += 4;
// p_taps += 4;

// xmm4 = _mm_unpackhi_ps(xmm0, xmm2);
// xmm5 = _mm_unpackhi_ps(xmm1, xmm3);
// xmm0 = _mm_unpacklo_ps(xmm0, xmm2);
// xmm2 = _mm_unpacklo_ps(xmm1, xmm3);

// // imaginary vector from input
// xmm1 = _mm_unpackhi_ps(xmm0, xmm4);
// // real vector from input
// xmm3 = _mm_unpacklo_ps(xmm0, xmm4);
// // imaginary vector from taps
// xmm0 = _mm_unpackhi_ps(xmm2, xmm5);
// // real vector from taps
// xmm2 = _mm_unpacklo_ps(xmm2, xmm5);

// xmm4 = _mm_dp_ps(xmm3, xmm2, 0xf1);
// xmm5 = _mm_dp_ps(xmm1, xmm0, 0xf1);

// xmm6 = _mm_dp_ps(xmm3, xmm0, 0xf2);
// xmm7 = _mm_dp_ps(xmm1, xmm2, 0xf2);

// real0 = _mm_add_ps(xmm4, real0);
// real1 = _mm_add_ps(xmm5, real1);
// im0 = _mm_add_ps(xmm6, im0);
// im1 = _mm_add_ps(xmm7, im1);
// }

// real1 = _mm_xor_ps(real1, bit128_p(&neg)->float_vec);

// im0 = _mm_add_ps(im0, im1);
// real0 = _mm_add_ps(real0, real1);

// im0 = _mm_add_ps(im0, real0);

// _mm_storel_pi(p_result, im0);

// for (i = num_points - isodd; i < num_points; i++) {
// *result += input[i] * taps[i];
// }
// }

// #endif /*LV_HAVE_SSE4_1*/

#ifdef LV_HAVE_NEON
#include <arm_neon.h>

Expand Down

0 comments on commit e527309

Please sign in to comment.