Merge pull request #725 from argilo/remove-asm-protokernels

Remove SSE protokernels written in assembly
gnuradio · Dec 17, 2023 · eed16da · eed16da
2 parents b8e7801 + ea9bc8d
commit eed16da
Show file tree

Hide file tree

Showing 2 changed files with 0 additions and 410 deletions.
diff --git a/kernels/volk/volk_32fc_x2_conjugate_dot_prod_32fc.h b/kernels/volk/volk_32fc_x2_conjugate_dot_prod_32fc.h
@@ -422,150 +422,4 @@ static inline void volk_32fc_x2_conjugate_dot_prod_32fc_a_sse3(lv_32fc_t* result
 #endif /*LV_HAVE_SSE3*/
 
 
-#if LV_HAVE_SSE && LV_HAVE_64
-
-static inline void volk_32fc_x2_conjugate_dot_prod_32fc_a_sse(lv_32fc_t* result,
-                                                              const lv_32fc_t* input,
-                                                              const lv_32fc_t* taps,
-                                                              unsigned int num_points)
-{
-
-    const unsigned int num_bytes = num_points * 8;
-
-    __VOLK_ATTR_ALIGNED(16)
-    static const uint32_t conjugator[4] = {
-        0x00000000, 0x80000000, 0x00000000, 0x80000000
-    };
-
-    __VOLK_ASM __VOLK_VOLATILE(
-        "#  ccomplex_conjugate_dotprod_generic (float* result, const float *input,\n\t"
-        "#                         const float *taps, unsigned num_bytes)\n\t"
-        "#    float sum0 = 0;\n\t"
-        "#    float sum1 = 0;\n\t"
-        "#    float sum2 = 0;\n\t"
-        "#    float sum3 = 0;\n\t"
-        "#    do {\n\t"
-        "#      sum0 += input[0] * taps[0] - input[1] * taps[1];\n\t"
-        "#      sum1 += input[0] * taps[1] + input[1] * taps[0];\n\t"
-        "#      sum2 += input[2] * taps[2] - input[3] * taps[3];\n\t"
-        "#      sum3 += input[2] * taps[3] + input[3] * taps[2];\n\t"
-        "#      input += 4;\n\t"
-        "#      taps += 4;  \n\t"
-        "#    } while (--n_2_ccomplex_blocks != 0);\n\t"
-        "#    result[0] = sum0 + sum2;\n\t"
-        "#    result[1] = sum1 + sum3;\n\t"
-        "# TODO: prefetch and better scheduling\n\t"
-        "  xor    %%r9,  %%r9\n\t"
-        "  xor    %%r10, %%r10\n\t"
-        "  movq   %[conjugator], %%r9\n\t"
-        "  movq   %%rcx, %%rax\n\t"
-        "  movaps 0(%%r9), %%xmm8\n\t"
-        "  movq   %%rcx, %%r8\n\t"
-        "  movq   %[rsi],  %%r9\n\t"
-        "  movq   %[rdx], %%r10\n\t"
-        "	xorps	%%xmm6, %%xmm6		# zero accumulators\n\t"
-        "	xorps	%%xmm7, %%xmm7		# zero accumulators\n\t"
-        "	shr	$5, %%rax		# rax = n_2_ccomplex_blocks / 2\n\t"
-        "  shr     $4, %%r8\n\t"
-        "  xorps  %%xmm8, %%xmm2\n\t"
-        "	jmp	.%=L1_test\n\t"
-        "	# 4 taps / loop\n\t"
-        "	# something like ?? cycles / loop\n\t"
-        ".%=Loop1:	\n\t"
-        "# complex prod: C += A * B,  w/ temp Z & Y (or B), xmmPN=$0x8000000080000000\n\t"
-        "#	movaps	(%%r9), %%xmmA\n\t"
-        "#	movaps	(%%r10), %%xmmB\n\t"
-        "#	movaps	%%xmmA, %%xmmZ\n\t"
-        "#	shufps	$0xb1, %%xmmZ, %%xmmZ	# swap internals\n\t"
-        "#	mulps	%%xmmB, %%xmmA\n\t"
-        "#	mulps	%%xmmZ, %%xmmB\n\t"
-        "#	# SSE replacement for: pfpnacc %%xmmB, %%xmmA\n\t"
-        "#	xorps	%%xmmPN, %%xmmA\n\t"
-        "#	movaps	%%xmmA, %%xmmZ\n\t"
-        "#	unpcklps %%xmmB, %%xmmA\n\t"
-        "#	unpckhps %%xmmB, %%xmmZ\n\t"
-        "#	movaps	%%xmmZ, %%xmmY\n\t"
-        "#	shufps	$0x44, %%xmmA, %%xmmZ	# b01000100\n\t"
-        "#	shufps	$0xee, %%xmmY, %%xmmA	# b11101110\n\t"
-        "#	addps	%%xmmZ, %%xmmA\n\t"
-        "#	addps	%%xmmA, %%xmmC\n\t"
-        "# A=xmm0, B=xmm2, Z=xmm4\n\t"
-        "# A'=xmm1, B'=xmm3, Z'=xmm5\n\t"
-        "	movaps	0(%%r9), %%xmm0\n\t"
-        "	movaps	16(%%r9), %%xmm1\n\t"
-        "	movaps	%%xmm0, %%xmm4\n\t"
-        "	movaps	0(%%r10), %%xmm2\n\t"
-        "  xorps   %%xmm8, %%xmm2\n\t"
-        "	mulps	%%xmm2, %%xmm0\n\t"
-        "	shufps	$0xb1, %%xmm4, %%xmm4	# swap internals\n\t"
-        "	movaps	16(%%r10), %%xmm3\n\t"
-        "	movaps	%%xmm1, %%xmm5\n\t"
-        "  xorps   %%xmm8, %%xmm3\n\t"
-        "	addps	%%xmm0, %%xmm6\n\t"
-        "	mulps	%%xmm3, %%xmm1\n\t"
-        "	shufps	$0xb1, %%xmm5, %%xmm5	# swap internals\n\t"
-        "	addps	%%xmm1, %%xmm6\n\t"
-        "	mulps	%%xmm4, %%xmm2\n\t"
-        "	addps	%%xmm2, %%xmm7\n\t"
-        "	mulps	%%xmm5, %%xmm3\n\t"
-        "	add	$32, %%r9\n\t"
-        "	addps	%%xmm3, %%xmm7\n\t"
-        "	add	$32, %%r10\n\t"
-        ".%=L1_test:\n\t"
-        "	dec	%%rax\n\t"
-        "	jge	.%=Loop1\n\t"
-        "	# We've handled the bulk of multiplies up to here.\n\t"
-        "	# Let's sse if original n_2_ccomplex_blocks was odd.\n\t"
-        "	# If so, we've got 2 more taps to do.\n\t"
-        "	and	$1, %%r8\n\t"
-        "	je	.%=Leven\n\t"
-        "	# The count was odd, do 2 more taps.\n\t"
-        "	# Note that we've already got mm0/mm2 preloaded\n\t"
-        "	# from the main loop.\n\t"
-        "	movaps	0(%%r9), %%xmm0\n\t"
-        "	movaps	%%xmm0, %%xmm4\n\t"
-        "	movaps	0(%%r10), %%xmm2\n\t"
-        "  xorps   %%xmm8, %%xmm2\n\t"
-        "	mulps	%%xmm2, %%xmm0\n\t"
-        "	shufps	$0xb1, %%xmm4, %%xmm4	# swap internals\n\t"
-        "	addps	%%xmm0, %%xmm6\n\t"
-        "	mulps	%%xmm4, %%xmm2\n\t"
-        "	addps	%%xmm2, %%xmm7\n\t"
-        ".%=Leven:\n\t"
-        "	# neg inversor\n\t"
-        "	xorps	%%xmm1, %%xmm1\n\t"
-        "	mov	$0x80000000, %%r9\n\t"
-        "	movd	%%r9, %%xmm1\n\t"
-        "	shufps	$0x11, %%xmm1, %%xmm1	# b00010001 # 0 -0 0 -0\n\t"
-        "	# pfpnacc\n\t"
-        "	xorps	%%xmm1, %%xmm6\n\t"
-        "	movaps	%%xmm6, %%xmm2\n\t"
-        "	unpcklps %%xmm7, %%xmm6\n\t"
-        "	unpckhps %%xmm7, %%xmm2\n\t"
-        "	movaps	%%xmm2, %%xmm3\n\t"
-        "	shufps	$0x44, %%xmm6, %%xmm2	# b01000100\n\t"
-        "	shufps	$0xee, %%xmm3, %%xmm6	# b11101110\n\t"
-        "	addps	%%xmm2, %%xmm6\n\t"
-        "					# xmm6 = r1 i2 r3 i4\n\t"
-        "	movhlps	%%xmm6, %%xmm4		# xmm4 = r3 i4 ?? ??\n\t"
-        "	addps	%%xmm4, %%xmm6		# xmm6 = r1+r3 i2+i4 ?? ??\n\t"
-        "	movlps	%%xmm6, (%[rdi])		# store low 2x32 bits (complex) "
-        "to memory\n\t"
-        :
-        : [rsi] "r"(input),
-          [rdx] "r"(taps),
-          "c"(num_bytes),
-          [rdi] "r"(result),
-          [conjugator] "r"(conjugator)
-        : "rax", "r8", "r9", "r10");
-
-    int getem = num_bytes % 16;
-
-    for (; getem > 0; getem -= 8) {
-        *result += (input[(num_bytes >> 3) - 1] * lv_conj(taps[(num_bytes >> 3) - 1]));
-    }
-}
-#endif
-
-
 #endif /*INCLUDED_volk_32fc_x2_conjugate_dot_prod_32fc_a_H*/