Skip to content

Commit

Permalink
Merge pull request #736 from argilo/fix-k7-r2
Browse files Browse the repository at this point in the history
Fix bugs in convolutional decoder
  • Loading branch information
jdemel authored Jan 7, 2024
2 parents 4266ee8 + 87cb93d commit a84c7e3
Show file tree
Hide file tree
Showing 2 changed files with 97 additions and 111 deletions.
16 changes: 8 additions & 8 deletions kernels/volk/volk_8u_conv_k7_r2puppet_8u.h
Original file line number Diff line number Diff line change
Expand Up @@ -97,8 +97,8 @@ static inline int chainback_viterbi(unsigned char* data,
#include <stdio.h>
#include <xmmintrin.h>

static inline void volk_8u_conv_k7_r2puppet_8u_spiral(unsigned char* syms,
unsigned char* dec,
static inline void volk_8u_conv_k7_r2puppet_8u_spiral(unsigned char* dec,
unsigned char* syms,
unsigned int framebits)
{
if (framebits < 12) {
Expand Down Expand Up @@ -181,8 +181,8 @@ static inline void volk_8u_conv_k7_r2puppet_8u_spiral(unsigned char* syms,

#include "volk/sse2neon.h"

static inline void volk_8u_conv_k7_r2puppet_8u_neonspiral(unsigned char* syms,
unsigned char* dec,
static inline void volk_8u_conv_k7_r2puppet_8u_neonspiral(unsigned char* dec,
unsigned char* syms,
unsigned int framebits)
{
if (framebits < 12) {
Expand Down Expand Up @@ -266,8 +266,8 @@ static inline void volk_8u_conv_k7_r2puppet_8u_neonspiral(unsigned char* syms,
//#include <immintrin.h>
//#include <stdio.h>
//
// static inline void volk_8u_conv_k7_r2puppet_8u_avx2(unsigned char* syms,
// unsigned char* dec,
// static inline void volk_8u_conv_k7_r2puppet_8u_avx2(unsigned char* dec,
// unsigned char* syms,
// unsigned int framebits)
//{
// if (framebits < 12) {
Expand Down Expand Up @@ -349,8 +349,8 @@ static inline void volk_8u_conv_k7_r2puppet_8u_neonspiral(unsigned char* syms,
#if LV_HAVE_GENERIC


static inline void volk_8u_conv_k7_r2puppet_8u_generic(unsigned char* syms,
unsigned char* dec,
static inline void volk_8u_conv_k7_r2puppet_8u_generic(unsigned char* dec,
unsigned char* syms,
unsigned int framebits)
{
if (framebits < 12) {
Expand Down
192 changes: 89 additions & 103 deletions kernels/volk/volk_8u_x4_conv_k7_r2_8u.h
Original file line number Diff line number Diff line change
Expand Up @@ -57,19 +57,17 @@ typedef union {
#endif


static inline void renormalize(unsigned char* X, unsigned char threshold)
static inline void renormalize(unsigned char* X)
{
int NUMSTATES = 64;
int i;

unsigned char min = X[0];
// if(min > threshold) {
for (i = 0; i < NUMSTATES; i++)
if (min > X[i])
min = X[i];
for (i = 0; i < NUMSTATES; i++)
X[i] -= min;
//}
}


Expand All @@ -85,16 +83,17 @@ static inline void BFLY(int i,
int j;
unsigned int decision0, decision1;
unsigned char metric, m0, m1, m2, m3;
unsigned short metricsum;

int NUMSTATES = 64;
int RATE = 2;
int METRICSHIFT = 2;
int METRICSHIFT = 1;
int PRECISIONSHIFT = 2;

metric = 0;
metricsum = 1;
for (j = 0; j < RATE; j++)
metric += (Branchtab[i + j * NUMSTATES / 2] ^ syms[s * RATE + j]) >> METRICSHIFT;
metric = metric >> PRECISIONSHIFT;
metricsum += (Branchtab[i + j * NUMSTATES / 2] ^ syms[s * RATE + j]);
metric = (metricsum >> METRICSHIFT) >> PRECISIONSHIFT;

unsigned char max = ((RATE * ((256 - 1) >> METRICSHIFT)) >> PRECISIONSHIFT);

Expand All @@ -103,8 +102,8 @@ static inline void BFLY(int i,
m2 = X[i] + (max - metric);
m3 = X[i + NUMSTATES / 2] + metric;

decision0 = (signed int)(m0 - m1) > 0;
decision1 = (signed int)(m2 - m3) > 0;
decision0 = (signed int)(m0 - m1) >= 0;
decision1 = (signed int)(m2 - m3) >= 0;

Y[2 * i] = decision0 ? m1 : m0;
Y[2 * i + 1] = decision1 ? m3 : m2;
Expand Down Expand Up @@ -297,7 +296,7 @@ static inline void BFLY(int i,
// }
// }
//
// renormalize(X, 210);
// renormalize(X);
//
// unsigned int j;
// for (j = 0; j < (framebits + excess) % 2; ++j) {
Expand All @@ -312,7 +311,7 @@ static inline void BFLY(int i,
// Branchtab);
// }
//
// renormalize(Y, 210);
// renormalize(Y);
// }
// /*skip*/
//}
Expand Down Expand Up @@ -438,27 +437,25 @@ static inline void volk_8u_x4_conv_k7_r2_8u_spiral(unsigned char* Y,
*(a112) = s28;
a113 = (a95 + 3);
*(a113) = s29;
if ((((unsigned char*)Y)[0] > 210)) {
__m128i m5, m6;
m5 = ((__m128i*)Y)[0];
m5 = _mm_min_epu8(m5, ((__m128i*)Y)[1]);
m5 = _mm_min_epu8(m5, ((__m128i*)Y)[2]);
m5 = _mm_min_epu8(m5, ((__m128i*)Y)[3]);
__m128i m7;
m7 = _mm_min_epu8(_mm_srli_si128(m5, 8), m5);
m7 =
((__m128i)_mm_min_epu8(((__m128i)_mm_srli_epi64(m7, 32)), ((__m128i)m7)));
m7 =
((__m128i)_mm_min_epu8(((__m128i)_mm_srli_epi64(m7, 16)), ((__m128i)m7)));
m7 = ((__m128i)_mm_min_epu8(((__m128i)_mm_srli_epi64(m7, 8)), ((__m128i)m7)));
m7 = _mm_unpacklo_epi8(m7, m7);
m7 = _mm_shufflelo_epi16(m7, _MM_SHUFFLE(0, 0, 0, 0));
m6 = _mm_unpacklo_epi64(m7, m7);
((__m128i*)Y)[0] = _mm_subs_epu8(((__m128i*)Y)[0], m6);
((__m128i*)Y)[1] = _mm_subs_epu8(((__m128i*)Y)[1], m6);
((__m128i*)Y)[2] = _mm_subs_epu8(((__m128i*)Y)[2], m6);
((__m128i*)Y)[3] = _mm_subs_epu8(((__m128i*)Y)[3], m6);
}

__m128i m5, m6;
m5 = ((__m128i*)Y)[0];
m5 = _mm_min_epu8(m5, ((__m128i*)Y)[1]);
m5 = _mm_min_epu8(m5, ((__m128i*)Y)[2]);
m5 = _mm_min_epu8(m5, ((__m128i*)Y)[3]);
__m128i m7;
m7 = _mm_min_epu8(_mm_srli_si128(m5, 8), m5);
m7 = ((__m128i)_mm_min_epu8(((__m128i)_mm_srli_epi64(m7, 32)), ((__m128i)m7)));
m7 = ((__m128i)_mm_min_epu8(((__m128i)_mm_srli_epi64(m7, 16)), ((__m128i)m7)));
m7 = ((__m128i)_mm_min_epu8(((__m128i)_mm_srli_epi64(m7, 8)), ((__m128i)m7)));
m7 = _mm_unpacklo_epi8(m7, m7);
m7 = _mm_shufflelo_epi16(m7, _MM_SHUFFLE(0, 0, 0, 0));
m6 = _mm_unpacklo_epi64(m7, m7);
((__m128i*)Y)[0] = _mm_subs_epu8(((__m128i*)Y)[0], m6);
((__m128i*)Y)[1] = _mm_subs_epu8(((__m128i*)Y)[1], m6);
((__m128i*)Y)[2] = _mm_subs_epu8(((__m128i*)Y)[2], m6);
((__m128i*)Y)[3] = _mm_subs_epu8(((__m128i*)Y)[3], m6);

unsigned char a188, a194;
int a186, a205;
short int s48, s49, s54, s55;
Expand Down Expand Up @@ -561,31 +558,27 @@ static inline void volk_8u_x4_conv_k7_r2_8u_spiral(unsigned char* Y,
*(a225) = s56;
a226 = (a208 + 3);
*(a226) = s57;
if ((((unsigned char*)X)[0] > 210)) {
__m128i m12, m13;
m12 = ((__m128i*)X)[0];
m12 = _mm_min_epu8(m12, ((__m128i*)X)[1]);
m12 = _mm_min_epu8(m12, ((__m128i*)X)[2]);
m12 = _mm_min_epu8(m12, ((__m128i*)X)[3]);
__m128i m14;
m14 = _mm_min_epu8(_mm_srli_si128(m12, 8), m12);
m14 = ((__m128i)_mm_min_epu8(((__m128i)_mm_srli_epi64(m14, 32)),
((__m128i)m14)));
m14 = ((__m128i)_mm_min_epu8(((__m128i)_mm_srli_epi64(m14, 16)),
((__m128i)m14)));
m14 = ((__m128i)_mm_min_epu8(((__m128i)_mm_srli_epi64(m14, 8)),
((__m128i)m14)));
m14 = _mm_unpacklo_epi8(m14, m14);
m14 = _mm_shufflelo_epi16(m14, _MM_SHUFFLE(0, 0, 0, 0));
m13 = _mm_unpacklo_epi64(m14, m14);
((__m128i*)X)[0] = _mm_subs_epu8(((__m128i*)X)[0], m13);
((__m128i*)X)[1] = _mm_subs_epu8(((__m128i*)X)[1], m13);
((__m128i*)X)[2] = _mm_subs_epu8(((__m128i*)X)[2], m13);
((__m128i*)X)[3] = _mm_subs_epu8(((__m128i*)X)[3], m13);
}

__m128i m12, m13;
m12 = ((__m128i*)X)[0];
m12 = _mm_min_epu8(m12, ((__m128i*)X)[1]);
m12 = _mm_min_epu8(m12, ((__m128i*)X)[2]);
m12 = _mm_min_epu8(m12, ((__m128i*)X)[3]);
__m128i m14;
m14 = _mm_min_epu8(_mm_srli_si128(m12, 8), m12);
m14 = ((__m128i)_mm_min_epu8(((__m128i)_mm_srli_epi64(m14, 32)), ((__m128i)m14)));
m14 = ((__m128i)_mm_min_epu8(((__m128i)_mm_srli_epi64(m14, 16)), ((__m128i)m14)));
m14 = ((__m128i)_mm_min_epu8(((__m128i)_mm_srli_epi64(m14, 8)), ((__m128i)m14)));
m14 = _mm_unpacklo_epi8(m14, m14);
m14 = _mm_shufflelo_epi16(m14, _MM_SHUFFLE(0, 0, 0, 0));
m13 = _mm_unpacklo_epi64(m14, m14);
((__m128i*)X)[0] = _mm_subs_epu8(((__m128i*)X)[0], m13);
((__m128i*)X)[1] = _mm_subs_epu8(((__m128i*)X)[1], m13);
((__m128i*)X)[2] = _mm_subs_epu8(((__m128i*)X)[2], m13);
((__m128i*)X)[3] = _mm_subs_epu8(((__m128i*)X)[3], m13);
}

renormalize(X, 210);
renormalize(X);

/*int ch;
for(ch = 0; ch < 64; ch++) {
Expand All @@ -607,7 +600,7 @@ static inline void volk_8u_x4_conv_k7_r2_8u_spiral(unsigned char* Y,
}


renormalize(Y, 210);
renormalize(Y);

/*printf("\n");
for(ch = 0; ch < 64; ch++) {
Expand Down Expand Up @@ -734,27 +727,25 @@ static inline void volk_8u_x4_conv_k7_r2_8u_neonspiral(unsigned char* Y,
*(a112) = s28;
a113 = (a95 + 3);
*(a113) = s29;
if ((((unsigned char*)Y)[0] > 210)) {
__m128i m5, m6;
m5 = ((__m128i*)Y)[0];
m5 = _mm_min_epu8(m5, ((__m128i*)Y)[1]);
m5 = _mm_min_epu8(m5, ((__m128i*)Y)[2]);
m5 = _mm_min_epu8(m5, ((__m128i*)Y)[3]);
__m128i m7;
m7 = _mm_min_epu8(_mm_srli_si128(m5, 8), m5);
m7 =
((__m128i)_mm_min_epu8(((__m128i)_mm_srli_epi64(m7, 32)), ((__m128i)m7)));
m7 =
((__m128i)_mm_min_epu8(((__m128i)_mm_srli_epi64(m7, 16)), ((__m128i)m7)));
m7 = ((__m128i)_mm_min_epu8(((__m128i)_mm_srli_epi64(m7, 8)), ((__m128i)m7)));
m7 = _mm_unpacklo_epi8(m7, m7);
m7 = _mm_shufflelo_epi16(m7, _MM_SHUFFLE(0, 0, 0, 0));
m6 = _mm_unpacklo_epi64(m7, m7);
((__m128i*)Y)[0] = _mm_subs_epu8(((__m128i*)Y)[0], m6);
((__m128i*)Y)[1] = _mm_subs_epu8(((__m128i*)Y)[1], m6);
((__m128i*)Y)[2] = _mm_subs_epu8(((__m128i*)Y)[2], m6);
((__m128i*)Y)[3] = _mm_subs_epu8(((__m128i*)Y)[3], m6);
}

__m128i m5, m6;
m5 = ((__m128i*)Y)[0];
m5 = _mm_min_epu8(m5, ((__m128i*)Y)[1]);
m5 = _mm_min_epu8(m5, ((__m128i*)Y)[2]);
m5 = _mm_min_epu8(m5, ((__m128i*)Y)[3]);
__m128i m7;
m7 = _mm_min_epu8(_mm_srli_si128(m5, 8), m5);
m7 = ((__m128i)_mm_min_epu8(((__m128i)_mm_srli_epi64(m7, 32)), ((__m128i)m7)));
m7 = ((__m128i)_mm_min_epu8(((__m128i)_mm_srli_epi64(m7, 16)), ((__m128i)m7)));
m7 = ((__m128i)_mm_min_epu8(((__m128i)_mm_srli_epi64(m7, 8)), ((__m128i)m7)));
m7 = _mm_unpacklo_epi8(m7, m7);
m7 = _mm_shufflelo_epi16(m7, _MM_SHUFFLE(0, 0, 0, 0));
m6 = _mm_unpacklo_epi64(m7, m7);
((__m128i*)Y)[0] = _mm_subs_epu8(((__m128i*)Y)[0], m6);
((__m128i*)Y)[1] = _mm_subs_epu8(((__m128i*)Y)[1], m6);
((__m128i*)Y)[2] = _mm_subs_epu8(((__m128i*)Y)[2], m6);
((__m128i*)Y)[3] = _mm_subs_epu8(((__m128i*)Y)[3], m6);

unsigned char a188, a194;
int a186, a205;
short int s48, s49, s54, s55;
Expand Down Expand Up @@ -857,31 +848,27 @@ static inline void volk_8u_x4_conv_k7_r2_8u_neonspiral(unsigned char* Y,
*(a225) = s56;
a226 = (a208 + 3);
*(a226) = s57;
if ((((unsigned char*)X)[0] > 210)) {
__m128i m12, m13;
m12 = ((__m128i*)X)[0];
m12 = _mm_min_epu8(m12, ((__m128i*)X)[1]);
m12 = _mm_min_epu8(m12, ((__m128i*)X)[2]);
m12 = _mm_min_epu8(m12, ((__m128i*)X)[3]);
__m128i m14;
m14 = _mm_min_epu8(_mm_srli_si128(m12, 8), m12);
m14 = ((__m128i)_mm_min_epu8(((__m128i)_mm_srli_epi64(m14, 32)),
((__m128i)m14)));
m14 = ((__m128i)_mm_min_epu8(((__m128i)_mm_srli_epi64(m14, 16)),
((__m128i)m14)));
m14 = ((__m128i)_mm_min_epu8(((__m128i)_mm_srli_epi64(m14, 8)),
((__m128i)m14)));
m14 = _mm_unpacklo_epi8(m14, m14);
m14 = _mm_shufflelo_epi16(m14, _MM_SHUFFLE(0, 0, 0, 0));
m13 = _mm_unpacklo_epi64(m14, m14);
((__m128i*)X)[0] = _mm_subs_epu8(((__m128i*)X)[0], m13);
((__m128i*)X)[1] = _mm_subs_epu8(((__m128i*)X)[1], m13);
((__m128i*)X)[2] = _mm_subs_epu8(((__m128i*)X)[2], m13);
((__m128i*)X)[3] = _mm_subs_epu8(((__m128i*)X)[3], m13);
}

__m128i m12, m13;
m12 = ((__m128i*)X)[0];
m12 = _mm_min_epu8(m12, ((__m128i*)X)[1]);
m12 = _mm_min_epu8(m12, ((__m128i*)X)[2]);
m12 = _mm_min_epu8(m12, ((__m128i*)X)[3]);
__m128i m14;
m14 = _mm_min_epu8(_mm_srli_si128(m12, 8), m12);
m14 = ((__m128i)_mm_min_epu8(((__m128i)_mm_srli_epi64(m14, 32)), ((__m128i)m14)));
m14 = ((__m128i)_mm_min_epu8(((__m128i)_mm_srli_epi64(m14, 16)), ((__m128i)m14)));
m14 = ((__m128i)_mm_min_epu8(((__m128i)_mm_srli_epi64(m14, 8)), ((__m128i)m14)));
m14 = _mm_unpacklo_epi8(m14, m14);
m14 = _mm_shufflelo_epi16(m14, _MM_SHUFFLE(0, 0, 0, 0));
m13 = _mm_unpacklo_epi64(m14, m14);
((__m128i*)X)[0] = _mm_subs_epu8(((__m128i*)X)[0], m13);
((__m128i*)X)[1] = _mm_subs_epu8(((__m128i*)X)[1], m13);
((__m128i*)X)[2] = _mm_subs_epu8(((__m128i*)X)[2], m13);
((__m128i*)X)[3] = _mm_subs_epu8(((__m128i*)X)[3], m13);
}

renormalize(X, 210);
renormalize(X);

/*int ch;
for(ch = 0; ch < 64; ch++) {
Expand All @@ -903,7 +890,7 @@ static inline void volk_8u_x4_conv_k7_r2_8u_neonspiral(unsigned char* Y,
}


renormalize(Y, 210);
renormalize(Y);

/*printf("\n");
for(ch = 0; ch < 64; ch++) {
Expand All @@ -928,7 +915,6 @@ static inline void volk_8u_x4_conv_k7_r2_8u_generic(unsigned char* Y,
{
int nbits = framebits + excess;
int NUMSTATES = 64;
int RENORMALIZE_THRESHOLD = 210;

int s, i;
for (s = 0; s < nbits; s++) {
Expand All @@ -937,7 +923,7 @@ static inline void volk_8u_x4_conv_k7_r2_8u_generic(unsigned char* Y,
BFLY(i, s, syms, Y, X, (decision_t*)dec, Branchtab);
}

renormalize(Y, RENORMALIZE_THRESHOLD);
renormalize(Y);

/// Swap pointers to old and new metrics
tmp = (void*)X;
Expand Down

0 comments on commit a84c7e3

Please sign in to comment.