diff options
author | Lennart Poettering <lennart@poettering.net> | 2009-02-02 01:44:37 +0100 |
---|---|---|
committer | Lennart Poettering <lennart@poettering.net> | 2009-02-02 01:44:37 +0100 |
commit | a41d72bb2ec84a055bb915803dfa75496f09973b (patch) | |
tree | e9717151400b2eaa15bc149e9987fdab7890df46 | |
parent | 537424a9a9fb8c3fbc930a70874c0983b6c5d1a3 (diff) |
update sbc stuff
-rw-r--r-- | src/Makefile.am | 4 | ||||
-rw-r--r-- | src/modules/bluetooth/sbc.c | 375 | ||||
-rw-r--r-- | src/modules/bluetooth/sbc.h | 1 | ||||
-rw-r--r-- | src/modules/bluetooth/sbc_math.h | 2 | ||||
-rw-r--r-- | src/modules/bluetooth/sbc_primitives.c | 469 | ||||
-rw-r--r-- | src/modules/bluetooth/sbc_primitives.h | 74 | ||||
-rw-r--r-- | src/modules/bluetooth/sbc_primitives_mmx.c | 319 | ||||
-rw-r--r-- | src/modules/bluetooth/sbc_primitives_mmx.h | 40 | ||||
-rw-r--r-- | src/modules/bluetooth/sbc_primitives_neon.c | 245 | ||||
-rw-r--r-- | src/modules/bluetooth/sbc_primitives_neon.h | 40 | ||||
-rw-r--r-- | src/modules/bluetooth/sbc_tables.h | 436 |
11 files changed, 1694 insertions, 311 deletions
diff --git a/src/Makefile.am b/src/Makefile.am index 77123c77a..f85890ead 100644 --- a/src/Makefile.am +++ b/src/Makefile.am @@ -1448,7 +1448,7 @@ module_bluetooth_discover_la_LDFLAGS = $(MODULE_LDFLAGS) module_bluetooth_discover_la_LIBADD = $(AM_LIBADD) $(DBUS_LIBS) libpulsecore-@PA_MAJORMINORMICRO@.la libdbus-util.la libbluetooth-util.la libpulsecommon-@PA_MAJORMINORMICRO@.la libpulse.la module_bluetooth_discover_la_CFLAGS = $(AM_CFLAGS) $(DBUS_CFLAGS) -libbluetooth_sbc_la_SOURCES = modules/bluetooth/sbc.c modules/bluetooth/sbc.h modules/bluetooth/sbc_tables.h modules/bluetooth/sbc_math.h +libbluetooth_sbc_la_SOURCES = modules/bluetooth/sbc.c modules/bluetooth/sbc.h modules/bluetooth/sbc_tables.h modules/bluetooth/sbc_math.h modules/bluetooth/sbc_primitives.h modules/bluetooth/sbc_primitives.c modules/bluetooth/sbc_primitives_mmx.h modules/bluetooth/sbc_primitives_neon.h modules/bluetooth/sbc_primitives_mmx.c modules/bluetooth/sbc_primitives_neon.c libbluetooth_sbc_la_LDFLAGS = -avoid-version libbluetooth_sbc_la_LIBADD = $(AM_LIBADD) libpulsecore-@PA_MAJORMINORMICRO@.la libpulsecommon-@PA_MAJORMINORMICRO@.la libpulse.la libbluetooth_sbc_la_CFLAGS = $(AM_CFLAGS) @@ -1457,7 +1457,7 @@ SBC_FILES = $(subst modules/bluetooth/,,$(libbluetooth_sbc_la_SOURCES)) libbluetooth_ipc_la_SOURCES = modules/bluetooth/ipc.c modules/bluetooth/ipc.h libbluetooth_ipc_la_LDFLAGS = -avoid-version libbluetooth_ipc_la_LIBADD = $(AM_LIBADD)libpulsecore-@PA_MAJORMINORMICRO@.la libpulsecommon-@PA_MAJORMINORMICRO@.la libpulse.la -libbluetooth_ipc_la_CFLAGS = $(AM_CFLAGS) -w +libbluetooth_ipc_la_CFLAGS = $(AM_CFLAGS) libbluetooth_util_la_SOURCES = modules/bluetooth/bluetooth-util.c modules/bluetooth/bluetooth-util.h libbluetooth_util_la_LDFLAGS = -avoid-version diff --git a/src/modules/bluetooth/sbc.c b/src/modules/bluetooth/sbc.c index 651981fae..29258d053 100644 --- a/src/modules/bluetooth/sbc.c +++ b/src/modules/bluetooth/sbc.c @@ -46,6 +46,7 @@ #include "sbc_tables.h" #include "sbc.h" +#include "sbc_primitives.h" #define SBC_SYNCWORD 0x9C @@ -76,13 +77,16 @@ struct sbc_frame { uint8_t joint; /* only the lower 4 bits of every element are to be used */ - uint8_t scale_factor[2][8]; + uint32_t scale_factor[2][8]; /* raw integer subband samples in the frame */ + int32_t SBC_ALIGNED sb_sample_f[16][2][8]; - int32_t sb_sample_f[16][2][8]; - int32_t sb_sample[16][2][8]; /* modified subband samples */ - int16_t pcm_sample[2][16*8]; /* original pcm audio samples */ + /* modified subband samples */ + int32_t SBC_ALIGNED sb_sample[16][2][8]; + + /* original pcm audio samples */ + int16_t SBC_ALIGNED pcm_sample[2][16*8]; }; struct sbc_decoder_state { @@ -91,16 +95,6 @@ struct sbc_decoder_state { int offset[2][16]; }; -struct sbc_encoder_state { - int subbands; - int position[2]; - int16_t X[2][256]; - void (*sbc_analyze_4b_4s)(int16_t *pcm, int16_t *x, - int32_t *out, int out_stride); - void (*sbc_analyze_4b_8s)(int16_t *pcm, int16_t *x, - int32_t *out, int out_stride); -}; - /* * Calculates the CRC-8 of the first len bits in data */ @@ -368,7 +362,7 @@ static void sbc_calculate_bits(const struct sbc_frame *frame, int (*bits)[8]) static int sbc_unpack_frame(const uint8_t *data, struct sbc_frame *frame, size_t len) { - int consumed; + unsigned int consumed; /* Will copy the parts of the header that are relevant to crc * calculation here */ uint8_t crc_header[11] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; @@ -653,180 +647,41 @@ static int sbc_synthesize_audio(struct sbc_decoder_state *state, } } -static inline void _sbc_analyze_four(const int16_t *in, int32_t *out) -{ - FIXED_A t1[4]; - FIXED_T t2[4]; - int i = 0, hop = 0; - - /* rounding coefficient */ - t1[0] = t1[1] = t1[2] = t1[3] = - (FIXED_A) 1 << (SBC_PROTO_FIXED4_SCALE - 1); - - /* low pass polyphase filter */ - for (hop = 0; hop < 40; hop += 8) { - t1[0] += (FIXED_A) in[hop] * _sbc_proto_fixed4[hop]; - t1[1] += (FIXED_A) in[hop + 1] * _sbc_proto_fixed4[hop + 1]; - t1[2] += (FIXED_A) in[hop + 2] * _sbc_proto_fixed4[hop + 2]; - t1[1] += (FIXED_A) in[hop + 3] * _sbc_proto_fixed4[hop + 3]; - t1[0] += (FIXED_A) in[hop + 4] * _sbc_proto_fixed4[hop + 4]; - t1[3] += (FIXED_A) in[hop + 5] * _sbc_proto_fixed4[hop + 5]; - t1[3] += (FIXED_A) in[hop + 7] * _sbc_proto_fixed4[hop + 7]; - } - - /* scaling */ - t2[0] = t1[0] >> SBC_PROTO_FIXED4_SCALE; - t2[1] = t1[1] >> SBC_PROTO_FIXED4_SCALE; - t2[2] = t1[2] >> SBC_PROTO_FIXED4_SCALE; - t2[3] = t1[3] >> SBC_PROTO_FIXED4_SCALE; - - /* do the cos transform */ - for (i = 0, hop = 0; i < 4; hop += 8, i++) { - out[i] = ((FIXED_A) t2[0] * cos_table_fixed_4[0 + hop] + - (FIXED_A) t2[1] * cos_table_fixed_4[1 + hop] + - (FIXED_A) t2[2] * cos_table_fixed_4[2 + hop] + - (FIXED_A) t2[3] * cos_table_fixed_4[5 + hop]) >> - (SBC_COS_TABLE_FIXED4_SCALE - SCALE_OUT_BITS); - } -} - -static void sbc_analyze_4b_4s(int16_t *pcm, int16_t *x, - int32_t *out, int out_stride) -{ - int i; - - /* Input 4 x 4 Audio Samples */ - for (i = 0; i < 16; i += 4) { - x[64 + i] = x[0 + i] = pcm[15 - i]; - x[65 + i] = x[1 + i] = pcm[14 - i]; - x[66 + i] = x[2 + i] = pcm[13 - i]; - x[67 + i] = x[3 + i] = pcm[12 - i]; - } - - /* Analyze four blocks */ - _sbc_analyze_four(x + 12, out); - out += out_stride; - _sbc_analyze_four(x + 8, out); - out += out_stride; - _sbc_analyze_four(x + 4, out); - out += out_stride; - _sbc_analyze_four(x, out); -} - -static inline void _sbc_analyze_eight(const int16_t *in, int32_t *out) -{ - FIXED_A t1[8]; - FIXED_T t2[8]; - int i, hop; - - /* rounding coefficient */ - t1[0] = t1[1] = t1[2] = t1[3] = t1[4] = t1[5] = t1[6] = t1[7] = - (FIXED_A) 1 << (SBC_PROTO_FIXED8_SCALE-1); - - /* low pass polyphase filter */ - for (hop = 0; hop < 80; hop += 16) { - t1[0] += (FIXED_A) in[hop] * _sbc_proto_fixed8[hop]; - t1[1] += (FIXED_A) in[hop + 1] * _sbc_proto_fixed8[hop + 1]; - t1[2] += (FIXED_A) in[hop + 2] * _sbc_proto_fixed8[hop + 2]; - t1[3] += (FIXED_A) in[hop + 3] * _sbc_proto_fixed8[hop + 3]; - t1[4] += (FIXED_A) in[hop + 4] * _sbc_proto_fixed8[hop + 4]; - t1[3] += (FIXED_A) in[hop + 5] * _sbc_proto_fixed8[hop + 5]; - t1[2] += (FIXED_A) in[hop + 6] * _sbc_proto_fixed8[hop + 6]; - t1[1] += (FIXED_A) in[hop + 7] * _sbc_proto_fixed8[hop + 7]; - t1[0] += (FIXED_A) in[hop + 8] * _sbc_proto_fixed8[hop + 8]; - t1[5] += (FIXED_A) in[hop + 9] * _sbc_proto_fixed8[hop + 9]; - t1[6] += (FIXED_A) in[hop + 10] * _sbc_proto_fixed8[hop + 10]; - t1[7] += (FIXED_A) in[hop + 11] * _sbc_proto_fixed8[hop + 11]; - t1[7] += (FIXED_A) in[hop + 13] * _sbc_proto_fixed8[hop + 13]; - t1[6] += (FIXED_A) in[hop + 14] * _sbc_proto_fixed8[hop + 14]; - t1[5] += (FIXED_A) in[hop + 15] * _sbc_proto_fixed8[hop + 15]; - } - - /* scaling */ - t2[0] = t1[0] >> SBC_PROTO_FIXED8_SCALE; - t2[1] = t1[1] >> SBC_PROTO_FIXED8_SCALE; - t2[2] = t1[2] >> SBC_PROTO_FIXED8_SCALE; - t2[3] = t1[3] >> SBC_PROTO_FIXED8_SCALE; - t2[4] = t1[4] >> SBC_PROTO_FIXED8_SCALE; - t2[5] = t1[5] >> SBC_PROTO_FIXED8_SCALE; - t2[6] = t1[6] >> SBC_PROTO_FIXED8_SCALE; - t2[7] = t1[7] >> SBC_PROTO_FIXED8_SCALE; - - /* do the cos transform */ - for (i = 0, hop = 0; i < 8; hop += 16, i++) { - out[i] = ((FIXED_A) t2[0] * cos_table_fixed_8[0 + hop] + - (FIXED_A) t2[1] * cos_table_fixed_8[1 + hop] + - (FIXED_A) t2[2] * cos_table_fixed_8[2 + hop] + - (FIXED_A) t2[3] * cos_table_fixed_8[3 + hop] + - (FIXED_A) t2[4] * cos_table_fixed_8[4 + hop] + - (FIXED_A) t2[5] * cos_table_fixed_8[9 + hop] + - (FIXED_A) t2[6] * cos_table_fixed_8[10 + hop] + - (FIXED_A) t2[7] * cos_table_fixed_8[11 + hop]) >> - (SBC_COS_TABLE_FIXED8_SCALE - SCALE_OUT_BITS); - } -} - -static void sbc_analyze_4b_8s(int16_t *pcm, int16_t *x, - int32_t *out, int out_stride) -{ - int i; - - /* Input 4 x 8 Audio Samples */ - for (i = 0; i < 32; i += 8) { - x[128 + i] = x[0 + i] = pcm[31 - i]; - x[129 + i] = x[1 + i] = pcm[30 - i]; - x[130 + i] = x[2 + i] = pcm[29 - i]; - x[131 + i] = x[3 + i] = pcm[28 - i]; - x[132 + i] = x[4 + i] = pcm[27 - i]; - x[133 + i] = x[5 + i] = pcm[26 - i]; - x[134 + i] = x[6 + i] = pcm[25 - i]; - x[135 + i] = x[7 + i] = pcm[24 - i]; - } - - /* Analyze four blocks */ - _sbc_analyze_eight(x + 24, out); - out += out_stride; - _sbc_analyze_eight(x + 16, out); - out += out_stride; - _sbc_analyze_eight(x + 8, out); - out += out_stride; - _sbc_analyze_eight(x, out); -} - static int sbc_analyze_audio(struct sbc_encoder_state *state, struct sbc_frame *frame) { int ch, blk; + int16_t *x; switch (frame->subbands) { case 4: - for (ch = 0; ch < frame->channels; ch++) + for (ch = 0; ch < frame->channels; ch++) { + x = &state->X[ch][state->position - 16 + + frame->blocks * 4]; for (blk = 0; blk < frame->blocks; blk += 4) { state->sbc_analyze_4b_4s( - &frame->pcm_sample[ch][blk * 4], - &state->X[ch][state->position[ch]], + x, frame->sb_sample_f[blk][ch], frame->sb_sample_f[blk + 1][ch] - frame->sb_sample_f[blk][ch]); - state->position[ch] -= 16; - if (state->position[ch] < 0) - state->position[ch] = 64 - 16; + x -= 16; } + } return frame->blocks * 4; case 8: - for (ch = 0; ch < frame->channels; ch++) + for (ch = 0; ch < frame->channels; ch++) { + x = &state->X[ch][state->position - 32 + + frame->blocks * 8]; for (blk = 0; blk < frame->blocks; blk += 4) { state->sbc_analyze_4b_8s( - &frame->pcm_sample[ch][blk * 8], - &state->X[ch][state->position[ch]], + x, frame->sb_sample_f[blk][ch], frame->sb_sample_f[blk + 1][ch] - frame->sb_sample_f[blk][ch]); - state->position[ch] -= 32; - if (state->position[ch] < 0) - state->position[ch] = 128 - 32; + x -= 32; } + } return frame->blocks * 8; default: @@ -836,23 +691,31 @@ static int sbc_analyze_audio(struct sbc_encoder_state *state, /* Supplementary bitstream writing macros for 'sbc_pack_frame' */ -#define PUT_BITS(v, n)\ - bits_cache = (v) | (bits_cache << (n));\ - bits_count += (n);\ - if (bits_count >= 16) {\ - bits_count -= 8;\ - *data_ptr++ = (uint8_t) (bits_cache >> bits_count);\ - bits_count -= 8;\ - *data_ptr++ = (uint8_t) (bits_cache >> bits_count);\ - }\ - -#define FLUSH_BITS()\ - while (bits_count >= 8) {\ - bits_count -= 8;\ - *data_ptr++ = (uint8_t) (bits_cache >> bits_count);\ - }\ - if (bits_count > 0)\ - *data_ptr++ = (uint8_t) (bits_cache << (8 - bits_count));\ +#define PUT_BITS(data_ptr, bits_cache, bits_count, v, n) \ + do { \ + bits_cache = (v) | (bits_cache << (n)); \ + bits_count += (n); \ + if (bits_count >= 16) { \ + bits_count -= 8; \ + *data_ptr++ = (uint8_t) \ + (bits_cache >> bits_count); \ + bits_count -= 8; \ + *data_ptr++ = (uint8_t) \ + (bits_cache >> bits_count); \ + } \ + } while (0) + +#define FLUSH_BITS(data_ptr, bits_cache, bits_count) \ + do { \ + while (bits_count >= 8) { \ + bits_count -= 8; \ + *data_ptr++ = (uint8_t) \ + (bits_cache >> bits_count); \ + } \ + if (bits_count > 0) \ + *data_ptr++ = (uint8_t) \ + (bits_cache << (8 - bits_count)); \ + } while (0) /* * Packs the SBC frame from frame into the memory at data. At most len @@ -869,7 +732,9 @@ static int sbc_analyze_audio(struct sbc_encoder_state *state, * -99 not implemented */ -static int sbc_pack_frame(uint8_t *data, struct sbc_frame *frame, size_t len) +static SBC_ALWAYS_INLINE int sbc_pack_frame_internal( + uint8_t *data, struct sbc_frame *frame, size_t len, + int frame_subbands, int frame_channels) { /* Bitstream writer starts from the fourth byte */ uint8_t *data_ptr = data + 4; @@ -887,8 +752,6 @@ static int sbc_pack_frame(uint8_t *data, struct sbc_frame *frame, size_t len) uint32_t levels[2][8]; /* levels are derived from that */ uint32_t sb_sample_delta[2][8]; - u_int32_t scalefactor[2][8]; /* derived from frame->scale_factor */ - data[0] = SBC_SYNCWORD; data[1] = (frame->frequency & 0x03) << 6; @@ -899,7 +762,7 @@ static int sbc_pack_frame(uint8_t *data, struct sbc_frame *frame, size_t len) data[1] |= (frame->allocation & 0x01) << 1; - switch (frame->subbands) { + switch (frame_subbands) { case 4: /* Nothing to do */ break; @@ -914,11 +777,11 @@ static int sbc_pack_frame(uint8_t *data, struct sbc_frame *frame, size_t len) data[2] = frame->bitpool; if ((frame->mode == MONO || frame->mode == DUAL_CHANNEL) && - frame->bitpool > frame->subbands << 4) + frame->bitpool > frame_subbands << 4) return -5; if ((frame->mode == STEREO || frame->mode == JOINT_STEREO) && - frame->bitpool > frame->subbands << 5) + frame->bitpool > frame_subbands << 5) return -5; /* Can't fill in crc yet */ @@ -927,36 +790,24 @@ static int sbc_pack_frame(uint8_t *data, struct sbc_frame *frame, size_t len) crc_header[1] = data[2]; crc_pos = 16; - for (ch = 0; ch < frame->channels; ch++) { - for (sb = 0; sb < frame->subbands; sb++) { - frame->scale_factor[ch][sb] = 0; - scalefactor[ch][sb] = 2 << SCALE_OUT_BITS; - for (blk = 0; blk < frame->blocks; blk++) { - while (scalefactor[ch][sb] < fabs(frame->sb_sample_f[blk][ch][sb])) { - frame->scale_factor[ch][sb]++; - scalefactor[ch][sb] *= 2; - } - } - } - } - if (frame->mode == JOINT_STEREO) { /* like frame->sb_sample but joint stereo */ int32_t sb_sample_j[16][2]; /* scalefactor and scale_factor in joint case */ - u_int32_t scalefactor_j[2]; + uint32_t scalefactor_j[2]; uint8_t scale_factor_j[2]; uint8_t joint = 0; frame->joint = 0; - for (sb = 0; sb < frame->subbands - 1; sb++) { + for (sb = 0; sb < frame_subbands - 1; sb++) { scale_factor_j[0] = 0; scalefactor_j[0] = 2 << SCALE_OUT_BITS; scale_factor_j[1] = 0; scalefactor_j[1] = 2 << SCALE_OUT_BITS; for (blk = 0; blk < frame->blocks; blk++) { + uint32_t tmp; /* Calculate joint stereo signal */ sb_sample_j[blk][0] = ASR(frame->sb_sample_f[blk][0][sb], 1) + @@ -966,11 +817,13 @@ static int sbc_pack_frame(uint8_t *data, struct sbc_frame *frame, size_t len) ASR(frame->sb_sample_f[blk][1][sb], 1); /* calculate scale_factor_j and scalefactor_j for joint case */ - while (scalefactor_j[0] < fabs(sb_sample_j[blk][0])) { + tmp = fabs(sb_sample_j[blk][0]); + while (scalefactor_j[0] < tmp) { scale_factor_j[0]++; scalefactor_j[0] *= 2; } - while (scalefactor_j[1] < fabs(sb_sample_j[blk][1])) { + tmp = fabs(sb_sample_j[blk][1]); + while (scalefactor_j[1] < tmp) { scale_factor_j[1]++; scalefactor_j[1] *= 2; } @@ -982,7 +835,7 @@ static int sbc_pack_frame(uint8_t *data, struct sbc_frame *frame, size_t len) (scale_factor_j[0] + scale_factor_j[1])) { /* use joint stereo for this subband */ - joint |= 1 << (frame->subbands - 1 - sb); + joint |= 1 << (frame_subbands - 1 - sb); frame->joint |= 1 << sb; frame->scale_factor[0][sb] = scale_factor_j[0]; frame->scale_factor[1][sb] = scale_factor_j[1]; @@ -995,14 +848,16 @@ static int sbc_pack_frame(uint8_t *data, struct sbc_frame *frame, size_t len) } } - PUT_BITS(joint, frame->subbands); + PUT_BITS(data_ptr, bits_cache, bits_count, + joint, frame_subbands); crc_header[crc_pos >> 3] = joint; - crc_pos += frame->subbands; + crc_pos += frame_subbands; } - for (ch = 0; ch < frame->channels; ch++) { - for (sb = 0; sb < frame->subbands; sb++) { - PUT_BITS(frame->scale_factor[ch][sb] & 0x0F, 4); + for (ch = 0; ch < frame_channels; ch++) { + for (sb = 0; sb < frame_subbands; sb++) { + PUT_BITS(data_ptr, bits_cache, bits_count, + frame->scale_factor[ch][sb] & 0x0F, 4); crc_header[crc_pos >> 3] <<= 4; crc_header[crc_pos >> 3] |= frame->scale_factor[ch][sb] & 0x0F; crc_pos += 4; @@ -1017,8 +872,8 @@ static int sbc_pack_frame(uint8_t *data, struct sbc_frame *frame, size_t len) sbc_calculate_bits(frame, bits); - for (ch = 0; ch < frame->channels; ch++) { - for (sb = 0; sb < frame->subbands; sb++) { + for (ch = 0; ch < frame_channels; ch++) { + for (sb = 0; sb < frame_subbands; sb++) { levels[ch][sb] = ((1 << bits[ch][sb]) - 1) << (32 - (frame->scale_factor[ch][sb] + SCALE_OUT_BITS + 2)); @@ -1029,8 +884,8 @@ static int sbc_pack_frame(uint8_t *data, struct sbc_frame *frame, size_t len) } for (blk = 0; blk < frame->blocks; blk++) { - for (ch = 0; ch < frame->channels; ch++) { - for (sb = 0; sb < frame->subbands; sb++) { + for (ch = 0; ch < frame_channels; ch++) { + for (sb = 0; sb < frame_subbands; sb++) { if (bits[ch][sb] == 0) continue; @@ -1039,33 +894,46 @@ static int sbc_pack_frame(uint8_t *data, struct sbc_frame *frame, size_t len) (sb_sample_delta[ch][sb] + frame->sb_sample_f[blk][ch][sb])) >> 32; - PUT_BITS(audio_sample, bits[ch][sb]); + PUT_BITS(data_ptr, bits_cache, bits_count, + audio_sample, bits[ch][sb]); } } } - FLUSH_BITS(); + FLUSH_BITS(data_ptr, bits_cache, bits_count); return data_ptr - data; } +static int sbc_pack_frame(uint8_t *data, struct sbc_frame *frame, size_t len) +{ + if (frame->subbands == 4) { + if (frame->channels == 1) + return sbc_pack_frame_internal(data, frame, len, 4, 1); + else + return sbc_pack_frame_internal(data, frame, len, 4, 2); + } else { + if (frame->channels == 1) + return sbc_pack_frame_internal(data, frame, len, 8, 1); + else + return sbc_pack_frame_internal(data, frame, len, 8, 2); + } +} + static void sbc_encoder_init(struct sbc_encoder_state *state, const struct sbc_frame *frame) { memset(&state->X, 0, sizeof(state->X)); - state->subbands = frame->subbands; - state->position[0] = state->position[1] = 12 * frame->subbands; + state->position = SBC_X_BUFFER_SIZE - frame->subbands * 9; - /* Default implementation for analyze function */ - state->sbc_analyze_4b_4s = sbc_analyze_4b_4s; - state->sbc_analyze_4b_8s = sbc_analyze_4b_8s; + sbc_init_primitives(state); } struct sbc_priv { int init; - struct sbc_frame frame; - struct sbc_decoder_state dec_state; - struct sbc_encoder_state enc_state; + struct SBC_ALIGNED sbc_frame frame; + struct SBC_ALIGNED sbc_decoder_state dec_state; + struct SBC_ALIGNED sbc_encoder_state enc_state; }; static void sbc_set_defaults(sbc_t *sbc, unsigned long flags) @@ -1091,10 +959,13 @@ int sbc_init(sbc_t *sbc, unsigned long flags) memset(sbc, 0, sizeof(sbc_t)); - sbc->priv = malloc(sizeof(struct sbc_priv)); - if (!sbc->priv) + sbc->priv_alloc_base = malloc(sizeof(struct sbc_priv) + SBC_ALIGN_MASK); + if (!sbc->priv_alloc_base) return -ENOMEM; + sbc->priv = (void *) (((uintptr_t) sbc->priv_alloc_base + + SBC_ALIGN_MASK) & ~((uintptr_t) SBC_ALIGN_MASK)); + memset(sbc->priv, 0, sizeof(struct sbc_priv)); sbc_set_defaults(sbc, flags); @@ -1177,8 +1048,10 @@ int sbc_encode(sbc_t *sbc, void *input, int input_len, void *output, int output_len, int *written) { struct sbc_priv *priv; - char *ptr; - int i, ch, framelen, samples; + int framelen, samples; + int (*sbc_enc_process_input)(int position, + const uint8_t *pcm, int16_t X[2][SBC_X_BUFFER_SIZE], + int nsamples, int nchannels); if (!sbc && !input) return -EIO; @@ -1213,22 +1086,34 @@ int sbc_encode(sbc_t *sbc, void *input, int input_len, void *output, if (!output || output_len < priv->frame.length) return -ENOSPC; - ptr = input; - - for (i = 0; i < priv->frame.subbands * priv->frame.blocks; i++) { - for (ch = 0; ch < priv->frame.channels; ch++) { - int16_t s; - if (sbc->endian == SBC_BE) - s = (ptr[0] & 0xff) << 8 | (ptr[1] & 0xff); - else - s = (ptr[0] & 0xff) | (ptr[1] & 0xff) << 8; - ptr += 2; - priv->frame.pcm_sample[ch][i] = s; - } + /* Select the needed input data processing function and call it */ + if (priv->frame.subbands == 8) { + if (sbc->endian == SBC_BE) + sbc_enc_process_input = + priv->enc_state.sbc_enc_process_input_8s_be; + else + sbc_enc_process_input = + priv->enc_state.sbc_enc_process_input_8s_le; + } else { + if (sbc->endian == SBC_BE) + sbc_enc_process_input = + priv->enc_state.sbc_enc_process_input_4s_be; + else + sbc_enc_process_input = + priv->enc_state.sbc_enc_process_input_4s_le; } + priv->enc_state.position = sbc_enc_process_input( + priv->enc_state.position, (const uint8_t *) input, + priv->enc_state.X, priv->frame.subbands * priv->frame.blocks, + priv->frame.channels); + samples = sbc_analyze_audio(&priv->enc_state, &priv->frame); + priv->enc_state.sbc_calc_scalefactors( + priv->frame.sb_sample_f, priv->frame.scale_factor, + priv->frame.blocks, priv->frame.channels, priv->frame.subbands); + framelen = sbc_pack_frame(output, &priv->frame, output_len); if (written) @@ -1242,8 +1127,8 @@ void sbc_finish(sbc_t *sbc) if (!sbc) return; - if (sbc->priv) - free(sbc->priv); + if (sbc->priv_alloc_base) + free(sbc->priv_alloc_base); memset(sbc, 0, sizeof(sbc_t)); } diff --git a/src/modules/bluetooth/sbc.h b/src/modules/bluetooth/sbc.h index 8ac59309e..b0a14888f 100644 --- a/src/modules/bluetooth/sbc.h +++ b/src/modules/bluetooth/sbc.h @@ -74,6 +74,7 @@ struct sbc_struct { uint8_t endian; void *priv; + void *priv_alloc_base; }; typedef struct sbc_struct sbc_t; diff --git a/src/modules/bluetooth/sbc_math.h b/src/modules/bluetooth/sbc_math.h index 6ca4f5260..b87bc81cb 100644 --- a/src/modules/bluetooth/sbc_math.h +++ b/src/modules/bluetooth/sbc_math.h @@ -29,8 +29,6 @@ #define ASR(val, bits) ((-2 >> 1 == -1) ? \ ((int32_t)(val)) >> (bits) : ((int32_t) (val)) / (1 << (bits))) -#define SCALE_OUT_BITS 15 - #define SCALE_SPROTO4_TBL 12 #define SCALE_SPROTO8_TBL 14 #define SCALE_NPROTO4_TBL 11 diff --git a/src/modules/bluetooth/sbc_primitives.c b/src/modules/bluetooth/sbc_primitives.c new file mode 100644 index 000000000..303f3feea --- /dev/null +++ b/src/modules/bluetooth/sbc_primitives.c @@ -0,0 +1,469 @@ +/* + * + * Bluetooth low-complexity, subband codec (SBC) library + * + * Copyright (C) 2004-2009 Marcel Holtmann <marcel@holtmann.org> + * Copyright (C) 2004-2005 Henryk Ploetz <henryk@ploetzli.ch> + * Copyright (C) 2005-2006 Brad Midgley <bmidgley@xmission.com> + * + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + */ + +#include <stdint.h> +#include <limits.h> +#include <string.h> +#include "sbc.h" +#include "sbc_math.h" +#include "sbc_tables.h" + +#include "sbc_primitives.h" +#include "sbc_primitives_mmx.h" +#include "sbc_primitives_neon.h" + +/* + * A reference C code of analysis filter with SIMD-friendly tables + * reordering and code layout. This code can be used to develop platform + * specific SIMD optimizations. Also it may be used as some kind of test + * for compiler autovectorization capabilities (who knows, if the compiler + * is very good at this stuff, hand optimized assembly may be not strictly + * needed for some platform). + * + * Note: It is also possible to make a simple variant of analysis filter, + * which needs only a single constants table without taking care about + * even/odd cases. This simple variant of filter can be implemented without + * input data permutation. The only thing that would be lost is the + * possibility to use pairwise SIMD multiplications. But for some simple + * CPU cores without SIMD extensions it can be useful. If anybody is + * interested in implementing such variant of a filter, sourcecode from + * bluez versions 4.26/4.27 can be used as a reference and the history of + * the changes in git repository done around that time may be worth checking. + */ + +static inline void sbc_analyze_four_simd(const int16_t *in, int32_t *out, + const FIXED_T *consts) +{ + FIXED_A t1[4]; + FIXED_T t2[4]; + int hop = 0; + + /* rounding coefficient */ + t1[0] = t1[1] = t1[2] = t1[3] = + (FIXED_A) 1 << (SBC_PROTO_FIXED4_SCALE - 1); + + /* low pass polyphase filter */ + for (hop = 0; hop < 40; hop += 8) { + t1[0] += (FIXED_A) in[hop] * consts[hop]; + t1[0] += (FIXED_A) in[hop + 1] * consts[hop + 1]; + t1[1] += (FIXED_A) in[hop + 2] * consts[hop + 2]; + t1[1] += (FIXED_A) in[hop + 3] * consts[hop + 3]; + t1[2] += (FIXED_A) in[hop + 4] * consts[hop + 4]; + t1[2] += (FIXED_A) in[hop + 5] * consts[hop + 5]; + t1[3] += (FIXED_A) in[hop + 6] * consts[hop + 6]; + t1[3] += (FIXED_A) in[hop + 7] * consts[hop + 7]; + } + + /* scaling */ + t2[0] = t1[0] >> SBC_PROTO_FIXED4_SCALE; + t2[1] = t1[1] >> SBC_PROTO_FIXED4_SCALE; + t2[2] = t1[2] >> SBC_PROTO_FIXED4_SCALE; + t2[3] = t1[3] >> SBC_PROTO_FIXED4_SCALE; + + /* do the cos transform */ + t1[0] = (FIXED_A) t2[0] * consts[40 + 0]; + t1[0] += (FIXED_A) t2[1] * consts[40 + 1]; + t1[1] = (FIXED_A) t2[0] * consts[40 + 2]; + t1[1] += (FIXED_A) t2[1] * consts[40 + 3]; + t1[2] = (FIXED_A) t2[0] * consts[40 + 4]; + t1[2] += (FIXED_A) t2[1] * consts[40 + 5]; + t1[3] = (FIXED_A) t2[0] * consts[40 + 6]; + t1[3] += (FIXED_A) t2[1] * consts[40 + 7]; + + t1[0] += (FIXED_A) t2[2] * consts[40 + 8]; + t1[0] += (FIXED_A) t2[3] * consts[40 + 9]; + t1[1] += (FIXED_A) t2[2] * consts[40 + 10]; + t1[1] += (FIXED_A) t2[3] * consts[40 + 11]; + t1[2] += (FIXED_A) t2[2] * consts[40 + 12]; + t1[2] += (FIXED_A) t2[3] * consts[40 + 13]; + t1[3] += (FIXED_A) t2[2] * consts[40 + 14]; + t1[3] += (FIXED_A) t2[3] * consts[40 + 15]; + + out[0] = t1[0] >> + (SBC_COS_TABLE_FIXED4_SCALE - SCALE_OUT_BITS); + out[1] = t1[1] >> + (SBC_COS_TABLE_FIXED4_SCALE - SCALE_OUT_BITS); + out[2] = t1[2] >> + (SBC_COS_TABLE_FIXED4_SCALE - SCALE_OUT_BITS); + out[3] = t1[3] >> + (SBC_COS_TABLE_FIXED4_SCALE - SCALE_OUT_BITS); +} + +static inline void sbc_analyze_eight_simd(const int16_t *in, int32_t *out, + const FIXED_T *consts) +{ + FIXED_A t1[8]; + FIXED_T t2[8]; + int i, hop; + + /* rounding coefficient */ + t1[0] = t1[1] = t1[2] = t1[3] = t1[4] = t1[5] = t1[6] = t1[7] = + (FIXED_A) 1 << (SBC_PROTO_FIXED8_SCALE-1); + + /* low pass polyphase filter */ + for (hop = 0; hop < 80; hop += 16) { + t1[0] += (FIXED_A) in[hop] * consts[hop]; + t1[0] += (FIXED_A) in[hop + 1] * consts[hop + 1]; + t1[1] += (FIXED_A) in[hop + 2] * consts[hop + 2]; + t1[1] += (FIXED_A) in[hop + 3] * consts[hop + 3]; + t1[2] += (FIXED_A) in[hop + 4] * consts[hop + 4]; + t1[2] += (FIXED_A) in[hop + 5] * consts[hop + 5]; + t1[3] += (FIXED_A) in[hop + 6] * consts[hop + 6]; + t1[3] += (FIXED_A) in[hop + 7] * consts[hop + 7]; + t1[4] += (FIXED_A) in[hop + 8] * consts[hop + 8]; + t1[4] += (FIXED_A) in[hop + 9] * consts[hop + 9]; + t1[5] += (FIXED_A) in[hop + 10] * consts[hop + 10]; + t1[5] += (FIXED_A) in[hop + 11] * consts[hop + 11]; + t1[6] += (FIXED_A) in[hop + 12] * consts[hop + 12]; + t1[6] += (FIXED_A) in[hop + 13] * consts[hop + 13]; + t1[7] += (FIXED_A) in[hop + 14] * consts[hop + 14]; + t1[7] += (FIXED_A) in[hop + 15] * consts[hop + 15]; + } + + /* scaling */ + t2[0] = t1[0] >> SBC_PROTO_FIXED8_SCALE; + t2[1] = t1[1] >> SBC_PROTO_FIXED8_SCALE; + t2[2] = t1[2] >> SBC_PROTO_FIXED8_SCALE; + t2[3] = t1[3] >> SBC_PROTO_FIXED8_SCALE; + t2[4] = t1[4] >> SBC_PROTO_FIXED8_SCALE; + t2[5] = t1[5] >> SBC_PROTO_FIXED8_SCALE; + t2[6] = t1[6] >> SBC_PROTO_FIXED8_SCALE; + t2[7] = t1[7] >> SBC_PROTO_FIXED8_SCALE; + + + /* do the cos transform */ + t1[0] = t1[1] = t1[2] = t1[3] = t1[4] = t1[5] = t1[6] = t1[7] = 0; + + for (i = 0; i < 4; i++) { + t1[0] += (FIXED_A) t2[i * 2 + 0] * consts[80 + i * 16 + 0]; + t1[0] += (FIXED_A) t2[i * 2 + 1] * consts[80 + i * 16 + 1]; + t1[1] += (FIXED_A) t2[i * 2 + 0] * consts[80 + i * 16 + 2]; + t1[1] += (FIXED_A) t2[i * 2 + 1] * consts[80 + i * 16 + 3]; + t1[2] += (FIXED_A) t2[i * 2 + 0] * consts[80 + i * 16 + 4]; + t1[2] += (FIXED_A) t2[i * 2 + 1] * consts[80 + i * 16 + 5]; + t1[3] += (FIXED_A) t2[i * 2 + 0] * consts[80 + i * 16 + 6]; + t1[3] += (FIXED_A) t2[i * 2 + 1] * consts[80 + i * 16 + 7]; + t1[4] += (FIXED_A) t2[i * 2 + 0] * consts[80 + i * 16 + 8]; + t1[4] += (FIXED_A) t2[i * 2 + 1] * consts[80 + i * 16 + 9]; + t1[5] += (FIXED_A) t2[i * 2 + 0] * consts[80 + i * 16 + 10]; + t1[5] += (FIXED_A) t2[i * 2 + 1] * consts[80 + i * 16 + 11]; + t1[6] += (FIXED_A) t2[i * 2 + 0] * consts[80 + i * 16 + 12]; + t1[6] += (FIXED_A) t2[i * 2 + 1] * consts[80 + i * 16 + 13]; + t1[7] += (FIXED_A) t2[i * 2 + 0] * consts[80 + i * 16 + 14]; + t1[7] += (FIXED_A) t2[i * 2 + 1] * consts[80 + i * 16 + 15]; + } + + for (i = 0; i < 8; i++) + out[i] = t1[i] >> + (SBC_COS_TABLE_FIXED8_SCALE - SCALE_OUT_BITS); +} + +static inline void sbc_analyze_4b_4s_simd(int16_t *x, + int32_t *out, int out_stride) +{ + /* Analyze blocks */ + sbc_analyze_four_simd(x + 12, out, analysis_consts_fixed4_simd_odd); + out += out_stride; + sbc_analyze_four_simd(x + 8, out, analysis_consts_fixed4_simd_even); + out += out_stride; + sbc_analyze_four_simd(x + 4, out, analysis_consts_fixed4_simd_odd); + out += out_stride; + sbc_analyze_four_simd(x + 0, out, analysis_consts_fixed4_simd_even); +} + +static inline void sbc_analyze_4b_8s_simd(int16_t *x, + int32_t *out, int out_stride) +{ + /* Analyze blocks */ + sbc_analyze_eight_simd(x + 24, out, analysis_consts_fixed8_simd_odd); + out += out_stride; + sbc_analyze_eight_simd(x + 16, out, analysis_consts_fixed8_simd_even); + out += out_stride; + sbc_analyze_eight_simd(x + 8, out, analysis_consts_fixed8_simd_odd); + out += out_stride; + sbc_analyze_eight_simd(x + 0, out, analysis_consts_fixed8_simd_even); +} + +static inline int16_t unaligned16_be(const uint8_t *ptr) +{ + return (int16_t) ((ptr[0] << 8) | ptr[1]); +} + +static inline int16_t unaligned16_le(const uint8_t *ptr) +{ + return (int16_t) (ptr[0] | (ptr[1] << 8)); +} + +/* + * Internal helper functions for input data processing. In order to get + * optimal performance, it is important to have "nsamples", "nchannels" + * and "big_endian" arguments used with this inline function as compile + * time constants. + */ + +static SBC_ALWAYS_INLINE int sbc_encoder_process_input_s4_internal( + int position, + const uint8_t *pcm, int16_t X[2][SBC_X_BUFFER_SIZE], + int nsamples, int nchannels, int big_endian) +{ + /* handle X buffer wraparound */ + if (position < nsamples) { + if (nchannels > 0) + memcpy(&X[0][SBC_X_BUFFER_SIZE - 36], &X[0][position], + 36 * sizeof(int16_t)); + if (nchannels > 1) + memcpy(&X[1][SBC_X_BUFFER_SIZE - 36], &X[1][position], + 36 * sizeof(int16_t)); + position = SBC_X_BUFFER_SIZE - 36; + } + + #define PCM(i) (big_endian ? \ + unaligned16_be(pcm + (i) * 2) : unaligned16_le(pcm + (i) * 2)) + + /* copy/permutate audio samples */ + while ((nsamples -= 8) >= 0) { + position -= 8; + if (nchannels > 0) { + int16_t *x = &X[0][position]; + x[0] = PCM(0 + 7 * nchannels); + x[1] = PCM(0 + 3 * nchannels); + x[2] = PCM(0 + 6 * nchannels); + x[3] = PCM(0 + 4 * nchannels); + x[4] = PCM(0 + 0 * nchannels); + x[5] = PCM(0 + 2 * nchannels); + x[6] = PCM(0 + 1 * nchannels); + x[7] = PCM(0 + 5 * nchannels); + } + if (nchannels > 1) { + int16_t *x = &X[1][position]; + x[0] = PCM(1 + 7 * nchannels); + x[1] = PCM(1 + 3 * nchannels); + x[2] = PCM(1 + 6 * nchannels); + x[3] = PCM(1 + 4 * nchannels); + x[4] = PCM(1 + 0 * nchannels); + x[5] = PCM(1 + 2 * nchannels); + x[6] = PCM(1 + 1 * nchannels); + x[7] = PCM(1 + 5 * nchannels); + } + pcm += 16 * nchannels; + } + #undef PCM + + return position; +} + +static SBC_ALWAYS_INLINE int sbc_encoder_process_input_s8_internal( + int position, + const uint8_t *pcm, int16_t X[2][SBC_X_BUFFER_SIZE], + int nsamples, int nchannels, int big_endian) +{ + /* handle X buffer wraparound */ + if (position < nsamples) { + if (nchannels > 0) + memcpy(&X[0][SBC_X_BUFFER_SIZE - 72], &X[0][position], + 72 * sizeof(int16_t)); + if (nchannels > 1) + memcpy(&X[1][SBC_X_BUFFER_SIZE - 72], &X[1][position], + 72 * sizeof(int16_t)); + position = SBC_X_BUFFER_SIZE - 72; + } + + #define PCM(i) (big_endian ? \ + unaligned16_be(pcm + (i) * 2) : unaligned16_le(pcm + (i) * 2)) + + /* copy/permutate audio samples */ + while ((nsamples -= 16) >= 0) { + position -= 16; + if (nchannels > 0) { + int16_t *x = &X[0][position]; + x[0] = PCM(0 + 15 * nchannels); + x[1] = PCM(0 + 7 * nchannels); + x[2] = PCM(0 + 14 * nchannels); + x[3] = PCM(0 + 8 * nchannels); + x[4] = PCM(0 + 13 * nchannels); + x[5] = PCM(0 + 9 * nchannels); + x[6] = PCM(0 + 12 * nchannels); + x[7] = PCM(0 + 10 * nchannels); + x[8] = PCM(0 + 11 * nchannels); + x[9] = PCM(0 + 3 * nchannels); + x[10] = PCM(0 + 6 * nchannels); + x[11] = PCM(0 + 0 * nchannels); + x[12] = PCM(0 + 5 * nchannels); + x[13] = PCM(0 + 1 * nchannels); + x[14] = PCM(0 + 4 * nchannels); + x[15] = PCM(0 + 2 * nchannels); + } + if (nchannels > 1) { + int16_t *x = &X[1][position]; + x[0] = PCM(1 + 15 * nchannels); + x[1] = PCM(1 + 7 * nchannels); + x[2] = PCM(1 + 14 * nchannels); + x[3] = PCM(1 + 8 * nchannels); + x[4] = PCM(1 + 13 * nchannels); + x[5] = PCM(1 + 9 * nchannels); + x[6] = PCM(1 + 12 * nchannels); + x[7] = PCM(1 + 10 * nchannels); + x[8] = PCM(1 + 11 * nchannels); + x[9] = PCM(1 + 3 * nchannels); + x[10] = PCM(1 + 6 * nchannels); + x[11] = PCM(1 + 0 * nchannels); + x[12] = PCM(1 + 5 * nchannels); + x[13] = PCM(1 + 1 * nchannels); + x[14] = PCM(1 + 4 * nchannels); + x[15] = PCM(1 + 2 * nchannels); + } + pcm += 32 * nchannels; + } + #undef PCM + + return position; +} + +/* + * Input data processing functions. The data is endian converted if needed, + * channels are deintrleaved and audio samples are reordered for use in + * SIMD-friendly analysis filter function. The results are put into "X" + * array, getting appended to the previous data (or it is better to say + * prepended, as the buffer is filled from top to bottom). Old data is + * discarded when neededed, but availability of (10 * nrof_subbands) + * contiguous samples is always guaranteed for the input to the analysis + * filter. This is achieved by copying a sufficient part of old data + * to the top of the buffer on buffer wraparound. + */ + +static int sbc_enc_process_input_4s_le(int position, + const uint8_t *pcm, int16_t X[2][SBC_X_BUFFER_SIZE], + int nsamples, int nchannels) +{ + if (nchannels > 1) + return sbc_encoder_process_input_s4_internal( + position, pcm, X, nsamples, 2, 0); + else + return sbc_encoder_process_input_s4_internal( + position, pcm, X, nsamples, 1, 0); +} + +static int sbc_enc_process_input_4s_be(int position, + const uint8_t *pcm, int16_t X[2][SBC_X_BUFFER_SIZE], + int nsamples, int nchannels) +{ + if (nchannels > 1) + return sbc_encoder_process_input_s4_internal( + position, pcm, X, nsamples, 2, 1); + else + return sbc_encoder_process_input_s4_internal( + position, pcm, X, nsamples, 1, 1); +} + +static int sbc_enc_process_input_8s_le(int position, + const uint8_t *pcm, int16_t X[2][SBC_X_BUFFER_SIZE], + int nsamples, int nchannels) +{ + if (nchannels > 1) + return sbc_encoder_process_input_s8_internal( + position, pcm, X, nsamples, 2, 0); + else + return sbc_encoder_process_input_s8_internal( + position, pcm, X, nsamples, 1, 0); +} + +static int sbc_enc_process_input_8s_be(int position, + const uint8_t *pcm, int16_t X[2][SBC_X_BUFFER_SIZE], + int nsamples, int nchannels) +{ + if (nchannels > 1) + return sbc_encoder_process_input_s8_internal( + position, pcm, X, nsamples, 2, 1); + else + return sbc_encoder_process_input_s8_internal( + position, pcm, X, nsamples, 1, 1); +} + +/* Supplementary function to count the number of leading zeros */ + +static inline int sbc_clz(uint32_t x) +{ +#ifdef __GNUC__ + return __builtin_clz(x); +#else + /* TODO: this should be replaced with something better if good + * performance is wanted when using compilers other than gcc */ + int cnt = 0; + while (x) { + cnt++; + x >>= 1; + } + return 32 - cnt; +#endif +} + +static void sbc_calc_scalefactors( + int32_t sb_sample_f[16][2][8], + uint32_t scale_factor[2][8], + int blocks, int channels, int subbands) +{ + int ch, sb, blk; + for (ch = 0; ch < channels; ch++) { + for (sb = 0; sb < subbands; sb++) { + uint32_t x = 1 << SCALE_OUT_BITS; + for (blk = 0; blk < blocks; blk++) { + int32_t tmp = fabs(sb_sample_f[blk][ch][sb]); + if (tmp != 0) + x |= tmp - 1; + } + scale_factor[ch][sb] = (31 - SCALE_OUT_BITS) - + sbc_clz(x); + } + } +} + +/* + * Detect CPU features and setup function pointers + */ +void sbc_init_primitives(struct sbc_encoder_state *state) +{ + /* Default implementation for analyze functions */ + state->sbc_analyze_4b_4s = sbc_analyze_4b_4s_simd; + state->sbc_analyze_4b_8s = sbc_analyze_4b_8s_simd; + + /* Default implementation for input reordering / deinterleaving */ + state->sbc_enc_process_input_4s_le = sbc_enc_process_input_4s_le; + state->sbc_enc_process_input_4s_be = sbc_enc_process_input_4s_be; + state->sbc_enc_process_input_8s_le = sbc_enc_process_input_8s_le; + state->sbc_enc_process_input_8s_be = sbc_enc_process_input_8s_be; + + /* Default implementation for scale factors calculation */ + state->sbc_calc_scalefactors = sbc_calc_scalefactors; + + /* X86/AMD64 optimizations */ +#ifdef SBC_BUILD_WITH_MMX_SUPPORT + sbc_init_primitives_mmx(state); +#endif + + /* ARM optimizations */ +#ifdef SBC_BUILD_WITH_NEON_SUPPORT + sbc_init_primitives_neon(state); +#endif +} diff --git a/src/modules/bluetooth/sbc_primitives.h b/src/modules/bluetooth/sbc_primitives.h new file mode 100644 index 000000000..2708c829f --- /dev/null +++ b/src/modules/bluetooth/sbc_primitives.h @@ -0,0 +1,74 @@ +/* + * + * Bluetooth low-complexity, subband codec (SBC) library + * + * Copyright (C) 2004-2009 Marcel Holtmann <marcel@holtmann.org> + * Copyright (C) 2004-2005 Henryk Ploetz <henryk@ploetzli.ch> + * Copyright (C) 2005-2006 Brad Midgley <bmidgley@xmission.com> + * + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + */ + +#ifndef __SBC_PRIMITIVES_H +#define __SBC_PRIMITIVES_H + +#define SCALE_OUT_BITS 15 +#define SBC_X_BUFFER_SIZE 328 + +#ifdef __GNUC__ +#define SBC_ALWAYS_INLINE __attribute__((always_inline)) +#else +#define SBC_ALWAYS_INLINE inline +#endif + +struct sbc_encoder_state { + int position; + int16_t SBC_ALIGNED X[2][SBC_X_BUFFER_SIZE]; + /* Polyphase analysis filter for 4 subbands configuration, + * it handles 4 blocks at once */ + void (*sbc_analyze_4b_4s)(int16_t *x, int32_t *out, int out_stride); + /* Polyphase analysis filter for 8 subbands configuration, + * it handles 4 blocks at once */ + void (*sbc_analyze_4b_8s)(int16_t *x, int32_t *out, int out_stride); + /* Process input data (deinterleave, endian conversion, reordering), + * depending on the number of subbands and input data byte order */ + int (*sbc_enc_process_input_4s_le)(int position, + const uint8_t *pcm, int16_t X[2][SBC_X_BUFFER_SIZE], + int nsamples, int nchannels); + int (*sbc_enc_process_input_4s_be)(int position, + const uint8_t *pcm, int16_t X[2][SBC_X_BUFFER_SIZE], + int nsamples, int nchannels); + int (*sbc_enc_process_input_8s_le)(int position, + const uint8_t *pcm, int16_t X[2][SBC_X_BUFFER_SIZE], + int nsamples, int nchannels); + int (*sbc_enc_process_input_8s_be)(int position, + const uint8_t *pcm, int16_t X[2][SBC_X_BUFFER_SIZE], + int nsamples, int nchannels); + /* Scale factors calculation */ + void (*sbc_calc_scalefactors)(int32_t sb_sample_f[16][2][8], + uint32_t scale_factor[2][8], + int blocks, int channels, int subbands); +}; + +/* + * Initialize pointers to the functions which are the basic "building bricks" + * of SBC codec. Best implementation is selected based on target CPU + * capabilities. + */ +void sbc_init_primitives(struct sbc_encoder_state *encoder_state); + +#endif diff --git a/src/modules/bluetooth/sbc_primitives_mmx.c b/src/modules/bluetooth/sbc_primitives_mmx.c new file mode 100644 index 000000000..7db4af72b --- /dev/null +++ b/src/modules/bluetooth/sbc_primitives_mmx.c @@ -0,0 +1,319 @@ +/* + * + * Bluetooth low-complexity, subband codec (SBC) library + * + * Copyright (C) 2004-2009 Marcel Holtmann <marcel@holtmann.org> + * Copyright (C) 2004-2005 Henryk Ploetz <henryk@ploetzli.ch> + * Copyright (C) 2005-2006 Brad Midgley <bmidgley@xmission.com> + * + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + */ + +#include <stdint.h> +#include <limits.h> +#include "sbc.h" +#include "sbc_math.h" +#include "sbc_tables.h" + +#include "sbc_primitives_mmx.h" + +/* + * MMX optimizations + */ + +#ifdef SBC_BUILD_WITH_MMX_SUPPORT + +static inline void sbc_analyze_four_mmx(const int16_t *in, int32_t *out, + const FIXED_T *consts) +{ + static const SBC_ALIGNED int32_t round_c[2] = { + 1 << (SBC_PROTO_FIXED4_SCALE - 1), + 1 << (SBC_PROTO_FIXED4_SCALE - 1), + }; + asm volatile ( + "movq (%0), %%mm0\n" + "movq 8(%0), %%mm1\n" + "pmaddwd (%1), %%mm0\n" + "pmaddwd 8(%1), %%mm1\n" + "paddd (%2), %%mm0\n" + "paddd (%2), %%mm1\n" + "\n" + "movq 16(%0), %%mm2\n" + "movq 24(%0), %%mm3\n" + "pmaddwd 16(%1), %%mm2\n" + "pmaddwd 24(%1), %%mm3\n" + "paddd %%mm2, %%mm0\n" + "paddd %%mm3, %%mm1\n" + "\n" + "movq 32(%0), %%mm2\n" + "movq 40(%0), %%mm3\n" + "pmaddwd 32(%1), %%mm2\n" + "pmaddwd 40(%1), %%mm3\n" + "paddd %%mm2, %%mm0\n" + "paddd %%mm3, %%mm1\n" + "\n" + "movq 48(%0), %%mm2\n" + "movq 56(%0), %%mm3\n" + "pmaddwd 48(%1), %%mm2\n" + "pmaddwd 56(%1), %%mm3\n" + "paddd %%mm2, %%mm0\n" + "paddd %%mm3, %%mm1\n" + "\n" + "movq 64(%0), %%mm2\n" + "movq 72(%0), %%mm3\n" + "pmaddwd 64(%1), %%mm2\n" + "pmaddwd 72(%1), %%mm3\n" + "paddd %%mm2, %%mm0\n" + "paddd %%mm3, %%mm1\n" + "\n" + "psrad %4, %%mm0\n" + "psrad %4, %%mm1\n" + "packssdw %%mm0, %%mm0\n" + "packssdw %%mm1, %%mm1\n" + "\n" + "movq %%mm0, %%mm2\n" + "pmaddwd 80(%1), %%mm0\n" + "pmaddwd 88(%1), %%mm2\n" + "\n" + "movq %%mm1, %%mm3\n" + "pmaddwd 96(%1), %%mm1\n" + "pmaddwd 104(%1), %%mm3\n" + "paddd %%mm1, %%mm0\n" + "paddd %%mm3, %%mm2\n" + "\n" + "movq %%mm0, (%3)\n" + "movq %%mm2, 8(%3)\n" + : + : "r" (in), "r" (consts), "r" (&round_c), "r" (out), + "i" (SBC_PROTO_FIXED4_SCALE) + : "memory"); +} + +static inline void sbc_analyze_eight_mmx(const int16_t *in, int32_t *out, + const FIXED_T *consts) +{ + static const SBC_ALIGNED int32_t round_c[2] = { + 1 << (SBC_PROTO_FIXED8_SCALE - 1), + 1 << (SBC_PROTO_FIXED8_SCALE - 1), + }; + asm volatile ( + "movq (%0), %%mm0\n" + "movq 8(%0), %%mm1\n" + "movq 16(%0), %%mm2\n" + "movq 24(%0), %%mm3\n" + "pmaddwd (%1), %%mm0\n" + "pmaddwd 8(%1), %%mm1\n" + "pmaddwd 16(%1), %%mm2\n" + "pmaddwd 24(%1), %%mm3\n" + "paddd (%2), %%mm0\n" + "paddd (%2), %%mm1\n" + "paddd (%2), %%mm2\n" + "paddd (%2), %%mm3\n" + "\n" + "movq 32(%0), %%mm4\n" + "movq 40(%0), %%mm5\n" + "movq 48(%0), %%mm6\n" + "movq 56(%0), %%mm7\n" + "pmaddwd 32(%1), %%mm4\n" + "pmaddwd 40(%1), %%mm5\n" + "pmaddwd 48(%1), %%mm6\n" + "pmaddwd 56(%1), %%mm7\n" + "paddd %%mm4, %%mm0\n" + "paddd %%mm5, %%mm1\n" + "paddd %%mm6, %%mm2\n" + "paddd %%mm7, %%mm3\n" + "\n" + "movq 64(%0), %%mm4\n" + "movq 72(%0), %%mm5\n" + "movq 80(%0), %%mm6\n" + "movq 88(%0), %%mm7\n" + "pmaddwd 64(%1), %%mm4\n" + "pmaddwd 72(%1), %%mm5\n" + "pmaddwd 80(%1), %%mm6\n" + "pmaddwd 88(%1), %%mm7\n" + "paddd %%mm4, %%mm0\n" + "paddd %%mm5, %%mm1\n" + "paddd %%mm6, %%mm2\n" + "paddd %%mm7, %%mm3\n" + "\n" + "movq 96(%0), %%mm4\n" + "movq 104(%0), %%mm5\n" + "movq 112(%0), %%mm6\n" + "movq 120(%0), %%mm7\n" + "pmaddwd 96(%1), %%mm4\n" + "pmaddwd 104(%1), %%mm5\n" + "pmaddwd 112(%1), %%mm6\n" + "pmaddwd 120(%1), %%mm7\n" + "paddd %%mm4, %%mm0\n" + "paddd %%mm5, %%mm1\n" + "paddd %%mm6, %%mm2\n" + "paddd %%mm7, %%mm3\n" + "\n" + "movq 128(%0), %%mm4\n" + "movq 136(%0), %%mm5\n" + "movq 144(%0), %%mm6\n" + "movq 152(%0), %%mm7\n" + "pmaddwd 128(%1), %%mm4\n" + "pmaddwd 136(%1), %%mm5\n" + "pmaddwd 144(%1), %%mm6\n" + "pmaddwd 152(%1), %%mm7\n" + "paddd %%mm4, %%mm0\n" + "paddd %%mm5, %%mm1\n" + "paddd %%mm6, %%mm2\n" + "paddd %%mm7, %%mm3\n" + "\n" + "psrad %4, %%mm0\n" + "psrad %4, %%mm1\n" + "psrad %4, %%mm2\n" + "psrad %4, %%mm3\n" + "\n" + "packssdw %%mm0, %%mm0\n" + "packssdw %%mm1, %%mm1\n" + "packssdw %%mm2, %%mm2\n" + "packssdw %%mm3, %%mm3\n" + "\n" + "movq %%mm0, %%mm4\n" + "movq %%mm0, %%mm5\n" + "pmaddwd 160(%1), %%mm4\n" + "pmaddwd 168(%1), %%mm5\n" + "\n" + "movq %%mm1, %%mm6\n" + "movq %%mm1, %%mm7\n" + "pmaddwd 192(%1), %%mm6\n" + "pmaddwd 200(%1), %%mm7\n" + "paddd %%mm6, %%mm4\n" + "paddd %%mm7, %%mm5\n" + "\n" + "movq %%mm2, %%mm6\n" + "movq %%mm2, %%mm7\n" + "pmaddwd 224(%1), %%mm6\n" + "pmaddwd 232(%1), %%mm7\n" + "paddd %%mm6, %%mm4\n" + "paddd %%mm7, %%mm5\n" + "\n" + "movq %%mm3, %%mm6\n" + "movq %%mm3, %%mm7\n" + "pmaddwd 256(%1), %%mm6\n" + "pmaddwd 264(%1), %%mm7\n" + "paddd %%mm6, %%mm4\n" + "paddd %%mm7, %%mm5\n" + "\n" + "movq %%mm4, (%3)\n" + "movq %%mm5, 8(%3)\n" + "\n" + "movq %%mm0, %%mm5\n" + "pmaddwd 176(%1), %%mm0\n" + "pmaddwd 184(%1), %%mm5\n" + "\n" + "movq %%mm1, %%mm7\n" + "pmaddwd 208(%1), %%mm1\n" + "pmaddwd 216(%1), %%mm7\n" + "paddd %%mm1, %%mm0\n" + "paddd %%mm7, %%mm5\n" + "\n" + "movq %%mm2, %%mm7\n" + "pmaddwd 240(%1), %%mm2\n" + "pmaddwd 248(%1), %%mm7\n" + "paddd %%mm2, %%mm0\n" + "paddd %%mm7, %%mm5\n" + "\n" + "movq %%mm3, %%mm7\n" + "pmaddwd 272(%1), %%mm3\n" + "pmaddwd 280(%1), %%mm7\n" + "paddd %%mm3, %%mm0\n" + "paddd %%mm7, %%mm5\n" + "\n" + "movq %%mm0, 16(%3)\n" + "movq %%mm5, 24(%3)\n" + : + : "r" (in), "r" (consts), "r" (&round_c), "r" (out), + "i" (SBC_PROTO_FIXED8_SCALE) + : "memory"); +} + +static inline void sbc_analyze_4b_4s_mmx(int16_t *x, int32_t *out, + int out_stride) +{ + /* Analyze blocks */ + sbc_analyze_four_mmx(x + 12, out, analysis_consts_fixed4_simd_odd); + out += out_stride; + sbc_analyze_four_mmx(x + 8, out, analysis_consts_fixed4_simd_even); + out += out_stride; + sbc_analyze_four_mmx(x + 4, out, analysis_consts_fixed4_simd_odd); + out += out_stride; + sbc_analyze_four_mmx(x + 0, out, analysis_consts_fixed4_simd_even); + + asm volatile ("emms\n"); +} + +static inline void sbc_analyze_4b_8s_mmx(int16_t *x, int32_t *out, + int out_stride) +{ + /* Analyze blocks */ + sbc_analyze_eight_mmx(x + 24, out, analysis_consts_fixed8_simd_odd); + out += out_stride; + sbc_analyze_eight_mmx(x + 16, out, analysis_consts_fixed8_simd_even); + out += out_stride; + sbc_analyze_eight_mmx(x + 8, out, analysis_consts_fixed8_simd_odd); + out += out_stride; + sbc_analyze_eight_mmx(x + 0, out, analysis_consts_fixed8_simd_even); + + asm volatile ("emms\n"); +} + +static int check_mmx_support() +{ +#ifdef __amd64__ + return 1; /* We assume that all 64-bit processors have MMX support */ +#else + int cpuid_feature_information; + asm volatile ( + /* According to Intel manual, CPUID instruction is supported + * if the value of ID bit (bit 21) in EFLAGS can be modified */ + "pushf\n" + "movl (%%esp), %0\n" + "xorl $0x200000, (%%esp)\n" /* try to modify ID bit */ + "popf\n" + "pushf\n" + "xorl (%%esp), %0\n" /* check if ID bit changed */ + "jz 1f\n" + "push %%eax\n" + "push %%ebx\n" + "push %%ecx\n" + "mov $1, %%eax\n" + "cpuid\n" + "pop %%ecx\n" + "pop %%ebx\n" + "pop %%eax\n" + "1:\n" + "popf\n" + : "=d" (cpuid_feature_information) + : + : "cc"); + return cpuid_feature_information & (1 << 23); +#endif +} + +void sbc_init_primitives_mmx(struct sbc_encoder_state *state) +{ + if (check_mmx_support()) { + state->sbc_analyze_4b_4s = sbc_analyze_4b_4s_mmx; + state->sbc_analyze_4b_8s = sbc_analyze_4b_8s_mmx; + } +} + +#endif diff --git a/src/modules/bluetooth/sbc_primitives_mmx.h b/src/modules/bluetooth/sbc_primitives_mmx.h new file mode 100644 index 000000000..c1e44a5db --- /dev/null +++ b/src/modules/bluetooth/sbc_primitives_mmx.h @@ -0,0 +1,40 @@ +/* + * + * Bluetooth low-complexity, subband codec (SBC) library + * + * Copyright (C) 2004-2009 Marcel Holtmann <marcel@holtmann.org> + * Copyright (C) 2004-2005 Henryk Ploetz <henryk@ploetzli.ch> + * Copyright (C) 2005-2006 Brad Midgley <bmidgley@xmission.com> + * + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + */ + +#ifndef __SBC_PRIMITIVES_MMX_H +#define __SBC_PRIMITIVES_MMX_H + +#include "sbc_primitives.h" + +#if defined(__GNUC__) && (defined(__i386__) || defined(__amd64__)) && \ + !defined(SBC_HIGH_PRECISION) && (SCALE_OUT_BITS == 15) + +#define SBC_BUILD_WITH_MMX_SUPPORT + +void sbc_init_primitives_mmx(struct sbc_encoder_state *encoder_state); + +#endif + +#endif diff --git a/src/modules/bluetooth/sbc_primitives_neon.c b/src/modules/bluetooth/sbc_primitives_neon.c new file mode 100644 index 000000000..d9c12f9e3 --- /dev/null +++ b/src/modules/bluetooth/sbc_primitives_neon.c @@ -0,0 +1,245 @@ +/* + * + * Bluetooth low-complexity, subband codec (SBC) library + * + * Copyright (C) 2004-2009 Marcel Holtmann <marcel@holtmann.org> + * Copyright (C) 2004-2005 Henryk Ploetz <henryk@ploetzli.ch> + * Copyright (C) 2005-2006 Brad Midgley <bmidgley@xmission.com> + * + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + */ + +#include <stdint.h> +#include <limits.h> +#include "sbc.h" +#include "sbc_math.h" +#include "sbc_tables.h" + +#include "sbc_primitives_neon.h" + +/* + * ARM NEON optimizations + */ + +#ifdef SBC_BUILD_WITH_NEON_SUPPORT + +static inline void _sbc_analyze_four_neon(const int16_t *in, int32_t *out, + const FIXED_T *consts) +{ + /* TODO: merge even and odd cases (or even merge all four calls to this + * function) in order to have only aligned reads from 'in' array + * and reduce number of load instructions */ + asm volatile ( + "vld1.16 {d4, d5}, [%0, :64]!\n" + "vld1.16 {d8, d9}, [%1, :128]!\n" + + "vmull.s16 q0, d4, d8\n" + "vld1.16 {d6, d7}, [%0, :64]!\n" + "vmull.s16 q1, d5, d9\n" + "vld1.16 {d10, d11}, [%1, :128]!\n" + + "vmlal.s16 q0, d6, d10\n" + "vld1.16 {d4, d5}, [%0, :64]!\n" + "vmlal.s16 q1, d7, d11\n" + "vld1.16 {d8, d9}, [%1, :128]!\n" + + "vmlal.s16 q0, d4, d8\n" + "vld1.16 {d6, d7}, [%0, :64]!\n" + "vmlal.s16 q1, d5, d9\n" + "vld1.16 {d10, d11}, [%1, :128]!\n" + + "vmlal.s16 q0, d6, d10\n" + "vld1.16 {d4, d5}, [%0, :64]!\n" + "vmlal.s16 q1, d7, d11\n" + "vld1.16 {d8, d9}, [%1, :128]!\n" + + "vmlal.s16 q0, d4, d8\n" + "vmlal.s16 q1, d5, d9\n" + + "vpadd.s32 d0, d0, d1\n" + "vpadd.s32 d1, d2, d3\n" + + "vrshrn.s32 d0, q0, %3\n" + + "vld1.16 {d2, d3, d4, d5}, [%1, :128]!\n" + + "vdup.i32 d1, d0[1]\n" /* TODO: can be eliminated */ + "vdup.i32 d0, d0[0]\n" /* TODO: can be eliminated */ + + "vmull.s16 q3, d2, d0\n" + "vmull.s16 q4, d3, d0\n" + "vmlal.s16 q3, d4, d1\n" + "vmlal.s16 q4, d5, d1\n" + + "vpadd.s32 d0, d6, d7\n" /* TODO: can be eliminated */ + "vpadd.s32 d1, d8, d9\n" /* TODO: can be eliminated */ + + "vst1.32 {d0, d1}, [%2, :128]\n" + : "+r" (in), "+r" (consts) + : "r" (out), + "i" (SBC_PROTO_FIXED4_SCALE) + : "memory", + "d0", "d1", "d2", "d3", "d4", "d5", + "d6", "d7", "d8", "d9", "d10", "d11"); +} + +static inline void _sbc_analyze_eight_neon(const int16_t *in, int32_t *out, + const FIXED_T *consts) +{ + /* TODO: merge even and odd cases (or even merge all four calls to this + * function) in order to have only aligned reads from 'in' array + * and reduce number of load instructions */ + asm volatile ( + "vld1.16 {d4, d5}, [%0, :64]!\n" + "vld1.16 {d8, d9}, [%1, :128]!\n" + + "vmull.s16 q6, d4, d8\n" + "vld1.16 {d6, d7}, [%0, :64]!\n" + "vmull.s16 q7, d5, d9\n" + "vld1.16 {d10, d11}, [%1, :128]!\n" + "vmull.s16 q8, d6, d10\n" + "vld1.16 {d4, d5}, [%0, :64]!\n" + "vmull.s16 q9, d7, d11\n" + "vld1.16 {d8, d9}, [%1, :128]!\n" + + "vmlal.s16 q6, d4, d8\n" + "vld1.16 {d6, d7}, [%0, :64]!\n" + "vmlal.s16 q7, d5, d9\n" + "vld1.16 {d10, d11}, [%1, :128]!\n" + "vmlal.s16 q8, d6, d10\n" + "vld1.16 {d4, d5}, [%0, :64]!\n" + "vmlal.s16 q9, d7, d11\n" + "vld1.16 {d8, d9}, [%1, :128]!\n" + + "vmlal.s16 q6, d4, d8\n" + "vld1.16 {d6, d7}, [%0, :64]!\n" + "vmlal.s16 q7, d5, d9\n" + "vld1.16 {d10, d11}, [%1, :128]!\n" + "vmlal.s16 q8, d6, d10\n" + "vld1.16 {d4, d5}, [%0, :64]!\n" + "vmlal.s16 q9, d7, d11\n" + "vld1.16 {d8, d9}, [%1, :128]!\n" + + "vmlal.s16 q6, d4, d8\n" + "vld1.16 {d6, d7}, [%0, :64]!\n" + "vmlal.s16 q7, d5, d9\n" + "vld1.16 {d10, d11}, [%1, :128]!\n" + "vmlal.s16 q8, d6, d10\n" + "vld1.16 {d4, d5}, [%0, :64]!\n" + "vmlal.s16 q9, d7, d11\n" + "vld1.16 {d8, d9}, [%1, :128]!\n" + + "vmlal.s16 q6, d4, d8\n" + "vld1.16 {d6, d7}, [%0, :64]!\n" + "vmlal.s16 q7, d5, d9\n" + "vld1.16 {d10, d11}, [%1, :128]!\n" + + "vmlal.s16 q8, d6, d10\n" + "vmlal.s16 q9, d7, d11\n" + + "vpadd.s32 d0, d12, d13\n" + "vpadd.s32 d1, d14, d15\n" + "vpadd.s32 d2, d16, d17\n" + "vpadd.s32 d3, d18, d19\n" + + "vrshr.s32 q0, q0, %3\n" + "vrshr.s32 q1, q1, %3\n" + "vmovn.s32 d0, q0\n" + "vmovn.s32 d1, q1\n" + + "vdup.i32 d3, d1[1]\n" /* TODO: can be eliminated */ + "vdup.i32 d2, d1[0]\n" /* TODO: can be eliminated */ + "vdup.i32 d1, d0[1]\n" /* TODO: can be eliminated */ + "vdup.i32 d0, d0[0]\n" /* TODO: can be eliminated */ + + "vld1.16 {d4, d5}, [%1, :128]!\n" + "vmull.s16 q6, d4, d0\n" + "vld1.16 {d6, d7}, [%1, :128]!\n" + "vmull.s16 q7, d5, d0\n" + "vmull.s16 q8, d6, d0\n" + "vmull.s16 q9, d7, d0\n" + + "vld1.16 {d4, d5}, [%1, :128]!\n" + "vmlal.s16 q6, d4, d1\n" + "vld1.16 {d6, d7}, [%1, :128]!\n" + "vmlal.s16 q7, d5, d1\n" + "vmlal.s16 q8, d6, d1\n" + "vmlal.s16 q9, d7, d1\n" + + "vld1.16 {d4, d5}, [%1, :128]!\n" + "vmlal.s16 q6, d4, d2\n" + "vld1.16 {d6, d7}, [%1, :128]!\n" + "vmlal.s16 q7, d5, d2\n" + "vmlal.s16 q8, d6, d2\n" + "vmlal.s16 q9, d7, d2\n" + + "vld1.16 {d4, d5}, [%1, :128]!\n" + "vmlal.s16 q6, d4, d3\n" + "vld1.16 {d6, d7}, [%1, :128]!\n" + "vmlal.s16 q7, d5, d3\n" + "vmlal.s16 q8, d6, d3\n" + "vmlal.s16 q9, d7, d3\n" + + "vpadd.s32 d0, d12, d13\n" /* TODO: can be eliminated */ + "vpadd.s32 d1, d14, d15\n" /* TODO: can be eliminated */ + "vpadd.s32 d2, d16, d17\n" /* TODO: can be eliminated */ + "vpadd.s32 d3, d18, d19\n" /* TODO: can be eliminated */ + + "vst1.32 {d0, d1, d2, d3}, [%2, :128]\n" + : "+r" (in), "+r" (consts) + : "r" (out), + "i" (SBC_PROTO_FIXED8_SCALE) + : "memory", + "d0", "d1", "d2", "d3", "d4", "d5", + "d6", "d7", "d8", "d9", "d10", "d11", + "d12", "d13", "d14", "d15", "d16", "d17", + "d18", "d19"); +} + +static inline void sbc_analyze_4b_4s_neon(int16_t *x, + int32_t *out, int out_stride) +{ + /* Analyze blocks */ + _sbc_analyze_four_neon(x + 12, out, analysis_consts_fixed4_simd_odd); + out += out_stride; + _sbc_analyze_four_neon(x + 8, out, analysis_consts_fixed4_simd_even); + out += out_stride; + _sbc_analyze_four_neon(x + 4, out, analysis_consts_fixed4_simd_odd); + out += out_stride; + _sbc_analyze_four_neon(x + 0, out, analysis_consts_fixed4_simd_even); +} + +static inline void sbc_analyze_4b_8s_neon(int16_t *x, + int32_t *out, int out_stride) +{ + /* Analyze blocks */ + _sbc_analyze_eight_neon(x + 24, out, analysis_consts_fixed8_simd_odd); + out += out_stride; + _sbc_analyze_eight_neon(x + 16, out, analysis_consts_fixed8_simd_even); + out += out_stride; + _sbc_analyze_eight_neon(x + 8, out, analysis_consts_fixed8_simd_odd); + out += out_stride; + _sbc_analyze_eight_neon(x + 0, out, analysis_consts_fixed8_simd_even); +} + +void sbc_init_primitives_neon(struct sbc_encoder_state *state) +{ + state->sbc_analyze_4b_4s = sbc_analyze_4b_4s_neon; + state->sbc_analyze_4b_8s = sbc_analyze_4b_8s_neon; +} + +#endif diff --git a/src/modules/bluetooth/sbc_primitives_neon.h b/src/modules/bluetooth/sbc_primitives_neon.h new file mode 100644 index 000000000..30766ed88 --- /dev/null +++ b/src/modules/bluetooth/sbc_primitives_neon.h @@ -0,0 +1,40 @@ +/* + * + * Bluetooth low-complexity, subband codec (SBC) library + * + * Copyright (C) 2004-2009 Marcel Holtmann <marcel@holtmann.org> + * Copyright (C) 2004-2005 Henryk Ploetz <henryk@ploetzli.ch> + * Copyright (C) 2005-2006 Brad Midgley <bmidgley@xmission.com> + * + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + */ + +#ifndef __SBC_PRIMITIVES_NEON_H +#define __SBC_PRIMITIVES_NEON_H + +#include "sbc_primitives.h" + +#if defined(__GNUC__) && defined(__ARM_NEON__) && \ + !defined(SBC_HIGH_PRECISION) && (SCALE_OUT_BITS == 15) + +#define SBC_BUILD_WITH_NEON_SUPPORT + +void sbc_init_primitives_neon(struct sbc_encoder_state *encoder_state); + +#endif + +#endif diff --git a/src/modules/bluetooth/sbc_tables.h b/src/modules/bluetooth/sbc_tables.h index f1dfe6c08..0057c73f7 100644 --- a/src/modules/bluetooth/sbc_tables.h +++ b/src/modules/bluetooth/sbc_tables.h @@ -157,33 +157,34 @@ static const int32_t synmatrix8[16][8] = { */ #define SBC_PROTO_FIXED4_SCALE \ ((sizeof(FIXED_T) * CHAR_BIT - 1) - SBC_FIXED_EXTRA_BITS + 1) -#define F(x) (FIXED_A) ((x * 2) * \ +#define F_PROTO4(x) (FIXED_A) ((x * 2) * \ ((FIXED_A) 1 << (sizeof(FIXED_T) * CHAR_BIT - 1)) + 0.5) +#define F(x) F_PROTO4(x) static const FIXED_T _sbc_proto_fixed4[40] = { - F(0.00000000E+00), F(5.36548976E-04), + F(0.00000000E+00), F(5.36548976E-04), -F(1.49188357E-03), F(2.73370904E-03), - F(3.83720193E-03), F(3.89205149E-03), - F(1.86581691E-03), F(3.06012286E-03), + F(3.83720193E-03), F(3.89205149E-03), + F(1.86581691E-03), F(3.06012286E-03), - F(1.09137620E-02), F(2.04385087E-02), + F(1.09137620E-02), F(2.04385087E-02), -F(2.88757392E-02), F(3.21939290E-02), - F(2.58767811E-02), F(6.13245186E-03), + F(2.58767811E-02), F(6.13245186E-03), -F(2.88217274E-02), F(7.76463494E-02), - F(1.35593274E-01), F(1.94987841E-01), + F(1.35593274E-01), F(1.94987841E-01), -F(2.46636662E-01), F(2.81828203E-01), - F(2.94315332E-01), F(2.81828203E-01), - F(2.46636662E-01), -F(1.94987841E-01), + F(2.94315332E-01), F(2.81828203E-01), + F(2.46636662E-01), -F(1.94987841E-01), -F(1.35593274E-01), -F(7.76463494E-02), - F(2.88217274E-02), F(6.13245186E-03), - F(2.58767811E-02), F(3.21939290E-02), - F(2.88757392E-02), -F(2.04385087E-02), + F(2.88217274E-02), F(6.13245186E-03), + F(2.58767811E-02), F(3.21939290E-02), + F(2.88757392E-02), -F(2.04385087E-02), -F(1.09137620E-02), -F(3.06012286E-03), -F(1.86581691E-03), F(3.89205149E-03), - F(3.83720193E-03), F(2.73370904E-03), - F(1.49188357E-03), -F(5.36548976E-04), + F(3.83720193E-03), F(2.73370904E-03), + F(1.49188357E-03), -F(5.36548976E-04), }; #undef F @@ -206,11 +207,12 @@ static const FIXED_T _sbc_proto_fixed4[40] = { */ #define SBC_COS_TABLE_FIXED4_SCALE \ ((sizeof(FIXED_T) * CHAR_BIT - 1) + SBC_FIXED_EXTRA_BITS) -#define F(x) (FIXED_A) ((x) * \ +#define F_COS4(x) (FIXED_A) ((x) * \ ((FIXED_A) 1 << (sizeof(FIXED_T) * CHAR_BIT - 1)) + 0.5) +#define F(x) F_COS4(x) static const FIXED_T cos_table_fixed_4[32] = { - F(0.7071067812), F(0.9238795325), -F(1.0000000000), F(0.9238795325), - F(0.7071067812), F(0.3826834324), F(0.0000000000), F(0.3826834324), + F(0.7071067812), F(0.9238795325), -F(1.0000000000), F(0.9238795325), + F(0.7071067812), F(0.3826834324), F(0.0000000000), F(0.3826834324), -F(0.7071067812), F(0.3826834324), -F(1.0000000000), F(0.3826834324), -F(0.7071067812), -F(0.9238795325), -F(0.0000000000), -F(0.9238795325), @@ -218,8 +220,8 @@ static const FIXED_T cos_table_fixed_4[32] = { -F(0.7071067812), -F(0.3826834324), -F(1.0000000000), -F(0.3826834324), -F(0.7071067812), F(0.9238795325), F(0.0000000000), F(0.9238795325), - F(0.7071067812), -F(0.9238795325), -F(1.0000000000), -F(0.9238795325), - F(0.7071067812), -F(0.3826834324), -F(0.0000000000), -F(0.3826834324), + F(0.7071067812), -F(0.9238795325), -F(1.0000000000), -F(0.9238795325), + F(0.7071067812), -F(0.3826834324), -F(0.0000000000), -F(0.3826834324), }; #undef F @@ -232,53 +234,54 @@ static const FIXED_T cos_table_fixed_4[32] = { * in order to compensate the same change applied to cos_table_fixed_8 */ #define SBC_PROTO_FIXED8_SCALE \ - ((sizeof(FIXED_T) * CHAR_BIT - 1) - SBC_FIXED_EXTRA_BITS + 2) -#define F(x) (FIXED_A) ((x * 4) * \ + ((sizeof(FIXED_T) * CHAR_BIT - 1) - SBC_FIXED_EXTRA_BITS + 1) +#define F_PROTO8(x) (FIXED_A) ((x * 2) * \ ((FIXED_A) 1 << (sizeof(FIXED_T) * CHAR_BIT - 1)) + 0.5) +#define F(x) F_PROTO8(x) static const FIXED_T _sbc_proto_fixed8[80] = { - F(0.00000000E+00), F(1.56575398E-04), - F(3.43256425E-04), F(5.54620202E-04), + F(0.00000000E+00), F(1.56575398E-04), + F(3.43256425E-04), F(5.54620202E-04), -F(8.23919506E-04), F(1.13992507E-03), - F(1.47640169E-03), F(1.78371725E-03), - F(2.01182542E-03), F(2.10371989E-03), - F(1.99454554E-03), F(1.61656283E-03), - F(9.02154502E-04), F(1.78805361E-04), - F(1.64973098E-03), F(3.49717454E-03), - - F(5.65949473E-03), F(8.02941163E-03), - F(1.04584443E-02), F(1.27472335E-02), + F(1.47640169E-03), F(1.78371725E-03), + F(2.01182542E-03), F(2.10371989E-03), + F(1.99454554E-03), F(1.61656283E-03), + F(9.02154502E-04), F(1.78805361E-04), + F(1.64973098E-03), F(3.49717454E-03), + + F(5.65949473E-03), F(8.02941163E-03), + F(1.04584443E-02), F(1.27472335E-02), -F(1.46525263E-02), F(1.59045603E-02), - F(1.62208471E-02), F(1.53184106E-02), - F(1.29371806E-02), F(8.85757540E-03), - F(2.92408442E-03), -F(4.91578024E-03), + F(1.62208471E-02), F(1.53184106E-02), + F(1.29371806E-02), F(8.85757540E-03), + F(2.92408442E-03), -F(4.91578024E-03), -F(1.46404076E-02), F(2.61098752E-02), - F(3.90751381E-02), F(5.31873032E-02), + F(3.90751381E-02), F(5.31873032E-02), - F(6.79989431E-02), F(8.29847578E-02), - F(9.75753918E-02), F(1.11196689E-01), + F(6.79989431E-02), F(8.29847578E-02), + F(9.75753918E-02), F(1.11196689E-01), -F(1.23264548E-01), F(1.33264415E-01), - F(1.40753505E-01), F(1.45389847E-01), - F(1.46955068E-01), F(1.45389847E-01), - F(1.40753505E-01), F(1.33264415E-01), - F(1.23264548E-01), -F(1.11196689E-01), + F(1.40753505E-01), F(1.45389847E-01), + F(1.46955068E-01), F(1.45389847E-01), + F(1.40753505E-01), F(1.33264415E-01), + F(1.23264548E-01), -F(1.11196689E-01), -F(9.75753918E-02), -F(8.29847578E-02), -F(6.79989431E-02), -F(5.31873032E-02), -F(3.90751381E-02), -F(2.61098752E-02), - F(1.46404076E-02), -F(4.91578024E-03), - F(2.92408442E-03), F(8.85757540E-03), - F(1.29371806E-02), F(1.53184106E-02), - F(1.62208471E-02), F(1.59045603E-02), - F(1.46525263E-02), -F(1.27472335E-02), + F(1.46404076E-02), -F(4.91578024E-03), + F(2.92408442E-03), F(8.85757540E-03), + F(1.29371806E-02), F(1.53184106E-02), + F(1.62208471E-02), F(1.59045603E-02), + F(1.46525263E-02), -F(1.27472335E-02), -F(1.04584443E-02), -F(8.02941163E-03), -F(5.65949473E-03), -F(3.49717454E-03), -F(1.64973098E-03), -F(1.78805361E-04), -F(9.02154502E-04), F(1.61656283E-03), - F(1.99454554E-03), F(2.10371989E-03), - F(2.01182542E-03), F(1.78371725E-03), - F(1.47640169E-03), F(1.13992507E-03), - F(8.23919506E-04), -F(5.54620202E-04), + F(1.99454554E-03), F(2.10371989E-03), + F(2.01182542E-03), F(1.78371725E-03), + F(1.47640169E-03), F(1.13992507E-03), + F(8.23919506E-04), -F(5.54620202E-04), -F(3.43256425E-04), -F(1.56575398E-04), }; #undef F @@ -301,13 +304,14 @@ static const FIXED_T _sbc_proto_fixed8[80] = { */ #define SBC_COS_TABLE_FIXED8_SCALE \ ((sizeof(FIXED_T) * CHAR_BIT - 1) + SBC_FIXED_EXTRA_BITS) -#define F(x) (FIXED_A) ((x) * \ +#define F_COS8(x) (FIXED_A) ((x) * \ ((FIXED_A) 1 << (sizeof(FIXED_T) * CHAR_BIT - 1)) + 0.5) +#define F(x) F_COS8(x) static const FIXED_T cos_table_fixed_8[128] = { - F(0.7071067812), F(0.8314696123), F(0.9238795325), F(0.9807852804), + F(0.7071067812), F(0.8314696123), F(0.9238795325), F(0.9807852804), -F(1.0000000000), F(0.9807852804), F(0.9238795325), F(0.8314696123), - F(0.7071067812), F(0.5555702330), F(0.3826834324), F(0.1950903220), - F(0.0000000000), F(0.1950903220), F(0.3826834324), F(0.5555702330), + F(0.7071067812), F(0.5555702330), F(0.3826834324), F(0.1950903220), + F(0.0000000000), F(0.1950903220), F(0.3826834324), F(0.5555702330), -F(0.7071067812), -F(0.1950903220), F(0.3826834324), F(0.8314696123), -F(1.0000000000), F(0.8314696123), F(0.3826834324), -F(0.1950903220), @@ -317,17 +321,17 @@ static const FIXED_T cos_table_fixed_8[128] = { -F(0.7071067812), -F(0.9807852804), -F(0.3826834324), F(0.5555702330), -F(1.0000000000), F(0.5555702330), -F(0.3826834324), -F(0.9807852804), -F(0.7071067812), F(0.1950903220), F(0.9238795325), F(0.8314696123), - F(0.0000000000), F(0.8314696123), F(0.9238795325), F(0.1950903220), + F(0.0000000000), F(0.8314696123), F(0.9238795325), F(0.1950903220), - F(0.7071067812), -F(0.5555702330), -F(0.9238795325), F(0.1950903220), + F(0.7071067812), -F(0.5555702330), -F(0.9238795325), F(0.1950903220), -F(1.0000000000), F(0.1950903220), -F(0.9238795325), -F(0.5555702330), - F(0.7071067812), F(0.8314696123), -F(0.3826834324), -F(0.9807852804), + F(0.7071067812), F(0.8314696123), -F(0.3826834324), -F(0.9807852804), -F(0.0000000000), -F(0.9807852804), -F(0.3826834324), F(0.8314696123), - F(0.7071067812), F(0.5555702330), -F(0.9238795325), -F(0.1950903220), + F(0.7071067812), F(0.5555702330), -F(0.9238795325), -F(0.1950903220), -F(1.0000000000), -F(0.1950903220), -F(0.9238795325), F(0.5555702330), - F(0.7071067812), -F(0.8314696123), -F(0.3826834324), F(0.9807852804), - F(0.0000000000), F(0.9807852804), -F(0.3826834324), -F(0.8314696123), + F(0.7071067812), -F(0.8314696123), -F(0.3826834324), F(0.9807852804), + F(0.0000000000), F(0.9807852804), -F(0.3826834324), -F(0.8314696123), -F(0.7071067812), F(0.9807852804), -F(0.3826834324), -F(0.5555702330), -F(1.0000000000), -F(0.5555702330), -F(0.3826834324), F(0.9807852804), @@ -339,9 +343,317 @@ static const FIXED_T cos_table_fixed_8[128] = { -F(0.7071067812), F(0.9807852804), -F(0.9238795325), F(0.5555702330), -F(0.0000000000), F(0.5555702330), -F(0.9238795325), F(0.9807852804), - F(0.7071067812), -F(0.8314696123), F(0.9238795325), -F(0.9807852804), + F(0.7071067812), -F(0.8314696123), F(0.9238795325), -F(0.9807852804), -F(1.0000000000), -F(0.9807852804), F(0.9238795325), -F(0.8314696123), - F(0.7071067812), -F(0.5555702330), F(0.3826834324), -F(0.1950903220), + F(0.7071067812), -F(0.5555702330), F(0.3826834324), -F(0.1950903220), -F(0.0000000000), -F(0.1950903220), F(0.3826834324), -F(0.5555702330), }; #undef F + +/* + * Enforce 16 byte alignment for the data, which is supposed to be used + * with SIMD optimized code. + */ + +#define SBC_ALIGN_BITS 4 +#define SBC_ALIGN_MASK ((1 << (SBC_ALIGN_BITS)) - 1) + +#ifdef __GNUC__ +#define SBC_ALIGNED __attribute__((aligned(1 << (SBC_ALIGN_BITS)))) +#else +#define SBC_ALIGNED +#endif + +/* + * Constant tables for the use in SIMD optimized analysis filters + * Each table consists of two parts: + * 1. reordered "proto" table + * 2. reordered "cos" table + * + * Due to non-symmetrical reordering, separate tables for "even" + * and "odd" cases are needed + */ + +static const FIXED_T SBC_ALIGNED analysis_consts_fixed4_simd_even[40 + 16] = { +#define C0 1.0932568993 +#define C1 1.3056875580 +#define C2 1.3056875580 +#define C3 1.6772280856 + +#define F(x) F_PROTO4(x) + F(0.00000000E+00 * C0), F(3.83720193E-03 * C0), + F(5.36548976E-04 * C1), F(2.73370904E-03 * C1), + F(3.06012286E-03 * C2), F(3.89205149E-03 * C2), + F(0.00000000E+00 * C3), -F(1.49188357E-03 * C3), + F(1.09137620E-02 * C0), F(2.58767811E-02 * C0), + F(2.04385087E-02 * C1), F(3.21939290E-02 * C1), + F(7.76463494E-02 * C2), F(6.13245186E-03 * C2), + F(0.00000000E+00 * C3), -F(2.88757392E-02 * C3), + F(1.35593274E-01 * C0), F(2.94315332E-01 * C0), + F(1.94987841E-01 * C1), F(2.81828203E-01 * C1), + -F(1.94987841E-01 * C2), F(2.81828203E-01 * C2), + F(0.00000000E+00 * C3), -F(2.46636662E-01 * C3), + -F(1.35593274E-01 * C0), F(2.58767811E-02 * C0), + -F(7.76463494E-02 * C1), F(6.13245186E-03 * C1), + -F(2.04385087E-02 * C2), F(3.21939290E-02 * C2), + F(0.00000000E+00 * C3), F(2.88217274E-02 * C3), + -F(1.09137620E-02 * C0), F(3.83720193E-03 * C0), + -F(3.06012286E-03 * C1), F(3.89205149E-03 * C1), + -F(5.36548976E-04 * C2), F(2.73370904E-03 * C2), + F(0.00000000E+00 * C3), -F(1.86581691E-03 * C3), +#undef F +#define F(x) F_COS4(x) + F(0.7071067812 / C0), F(0.9238795325 / C1), + -F(0.7071067812 / C0), F(0.3826834324 / C1), + -F(0.7071067812 / C0), -F(0.3826834324 / C1), + F(0.7071067812 / C0), -F(0.9238795325 / C1), + F(0.3826834324 / C2), -F(1.0000000000 / C3), + -F(0.9238795325 / C2), -F(1.0000000000 / C3), + F(0.9238795325 / C2), -F(1.0000000000 / C3), + -F(0.3826834324 / C2), -F(1.0000000000 / C3), +#undef F + +#undef C0 +#undef C1 +#undef C2 +#undef C3 +}; + +static const FIXED_T SBC_ALIGNED analysis_consts_fixed4_simd_odd[40 + 16] = { +#define C0 1.3056875580 +#define C1 1.6772280856 +#define C2 1.0932568993 +#define C3 1.3056875580 + +#define F(x) F_PROTO4(x) + F(2.73370904E-03 * C0), F(5.36548976E-04 * C0), + -F(1.49188357E-03 * C1), F(0.00000000E+00 * C1), + F(3.83720193E-03 * C2), F(1.09137620E-02 * C2), + F(3.89205149E-03 * C3), F(3.06012286E-03 * C3), + F(3.21939290E-02 * C0), F(2.04385087E-02 * C0), + -F(2.88757392E-02 * C1), F(0.00000000E+00 * C1), + F(2.58767811E-02 * C2), F(1.35593274E-01 * C2), + F(6.13245186E-03 * C3), F(7.76463494E-02 * C3), + F(2.81828203E-01 * C0), F(1.94987841E-01 * C0), + -F(2.46636662E-01 * C1), F(0.00000000E+00 * C1), + F(2.94315332E-01 * C2), -F(1.35593274E-01 * C2), + F(2.81828203E-01 * C3), -F(1.94987841E-01 * C3), + F(6.13245186E-03 * C0), -F(7.76463494E-02 * C0), + F(2.88217274E-02 * C1), F(0.00000000E+00 * C1), + F(2.58767811E-02 * C2), -F(1.09137620E-02 * C2), + F(3.21939290E-02 * C3), -F(2.04385087E-02 * C3), + F(3.89205149E-03 * C0), -F(3.06012286E-03 * C0), + -F(1.86581691E-03 * C1), F(0.00000000E+00 * C1), + F(3.83720193E-03 * C2), F(0.00000000E+00 * C2), + F(2.73370904E-03 * C3), -F(5.36548976E-04 * C3), +#undef F +#define F(x) F_COS4(x) + F(0.9238795325 / C0), -F(1.0000000000 / C1), + F(0.3826834324 / C0), -F(1.0000000000 / C1), + -F(0.3826834324 / C0), -F(1.0000000000 / C1), + -F(0.9238795325 / C0), -F(1.0000000000 / C1), + F(0.7071067812 / C2), F(0.3826834324 / C3), + -F(0.7071067812 / C2), -F(0.9238795325 / C3), + -F(0.7071067812 / C2), F(0.9238795325 / C3), + F(0.7071067812 / C2), -F(0.3826834324 / C3), +#undef F + +#undef C0 +#undef C1 +#undef C2 +#undef C3 +}; + +static const FIXED_T SBC_ALIGNED analysis_consts_fixed8_simd_even[80 + 64] = { +#define C0 2.7906148894 +#define C1 2.4270044280 +#define C2 2.8015616024 +#define C3 3.1710363741 +#define C4 2.5377944043 +#define C5 2.4270044280 +#define C6 2.8015616024 +#define C7 3.1710363741 + +#define F(x) F_PROTO8(x) + F(0.00000000E+00 * C0), F(2.01182542E-03 * C0), + F(1.56575398E-04 * C1), F(1.78371725E-03 * C1), + F(3.43256425E-04 * C2), F(1.47640169E-03 * C2), + F(5.54620202E-04 * C3), F(1.13992507E-03 * C3), + -F(8.23919506E-04 * C4), F(0.00000000E+00 * C4), + F(2.10371989E-03 * C5), F(3.49717454E-03 * C5), + F(1.99454554E-03 * C6), F(1.64973098E-03 * C6), + F(1.61656283E-03 * C7), F(1.78805361E-04 * C7), + F(5.65949473E-03 * C0), F(1.29371806E-02 * C0), + F(8.02941163E-03 * C1), F(1.53184106E-02 * C1), + F(1.04584443E-02 * C2), F(1.62208471E-02 * C2), + F(1.27472335E-02 * C3), F(1.59045603E-02 * C3), + -F(1.46525263E-02 * C4), F(0.00000000E+00 * C4), + F(8.85757540E-03 * C5), F(5.31873032E-02 * C5), + F(2.92408442E-03 * C6), F(3.90751381E-02 * C6), + -F(4.91578024E-03 * C7), F(2.61098752E-02 * C7), + F(6.79989431E-02 * C0), F(1.46955068E-01 * C0), + F(8.29847578E-02 * C1), F(1.45389847E-01 * C1), + F(9.75753918E-02 * C2), F(1.40753505E-01 * C2), + F(1.11196689E-01 * C3), F(1.33264415E-01 * C3), + -F(1.23264548E-01 * C4), F(0.00000000E+00 * C4), + F(1.45389847E-01 * C5), -F(8.29847578E-02 * C5), + F(1.40753505E-01 * C6), -F(9.75753918E-02 * C6), + F(1.33264415E-01 * C7), -F(1.11196689E-01 * C7), + -F(6.79989431E-02 * C0), F(1.29371806E-02 * C0), + -F(5.31873032E-02 * C1), F(8.85757540E-03 * C1), + -F(3.90751381E-02 * C2), F(2.92408442E-03 * C2), + -F(2.61098752E-02 * C3), -F(4.91578024E-03 * C3), + F(1.46404076E-02 * C4), F(0.00000000E+00 * C4), + F(1.53184106E-02 * C5), -F(8.02941163E-03 * C5), + F(1.62208471E-02 * C6), -F(1.04584443E-02 * C6), + F(1.59045603E-02 * C7), -F(1.27472335E-02 * C7), + -F(5.65949473E-03 * C0), F(2.01182542E-03 * C0), + -F(3.49717454E-03 * C1), F(2.10371989E-03 * C1), + -F(1.64973098E-03 * C2), F(1.99454554E-03 * C2), + -F(1.78805361E-04 * C3), F(1.61656283E-03 * C3), + -F(9.02154502E-04 * C4), F(0.00000000E+00 * C4), + F(1.78371725E-03 * C5), -F(1.56575398E-04 * C5), + F(1.47640169E-03 * C6), -F(3.43256425E-04 * C6), + F(1.13992507E-03 * C7), -F(5.54620202E-04 * C7), +#undef F +#define F(x) F_COS8(x) + F(0.7071067812 / C0), F(0.8314696123 / C1), + -F(0.7071067812 / C0), -F(0.1950903220 / C1), + -F(0.7071067812 / C0), -F(0.9807852804 / C1), + F(0.7071067812 / C0), -F(0.5555702330 / C1), + F(0.7071067812 / C0), F(0.5555702330 / C1), + -F(0.7071067812 / C0), F(0.9807852804 / C1), + -F(0.7071067812 / C0), F(0.1950903220 / C1), + F(0.7071067812 / C0), -F(0.8314696123 / C1), + F(0.9238795325 / C2), F(0.9807852804 / C3), + F(0.3826834324 / C2), F(0.8314696123 / C3), + -F(0.3826834324 / C2), F(0.5555702330 / C3), + -F(0.9238795325 / C2), F(0.1950903220 / C3), + -F(0.9238795325 / C2), -F(0.1950903220 / C3), + -F(0.3826834324 / C2), -F(0.5555702330 / C3), + F(0.3826834324 / C2), -F(0.8314696123 / C3), + F(0.9238795325 / C2), -F(0.9807852804 / C3), + -F(1.0000000000 / C4), F(0.5555702330 / C5), + -F(1.0000000000 / C4), -F(0.9807852804 / C5), + -F(1.0000000000 / C4), F(0.1950903220 / C5), + -F(1.0000000000 / C4), F(0.8314696123 / C5), + -F(1.0000000000 / C4), -F(0.8314696123 / C5), + -F(1.0000000000 / C4), -F(0.1950903220 / C5), + -F(1.0000000000 / C4), F(0.9807852804 / C5), + -F(1.0000000000 / C4), -F(0.5555702330 / C5), + F(0.3826834324 / C6), F(0.1950903220 / C7), + -F(0.9238795325 / C6), -F(0.5555702330 / C7), + F(0.9238795325 / C6), F(0.8314696123 / C7), + -F(0.3826834324 / C6), -F(0.9807852804 / C7), + -F(0.3826834324 / C6), F(0.9807852804 / C7), + F(0.9238795325 / C6), -F(0.8314696123 / C7), + -F(0.9238795325 / C6), F(0.5555702330 / C7), + F(0.3826834324 / C6), -F(0.1950903220 / C7), +#undef F + +#undef C0 +#undef C1 +#undef C2 +#undef C3 +#undef C4 +#undef C5 +#undef C6 +#undef C7 +}; + +static const FIXED_T SBC_ALIGNED analysis_consts_fixed8_simd_odd[80 + 64] = { +#define C0 2.5377944043 +#define C1 2.4270044280 +#define C2 2.8015616024 +#define C3 3.1710363741 +#define C4 2.7906148894 +#define C5 2.4270044280 +#define C6 2.8015616024 +#define C7 3.1710363741 + +#define F(x) F_PROTO8(x) + F(0.00000000E+00 * C0), -F(8.23919506E-04 * C0), + F(1.56575398E-04 * C1), F(1.78371725E-03 * C1), + F(3.43256425E-04 * C2), F(1.47640169E-03 * C2), + F(5.54620202E-04 * C3), F(1.13992507E-03 * C3), + F(2.01182542E-03 * C4), F(5.65949473E-03 * C4), + F(2.10371989E-03 * C5), F(3.49717454E-03 * C5), + F(1.99454554E-03 * C6), F(1.64973098E-03 * C6), + F(1.61656283E-03 * C7), F(1.78805361E-04 * C7), + F(0.00000000E+00 * C0), -F(1.46525263E-02 * C0), + F(8.02941163E-03 * C1), F(1.53184106E-02 * C1), + F(1.04584443E-02 * C2), F(1.62208471E-02 * C2), + F(1.27472335E-02 * C3), F(1.59045603E-02 * C3), + F(1.29371806E-02 * C4), F(6.79989431E-02 * C4), + F(8.85757540E-03 * C5), F(5.31873032E-02 * C5), + F(2.92408442E-03 * C6), F(3.90751381E-02 * C6), + -F(4.91578024E-03 * C7), F(2.61098752E-02 * C7), + F(0.00000000E+00 * C0), -F(1.23264548E-01 * C0), + F(8.29847578E-02 * C1), F(1.45389847E-01 * C1), + F(9.75753918E-02 * C2), F(1.40753505E-01 * C2), + F(1.11196689E-01 * C3), F(1.33264415E-01 * C3), + F(1.46955068E-01 * C4), -F(6.79989431E-02 * C4), + F(1.45389847E-01 * C5), -F(8.29847578E-02 * C5), + F(1.40753505E-01 * C6), -F(9.75753918E-02 * C6), + F(1.33264415E-01 * C7), -F(1.11196689E-01 * C7), + F(0.00000000E+00 * C0), F(1.46404076E-02 * C0), + -F(5.31873032E-02 * C1), F(8.85757540E-03 * C1), + -F(3.90751381E-02 * C2), F(2.92408442E-03 * C2), + -F(2.61098752E-02 * C3), -F(4.91578024E-03 * C3), + F(1.29371806E-02 * C4), -F(5.65949473E-03 * C4), + F(1.53184106E-02 * C5), -F(8.02941163E-03 * C5), + F(1.62208471E-02 * C6), -F(1.04584443E-02 * C6), + F(1.59045603E-02 * C7), -F(1.27472335E-02 * C7), + F(0.00000000E+00 * C0), -F(9.02154502E-04 * C0), + -F(3.49717454E-03 * C1), F(2.10371989E-03 * C1), + -F(1.64973098E-03 * C2), F(1.99454554E-03 * C2), + -F(1.78805361E-04 * C3), F(1.61656283E-03 * C3), + F(2.01182542E-03 * C4), F(0.00000000E+00 * C4), + F(1.78371725E-03 * C5), -F(1.56575398E-04 * C5), + F(1.47640169E-03 * C6), -F(3.43256425E-04 * C6), + F(1.13992507E-03 * C7), -F(5.54620202E-04 * C7), +#undef F +#define F(x) F_COS8(x) + -F(1.0000000000 / C0), F(0.8314696123 / C1), + -F(1.0000000000 / C0), -F(0.1950903220 / C1), + -F(1.0000000000 / C0), -F(0.9807852804 / C1), + -F(1.0000000000 / C0), -F(0.5555702330 / C1), + -F(1.0000000000 / C0), F(0.5555702330 / C1), + -F(1.0000000000 / C0), F(0.9807852804 / C1), + -F(1.0000000000 / C0), F(0.1950903220 / C1), + -F(1.0000000000 / C0), -F(0.8314696123 / C1), + F(0.9238795325 / C2), F(0.9807852804 / C3), + F(0.3826834324 / C2), F(0.8314696123 / C3), + -F(0.3826834324 / C2), F(0.5555702330 / C3), + -F(0.9238795325 / C2), F(0.1950903220 / C3), + -F(0.9238795325 / C2), -F(0.1950903220 / C3), + -F(0.3826834324 / C2), -F(0.5555702330 / C3), + F(0.3826834324 / C2), -F(0.8314696123 / C3), + F(0.9238795325 / C2), -F(0.9807852804 / C3), + F(0.7071067812 / C4), F(0.5555702330 / C5), + -F(0.7071067812 / C4), -F(0.9807852804 / C5), + -F(0.7071067812 / C4), F(0.1950903220 / C5), + F(0.7071067812 / C4), F(0.8314696123 / C5), + F(0.7071067812 / C4), -F(0.8314696123 / C5), + -F(0.7071067812 / C4), -F(0.1950903220 / C5), + -F(0.7071067812 / C4), F(0.9807852804 / C5), + F(0.7071067812 / C4), -F(0.5555702330 / C5), + F(0.3826834324 / C6), F(0.1950903220 / C7), + -F(0.9238795325 / C6), -F(0.5555702330 / C7), + F(0.9238795325 / C6), F(0.8314696123 / C7), + -F(0.3826834324 / C6), -F(0.9807852804 / C7), + -F(0.3826834324 / C6), F(0.9807852804 / C7), + F(0.9238795325 / C6), -F(0.8314696123 / C7), + -F(0.9238795325 / C6), F(0.5555702330 / C7), + F(0.3826834324 / C6), -F(0.1950903220 / C7), +#undef F + +#undef C0 +#undef C1 +#undef C2 +#undef C3 +#undef C4 +#undef C5 +#undef C6 +#undef C7 +}; |