diff options
-rw-r--r-- | ChangeLog | 8 | ||||
-rw-r--r-- | src/Makefile.am | 5 | ||||
-rw-r--r-- | src/arm_vfp_synt.S | 102 | ||||
-rw-r--r-- | src/mp3-c-synth.c | 13 |
4 files changed, 124 insertions, 4 deletions
@@ -1,3 +1,11 @@ +2011-11-20 Josep Torra <josep@fluendo.com> + + * src/Makefile.am: + * src/arm_vfp_synt.S: + * src/mp3-c-synth.c: (mp3_dewindow_output), (mp3_SubBandSynthesis): + Initial attempt to rewrite part of the subband synthesis in ARM/VFP + assembly. + 2011-11-18 Josep Torra <josep@fluendo.com> * src/mp3-c-synth.c: (mp3_dewindow_output), (mp3_SubBandSynthesis): diff --git a/src/Makefile.am b/src/Makefile.am index a51dc49..3637fe9 100644 --- a/src/Makefile.am +++ b/src/Makefile.am @@ -5,6 +5,10 @@ if USE_IPP SOURCE_FILES += mp3-ipp.c endif +if USE_ARM_VFP +SOURCE_FILES += arm_vfp_synt.S +endif + libgstflump3dec_la_SOURCES = \ flump3dec.c \ bitstream.c \ @@ -35,3 +39,4 @@ libgstflump3dec_la_LDFLAGS = $(GST_PLUGIN_LDFLAGS) $(CPU_TUNE_LDFLAGS) libgstflump3dec_la_CFLAGS = \ $(GST_ERROR) $(GST_CFLAGS) $(CPU_TUNE_CFLAGS) \ $(IPP_INCLUDES) $(LIBOIL_CFLAGS) +libgstflump3dec_la_CCASFLAGS = $(CPU_TUNE_CCASFLAGS) diff --git a/src/arm_vfp_synt.S b/src/arm_vfp_synt.S new file mode 100644 index 0000000..ada8329 --- /dev/null +++ b/src/arm_vfp_synt.S @@ -0,0 +1,102 @@ +/* + * FLUENDO S.A. + * Copyright (C) <2005 - 2011> <support@fluendo.com> + */ + +#if defined(__VFP_FP__) && !defined(__SOFTFP__) + +/* mp3_dewindow_output (gfloat *uvec, short *samples, const gfloat* window) + * + * uvec: $r0 | samples: $r1 | window: $r2 + * + * uvec[0..31] = uvec [0..31] * window [0..31] + uvec[32..63] * window[32..63] + + * ... + * + uvec[448..479] * window[480..511] + uvec[480..511] * window[480..511] + * samples[0..31] = convert_round_to_short(uvec[0..31]) + * + * lr: original fpscr + */ + .SCALE: + .word 1191182336 + .C_0dot5: + .word 1056964608 + +#define CONVERT_TO_INTEGER(v,d) \ + fcmpezs v ; \ + fmstat; \ + faddsgt d, v, s2; /* v > 0 ? d = v + 0.5 */ \ + fsubsle d, v, s2; /* v <= 0 ? d = v - 0.5 */ \ + ftosizs d, d + + .global mp3_dewindow_output ; + mp3_dewindow_output: + stmdb sp!, {r4-r8, fp, lr}; /* save registers to stack */ + fmrx lr, fpscr; /* read fpscr register into arm */ + mov fp, #7; + orr fp, lr, fp, lsl #16; /* set vector lenght to 8 */ + fmxr fpscr, fp; + mov fp, #4; /* main iterator */ + flds s1, .SCALE /* load SCALE constant */ + flds s2, .C_0dot5 /* load 0.5 constant */ + + mp3_dewindow_output_loop: + mov r8, r2; /* r8 = &win[0] */ + mov r3, r0; /* r3 = &uvec[0] */ + fldmias r2!, {s8-s15}; /* win[0..8] */ + fldmias r0!, {s16-s23}; /* uvec[0..8] */ + fmuls s24, s8, s16; /* s24..s31 = win[0..8] * uvec[0..8] */ + + mov ip, #15; + mp3_dewindow_output_mac_loop: + add r8, r8, #128 /* r8 = &win[32] */ + add r3, r3, #128 /* r3 = &uvec[32] */ + fldmias r8, {s8-s15}; /* win[32..39] */ + fldmias r3, {s16-s23}; /* uvec[32..39] */ + fmacs s24, s8, s16; /* s24..s31 += win[32..39] * uvec[32..39] */ + subs ip, ip, #1; + bne mp3_dewindow_output_mac_loop; + + /* Scale result */ + fmuls s8, s24, s1; /* uvec[0..8] *= SCALE */ + + /* Write 4 samples */ + CONVERT_TO_INTEGER(s8,s4); + CONVERT_TO_INTEGER(s9,s5); + CONVERT_TO_INTEGER(s10,s6); + CONVERT_TO_INTEGER(s11,s7); + fmrrs r4, r5, {s4, s5} + fmrrs r6, r7, {s6, s7} + ssat r4, #16, r4 + ssat r5, #16, r5 + ssat r6, #16, r6 + ssat r7, #16, r7 + strh r4, [r1, #0]; + strh r5, [r1, #2]; + strh r6, [r1, #4]; + strh r7, [r1, #6]; + add r1, r1, #8; + + /* Write 4 samples */ + CONVERT_TO_INTEGER(s12,s4); + CONVERT_TO_INTEGER(s13,s5); + CONVERT_TO_INTEGER(s14,s6); + CONVERT_TO_INTEGER(s15,s7); + fmrrs r4, r5, {s4, s5} + fmrrs r6, r7, {s6, s7} + ssat r4, #16, r4 + ssat r5, #16, r5 + ssat r6, #16, r6 + ssat r7, #16, r7 + strh r4, [r1, #0]; + strh r5, [r1, #2]; + strh r6, [r1, #4]; + strh r7, [r1, #6]; + add r1, r1, #8; + + subs fp, fp, #1; + bne mp3_dewindow_output_loop; + + fmxr fpscr, lr; /* restore original fpscr */ + ldmia sp!, {r4-r8, fp, pc}; /* recovering from stack and return */ + +#endif /* defined(__VFP_FP__) && !defined(__SOFTFP__) */ diff --git a/src/mp3-c-synth.c b/src/mp3-c-synth.c index e9ab181..0f37b33 100644 --- a/src/mp3-c-synth.c +++ b/src/mp3-c-synth.c @@ -48,15 +48,19 @@ static void MPG_DCT_32 (gfloat in[32], gfloat out[32]); static void MPG_DCT_16 (gfloat in[16], gfloat out[16]); static void MPG_DCT_8 (gfloat in[8], gfloat out[8]); +#if defined(__VFP_FP__) && !defined(__SOFTFP__) +extern void mp3_dewindow_output (gfloat *u_vec, short *samples, + gfloat* window); +#else static inline void -mp3_dewindow_output (gfloat *u_vec, short *samples) +mp3_dewindow_output (gfloat *u_vec, short *samples, gfloat* window) { gint i; gfloat *u_vec0; /* dewindowing */ #ifdef USE_LIBOIL - oil_multiply_f32 (u_vec, u_vec, dewindow, HAN_SIZE); + oil_multiply_f32 (u_vec, u_vec, window, HAN_SIZE); #else for (i = 0; i < HAN_SIZE; i++) u_vec[i] *= dewindow[i]; @@ -110,6 +114,7 @@ mp3_dewindow_output (gfloat *u_vec, short *samples) } } } +#endif void mp3_SubBandSynthesis (mp3tl * tl ATTR_UNUSED, frame_params * fr_ps, @@ -169,9 +174,9 @@ mp3_SubBandSynthesis (mp3tl * tl ATTR_UNUSED, frame_params * fr_ps, } #endif - mp3_dewindow_output (u_vec, samples); - fr_ps->bufOffset[channel] = buf_offset; + + mp3_dewindow_output (u_vec, samples, (gfloat*) dewindow); } /* Synthesis matrixing variant which uses a 32 point DCT to compute |