新版FM音源プログラム (26)
gcc の「インライン・アセンブラ」で書いた ARMv6-M (Cortex-M0) 版の acc_calc_slot() 関数のリストを下に示します。 「中身」は armcc 版と同等です。
#include "slot.h" // // gcc inline assembly language function // int32_t acc_calc_slot(op_prop_t *o_p, int num_slot) { __asm__ __volatile__ ( // Use UAL (Unified Assembler Language) syntax " .syntax unified \n\t" " movs r6,#0 \n\t" // clear r6=acc_R " movs r7,r6 \n\t" // clear r7=acc_L "1: \n\t" // // series FM modulator slot // " ldr r2,[r0,%[ph_inc]] \n\t" // r2=ph_inc " ldr r3,[r0,%[ph_acc]] \n\t" // r3=ph_acc " ldr r4,[r0,%[mod_in]] \n\t" // r4=mod_in " ldr r5,[r0,%[stab_p]] \n\t" // r5=stab_p " adds r2, r2,r3 \n\t" // r2=ph_inc+ph_acc " ldrh r3,[r0,%[ind_mask]] \n\t" // r3=ind_mask " str r2,[r0,%[ph_acc]] \n\t" // update ph_acc " adds r4, r4,r2 \n\t" // r4=ph_inc+ph_acc+mod_in (= phh) " lsrs r4, r4,%[acc_sft] \n\t" // r4 >> 8 " ands r3, r3,r4 \n\t" // r3 & r4 " ldr r2,[r0,%[ol_lin]] \n\t" // r2=ol_lin " ldrsh r3,[r5,r3] \n\t" // r3=*(int16_t *)(stab_p+r3) // // 2-tap FIR filter for feedback // " ldrh r5,[r0,%[op_out]] \n\t" // r5=op_out (=prev_out) " sxth r5, r5 \n\t" // sign extend " muls r2, r3,r2 \n\t" // r2=r3*ol_lin " asrs r4, r2,#15 \n\t" // r4=(r2 >> 15) (=out) " ldrh r3,[r0,%[mod_mul]] \n\t" // r3=mod_mul " strh r4,[r0,%[op_out]] \n\t" // op_out=out " adds r5, r5,r4 \n\t" // r5=out+prev_out " muls r3, r5,r3 \n\t" // r3=r5*mod_mul " str r3,[r0,%[mod_in]] \n\t" // mod_in=r3 // // series FM carrier slot // " ldr r2,[r0,%[ph_inc]+%c[S]] \n\t" // r2=ph_inc " ldr r3,[r0,%[ph_acc]+%c[S]] \n\t" // r3=ph_acc " ldrh r5,[r0,%[mod_mul]+%c[S]] \n\t" // r5=mod_mul " adds r2, r2,r3 \n\t" // r2=ph_inc+ph_acc " str r2,[r0,%[ph_acc]+%c[S]] \n\t" // update ph_acc " muls r5, r4,r5 \n\t" // r5=out*mod_mul " adds r2, r2,r5 \n\t" // r2=ph_inc+ph_acc+out (=phh) " ldrh r3,[r0,%[ind_mask]+%c[S]] \n\t" // r3=ind_mask " ldr r5,[r0,%[stab_p]+%c[S]] \n\t" // r5=stab_p " lsrs r2, r2,%[acc_sft] \n\t" // r2 >> 8 " ands r3, r2,r3 \n\t" // r3 & r2 " ldr r2,[r0,%[ol_lin]+%c[S]] \n\t" // r2=ol_lin " ldrsh r3,[r5,r3] \n\t" // r3=*(int16_t *)(stab_p+r3) " muls r2, r3,r2 \n\t" // r2=r3*ol_lin " asrs r2, r2,#15 \n\t" // r2 >>= 15 (=out) " strh r2,[r0,%[op_out]+%c[S]] \n\t" // op_out=out // // accumulator // // r0 = *op_prop, r1 = num_slot // r2 = op_out1, r4 = op_out0 // r6 = acc_R, r7 = acc_L // r3, r5 = free // " ldr r3,[r0,%[R_vol]+%c[S]] \n\t" // r3=R_vol0:R_vol1 " lsrs r5, r3, #16 \n\t" // r5=R_vol0 " uxth r3, r3 \n\t" // r3=R_vol1 " muls r3, r2,r3 \n\t" // r3*=op_out1 " muls r5, r4,r5 \n\t" // r5*=op_out0 " adds r6, r6,r3 \n\t" // acc_R+=r3 " adds r6, r6,r5 \n\t" // acc_R+=r5 " ldr r3,[r0,%[L_vol]+%c[S]] \n\t" // r3=L_vol0:L_vol1 " lsrs r5, r3, #16 \n\t" // r5=L_vol0 " uxth r3, r3 \n\t" // r3=L_vol1 " muls r3, r2,r3 \n\t" // r3*=op_out1 " muls r5, r4,r5 \n\t" // r5*=op_out0 " adds r7, r7,r3 \n\t" // acc_L+=r3 " adds r7, r7,r5 \n\t" // acc_L+=r5 // " adds r0, r0,%[S]*2 \n\t" // adv. to next slot pair " subs r1, r1,#2 \n\t" // decr. loop counter " bgt 1b \n\t" // more to do // // post scaling and saturate to 16-bit // " ldr r3,=0x7fff \n\t" // r3=0x00007fff " mvns r4, r3 \n\t" // r4=0xffff8000 // // saturate acc_R " asrs r6, r6,%[vol_sft] \n\t" // acc_R post scaling " bmi 2f \n\t" // branch if negative " cmp r6, r3 \n\t" // compare to 0x00007fff " blt 3f \n\t" // no pos overflow " movs r6, r3 \n\t" // r6=0x00007fff " b 3f \n\t" // "2: \n\t" // neg_r: " cmp r6, r4 \n\t" // compare to 0xffff8000 " bge 3f \n\t" // no neg overflow " movs r6, r4 \n\t" // r6=0xffff8000 "3: \n\t" // saturate acc_L " asrs r7, r7,%[vol_sft] \n\t" // acc_L post scaling " bmi 4f \n\t" // branch if negative " cmp r7, r3 \n\t" // compare to 0x00007fff " blt 5f \n\t" // no pos overflow " movs r7, r3 \n\t" // r7=0x00007fff " b 5f \n\t" // "4: \n\t" // neg_l: " cmp r7, r4 \n\t" // compare to 0xffff8000 " bge 5f \n\t" // no neg overflow " movs r7, r4 \n\t" // r7=0xffff8000 "5: \n\t" // pack (acc_L:acc_R) to R0 " lsls r7, r7, #16 \n\t" // acc_L in top halfword of r7 " uxth r0, r6 \n\t" // acc_R in bottom halfword of r0 " orrs r0, r0,r7 \n\t" // combine them // output reg list : "+r" (o_p), // [arg1] = r0 = (op_prop_t *) o_p "+r" (num_slot) // [arg2] = r1 = (int) num_slot // input parameter list ("I" for offset constant) : [acc_sft] "I" (PH_ACC_FRAC_BITS-1), [vol_sft] "I" (LR_VOL_SHIFT), [ph_inc] "I" (DEF_OFS(ph_inc)), [ph_acc] "I" (DEF_OFS(ph_acc)), [mod_in] "I" (DEF_OFS(mod_in)), [stab_p] "I" (DEF_OFS(stab_p)), [ind_mask] "I" (DEF_OFS(ind_mask)), [op_out] "I" (DEF_OFS(op_out)), [ol_lin] "I" (DEF_OFS(ol_lin)), [mod_mul] "I" (DEF_OFS(mod_mul)), [L_vol] "I" (DEF_OFS(L_vol)), [R_vol] "I" (DEF_OFS(R_vol)), [S] "I" (sizeof(op_prop_t)) // clobber reg list : "r2", "r3", "r4", "r5", "r6", "r7", "cc", "memory" );// __asm__ return((int32_t) o_p); } // int32_t acc_calc_slot()
PSoC 4200 Prototyping Kit (CY8CKIT-049-42xx) での所要サイクル数の測定結果は下のようになっています。
// PSoC Creator 4.0 SP1 (gcc 4.9.3), NSLOT=64 // 80 cycle / 2slot (CY8C4245 (CM0) @ 24 MHz, Flash wait=0) // 83 cycle / 2slot (CY8C4245 (CM0) @ 48 MHz, Flash wait=1)