新版FM音源プログラム (28)
gcc の「インライン・アセンブラ」で書いた ARMv7E-M (Cortex-M4) 版の acc_calc_slot() 関数のリストを下に示します。 「中身」は armcc 版と同等です。
#include "slot.h" #define DEF_OFS(x) (offsetof(op_prop_t, x)) // // gcc inline assembly language function // int32_t acc_calc_slot(op_prop_t *o_p, int num_slot) { __asm__ __volatile__ ( #if defined(__ARM_ARCH_7EM__) // ARMv7E-M (Cortex-M4) // Use UAL (Unified Assembler Language) syntax " .syntax unified \n\t" " movs r7,#0 \n\t" // clear r7=acc_R " mov r8,r7 \n\t" // clear r8=acc_L "1: \n\t" // // series FM modulator slot // " ldr r2,[r0,%[ph_inc]] \n\t" // r2=ph_inc " ldr r3,[r0,%[ph_acc]] \n\t" // r3=ph_acc " ldr r4,[r0,%[mod_in]] \n\t" // r4=mod_in " ldr r5,[r0,%[stab_p]] \n\t" // r5=stab_p " adds r2, r2,r3 \n\t" // r2=ph_inc+ph_acc " ldrh r3,[r0,%[ind_mask]] \n\t" // r3=ind_mask " str r2,[r0,%[ph_acc]] \n\t" // update ph_acc " adds r4, r4,r2 \n\t" // r4=ph_inc+ph_acc+mod_in (=phh) " and r3, r3,r4,lsr %[acc_sft] \n\t" // r3 & (r4 >> 8) " ldr r2,[r0,%[ol_lin]] \n\t" // r2=ol_lin " ldrsh r3,[r5,r3] \n\t" // r3=*(int16_t *)(stab_p+r3) // // 2-tap FIR filter for feedback // " ldrsh r5,[r0,%[op_out]] \n\t" // r5=op_out (=prev_out) " muls r2, r3,r2 \n\t" // r2=r3*ol_lin " asrs r4, r2,#15 \n\t" // r4=(r2>>15) (=out) " ldrh r3,[r0,%[mod_mul]] \n\t" // r3=mod_mul " strh r4,[r0,%[op_out]] \n\t" // op_out=out " adds r5, r5,r4 \n\t" // r5=out+prev_out " muls r3, r5,r3 \n\t" // r3=r5*mod_mul " str r3,[r0,%[mod_in]] \n\t" // mod_in=r3 // // series FM carrier slot // " ldr r2,[r0,%[ph_inc]+%c[S]] \n\t" // r2=ph_inc " ldr r3,[r0,%[ph_acc]+%c[S]] \n\t" // r3=ph_acc " ldrh r5,[r0,%[mod_mul]+%c[S]] \n\t" // r5=mod_mul " adds r2, r2,r3 \n\t" // r2=ph_inc+ph_acc " str r2,[r0,%[ph_acc]+%c[S]] \n\t" // update ph_acc " mla r2, r4, r5, r2 \n\t" // r2=(r4*r5)+r2 " ldrh r3,[r0,%[ind_mask]+%c[S]] \n\t" // r3=ind_mask " ldr r5,[r0,%[stab_p]+%c[S]] \n\t" // r5=stab_p " and r3, r3,r2,lsr %[acc_sft] \n\t" // r3&(r2>>8) " ldr r2,[r0,%[ol_lin]+%c[S]] \n\t" // r2=ol_lin " ldrsh r3,[r5,r3] \n\t" // r3=*(int16_t *)(stab_p+r3) " ldr r5,[r0,%[R_vol]+%c[S]] \n\t" // r5=R volume " ldr r6,[r0,%[L_vol]+%c[S]] \n\t" // r6=L volume " muls r2, r3,r2 \n\t" // r2=r3*ol_lin " asrs r2, r2,#15 \n\t" // r2>>=15 (=out) " bfi r2, r4,#16,#16 \n\t" // r2[31:16]=r4[15:0] " str r2,[r0,%[op_out]+%c[S]] \n\t" // op_out=out // acc_R += (op_out0*R_vol0) + (op_out1*R_vol1) " smlad r7, r2, r5, r7 \n\t" // acc_L += (op_out0*L_vol0) + (op_out1*L_vol1) " smlad r8, r2, r6, r8 \n\t" " adds r0, r0,%[S]*2 \n\t" // adv. to next slot pair " subs r1, r1,#2 \n\t" // decrement loop counter " bgt 1b \n\t" // more to do // post scaling and saturate to 16 bit " ssat r1, #16,r8,asr %[vol_sft] \n\t" // L-ch " ssat r0, #16,r7,asr %[vol_sft] \n\t" // R-ch // pack two 16-bit halfwords to single 32-bit word " bfi r0, r1,#16,#16 \n\t" // result[31:16]=acc_L[15:0] // output reg list : "+r" (o_p), // [arg1] = r0 = (op_prop_t *) o_p "+r" (num_slot) // [arg2] = r1 = (int) num_slot // input parameter list ("I" for offset constant) : [acc_sft] "I" (PH_ACC_FRAC_BITS-1), [vol_sft] "I" (LR_VOL_SHIFT), [ph_inc] "I" (DEF_OFS(ph_inc)), [ph_acc] "I" (DEF_OFS(ph_acc)), [mod_in] "I" (DEF_OFS(mod_in)), [stab_p] "I" (DEF_OFS(stab_p)), [ind_mask] "I" (DEF_OFS(ind_mask)), [op_out] "I" (DEF_OFS(op_out)), [ol_lin] "I" (DEF_OFS(ol_lin)), [mod_mul] "I" (DEF_OFS(mod_mul)), [L_vol] "I" (DEF_OFS(L_vol)), [R_vol] "I" (DEF_OFS(R_vol)), [S] "I" (sizeof(op_prop_t)) // clobber reg list : "r2", "r3", "r4", "r5", "r6", "r7", "r8", "cc", "memory" );// __asm__ return((int32_t) o_p); } // int32_t acc_calc_slot()
STM32F4-Discovery (STM32F407VGT6) および Nucleo-F303K8 (STM32F303K8T6) での所要サイクル数の測定結果は下のようになっています。
// STAB in SRAM // Atollic TrueSTUDIO for STM32 v9.0.0 (gcc 6.3.1) // 48 cycle / 2slot (STM32F407 (CM4) @ 30 MHz, Flash latency=0, NSLOT=256) // 48 cycle / 2slot (STM32F407 (CM4) @ 168 MHz, Flash latency=5, NSLOT=256) // 48 cycle / 2slot (STM32F303 (CM4) @ 24 MHz, Flash latency=0, NSLOT=64) // 52 cycle / 2slot (STM32F303 (CM4) @ 64 MHz, Flash latency=2, NSLOT=64) // STAB in flash // Atollic TrueSTUDIO for STM32 v9.0.0 (gcc 6.3.1) // 52 cycle / 2slot (STM32F407 (CM4) @ 30 MHz, Flash latency=0, NSLOT=256) // 62 cycle / 2slot (STM32F407 (CM4) @ 168 MHz, Flash latency=5, NSLOT=256) // 48 cycle / 2slot (STM32F303 (CM4) @ 24 MHz, Flash latency=0, NSLOT=64) // 62 cycle / 2slot (STM32F303 (CM4) @ 64 MHz, Flash latency=2, NSLOT=64)