新版FM音源プログラム (27)

gcc の「インライン・アセンブラ」で書いた ARMv7-M (Cortex-M3) 版の acc_calc_slot() 関数のリストを下に示します。 「中身」は armcc 版と同等です。

#include "slot.h"
//
// gcc inline assembly language function
//
int32_t acc_calc_slot(op_prop_t *o_p, int num_slot)
{
  __asm__ __volatile__ (
// Use UAL (Unified Assembler Language) syntax    
  " .syntax  unified             \n\t"
  "  movs    r8,#0               \n\t" // clear r8=acc_R
  "  mov     r9,r8               \n\t" // clear r9=acc_L
  "1:                            \n\t"
//
//  series FM modulator slot
//
  "  ldr     r2,[r0,%[ph_inc]]   \n\t" // r2=ph_inc
  "  ldr     r3,[r0,%[ph_acc]]   \n\t" // r3=ph_acc
  "  ldr     r4,[r0,%[mod_in]]   \n\t" // r4=mod_in
  "  ldr     r5,[r0,%[stab_p]]   \n\t" // r5=stab_p
  "  adds    r2, r2,r3           \n\t" // r2=ph_inc+ph_acc
  "  ldrh    r3,[r0,%[ind_mask]] \n\t" // r3=ind_mask
  "  str     r2,[r0,%[ph_acc]]   \n\t" // update ph_acc
  "  adds    r4, r4,r2           \n\t" // r4=ph_inc+ph_acc+mod_in (=phh)
  "  ands    r3, r3,r4,lsr %[acc_sft] \n\t" // r3 & (r4 >> 8)
  "  ldr     r2,[r0,%[ol_lin]]   \n\t" // r2=ol_lin
  "  ldrsh   r3,[r5,r3]          \n\t" // r3=*(int16_t *)(stab_p+r3)
//
// 2-tap FIR filter for feedback
//
  "  ldrsh   r5,[r0,%[op_out]]   \n\t" // r5=op_out (=prev_out)
  "  muls    r2, r3,r2           \n\t" // r2=r3*ol_lin
  "  asrs    r4, r2,#15          \n\t" // r4=(r2 >> 15) (=out)
  "  ldrh    r3,[r0,%[mod_mul]]  \n\t" // r3=mod_mul
  "  strh    r4,[r0,%[op_out]]   \n\t" // op_out=out
  "  adds    r5, r5,r4           \n\t" // r5=out+prev_out
  "  muls    r3, r5,r3           \n\t" // r3=r5*mod_mul
  "  str     r3,[r0,%[mod_in]]   \n\t" // mod_in=r3
//
// series FM carrier slot
//
  "  ldr     r2,[r0,%[ph_inc]+%c[S]]  \n\t" // r2=ph_inc
  "  ldr     r3,[r0,%[ph_acc]+%c[S]]  \n\t" // r3=ph_acc
  "  ldrh    r5,[r0,%[mod_mul]+%c[S]] \n\t" // r5=mod_mul
  "  adds    r2, r2,r3            \n\t" // r2=ph_inc+ph_acc
  "  str     r2,[r0,%[ph_acc]+%c[S]]  \n\t" // update ph_acc
  "  mla     r2, r4, r5, r2       \n\t" // r2+=(r4*r5)
  "  ldrh    r3,[r0,%[ind_mask]+%c[S]] \n\t" // r3=ind_mask
  "  ldr     r5,[r0,%[stab_p]+%c[S]]  \n\t" // r5=stab_p
  "  and     r3, r3,r2,lsr %[acc_sft] \n\t" // r3 & (r2 >> 8)
  "  ldr     r2,[r0,%[ol_lin]+%c[S]]  \n\t" // r2=ol_lin
  "  ldrsh   r3,[r5,r3]           \n\t" // r3=*(int16_t *)(stab_p+r3)
  "  muls    r2, r3,r2            \n\t" // r2=r3*ol_lin
  "  asrs    r2, r2,#15           \n\t" // r2>>=15 (=out)
  "  ldrh    r6,[r0,%[L_vol]+%c[S]]   \n\t" // r6=L_vol1
  "  ldrh    r7,[r0,%[L_vol]+2+%c[S]] \n\t" // r7=L_vol0
  "  ldrh    r3,[r0,%[R_vol]+%c[S]]   \n\t" // r3=R_vol1
  "  ldrh    r5,[r0,%[R_vol]+2+%c[S]] \n\t" // r5=R_vol0
  "  strh    r2,[r0,%[op_out]+%c[S]]  \n\t" // op_out=out
  "  mla     r8, r2, r3, r8       \n\t" // acc_R+=(op_out1*R_vol1)
  "  mla     r8, r4, r5, r8       \n\t" // acc_R+=(op_out0*R_vol0)
  "  mla     r9, r2, r6, r9       \n\t" // acc_L+=(op_out1*L_vol1)
  "  mla     r9, r4, r7, r9       \n\t" // acc_L+=(op_out0*L_vol0)
  "  adds    r0, r0,%[S]*2        \n\t" // adv. to next slot pair
  "  subs    r1, r1,#2            \n\t" // decr. loop counter
  "  bgt     1b                   \n\t" // more to do
// post scaling and saturate to 16 bit
  "  ssat    r1, #16, r9, asr %[vol_sft] \n\t" // L-ch
  "  ssat    r0, #16, r8, asr %[vol_sft] \n\t" // R-ch
// pack two halfwords (16 bit, 16 bit) to single word (32 bit)
  "  bfi     r0, r1,#16,#16       \n\t" // result[31:16]=acc_L[15:0]
   
// output reg list
  : "+r" (o_p),      // [arg1] = r0 = (op_prop_t *) o_p
    "+r" (num_slot)  // [arg2] = r1 = (int) num_slot
// input parameter list ("I" for offset constant)
  : [acc_sft]  "I" (PH_ACC_FRAC_BITS-1),
    [vol_sft]  "I" (LR_VOL_SHIFT),
    [ph_inc]   "I" (DEF_OFS(ph_inc)),
    [ph_acc]   "I" (DEF_OFS(ph_acc)),
    [mod_in]   "I" (DEF_OFS(mod_in)),
    [stab_p]   "I" (DEF_OFS(stab_p)),
    [ind_mask] "I" (DEF_OFS(ind_mask)),
    [op_out]   "I" (DEF_OFS(op_out)),
    [ol_lin]   "I" (DEF_OFS(ol_lin)),
    [mod_mul]  "I" (DEF_OFS(mod_mul)),
    [L_vol]    "I" (DEF_OFS(L_vol)),
    [R_vol]    "I" (DEF_OFS(R_vol)),
    [S]        "I" (sizeof(op_prop_t))
// clobber reg list
  : "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9",
    "cc", "memory"
  );// __asm__
  return((int32_t) o_p);
} // int32_t acc_calc_slot()

PSoC5LP Prototyping Kit (CY8CKIT-059) での所要サイクル数の測定結果は下のようになっています。

// STAB in SRAM
// PSoC Creator 4.0 SP1 (gcc 4.9.3), NSLOT=128
// 60 cycle / 2slot (CY8C5888 (CM3) @ 16 MHz, flash wait=0)
// 60 cycle / 2slot (CY8C5888 (CM3) @ 80 MHz, flash wait=4)

// STAB in flash
// PSoC Creator 4.0 SP1 (gcc 4.9.3), NSLOT=128
// 62 cycle / 2slot (CY8C5888 (CM3) @ 16 MHz, flash wait=0)
// 72 cycle / 2slot (CY8C5888 (CM3) @ 80 MHz, flash wait=4)