新版FM音源プログラム (26)

gcc の「インライン・アセンブラ」で書いた ARMv6-M (Cortex-M0) 版の acc_calc_slot() 関数のリストを下に示します。 「中身」は armcc 版と同等です。

#include "slot.h"
//
// gcc inline assembly language function
//
int32_t acc_calc_slot(op_prop_t *o_p, int num_slot)
{
  __asm__ __volatile__ (
// Use UAL (Unified Assembler Language) syntax    
  " .syntax  unified     \n\t"
  "  movs    r6,#0       \n\t" // clear r6=acc_R
  "  movs    r7,r6       \n\t" // clear r7=acc_L
  "1:                    \n\t"
//
//  series FM modulator slot
//
  "  ldr     r2,[r0,%[ph_inc]]   \n\t" // r2=ph_inc
  "  ldr     r3,[r0,%[ph_acc]]   \n\t" // r3=ph_acc
  "  ldr     r4,[r0,%[mod_in]]   \n\t" // r4=mod_in
  "  ldr     r5,[r0,%[stab_p]]   \n\t" // r5=stab_p
  "  adds    r2, r2,r3           \n\t" // r2=ph_inc+ph_acc
  "  ldrh    r3,[r0,%[ind_mask]] \n\t" // r3=ind_mask
  "  str     r2,[r0,%[ph_acc]]   \n\t" // update ph_acc
  "  adds    r4, r4,r2           \n\t" // r4=ph_inc+ph_acc+mod_in (= phh)
  "  lsrs    r4, r4,%[acc_sft]   \n\t" // r4 >> 8
  "  ands    r3, r3,r4           \n\t" // r3 & r4
  "  ldr     r2,[r0,%[ol_lin]]   \n\t" // r2=ol_lin
  "  ldrsh   r3,[r5,r3]          \n\t" // r3=*(int16_t *)(stab_p+r3)
//
// 2-tap FIR filter for feedback
//
  "  ldrh    r5,[r0,%[op_out]]   \n\t" // r5=op_out (=prev_out)
  "  sxth    r5, r5              \n\t" // sign extend
  "  muls    r2, r3,r2           \n\t" // r2=r3*ol_lin
  "  asrs    r4, r2,#15          \n\t" // r4=(r2 >> 15) (=out)
  "  ldrh    r3,[r0,%[mod_mul]]  \n\t" // r3=mod_mul
  "  strh    r4,[r0,%[op_out]]   \n\t" // op_out=out
  "  adds    r5, r5,r4           \n\t" // r5=out+prev_out
  "  muls    r3, r5,r3           \n\t" // r3=r5*mod_mul
  "  str     r3,[r0,%[mod_in]]   \n\t" // mod_in=r3
//
// series FM carrier slot
//
  "  ldr     r2,[r0,%[ph_inc]+%c[S]]  \n\t" // r2=ph_inc
  "  ldr     r3,[r0,%[ph_acc]+%c[S]]  \n\t" // r3=ph_acc
  "  ldrh    r5,[r0,%[mod_mul]+%c[S]] \n\t" // r5=mod_mul
  "  adds    r2, r2,r3            \n\t" // r2=ph_inc+ph_acc
  "  str     r2,[r0,%[ph_acc]+%c[S]]  \n\t" // update ph_acc
  "  muls    r5, r4,r5            \n\t" // r5=out*mod_mul
  "  adds    r2, r2,r5            \n\t" // r2=ph_inc+ph_acc+out (=phh)
  "  ldrh    r3,[r0,%[ind_mask]+%c[S]] \n\t" // r3=ind_mask
  "  ldr     r5,[r0,%[stab_p]+%c[S]]  \n\t" // r5=stab_p
  "  lsrs    r2, r2,%[acc_sft]    \n\t" // r2 >> 8
  "  ands    r3, r2,r3            \n\t" // r3 & r2
  "  ldr     r2,[r0,%[ol_lin]+%c[S]]  \n\t" // r2=ol_lin
  "  ldrsh   r3,[r5,r3]           \n\t" // r3=*(int16_t *)(stab_p+r3)
  "  muls    r2, r3,r2            \n\t" // r2=r3*ol_lin
  "  asrs    r2, r2,#15           \n\t" // r2 >>= 15 (=out)
  "  strh    r2,[r0,%[op_out]+%c[S]]  \n\t" // op_out=out
//
// accumulator
//
// r0 = *op_prop, r1 = num_slot
// r2 = op_out1,  r4 = op_out0
// r6 = acc_R,    r7 = acc_L
// r3, r5 = free
//
  "  ldr     r3,[r0,%[R_vol]+%c[S]] \n\t" // r3=R_vol0:R_vol1
  "  lsrs    r5, r3, #16            \n\t" // r5=R_vol0
  "  uxth    r3, r3                 \n\t" // r3=R_vol1
  "  muls    r3, r2,r3              \n\t" // r3*=op_out1
  "  muls    r5, r4,r5              \n\t" // r5*=op_out0
  "  adds    r6, r6,r3              \n\t" // acc_R+=r3
  "  adds    r6, r6,r5              \n\t" // acc_R+=r5
  "  ldr     r3,[r0,%[L_vol]+%c[S]] \n\t" // r3=L_vol0:L_vol1
  "  lsrs    r5, r3, #16            \n\t" // r5=L_vol0
  "  uxth    r3, r3                 \n\t" // r3=L_vol1
  "  muls    r3, r2,r3              \n\t" // r3*=op_out1
  "  muls    r5, r4,r5              \n\t" // r5*=op_out0
  "  adds    r7, r7,r3              \n\t" // acc_L+=r3
  "  adds    r7, r7,r5              \n\t" // acc_L+=r5
//				
  "  adds    r0, r0,%[S]*2    \n\t" // adv. to next slot pair
  "  subs    r1, r1,#2        \n\t" // decr. loop counter
  "  bgt     1b               \n\t" // more to do
//
// post scaling and saturate to 16-bit
//
  "  ldr     r3,=0x7fff   \n\t" // r3=0x00007fff
  "  mvns    r4, r3       \n\t" // r4=0xffff8000
//                              // saturate acc_R
  "  asrs    r6, r6,%[vol_sft] \n\t" // acc_R post scaling
  "  bmi     2f           \n\t" // branch if negative
  "  cmp     r6, r3       \n\t" // compare to 0x00007fff
  "  blt     3f           \n\t" // no pos overflow
  "  movs    r6, r3       \n\t" // r6=0x00007fff
  "  b       3f           \n\t"
//		
  "2:                     \n\t" // neg_r:
  "  cmp     r6, r4       \n\t" // compare to 0xffff8000
  "  bge     3f           \n\t" // no neg overflow
  "  movs    r6, r4       \n\t" // r6=0xffff8000
  "3:                     \n\t" // saturate acc_L		
  "  asrs    r7, r7,%[vol_sft] \n\t" // acc_L post scaling
  "  bmi     4f           \n\t" // branch if negative
  "  cmp     r7, r3       \n\t" // compare to 0x00007fff
  "  blt     5f           \n\t" // no pos overflow
  "  movs    r7, r3       \n\t" // r7=0x00007fff
  "  b       5f           \n\t"
//		
  "4:                     \n\t" // neg_l:
  "  cmp     r7, r4       \n\t" // compare to 0xffff8000
  "  bge     5f           \n\t" // no neg overflow
  "  movs    r7, r4       \n\t" // r7=0xffff8000
  "5:                     \n\t" // pack (acc_L:acc_R) to R0			
  "  lsls    r7, r7, #16  \n\t" // acc_L in top halfword of r7
  "  uxth    r0, r6       \n\t" // acc_R in bottom halfword of r0
  "  orrs    r0, r0,r7    \n\t" // combine them

// output reg list
  : "+r" (o_p),      // [arg1] = r0 = (op_prop_t *) o_p
    "+r" (num_slot)  // [arg2] = r1 = (int) num_slot
// input parameter list ("I" for offset constant)
  : [acc_sft]  "I" (PH_ACC_FRAC_BITS-1),
    [vol_sft]  "I" (LR_VOL_SHIFT),
    [ph_inc]   "I" (DEF_OFS(ph_inc)),
    [ph_acc]   "I" (DEF_OFS(ph_acc)),
    [mod_in]   "I" (DEF_OFS(mod_in)),
    [stab_p]   "I" (DEF_OFS(stab_p)),
    [ind_mask] "I" (DEF_OFS(ind_mask)),
    [op_out]   "I" (DEF_OFS(op_out)),
    [ol_lin]   "I" (DEF_OFS(ol_lin)),
    [mod_mul]  "I" (DEF_OFS(mod_mul)),
    [L_vol]    "I" (DEF_OFS(L_vol)),
    [R_vol]    "I" (DEF_OFS(R_vol)),
    [S]        "I" (sizeof(op_prop_t))
// clobber reg list
  : "r2", "r3", "r4", "r5", "r6", "r7",
    "cc", "memory"
  );// __asm__
  return((int32_t) o_p);
} // int32_t acc_calc_slot()

PSoC 4200 Prototyping Kit (CY8CKIT-049-42xx) での所要サイクル数の測定結果は下のようになっています。

// PSoC Creator 4.0 SP1 (gcc 4.9.3), NSLOT=64
// 80 cycle / 2slot (CY8C4245 (CM0) @ 24 MHz, Flash wait=0)
// 83 cycle / 2slot (CY8C4245 (CM0) @ 48 MHz, Flash wait=1)