新版FM音源プログラム (28)

gcc の「インライン・アセンブラ」で書いた ARMv7E-M (Cortex-M4) 版の acc_calc_slot() 関数のリストを下に示します。 「中身」は armcc 版と同等です。

#include "slot.h"
#define DEF_OFS(x) (offsetof(op_prop_t, x))

//
// gcc inline assembly language function
//
int32_t acc_calc_slot(op_prop_t *o_p, int num_slot)
{
  __asm__ __volatile__ (
#if defined(__ARM_ARCH_7EM__)     // ARMv7E-M (Cortex-M4)
// Use UAL (Unified Assembler Language) syntax    
  " .syntax  unified             \n\t"
  "  movs    r7,#0               \n\t" // clear r7=acc_R
  "  mov     r8,r7               \n\t" // clear r8=acc_L
  "1:                            \n\t"
//
//  series FM modulator slot
//
  "  ldr     r2,[r0,%[ph_inc]]   \n\t" // r2=ph_inc
  "  ldr     r3,[r0,%[ph_acc]]   \n\t" // r3=ph_acc
  "  ldr     r4,[r0,%[mod_in]]   \n\t" // r4=mod_in
  "  ldr     r5,[r0,%[stab_p]]   \n\t" // r5=stab_p
  "  adds    r2, r2,r3           \n\t" // r2=ph_inc+ph_acc
  "  ldrh    r3,[r0,%[ind_mask]] \n\t" // r3=ind_mask
  "  str     r2,[r0,%[ph_acc]]   \n\t" // update ph_acc
  "  adds    r4, r4,r2           \n\t" // r4=ph_inc+ph_acc+mod_in (=phh)
  "  and     r3, r3,r4,lsr %[acc_sft] \n\t" // r3 & (r4 >> 8)
  "  ldr     r2,[r0,%[ol_lin]]   \n\t" // r2=ol_lin
  "  ldrsh   r3,[r5,r3]          \n\t" // r3=*(int16_t *)(stab_p+r3)
//
// 2-tap FIR filter for feedback
//
  "  ldrsh   r5,[r0,%[op_out]]   \n\t" // r5=op_out (=prev_out)
  "  muls    r2, r3,r2           \n\t" // r2=r3*ol_lin
  "  asrs    r4, r2,#15          \n\t" // r4=(r2>>15) (=out)
  "  ldrh    r3,[r0,%[mod_mul]]  \n\t" // r3=mod_mul
  "  strh    r4,[r0,%[op_out]]   \n\t" // op_out=out
  "  adds    r5, r5,r4           \n\t" // r5=out+prev_out
  "  muls    r3, r5,r3           \n\t" // r3=r5*mod_mul
  "  str     r3,[r0,%[mod_in]]   \n\t" // mod_in=r3
//
// series FM carrier slot
//
  "  ldr     r2,[r0,%[ph_inc]+%c[S]]  \n\t" // r2=ph_inc
  "  ldr     r3,[r0,%[ph_acc]+%c[S]]  \n\t" // r3=ph_acc
  "  ldrh    r5,[r0,%[mod_mul]+%c[S]] \n\t" // r5=mod_mul
  "  adds    r2, r2,r3            \n\t" // r2=ph_inc+ph_acc
  "  str     r2,[r0,%[ph_acc]+%c[S]]  \n\t" // update ph_acc
  "  mla     r2, r4, r5, r2       \n\t" // r2=(r4*r5)+r2
  "  ldrh    r3,[r0,%[ind_mask]+%c[S]] \n\t" // r3=ind_mask
  "  ldr     r5,[r0,%[stab_p]+%c[S]]  \n\t" // r5=stab_p
  "  and     r3, r3,r2,lsr %[acc_sft] \n\t" // r3&(r2>>8)
  "  ldr     r2,[r0,%[ol_lin]+%c[S]]  \n\t" // r2=ol_lin
  "  ldrsh   r3,[r5,r3]           \n\t" // r3=*(int16_t *)(stab_p+r3)
  "  ldr     r5,[r0,%[R_vol]+%c[S]] \n\t" // r5=R volume
  "  ldr     r6,[r0,%[L_vol]+%c[S]] \n\t" // r6=L volume
  "  muls    r2, r3,r2            \n\t" // r2=r3*ol_lin
  "  asrs    r2, r2,#15           \n\t" // r2>>=15 (=out)
  "  bfi     r2, r4,#16,#16       \n\t" // r2[31:16]=r4[15:0]
  "  str     r2,[r0,%[op_out]+%c[S]]  \n\t" // op_out=out
// acc_R += (op_out0*R_vol0) + (op_out1*R_vol1)
  "  smlad   r7, r2, r5, r7           \n\t"
// acc_L += (op_out0*L_vol0) + (op_out1*L_vol1)
  "  smlad   r8, r2, r6, r8           \n\t"
  "  adds    r0, r0,%[S]*2    \n\t" // adv. to next slot pair
  "  subs    r1, r1,#2        \n\t" // decrement loop counter
  "  bgt     1b               \n\t" // more to do
// post scaling and saturate to 16 bit
  "  ssat    r1, #16,r8,asr %[vol_sft] \n\t" // L-ch
  "  ssat    r0, #16,r7,asr %[vol_sft] \n\t" // R-ch
// pack two 16-bit halfwords to single 32-bit word
  "  bfi     r0, r1,#16,#16   \n\t" // result[31:16]=acc_L[15:0]

// output reg list
  : "+r" (o_p),      // [arg1] = r0 = (op_prop_t *) o_p
    "+r" (num_slot)  // [arg2] = r1 = (int) num_slot
// input parameter list ("I" for offset constant)
  : [acc_sft]  "I" (PH_ACC_FRAC_BITS-1),
    [vol_sft]  "I" (LR_VOL_SHIFT),
    [ph_inc]   "I" (DEF_OFS(ph_inc)),
    [ph_acc]   "I" (DEF_OFS(ph_acc)),
    [mod_in]   "I" (DEF_OFS(mod_in)),
    [stab_p]   "I" (DEF_OFS(stab_p)),
    [ind_mask] "I" (DEF_OFS(ind_mask)),
    [op_out]   "I" (DEF_OFS(op_out)),
    [ol_lin]   "I" (DEF_OFS(ol_lin)),
    [mod_mul]  "I" (DEF_OFS(mod_mul)),
    [L_vol]    "I" (DEF_OFS(L_vol)),
    [R_vol]    "I" (DEF_OFS(R_vol)),
    [S]        "I" (sizeof(op_prop_t))
// clobber reg list
  : "r2", "r3", "r4", "r5", "r6", "r7", "r8",
    "cc", "memory"
  );// __asm__
  return((int32_t) o_p);
} // int32_t acc_calc_slot()

STM32F4-Discovery (STM32F407VGT6) および Nucleo-F303K8 (STM32F303K8T6) での所要サイクル数の測定結果は下のようになっています。

// STAB in SRAM
// Atollic TrueSTUDIO for STM32 v9.0.0 (gcc 6.3.1)
// 48 cycle / 2slot (STM32F407 (CM4) @  30 MHz, Flash latency=0, NSLOT=256)
// 48 cycle / 2slot (STM32F407 (CM4) @ 168 MHz, Flash latency=5, NSLOT=256)

// 48 cycle / 2slot (STM32F303 (CM4) @  24 MHz, Flash latency=0, NSLOT=64)
// 52 cycle / 2slot (STM32F303 (CM4) @  64 MHz, Flash latency=2, NSLOT=64)

// STAB in flash
// Atollic TrueSTUDIO for STM32 v9.0.0 (gcc 6.3.1)
// 52 cycle / 2slot (STM32F407 (CM4) @  30 MHz, Flash latency=0, NSLOT=256)
// 62 cycle / 2slot (STM32F407 (CM4) @ 168 MHz, Flash latency=5, NSLOT=256)

// 48 cycle / 2slot (STM32F303 (CM4) @  24 MHz, Flash latency=0, NSLOT=64)
// 62 cycle / 2slot (STM32F303 (CM4) @  64 MHz, Flash latency=2, NSLOT=64)