円周率の計算 (15) -- BBP 公式による計算 (4)

ATtiny10 版のプログラムができました。
全てアセンブラ記述で、プログラム・サイズは 860 バイト (430 ワード) です。 思ったより小さくなりました。
SRAM の使用量はデータ 22 バイト、スタック 6 バイトの計 28 バイトで、まだ 4 バイト余っています。
内蔵 8 MHz クロックを使い、PB2 (4 番ピン) から 38.4 kbps のシリアル信号を出力します。
実行時間は 1000 桁の出力で約 2 分、8191 桁の出力で約 3 時間 27 分です。
プログラム・サイズに余裕があったので、出力フォーマットを

  • 10 桁ごとに空白を入れ、1 行に 50 桁を出力、
  • 20 行ごと、つまり 1000 桁ごとに空行を入れる

ように変更しました。
出力をキャプチャした結果を下に示します。 (途中省略)

ATtiny10
Pi by BBP algorithm (8191 hex digits)
pi = 3.

243F6A8885 A308D31319 8A2E037073 44A4093822 299F31D008 
2EFA98EC4E 6C89452821 E638D01377 BE5466CF34 E90C6CC0AC 
29B7C97C50 DD3F84D5B5 B547091792 16D5D98979 FB1BD1310B 
A698DFB5AC 2FFD72DBD0 1ADFB7B8E1 AFED6A267E 96BA7C9045 
F12C7F9924 A19947B391 6CF70801F2 E2858EFC16 636920D871 
574E69A458 FEA3F4933D 7E0D95748F 728EB65871 8BCD588215 
4AEE7B54A4 1DC25A59B5 9C30D5392A F26013C5D1 B023286085 
F0CA417918 B8DB38EF8E 79DCB0603A 180E6C9E0E 8BB01E8A3E 
D71577C1BD 314B2778AF 2FDA55605C 60E65525F3 AA55AB9457 
48986263E8 144055CA39 6A2AAB10B6 B4CC5C3411 41E8CEA154 
86AF7C72E9 93B3EE1411 636FBC2A2B A9C55D7418 31F6CE5C3E 
169B87931E AFD6BA336C 24CF5C7A32 5381289586 773B8F4898 
6B4BB9AFC4 BFE81B6628 219361D809 CCFB21A991 487CAC605D 
EC8032EF84 5D5DE98575 B1DC262302 EB651B8823 893E81D396 
ACC50F6D6F F383F44239 2E0B4482A4 84200469C8 F04A9E1F9B 
5E21C66842 F6E96C9A67 0C9C61ABD3 88F06A51A0 D2D8542F68 
960FA728AB 5133A36EEF 0B6C137A3B E4BA3BF050 7EFB2A98A1 
F1651D39AF 017666CA59 3E82430E88 8CEE861945 6F9FB47D84 
A5C33B8B5E BEE06F75D8 85C1207340 1A449F56C1 6AA64ED3AA 
62363F7706 1BFEDF7242 9B023D37D0 D724D00A12 48DB0FEAD3 

. . . . < 1001 桁目から 8000 桁目まで省略 > . . . .

6E16369788 D273CCDE96 629281B949 D04C50901B 71C65614E6 
C6C7BD327A 140A45E1D0 06C3F27B9A C9AA53FD62 A80F00BB25 
BFE235BDD2 F671126905 B2040222B6 CBCF7CCD76 9C2B53113E 
C01640E3D3 38ABBD6025 47ADF0BA38 209CF746CE 75

出力桁数は 8192 桁ですが、最後の「75」が正しくは「76」なので、正しい桁数としては 8191 桁になります。
オブジェクトの HEX ファイルを下に示します。

:020000020000FC
:1000000020C000000D0A415474696E7931300D0A28
:1000100050692062792042425020616C676F72699A
:1000200074686D202838313931206865782064691A
:1000300067697473290D0A7069203D20332E0D0AFB
:10004000000008ED0CBF00E006BF0FE50DBF00E0AB
:100050000EBF0FE002B9129A0A9AE2E0F0E4608162
:1000600089B7860F89BF10E080E084AB80E085AB64
:10007000E2E0F0E04FD158D180E090E088A999A962
:10008000E0E4F0E008E011930A95E9F781E064E02C
:10009000BDD084E062E8BAD085E061E8B7D086E000
:1000A00061E8B4D0E8E4F0E024E0829141D184A397
:1000B00082958F7083AB84A38F7080AB839580AB68
:1000C000855089F480AB80E23CD183A3839583ABD8
:1000D000855049F483AB28D185A3839585AB8451A2
:1000E00011F485AB21D180A363A36295607F8F70EB
:1000F000862B84AB2A95C9F688A199A1885F9F4F6A
:1001000088A999A960E070E28617970708F4B8CF2C
:100110000BD1FFCFE0E4F0E0CAE4D0E032A33595A4
:10012000330F2FEF10F4622772278081861F81938F
:100130008081871F819306E0699137FD6227808166
:10014000861F81930A95C1F7089502C0841B950B01
:1001500084179507D8F720E030E014C060FF08C08E
:10016000280F391F18F02417350710F0241B350B02
:10017000880F991F18F08417950710F0841B950BB2
:10018000769567956117710749F7822F932F232B77
:10019000089501E18417950718F0841B950B889446
:1001A000661F771F0A9521F0881F991FB0F3F2CFC1
:1001B000609570950895CC27DD274130510711F1E6
:1001C000E82FF92FC1E0A0E1B0E002C0A41BB50BFD
:1001D000A417B507D8F713C0E0FF08C08C2F9D2FD8
:1001E0006A2F7B2FB2DFC82FD92F61F08A2F9B2F68
:1001F0006A2F7B2FAADFA82FB92FF695E795E11775
:10020000F10751F78C2F9D2FCD2B089583AB62AB57
:1002100040E050E040AB51AB2AC040A351A388A1BD
:1002200099A1841B950B440F551F440F551F440F74
:10023000551F03A3400F511FBEDF99F062A36770E3
:1002400070E083DF71F060E070E0C0E5D0E023E0B3
:10025000A0DF7A936A9360E070E02A95C9F799DF8E
:1002600059DF40A351A34F5F5F4F40AB51AB88A113
:1002700099A14817590788F248A159A140C040A345
:1002800051A3440F551F440F551F440F551F03A37F
:10029000400F511F4130510771F1EAE4F0E006E0F0
:1002A00011930A95E9F7F0A3E8A1FE1BE3E0EF2321
:1002B000EE0FEE0F90E070E060E082A38770E03018
:1002C00031F09695879577956795EA95D1F7FC700B
:1002D000F695C0E5D0E0CF1BD10B23E0F6952F1BA0
:1002E00050F039F056DF7A936A9360E070E02A9517
:1002F000C9F74FDF0FDF40A351A34F5F5F4F40AB04
:1003000051AB88A199A1805F9F4F4817590708F406
:10031000B6CF0895EE0FFF1FE050F04C8191803072
:1003200009F408950ED0FACF8DE00BD08AE009C011
:10033000682F829501D0862F8F708A3008F0895FF0
:10034000805D80959AE073E47A95F1F708F0129A4F
:0C03500008F4129886959A95B1F708956C
:00000001FF

プログラム・サイズに余裕があったので、シリアルのボーレート調節は、内蔵クロック自体を OSCCAL の値を調整して、本来の 8 MHz に近づけるようにしました。
上のオブジェクトのアドレス 0x0002 にデフォルトの OSCCAL 値に対して加算するオフセット値を (チェックサムも変更した上で) 書き込めば、ソースをいじらなくても調整できます。
アセンブラ・ソースを下に示します。
AVRStudio の AVR アセンブラ・プロジェクトを作成し、下のファイルをプロジェクトに登録してアセンブルします。
使用したのは AVRStudio 4.19 (Build 730) で、他の環境では試していません。
シリアル出力端子は、ソース・リストの「outport」定義部分を変更すれば変えることができます。

;
; piBBP10.asm : Pi calculation by BBP algorithm
;               (8191 hexadecimal digits)
;               for Atmel AVR ATtiny10
;               (Flash ROM 1KB, SRAM 32B)
;               using integer arithmetics only
;
;               based on "piqpr8.c" (2006-9-8)
;               by David H. Bailey
;
; 2012/10/24 created by pcm1723
; 
;       .include "tn10def.inc"
        .nolist
        .include "tn10def.inc"
        .list
 
;
; select output port for software serial TxD
; 
;       .set    outport = PORTB0  ; PB0 (pin 1)
;       .set    outport = PORTB1  ; PB1 (pin 3)
        .set    outport = PORTB2  ; PB2 (pin 4)
;       .set    outport = PORTB3  ; PB3 (pin 6)

        .dseg
;
; global variables
;
        .equ    N_digits = 8192
        .equ    N_step   = 8
        .equ    N_frac   = 4
;
sum:    .byte   (2*N_frac)      ; uint16_t sum[N_frac]
id:     .byte   2       ; uint16_t id
fract:  .byte   (2*N_frac-2)    ; uint16_t fract[N_frac-1]
k:      .byte   2       ; uint16_t k
coef:   .byte   1       ; uint8_t  coef
m:      .byte   1       ; uint8_t  m
h_cnt1: .byte   1       ; output column counter 1
h_cnt2: .byte   1       ; output column counter 2
;
        .cseg
        rjmp    reset_entry
;
OSCCAL_adj:
        .dw     0x00
;
banner_str:
        .db     0x0d,0x0a
        .db     'A','T','t','i','n','y','1','0'
        .db     0x0d,0x0a
        .db     'P','i',' ','b','y',' ','B','B','P',' '
        .db     'a','l','g','o','r','i','t','h','m', ' '
        .db     '(','8','1','9','1',' ','h','e','x',' '
        .db     'd','i','g','i','t','s',')',0x0d
        .db     0x0a,'p','i',' ','=',' ','3','.'
        .db     0x0d,0x0a,0,0
;
reset_entry:
;
; setup clock prescaler to 1/1 (8 MHz)
;
        ldi     r16,0xD8    ; protect signature
        out     CCP,r16     ; CCP <-- 0xD8
        ldi     r16,0x00    ; 0x00 = 1/1 
        out     CLKPSR,r16  ; set CLocK PreScaleR
;
; setup SP
;
        ldi     r16,low(RAMEND) ; RAMEND
        out     SPL,r16         ; set low byte of SP
        ldi     r16,high(RAMEND) ; RAMEND
        out     SPH,r16         ; set high byte of SP
;
; setup GPIO
;
        ldi     r16,0x0F
        out     PORTB,r16    ; enable pull up 
        sbi     PORTB,outport ; PBx = "H"
        sbi     DDRB,outport ; PBx = OUTPUT
;
; adjust OSCCAL value
;
        ldi     r30,low(2*OSCCAL_adj+MAPPED_FLASH_START)
        ldi     r31,high(2*OSCCAL_adj+MAPPED_FLASH_START)
        ld      r22,Z           ; get adjust value for OSCCAL
        in      r24,OSCCAL
        add     r24,r22         
        out     OSCCAL,r24      ; adjust OSCCAL
main:
        ldi     r17,0           ; zero register
        ldi     r24,0x00
        sts     h_cnt1,r24      ; initialize column counter 1, 0
        ldi     r24,0x00
        sts     h_cnt2,r24      ; inutialize column counter 2
        ldi     r30,low(banner_str)
        ldi     r31,high(banner_str)
        rcall   xmit_str        ; transmit banner stirng
        rcall   xmit_nl         ; CR/LF
; for (id = 0; id < N_digits; id += N_step)
        ldi     r24,0
        ldi     r25,0
        sts     id,r24
        sts     id+1,r25        ; id = 0
id_loop1:
;
; clear sum[]
;
        ldi     r30,low(sum)
        ldi     r31,high(sum)
        ldi     r16,(N_frac*2)
clr_sum1:
        st      Z+,r17
        dec     r16
        brne    clr_sum1
;
        ldi     r24,1           ; m    = 1
        ldi     r22,0x04        ; coef = 4, sub_flag = 0
        rcall   series          ; series(id, m, coef)
;
        ldi     r24,4           ; m    = 4
        ldi     r22,0x82        ; coef = 2, sub_flag = 1
        rcall   series          ; series(id, m, coef)
;
        ldi     r24,5           ; m    = 5
        ldi     r22,0x81        ; coef = 1, sub_flag = 1
        rcall   series          ; series(id, m, coef)
;
        ldi     r24,6           ; m    = 6
        ldi     r22,0x81        ; coef = 1, sub_flag = 1
        rcall   series          ; series(id, m, coef)
;
; now, result in sum[0]..sum[N_frac-1]
;
        ldi     r30,low(sum+2*N_frac)
        ldi     r31,high(sum+2*N_frac)  ; Z = (R31:R30) = &(sum[N_frac])
        ldi     r18,4                   ; set loop counter to 4, (4 * 2 = 8 hex digits)
hex_out1:
        ld      r24,--Z                 ; get data byte
        rcall   xmit_hex2               ; transmit 2 hex digits
;
; output formatting
;
        lds     r24,h_cnt1
        swap    r24
        andi    r24,0x0F
        sts     m,r24           ; unpack column counter1
;
        lds     r24,h_cnt1
        andi    r24,0x0F
        sts     k,r24           ; unpack column counter0
;
        inc     r24             ; (cnt0 + 1)
        sts     k,r24           ; update cnt0
        subi    r24,5           ; modulo 5
        brne    hex_out3
        sts     k,r24           ; clear cnt0
        ldi     r24,0x20
        rcall   xmit            ; output a space
;
        lds     r24,m           ; R24 = cnt1
        inc     r24             ; (cnt1 + 1)
        sts     m,r24           ; update cnt1
        subi    r24,5           ; modulo 5
        brne    hex_out3
        sts     m,r24           ; clear cnt1
        rcall   xmit_nl         ; output CR/LF
;
        lds     r24,h_cnt2      ; R24 = h_cnt2
        inc     r24             ; (h_cnt2 + 1)
        sts     h_cnt2,r24      ; update h_cnt2
        subi    r24,20          ; modulo 20
        brne    hex_out3
        sts     h_cnt2,r24      ; clear h_cnt2
        rcall   xmit_nl         ; output CR/LF
hex_out3:
;
; pack column counters
;
        lds     r24,k           ; R24 = cnt0
        lds     r22,m           ; R22 = cnt1
        swap    r22             ; lower nibble to upper nibble
        andi    r22,0xF0        ; upper nibble
        andi    r24,0x0F        ; lower nibble
        or      r24,r22         ; combine
        sts     h_cnt1,r24      ; update h_cnt1
        dec     r18                     ; decrement loop counter
        brne    hex_out1                ; more to do?
;
; loop condition check
;
        lds     r24,id
        lds     r25,id+1                ; (R25:R24) = id
        subi    r24,low(-N_step)
        sbci    r25,high(-N_step)       ; (R25:R24) + N_step
        sts     id,r24
        sts     id+1,r25                ; id += N_step
        ldi     r22,low(N_digits)
        ldi     r23,high(N_digits)
        cp      r24,r22
        cpc     r25,r23                 ; (id - N_digits)
        brsh    id_loop_end             ; loop finished
        rjmp    id_loop1                ; for (id = ...) loop
;
id_loop_end:
        rcall   xmit_nl                 ; transmit a newline
        rjmp    pc                      ; 
;
;
;        .include        "testprog.inc"
;
;
;
;---------------------------------------------------------------------------;
;
; Prototype : 
;
; void add_sub_frac( void );
;
; Least significant 16-bit word of fract left in R23:R22
;
; local variable:
;                 (uint8_t *) s_p = R31:R30 (Z,  sum[] pointer )
;                 (uint8_t *) f_p = R29:R28 (Y,  fract[] pointer)
;
;                 uint8_t ff = R22
;                              R19 : sub_flag
;                              R18 : const. 0xFF for 1's complement
;                              R16 : loop counter
;
add_sub_frac:
        ldi     r30,low(sum)
        ldi     r31,high(sum)   ; (R31:R30) = &(sum[0])
        ldi     r28,low(fract)
        ldi     r29,high(fract) ; (R29:R28) = &(fract[0])
;
        lds     r19,coef        ; b7 of coef is sub_flag
        asr     r19             ; copy b7
        lsl     r19             ; C = sub_flag, b7 = sub_flag
        ser     r18             ; R18 = 0xff
        brcc    asf_1           ; branch if not subtract
        eor     r22,r18         ; 1's complement for subtract
        eor     r23,r18         ; 1's complement for subtract
asf_1:
        ld      r24,Z           ; R24 = s_p[0]
        adc     r24,r22         ; add/sub fract
        st      Z+,r24          ; store result
        ld      r24,Z           ; R24 = s_p[1]
        adc     r24,r23         ; add/sub fract
        st      Z+,r24          ; store result
;
        ldi     r16,(2*N_frac-2) ; set loop counter
;
asf_2:
        ld      r22,Y+          ; ff (R22) = *(f_p++)
        sbrc    r19,7           ; test sub_flag, skip if not subtract
        eor     r22,r18         ; 1's complement for subtract
asf_3:
        ld      r24,Z           ; R24 = *s_p
        adc     r24,r22         ; (*f_p) + ff
        st      Z+,r24          ; (*f_p++) = result of add/sub
        dec     r16             ; decrement loop counter
        brne    asf_2           ; more to do? 
        ret
;
;---------------------------------------------------------------------------;
;
; Prototype : 
;
; uint16_t mulmod(uint16_t a, uint16_t b, uint16_t ak);
; argument:          R25:R24,    R23:R22,     R21:R20
;
; function result:   R25:R24
;
; local variable :   R19:R18
;

mulmod:
mm_1:
        rjmp	mm_3      	;
mm_2:
        sub     r24, r20
        sbc     r25, r21    ; a = a - ak
mm_3:
        cp  	r24, r20
        cpc 	r25, r21    ; a - ak
        brsh	mm_2         	; branch if SAME or HIGHER
        ldi 	r18, 0x00	; 
        ldi 	r19, 0x00	; r = 0
        rjmp	mm_7     	;

mm_4:
        sbrs	r22, 0  ; skip if (1 == b0)
        rjmp	mm_5   	; jump if (0 == b0)
        add 	r18, r24
        adc 	r19, r25    ; r = r + a
        brcs    mm_45     ; branch if carry occurred
        cp  	r18, r20    ;
        cpc 	r19, r21    ; r - ak
        brlo	mm_5      	; branch if LOWER
mm_45:
        sub  	r18, r20
        sbc 	r19, r21    ; r = r - ak
mm_5:
        lsl 	r24
        rol 	r25
        brcs    mm_55     ; branch if carry occurred
        cp  	r24, r20
        cpc 	r25, r21        ; a - ak
        brlo	mm_6    	; branch if LOWER
mm_55:
        sub 	r24, r20
        sbc 	r25, r21    ; a = a - ak
mm_6:
        lsr 	r23
        ror 	r22     ; b = b / 2
mm_7:
        cp  	r22, r17
        cpc 	r23, r17 ; b - 0
        brne	mm_4      ; branch if (0 != b)
        mov	r24, r18
        mov	r25, r19
        or      r18,r19         ; for zero test
        ret
;
;---------------------------------------------------------------------------;
;
; Prototype : 
;
; uint32_t divmod( uint32_t dividend,   uint16_t divisor);
; argument:          R25:R24:R23:R22,            R21:R20
;                dend3:dend2:dend1:dend0,     visor1:visor0
;
; function result:   R25:R24 = 16 bit remainder
;                  dend3:dend2
;
;                    R23:R22 = 16 bit quotient
;                  dend1:dend0
;
; work register:     R16 = loop counter
;                   lcnt
;
;
; register symbol name defines
;
        .def    dend3  = r25
        .def    dend2  = r24
        .def    dend1  = r23
        .def    dend0  = r22
        .def    visor1 = r21
        .def    visor0 = r20
        .def    lcnt   = r16
;
divmod:
        ldi     lcnt,17         ; set loop count to (16+1)
dm_1:
        cp      dend2,visor0
        cpc     dend3,visor1    ; (dend3:dend2) - (visor1:visor0)
        brcs    dm_2            ; branch if not subtractable
dm_15:
        sub     dend2,visor0
        sbc     dend3,visor1    ; (dend3:dend2) -= (visor1:visor0)
        clc                     ; force Carry cleared
dm_2:
        rol     dend0           ; shift-in  quotient (inverted)
        rol     dend1           ; shift-out lower part of dividend 
        dec     lcnt            ; decrement loop count
        breq    dm_3            ; more to do?
        rol     dend2           ; 
        rol     dend3           ; left shift partial remainder
        brcs    dm_15           ; force subtraction if Carry
        rjmp    dm_1            ; division loop
;
dm_3:
        com     dend0
        com     dend1           ; adjust quotient
        ret
;
;---------------------------------------------------------------------------;
;
; Prototype : 
;
; uint16_t expm ( uint16_t p,  dummy,   uint16_t ak )
; argument:          R25:R24,  R23:R22,     R21:R20
; move  to:          R31:R30
;
; function result:   R25:R24 
;
; work register:     R29:R28 = uint16_t r
;
;                    R27:R26 = uint16_t b
;
expm:
        clr     r28
        clr     r29     ; assume result = 0
        cpi     r20,1
        cpc     r21,r17 ; (ak1:ak0) - 1
        breq    ex_ret  ; (1 == ak) early return
        mov     r30,r24
        mov     r31,r25 ; argument "p" move to R31:R30
        ldi     r28,1   ; (r1:r0) = 1;
        ldi     r26,16  ;
        ldi     r27,0   ; (b1:b0) = 16;
        rjmp    ex_2    ; while loop
ex_1:
        sub     r26,r20
        sbc     r27,r21 ; (b1:b0) -= (ak1:ak0);
ex_2:
;                       ; while (b >= ak) ...
        cp      r26,r20
        cpc     r27,r21 ; (b1:b0) - (ak1:ak0)
        brsh    ex_1    ; branch if SAME or HIGHER
        rjmp    ex_5    ; while loop condition check
ex_3:
        sbrs    r30,0   ; if (0x01 & p)
        rjmp    ex_4    ; branch if not
;                       ; (1 == (0x01 & p))
;                       ; r = (r * b) mod ak
        mov     r24,r28
        mov     r25,r29 ; (r25:r24) = (r)
        mov     r22,r26
        mov     r23,r27 ; (r23:r22) = (b)
;                       ; (r21:r20) is already (ak)
        rcall   mulmod  ; mulmod(r, b, ak)
        mov     r28,r24
        mov     r29,r25 ; (r) = function result
        breq    ex_ret  ; return if result = 0
;
ex_4:                   ; b = (b * b) mod ak
        mov     r24,r26
        mov     r25,r27 ; (r25:r24) = (b)
        mov     r22,r26
        mov     r23,r27 ; (r23:r22) = (b)
;                       ; (r21:r20) is already (ak)
        rcall   mulmod  ; mulmod(b, b, ak)
        mov     r26,r24
        mov     r27,r25 ; (b) = function result
;
        lsr     r31
        ror     r30     ; p >>= 1;
ex_5:                   ; while (p)
        cp      r30,r17
        cpc     r31,r17 ; (p1:p0) - 0
        brne    ex_3    ; go to top of while loop
ex_ret:
        mov     r24,r28
        mov     r25,r29 ; return(r)
        or      r28,r29 ; for zero test
        ret
;---------------------------------------------------------------------------;
;
; Prototype : 
;
; void  series ( uint8_t m,  uint8_t coef )
; argument:            R24,           R22
;
; parameter in global variable:
;
;                  uint16_t id
;                  uint8_t  m     <-- R24 on entry
;                  uint8_t  coef  <-- R22 on entry
;
; grobal variable for local use
;              
;                  uint16_t k  = R21:R20
;
; local variable:
;                  uint16_t ak    = R21:R20 
;                 (uint8_t *) f_p = R29:R28 (Y,  fract[] pointer)
;                  loop counter, misc = R16  
;                 
series:
        sts     m,r24                   ; m = 1
        sts     coef,r22                ; coef = 4
;
; first term of the series
;
; for (k = 0; k < id; k++) {
        ldi     r20,0
        ldi     r21,0
        sts     k,r20
        sts     k+1,r21 ; k = 0
ser_1:
        rjmp    ser_5    ; for() condition check
ser_2:
        lds     r20,k
        lds     r21,k+1         ; (R21:R20) = k
        lds     r24,id
        lds     r25,id+1        ; (R25:R24) = id
        sub     r24,r20
        sbc     r25,r21         ; (R25:R24) = (id - k)
;
        lsl     r20
        rol     r21
        lsl     r20
        rol     r21
        lsl     r20
        rol     r21             ; 8 * k
        lds     r16,m           ; R16 = m
        add     r20,r16
        adc     r21,r17         ; (R21:R20) = (8 * k + m) = ak
        rcall   expm            ; t = (R25:R24) = expm((id - k), ak)
        breq    ser_45          ; skip calc if expm() result = 0
        lds     r22,coef
        andi    r22,0x07        ; extract coef (3 bits)
        ldi     r23,0           ; zero extend to uint16_t
        rcall   mulmod          ; (R25:R24) = mulmod(t, coef, ak)
        breq    ser_45          ; skip calc if mulmod() result = 0
        ldi     r22,0
        ldi     r23,0           ; low word of prem = (R23:R22) = 0
;                               ; prem = (R25:R24:R23:R22)
        ldi     r28,low(fract+(2*N_frac-2))
        ldi     r29,high(fract+(2*N_frac-2))    ; f_p = &(fract[N_frac-1]
        ldi     r18,(N_frac-1)  ; set loop counter to (N_frac-1) times
ser_3:
        rcall   divmod          ; (rem:quo) = (R25:R24:R23:R22) = divmod(prem, ak)
        st      -Y,r23
        st      -Y,r22          ; *(--f_p) = quotient = (R23:R22)
        ldi     r22,0
        ldi     r23,0           ; low word of prem = (R23:R22) = 0
        dec     r18             ; decrement loop count
        brne    ser_3           ; branch if not end of loop
        rcall   divmod          ; (rem:quo) = (R25:R24:R23:R22) = divmod(prem, ak)
;                               ; don't store the last 16-bit word (left in R23:R22)
ser_4:
        rcall   add_sub_frac    ; add_sub_frac()
;
; for-loop end condition test
;
ser_45:
        lds     r20,k
        lds     r21,k+1         ; (R21:R20) = k
        subi    r20,low(-1)
        sbci    r21,high(-1)    ; increment
        sts     k,r20
        sts     k+1,r21         ; k += 1
ser_5:
        lds     r24,id
        lds     r25,id+1
        cp      r20,r24
        cpc     r21,r25         ; (k - id)
        brlo    ser_2           ; branch if LOWER
;
; second term of the series
;
;  for (k = id; k < id + (N_frac * 4); k++){
        lds     r20,id
        lds     r21,id+1
        rjmp    ser_19    ; for() condition check
ser_7:
        lds     r20,k
        lds     r21,k+1         ; (R21:R20) = k
;
        lsl     r20
        rol     r21
        lsl     r20
        rol     r21
        lsl     r20
        rol     r21             ; 8 * k
        lds     r16,m           ; R16 = m
        add     r20,r16
        adc     r21,r17         ; (R21:R20) = (8 * k + m) = ak
        cpi     r20,1
        cpc     r21,r17
        breq    ser_13          ; skip calc if (1 == ak)
;
; clear fract[] buffer
;
        ldi     r30,low(fract)
        ldi     r31,high(fract)         ; Z = (R31:R30) = f_p = &(fract[0])
        ldi     r16,(2*N_frac-2)        ; set loop count
ser_8:
        st      Z+,r17                  ; clear
        dec     r16                     ; decrement loop counter
        brne    ser_8                   ; more to do?
;
        lds     r31,k           ; R31 = (uint8_t) k
        lds     r30,id          ; R30 = (uint8_t) id
        sub     r31,r30         ; R31 = (uint8_t)(k - id)
        ldi     r30,0x03        ; bit mask for b1,b0
        and     r30,r31         ; sft = R30 = (0x03 & (uint8_t)(k - id))
        lsl     r30
        lsl     r30             ; sft <<= 2
;
        ldi     r25,0
        ldi     r23,0
        ldi     r22,0           ; (R25:R24:R23:R22) = 000:coef:0000
        lds     r24,coef
        andi    r24,0x07        ; R24 = coef (sub_flag bit removed)
        cpi     r30,0
        breq    ser_10          ; branch if (0 == sft)
ser_9:
        lsr     r25
        ror     r24
        ror     r23
        ror     r22             ; (R25:R24:R23:R22) >>= 1
        dec     r30             ; decrement loop count
        brne    ser_9           ; more to do?
ser_10:
        andi    r31,0x0c        ; bit mask for b3,b2
        lsr     r31             ; adjust for byte offset
;
        ldi     r28,low(fract+(2*N_frac-2))
        ldi     r29,high(fract+(2*N_frac-2))    ; f_p = &(fract[N_frac-1]
        sub     r28,r31         ; 
        sbc     r29,r17         ; X = (R29:R28) -= (byte offset)
        ldi     r18,(N_frac-1)  ; set loop counter to (N_frac-1) times
        lsr     r31             ; adjust for word count
        sub     r18,r31         ; adjust loop count
        brlo    ser_13          ; if LOWER, skip calc at all
        breq    ser_12          ; Least Significant word calc only
ser_11:
        rcall   divmod          ; (rem:quo) = (R25:R24:R23:R22) = divmod(prem, ak)
        st      -Y,r23
        st      -Y,r22          ; *(--f_p) = quotient = (R23:R22)
        ldi     r22,0
        ldi     r23,0           ; low word of prem = (R23:R22) = 0
        dec     r18             ; decrement loop count
        brne    ser_11           ; branch if not end of loop
ser_12:
        rcall   divmod          ; (rem:quo) = (R25:R24:R23:R22) = divmod(prem, ak)
        rcall   add_sub_frac    ; add_sub_frac()
ser_13:
;
; for-loop end condition test
;
;  for (k = id; k < (id + (N_frac * 4)); k++){
        lds     r20,k
        lds     r21,k+1         ; (R21:R20) = k
        subi    r20,low(-1)
        sbci    r21,high(-1)    ; increment
ser_19:
        sts     k,r20
        sts     k+1,r21         ; k = (R21:R20)
        lds     r24,id
        lds     r25,id+1        ; (R25:R24) = id
        subi    r24,low(-4*N_frac)
        sbci    r25,high(-4*N_frac)     ; (R25:R24) + (4*N_frac)
        cp      r20,r24
        cpc     r21,r25         ; (k - (id + 4 * N_frac))
        brsh    ser_99          ; return if SAME or HIGHER
        rjmp    ser_7           ; branch if LOWER
;
ser_99:
        ret
;---------------------------------------------------------------------------;
;
; omit rcvr() func 
; and add xmit_nl(), xmit_hex1(), xmit_hex2(), xmit_str() func
;
; 2012/10/16 pcm1723
;
;---------------------------------------------------------------------------;
; Software implemented UART module                                          ;
; (C)ChaN, 2005 (http://elm-chan.org/)                                      ;
;---------------------------------------------------------------------------;
; Bit rate settings:
;
;            1MHz  2MHz  4MHz  6MHz  8MHz  10MHz  12MHz  16MHz  20MHz
;   2.4kbps   138     -     -     -     -      -      -      -      -
;   4.8kbps    68   138     -     -     -      -      -      -      -
;   9.6kbps    33    68   138   208     -      -      -      -      -
;  19.2kbps     -    33    68   102   138    173    208      -      -
;  38.4kbps     -     -    33    50    68     85    102    138    172
;  57.6kbps     -     -    21    33    44     56     68     91    114
; 115.2kbps     -     -     -     -    21     27     33     44     56

#define	BPS	68	; Bit delay. (see above table) 
#define	BIDIR	0	; 0:Separated Tx/Rx, 1:Shared Tx/Rx 

#define	OUT_1		sbi PORTB,outport	; Output 1 
#define	OUT_0		cbi PORTB,outport	; Output 0 


;---------------------------------------------------------------------------;
; Transmit null terminated string pointed by Z (R31:R30)
;
; Prototype: void xmit_nl( void );
;
xmit_str:
        lsl     r30
        rol     r31             ; convert word offset to byte offset
        subi    r30,low(-MAPPED_FLASH_START)
        sbci    r31,high(-MAPPED_FLASH_START)   ; adjust for PROGMEM in data space
xmit_str1:
        ld      r24,Z+          ; get a char
        cpi     r24,0           ; compare against 0
        brne    xmit_str2
        ret                     ; return if null found
xmit_str2:
        rcall   xmit            ; transmit a char
        rjmp    xmit_str1       ; loop
;
;---------------------------------------------------------------------------;
; Transmit a newline (CR/LF sequence)
;
; Prototype: void xmit_nl( void );
;
xmit_nl:
        ldi     r24,0x0D    ; CR
        rcall   xmit        ; transmit
        ldi     r24,0x0A    ; LF
        rjmp    xmit        ; transmit
;
;---------------------------------------------------------------------------;
; Transmit 2 hex digit in R24
;
; Prototype : 
;
; void xmit_hex2(uint8_t d);
;
xmit_hex2:
        mov     r22,r24         ; save copy of R24
        swap    r24             ; upper nibble to lower
        rcall   xmit_hex1       ; transmit hex 1 digit
        mov     r24,r22         ; restore R24
;
; fall into xmit_hex1()
;
;---------------------------------------------------------------------------;
; Transmit a hex digit in lower nibble of R24
;
; Prototype : 
;
; void xmit_hex1(uint8_t d) {
;   d &= 0x0F;
;   xmit( (9 < d) ? ('A' + d - 10) : ('0' + d));
; }
;

xmit_hex1:
        andi	r24, 0x0F	; extract lower nibble
        cpi 	r24, 10         ; 10
        brcs	xh1_1     	; branch if (10 > r24)
        subi	r24, -('A' - '0' - 10)
xh1_1:
        subi	r24, -('0')
;
; fall into xmit()
;
;---------------------------------------------------------------------------;
; Transmit a byte in serial format of N81
;
;Prototype: void xmit (uint8_t data);
;Size: 16 words

xmit:
;        ret

#if BIDIR
	ldi	r23, BPS-1	;Pre-idle time for bidirectional data line
xm_5:	dec	r23     	;
	brne	xm_5		;/
#endif

;	in	r16, SREG	;Save flags

	com	r24		;C = start bit
	ldi	r25, 10		;Bit counter
;	cli			;Start critical section

xm_1:	ldi	r23, BPS-1	;----- Bit transferring loop 
xm_2:	dec	r23     	;Wait for a bit time
	brne	xm_2		;/
	brcs	xm_3		;MISO = bit to be sent
	OUT_1			;
xm_3:	brcc	xm_4		;
	OUT_0			;/
xm_4:	lsr	r24     	;Get next bit into C
	dec	r25     	;All bits sent?
	brne	xm_1	     	;  no, coutinue

;	out	SREG, r16	;End of critical section
	ret