SH4 FIPR Optimizations: Difference between revisions

From dreamcast.wiki
Jump to navigation Jump to search
No edit summary
(Updated the function because it crashed before)
Line 71: Line 71:
__always_inline  
__always_inline  
float shz_pl_inner_loop(const float *d, const float *v1, const float *v2) {
float shz_pl_inner_loop(const float *d, const float *v1, const float *v2) {
    float fp_scratch[2];
const float *td = d;
    uint32_t int_scratch;
const float *tv1 = v1;
const float *tv2 = v2;
uint32_t stride;
float result;


    asm volatile(R"(
asm volatile(R"(
        ! Swap to back-bank so we don't need to clobber any FP regs.
! Swap to back-bank so we don't need to clobber any FP regs.
        frchg
frchg


        ! Load first vector into fv0 for first FIPR.
! s = 512 (stride: 128 floats * 4 bytes)
        xor    %[s], %[s]
mov     #2, %[s]
        fmov.s  @%[d]+, fr0
shll8  %[s]               ! 2 << 8 = 512
        add     #64, %[s]
        fmov.s  @%[d]+, fr1
        add    #64, %[s]
        fmov.s  @%[d]+, fr2
        add    #16, %[r]
        fmov.s  @%[d]+, fr3


        ! Load second vector into fv4 for first FIPR
! Load first vector into fv0 for first FIPR.
        fmov.s  @%[v1], fr4
fmov.s  @%[td]+, fr0      ! fr0 = d[0]
        add    %[s], %[v1]
fmov.s  @%[td]+, fr1      ! fr1 = d[1]
        fmov.s  @%[v2], fr5
fmov.s  @%[td]+, fr2      ! fr2 = d[2]
        add    %[s], %[v2]
fmov.s  @%[td]+, fr3      ! fr3 = d[3]
        fmov.s  @%[v1], fr6
        add    %[s], %[v1]
        fmov.s  @%[v2], fr7
        add    %[s], %[v2]


        ! Issue first FIPR
! Load second vector into fv4 for first FIPR
        fipr    fv0, fv4
fmov.s  @%[tv1], fr4      ! fr4 = v1[0]
        ! DO NOT SAVE THE RESULT YET
add    %[s], %[tv1]      ! tv1 -> v1[128]
fmov.s  @%[tv2], fr5      ! fr5 = v2[0]
add    %[s], %[tv2]      ! tv2 -> v2[128]
fmov.s  @%[tv1], fr6      ! fr6 = v1[128]
add    %[s], %[tv1]      ! tv1 -> v1[256]
fmov.s  @%[tv2], fr7      ! fr7 = v2[128]
add    %[s], %[tv2]      ! tv2 -> v2[256]


        ! Load first vector into fv8 for second FIPR.
! Issue first FIPR
        fmov.s  @%[d]+, fr8
fipr    fv0, fv4          ! fr7 = FIPR1 result
        fmov.s  @%[d]+, fr9
        fmov.s  @%[d]+, fr10
        fmov.s  @%[d]+, fr11


        ! Load second vector into fv12 for second FIPR.
! Load first vector into fv8 for second FIPR.
        fmov.s  @%[v1], fr12
fmov.s  @%[td]+, fr8      ! fr8  = d[4]
        add    %[s], %[v1]
fmov.s  @%[td]+, fr9      ! fr9  = d[5]
        fmov.s  @%[v2], fr13
fmov.s  @%[td]+, fr10      ! fr10 = d[6]
        add    %[s], %[v2]
fmov.s  @%[td]+, fr11      ! fr11 = d[7]
        fmov.s  @%[v1], fr14
        add    %[s], %[v1]
        fmov.s  @%[v2], fr15
        add    %[s], %[v2]


        ! Issue second FIPR
! Load second vector into fv12 for second FIPR.
        fipr    fv8, fv12
fmov.s  @%[tv1], fr12      ! fr12 = v1[256]
        ! Store result from FIRST FIPR now that it's ready
add    %[s], %[tv1]      ! tv1 -> v1[384]
        fmov.s  fr7, @-%[r]
fmov.s  @%[tv2], fr13      ! fr13 = v2[256]
add    %[s], %[tv2]      ! tv2 -> v2[384]
fmov.s  @%[tv1], fr14      ! fr14 = v1[384]
add    %[s], %[tv1]      ! tv1 -> v1[512]
fmov.s  @%[tv2], fr15      ! fr15 = v2[384]
add    %[s], %[tv2]      ! tv2 -> v2[512]


        ! Load first vector into fv0 for third FIPR
! Issue second FIPR
        fmov.s  @%[d]+, fr0
fipr    fv8, fv12          ! fr15 = FIPR2 result
        fmov.s  @%[d]+, fr1
fmov.s  fr7, @-r15         ! push FIPR1 result onto stack
        fmov.s  @%[d]+, fr2
         fmov.s  @%[d]+, fr3


        ! Load second vector into fv4 for third FIPR
! Load first vector into fv0 for third FIPR
        fmov.s  @%[v1], fr4
fmov.s  @%[td]+, fr0      ! fr0 = d[8]
        add    %[s], %[v1]
fmov.s  @%[td]+, fr1      ! fr1 = d[9]
        fmov.s  @%[v2], fr5
fmov.s  @%[td]+, fr2      ! fr2 = d[10]
        add    %[s], %[v2]
fmov.s  @%[td]+, fr3      ! fr3 = d[11]
        fmov.s  @%[v1], fr6
        add    %[s], %[v1]
        fmov.s  @%[v2], fr7
        add    %[s], %[v2]
       
        ! Issue third FIPR
        fipr    fv0, fv4
        ! Store result from SECOND FIPR now that it's ready.
        fmov.s  fr15, @-%[r]


        ! Load first vector into fv8 for fourth FIPR
! Load second vector into fv4 for third FIPR
        fmov.s  @%[d]+, fr8
fmov.s  @%[tv1], fr4      ! fr4 = v1[512]
        fmov.s  @%[d]+, fr9
add    %[s], %[tv1]      ! tv1 -> v1[640]
        fmov.s  @%[d]+, fr10
fmov.s  @%[tv2], fr5      ! fr5 = v2[512]
        fmov.s  @%[d]+, fr11
add    %[s], %[tv2]      ! tv2 -> v2[640]
fmov.s  @%[tv1], fr6      ! fr6 = v1[640]
add    %[s], %[tv1]      ! tv1 -> v1[768]
fmov.s  @%[tv2], fr7      ! fr7 = v2[640]
add    %[s], %[tv2]      ! tv2 -> v2[768]


        ! Load second vector into fv12 for fourth FIPR
! Issue third FIPR
        fmov.s  @%[v1], fr12
fipr    fv0, fv4          ! fr7 = FIPR3 result
        add    %[s], %[v1]
fmov.s  fr15, @-r15        ! push FIPR2 result onto stack
        fmov.s  @%[v2], fr13
        add    %[s], %[v2]
        fmov.s  @%[v1], fr14
        fmov.s  @%[v2], fr15


        ! Issue fourth FIPR
! Load first vector into fv8 for fourth FIPR
        fipr    fv8, fv12
fmov.s  @%[td]+, fr8      ! fr8  = d[12]
fmov.s  @%[td]+, fr9      ! fr9  = d[13]
fmov.s  @%[td]+, fr10      ! fr10 = d[14]
fmov.s  @%[td]+, fr11      ! fr11 = d[15]


        ! Add up results from previous FIPRs while we wait
! Load second vector into fv12 for fourth FIPR
        fmov.s  @%[r]+, fr0
fmov.s  @%[tv1], fr12      ! fr12 = v1[768]
        fmov.s  @%[r]+, fr1
add    %[s], %[tv1]      ! tv1 -> v1[896]
        fadd    fr1, fr0
fmov.s  @%[tv2], fr13      ! fr13 = v2[768]
        fadd    fr7, fr0
add    %[s], %[tv2]      ! tv2 -> v2[896]
        add    #-8, %[r]
fmov.s  @%[tv1], fr14      ! fr14 = v1[896]
fmov.s  @%[tv2], fr15      ! fr15 = v2[896]


        ! Add result from fourth FIPR now that it's ready
! Issue fourth FIPR
        fadd   fr15, fr0
fipr   fv8, fv12          ! fr15 = FIPR4 result


         ! Store final result
! Add up results from previous FIPRs while we wait
        fmov.s  fr0, @%[r]
fmov.s  @r15+, fr0         ! pop FIPR2 result
fmov.s  @r15+, fr1        ! pop FIPR1 result
fadd    fr1, fr0          ! fr0 = FIPR1 + FIPR2
fadd    fr7, fr0          ! fr0 += FIPR3


        ! Swap back to primary FP register bank
! Add result from fourth FIPR now that it's ready
        frchg
fadd    fr15, fr0          ! fr0 += FIPR4
    )"
    : [d] "+&r" (d), [v1] "+r" (v1), [v2] "+r" (v2),
      [r] "+r" (fp_scratch), [s] "=&r" (int_scratch),
      "=m" (*fp_scratch));


    return fp_scratch[0];
! Transfer result to primary bank via FPUL
flds    fr0, FPUL          ! secondary fr0 -> FPUL
frchg                      ! Switch back to primary FP bank
fsts    FPUL, %[result]    ! FPUL -> result register (primary bank)
)"
: [td] "+r" (td), [tv1] "+r" (tv1), [tv2] "+r" (tv2),
  [s] "=r" (stride), [result] "=f" (result)
:
: "memory");
 
return result;
}
}
</pre>
</pre>

Revision as of 18:40, 16 February 2026

Yo, guys. At like 1AM @ian micheal got me looking at pl_mpeg's audio decoder to see if I could see any potential gainz... So here is its innermost hottest audio synthesis loop:

for (int i = 32; i; --i) {
    float u;
    u = pl_fipr(d[0], d[1], d[2], d[3], v1[0], v2[0], v1[128], v2[128]);
    u += pl_fipr(d[4], d[5], d[6], d[7], v1[256], v2[256], v1[384], v2[384]);
    u += pl_fipr(d[8], d[9], d[10], d[11], v1[512], v2[512], v1[640], v2[640]);
    u += pl_fipr(d[12], d[13], d[14], d[15], v1[768], v2[768], v1[896], v2[896]);
    d += 32;
    v1++;
    v2++;
    *out++ = (short)((int)u >> 16);
}

Which... you'd think would be preeeetty efficient, right? 4 back-to-back FIPRs? I mean, it is hella gainzy compared to not using FIPR.

But there are two problems with back-to-back FIPR-y, I wanna teach anyone interested:

1) Very often one of the vector arguments stays constant between FIPR calls, but unfortunately the compiler is too dumb to not reload all 8 registers between calls regardless.

  • LUCKILY every argument to these FIPRs is unique so this is not applicable, but... very often that's a perf destroyer.

2) THE COMPILER CANNOT PIPELINE FIPR FOR SHIT.

  • VERY applicable here. You know what the ASM looks like for these FIPR calls? Something like this:
! load first vector arg into fv0 (nothing wrong with this)
fmov.s @%[d]+, fr0
fmov.s @%[d}+, fr1
fmov.s @%[d]+, fr2
fmov.s @%[d]+, fr3

! load second vector arg into fv4 (nothing wrong with this)
fmov.s @%[v1], fr4
add    %[offset], @[v1]
fmov.s @%[v2], fr5
add    %[offset], @[v2]
fmov.s @%[v1], fr6
fmov.s @%[v2], fr7

! issue actual FIPR calculation
fipr fv0, fv4

! VERY NEXT INSTRUCTION TRY TO STORE THE RESULT
fmov.s fr7, @%[result] ! PIPELINE STALL!!!!

Now this is very very bad. FIPR has 4-5 cycles of latency, so every fucking call to FIPR, since the very next instruction tries to use the result before its been calculated, the entire pipeline must stall waiting for the result... FOR EVERY FIPR CALL. So you're losing MASSIVE perf benefits there. The solution? You have to pipeline your FIPRs so that while the previous FIPR call is still calculating, you're loading up and issuing the next FIPR call.

So I wrote a new routine that replaces that inner loop body doing manually pipelined FIPR calls... This should be way better:

for (int i = 32; i; --i) {
#if 0 // Old FIPR path which didn't pipeline for shit.
    float u;
    u = pl_fipr(d[0], d[1], d[2], d[3], v1[0], v2[0], v1[128], v2[128]);
    u += pl_fipr(d[4], d[5], d[6], d[7], v1[256], v2[256], v1[384], v2[384]);
    u += pl_fipr(d[8], d[9], d[10], d[11], v1[512], v2[512], v1[640], v2[640]);
    u += pl_fipr(d[12], d[13], d[14], d[15], v1[768], v2[768], v1[896], v2[896]);
#else // New hand-written FIPR path with manual pipelining
    float u = shz_pl_inner_loop(d, v1, v2);
#endif
    d += 32;
    v1++;
    v2++;
    *out++ = (short)((int)u >> 16);
}

Where the new implementation is this inline ASM:

__always_inline 
float shz_pl_inner_loop(const float *d, const float *v1, const float *v2) {
	const float *td = d;
	const float *tv1 = v1;
	const float *tv2 = v2;
	uint32_t stride;
	float result;

	asm volatile(R"(
		! Swap to back-bank so we don't need to clobber any FP regs.
		frchg

		! s = 512 (stride: 128 floats * 4 bytes)
		mov     #2, %[s]
		shll8   %[s]               ! 2 << 8 = 512

		! Load first vector into fv0 for first FIPR.
		fmov.s  @%[td]+, fr0       ! fr0 = d[0]
		fmov.s  @%[td]+, fr1       ! fr1 = d[1]
		fmov.s  @%[td]+, fr2       ! fr2 = d[2]
		fmov.s  @%[td]+, fr3       ! fr3 = d[3]

		! Load second vector into fv4 for first FIPR
		fmov.s  @%[tv1], fr4       ! fr4 = v1[0]
		add     %[s], %[tv1]       ! tv1 -> v1[128]
		fmov.s  @%[tv2], fr5       ! fr5 = v2[0]
		add     %[s], %[tv2]       ! tv2 -> v2[128]
		fmov.s  @%[tv1], fr6       ! fr6 = v1[128]
		add     %[s], %[tv1]       ! tv1 -> v1[256]
		fmov.s  @%[tv2], fr7       ! fr7 = v2[128]
		add     %[s], %[tv2]       ! tv2 -> v2[256]

		! Issue first FIPR
		fipr    fv0, fv4           ! fr7 = FIPR1 result

		! Load first vector into fv8 for second FIPR.
		fmov.s  @%[td]+, fr8       ! fr8  = d[4]
		fmov.s  @%[td]+, fr9       ! fr9  = d[5]
		fmov.s  @%[td]+, fr10      ! fr10 = d[6]
		fmov.s  @%[td]+, fr11      ! fr11 = d[7]

		! Load second vector into fv12 for second FIPR.
		fmov.s  @%[tv1], fr12      ! fr12 = v1[256]
		add     %[s], %[tv1]       ! tv1 -> v1[384]
		fmov.s  @%[tv2], fr13      ! fr13 = v2[256]
		add     %[s], %[tv2]       ! tv2 -> v2[384]
		fmov.s  @%[tv1], fr14      ! fr14 = v1[384]
		add     %[s], %[tv1]       ! tv1 -> v1[512]
		fmov.s  @%[tv2], fr15      ! fr15 = v2[384]
		add     %[s], %[tv2]       ! tv2 -> v2[512]

		! Issue second FIPR
		fipr    fv8, fv12          ! fr15 = FIPR2 result
		fmov.s  fr7, @-r15         ! push FIPR1 result onto stack

		! Load first vector into fv0 for third FIPR
		fmov.s  @%[td]+, fr0       ! fr0 = d[8]
		fmov.s  @%[td]+, fr1       ! fr1 = d[9]
		fmov.s  @%[td]+, fr2       ! fr2 = d[10]
		fmov.s  @%[td]+, fr3       ! fr3 = d[11]

		! Load second vector into fv4 for third FIPR
		fmov.s  @%[tv1], fr4       ! fr4 = v1[512]
		add     %[s], %[tv1]       ! tv1 -> v1[640]
		fmov.s  @%[tv2], fr5       ! fr5 = v2[512]
		add     %[s], %[tv2]       ! tv2 -> v2[640]
		fmov.s  @%[tv1], fr6       ! fr6 = v1[640]
		add     %[s], %[tv1]       ! tv1 -> v1[768]
		fmov.s  @%[tv2], fr7       ! fr7 = v2[640]
		add     %[s], %[tv2]       ! tv2 -> v2[768]

		! Issue third FIPR
		fipr    fv0, fv4           ! fr7 = FIPR3 result
		fmov.s  fr15, @-r15        ! push FIPR2 result onto stack

		! Load first vector into fv8 for fourth FIPR
		fmov.s  @%[td]+, fr8       ! fr8  = d[12]
		fmov.s  @%[td]+, fr9       ! fr9  = d[13]
		fmov.s  @%[td]+, fr10      ! fr10 = d[14]
		fmov.s  @%[td]+, fr11      ! fr11 = d[15]

		! Load second vector into fv12 for fourth FIPR
		fmov.s  @%[tv1], fr12      ! fr12 = v1[768]
		add     %[s], %[tv1]       ! tv1 -> v1[896]
		fmov.s  @%[tv2], fr13      ! fr13 = v2[768]
		add     %[s], %[tv2]       ! tv2 -> v2[896]
		fmov.s  @%[tv1], fr14      ! fr14 = v1[896]
		fmov.s  @%[tv2], fr15      ! fr15 = v2[896]

		! Issue fourth FIPR
		fipr    fv8, fv12          ! fr15 = FIPR4 result

		! Add up results from previous FIPRs while we wait
		fmov.s  @r15+, fr0         ! pop FIPR2 result
		fmov.s  @r15+, fr1         ! pop FIPR1 result
		fadd    fr1, fr0           ! fr0 = FIPR1 + FIPR2
		fadd    fr7, fr0           ! fr0 += FIPR3

		! Add result from fourth FIPR now that it's ready
		fadd    fr15, fr0          ! fr0 += FIPR4

		! Transfer result to primary bank via FPUL
		flds    fr0, FPUL          ! secondary fr0 -> FPUL
		frchg                      ! Switch back to primary FP bank
		fsts    FPUL, %[result]    ! FPUL -> result register (primary bank)
	)"
	: [td] "+r" (td), [tv1] "+r" (tv1), [tv2] "+r" (tv2),
	  [s] "=r" (stride), [result] "=f" (result)
	:
	: "memory");

	return result;
}