SH4 FIPR Optimizations: Difference between revisions
GyroVorbis (talk | contribs) No edit summary |
No edit summary |
||
| (One intermediate revision by the same user not shown) | |||
| Line 71: | Line 71: | ||
__always_inline | __always_inline | ||
float shz_pl_inner_loop(const float *d, const float *v1, const float *v2) { | float shz_pl_inner_loop(const float *d, const float *v1, const float *v2) { | ||
float | const float *td = d; | ||
uint32_t | const float *tv1 = v1; | ||
const float *tv2 = v2; | |||
uint32_t stride; | |||
float result; | |||
asm volatile(R"( | asm volatile(R"( | ||
! Swap to back-bank so we don't need to clobber any FP regs. | ! Swap to back-bank so we don't need to clobber any FP regs. | ||
frchg | frchg | ||
! s = 512 (stride: 128 floats * 4 bytes) | |||
mov #2, %[s] | |||
shll8 %[s] ! 2 << 8 = 512 | |||
! Load first vector into fv0 for first FIPR. | ! Load first vector into fv0 for first FIPR. | ||
fmov.s @%[td]+, fr0 ! fr0 = d[0] | |||
fmov.s @%[ | fmov.s @%[td]+, fr1 ! fr1 = d[1] | ||
fmov.s @%[td]+, fr2 ! fr2 = d[2] | |||
fmov.s @%[ | fmov.s @%[td]+, fr3 ! fr3 = d[3] | ||
fmov.s @%[ | |||
fmov.s @%[ | |||
! Load second vector into fv4 for first FIPR | ! Load second vector into fv4 for first FIPR | ||
fmov.s @%[ | fmov.s @%[tv1], fr4 ! fr4 = v1[0] | ||
add %[s], %[v1] | add %[s], %[tv1] ! tv1 -> v1[128] | ||
fmov.s @%[ | fmov.s @%[tv2], fr5 ! fr5 = v2[0] | ||
add %[s], %[v2] | add %[s], %[tv2] ! tv2 -> v2[128] | ||
fmov.s @%[ | fmov.s @%[tv1], fr6 ! fr6 = v1[128] | ||
add %[s], %[v1] | add %[s], %[tv1] ! tv1 -> v1[256] | ||
fmov.s @%[ | fmov.s @%[tv2], fr7 ! fr7 = v2[128] | ||
add %[s], %[v2] | add %[s], %[tv2] ! tv2 -> v2[256] | ||
! Issue first FIPR | ! Issue first FIPR | ||
fipr fv0, fv4 | fipr fv0, fv4 ! fr7 = FIPR1 result | ||
! Load first vector into fv8 for second FIPR. | ! Load first vector into fv8 for second FIPR. | ||
fmov.s @%[ | fmov.s @%[td]+, fr8 ! fr8 = d[4] | ||
fmov.s @%[ | fmov.s @%[td]+, fr9 ! fr9 = d[5] | ||
fmov.s @%[ | fmov.s @%[td]+, fr10 ! fr10 = d[6] | ||
fmov.s @%[ | fmov.s @%[td]+, fr11 ! fr11 = d[7] | ||
! Load second vector into fv12 for second FIPR. | ! Load second vector into fv12 for second FIPR. | ||
fmov.s @%[ | fmov.s @%[tv1], fr12 ! fr12 = v1[256] | ||
add %[s], %[v1] | add %[s], %[tv1] ! tv1 -> v1[384] | ||
fmov.s @%[ | fmov.s @%[tv2], fr13 ! fr13 = v2[256] | ||
add %[s], %[v2] | add %[s], %[tv2] ! tv2 -> v2[384] | ||
fmov.s @%[ | fmov.s @%[tv1], fr14 ! fr14 = v1[384] | ||
add %[s], %[v1] | add %[s], %[tv1] ! tv1 -> v1[512] | ||
fmov.s @%[ | fmov.s @%[tv2], fr15 ! fr15 = v2[384] | ||
add %[s], %[v2] | add %[s], %[tv2] ! tv2 -> v2[512] | ||
! Issue second FIPR | ! Issue second FIPR | ||
fipr fv8, fv12 | fipr fv8, fv12 ! fr15 = FIPR2 result | ||
fmov.s fr7, @-r15 ! push FIPR1 result onto stack | |||
fmov.s fr7, @- | |||
! Load first vector into fv0 for third FIPR | ! Load first vector into fv0 for third FIPR | ||
fmov.s @%[ | fmov.s @%[td]+, fr0 ! fr0 = d[8] | ||
fmov.s @%[ | fmov.s @%[td]+, fr1 ! fr1 = d[9] | ||
fmov.s @%[ | fmov.s @%[td]+, fr2 ! fr2 = d[10] | ||
fmov.s @%[ | fmov.s @%[td]+, fr3 ! fr3 = d[11] | ||
! Load second vector into fv4 for third FIPR | ! Load second vector into fv4 for third FIPR | ||
fmov.s @%[ | fmov.s @%[tv1], fr4 ! fr4 = v1[512] | ||
add %[s], %[v1] | add %[s], %[tv1] ! tv1 -> v1[640] | ||
fmov.s @%[ | fmov.s @%[tv2], fr5 ! fr5 = v2[512] | ||
add %[s], %[v2] | add %[s], %[tv2] ! tv2 -> v2[640] | ||
fmov.s @%[ | fmov.s @%[tv1], fr6 ! fr6 = v1[640] | ||
add %[s], %[v1] | add %[s], %[tv1] ! tv1 -> v1[768] | ||
fmov.s @%[ | fmov.s @%[tv2], fr7 ! fr7 = v2[640] | ||
add %[s], %[v2] | add %[s], %[tv2] ! tv2 -> v2[768] | ||
! Issue third FIPR | ! Issue third FIPR | ||
fipr fv0, fv4 | fipr fv0, fv4 ! fr7 = FIPR3 result | ||
fmov.s fr15, @-r15 ! push FIPR2 result onto stack | |||
fmov.s fr15, @- | |||
! Load first vector into fv8 for fourth FIPR | ! Load first vector into fv8 for fourth FIPR | ||
fmov.s @%[ | fmov.s @%[td]+, fr8 ! fr8 = d[12] | ||
fmov.s @%[ | fmov.s @%[td]+, fr9 ! fr9 = d[13] | ||
fmov.s @%[ | fmov.s @%[td]+, fr10 ! fr10 = d[14] | ||
fmov.s @%[ | fmov.s @%[td]+, fr11 ! fr11 = d[15] | ||
! Load second vector into fv12 for fourth FIPR | ! Load second vector into fv12 for fourth FIPR | ||
fmov.s @%[ | fmov.s @%[tv1], fr12 ! fr12 = v1[768] | ||
add %[s], %[v1] | add %[s], %[tv1] ! tv1 -> v1[896] | ||
fmov.s @%[ | fmov.s @%[tv2], fr13 ! fr13 = v2[768] | ||
add %[s], %[v2] | add %[s], %[tv2] ! tv2 -> v2[896] | ||
fmov.s @%[ | fmov.s @%[tv1], fr14 ! fr14 = v1[896] | ||
fmov.s @%[ | fmov.s @%[tv2], fr15 ! fr15 = v2[896] | ||
! Issue fourth FIPR | ! Issue fourth FIPR | ||
fipr fv8, fv12 | fipr fv8, fv12 ! fr15 = FIPR4 result | ||
! Add up results from previous FIPRs while we wait | ! Add up results from previous FIPRs while we wait | ||
fmov.s @ | fmov.s @r15+, fr0 ! pop FIPR2 result | ||
fmov.s @ | fmov.s @r15+, fr1 ! pop FIPR1 result | ||
fadd fr1, fr0 | fadd fr1, fr0 ! fr0 = FIPR1 + FIPR2 | ||
fadd fr7, fr0 | fadd fr7, fr0 ! fr0 += FIPR3 | ||
! Add result from fourth FIPR now that it's ready | ! Add result from fourth FIPR now that it's ready | ||
fadd fr15, fr0 | fadd fr15, fr0 ! fr0 += FIPR4 | ||
! | ! Transfer result to primary bank via FPUL | ||
flds fr0, FPUL ! secondary fr0 -> FPUL | |||
frchg ! Switch back to primary FP bank | |||
fsts FPUL, %[result] ! FPUL -> result register (primary bank) | |||
)" | )" | ||
: [ | : [td] "+r" (td), [tv1] "+r" (tv1), [tv2] "+r" (tv2), | ||
[ | [s] "=r" (stride), [result] "=f" (result) | ||
: | |||
: "memory"); | |||
return | return result; | ||
} | } | ||
</pre> | </pre> | ||
Latest revision as of 18:42, 16 February 2026
Yo, guys. At like 1AM @ian micheal got me looking at pl_mpeg's audio decoder to see if I could see any potential gainz... So here is its innermost hottest audio synthesis loop:
for (int i = 32; i; --i) {
float u;
u = pl_fipr(d[0], d[1], d[2], d[3], v1[0], v2[0], v1[128], v2[128]);
u += pl_fipr(d[4], d[5], d[6], d[7], v1[256], v2[256], v1[384], v2[384]);
u += pl_fipr(d[8], d[9], d[10], d[11], v1[512], v2[512], v1[640], v2[640]);
u += pl_fipr(d[12], d[13], d[14], d[15], v1[768], v2[768], v1[896], v2[896]);
d += 32;
v1++;
v2++;
*out++ = (short)((int)u >> 16);
}
Which... you'd think would be preeeetty efficient, right? 4 back-to-back FIPRs? I mean, it is hella gainzy compared to not using FIPR.
But there are two problems with back-to-back FIPR-y, I wanna teach anyone interested:
1) Very often one of the vector arguments stays constant between FIPR calls, but unfortunately the compiler is too dumb to not reload all 8 registers between calls regardless.
- LUCKILY every argument to these FIPRs is unique so this is not applicable, but... very often that's a perf destroyer.
2) THE COMPILER CANNOT PIPELINE FIPR FOR SHIT.
- VERY applicable here. You know what the ASM looks like for these FIPR calls? Something like this:
! load first vector arg into fv0 (nothing wrong with this) fmov.s @%[d]+, fr0 fmov.s @%[d}+, fr1 fmov.s @%[d]+, fr2 fmov.s @%[d]+, fr3 ! load second vector arg into fv4 (nothing wrong with this) fmov.s @%[v1], fr4 add %[offset], @[v1] fmov.s @%[v2], fr5 add %[offset], @[v2] fmov.s @%[v1], fr6 fmov.s @%[v2], fr7 ! issue actual FIPR calculation fipr fv0, fv4 ! VERY NEXT INSTRUCTION TRY TO STORE THE RESULT fmov.s fr7, @%[result] ! PIPELINE STALL!!!!
Now this is very very bad. FIPR has 4-5 cycles of latency, so every fucking call to FIPR, since the very next instruction tries to use the result before its been calculated, the entire pipeline must stall waiting for the result... FOR EVERY FIPR CALL. So you're losing MASSIVE perf benefits there. The solution? You have to pipeline your FIPRs so that while the previous FIPR call is still calculating, you're loading up and issuing the next FIPR call.
So I wrote a new routine that replaces that inner loop body doing manually pipelined FIPR calls... This should be way better:
for (int i = 32; i; --i) {
#if 0 // Old FIPR path which didn't pipeline for shit.
float u;
u = pl_fipr(d[0], d[1], d[2], d[3], v1[0], v2[0], v1[128], v2[128]);
u += pl_fipr(d[4], d[5], d[6], d[7], v1[256], v2[256], v1[384], v2[384]);
u += pl_fipr(d[8], d[9], d[10], d[11], v1[512], v2[512], v1[640], v2[640]);
u += pl_fipr(d[12], d[13], d[14], d[15], v1[768], v2[768], v1[896], v2[896]);
#else // New hand-written FIPR path with manual pipelining
float u = shz_pl_inner_loop(d, v1, v2);
#endif
d += 32;
v1++;
v2++;
*out++ = (short)((int)u >> 16);
}
Where the new implementation is this inline ASM:
__always_inline
float shz_pl_inner_loop(const float *d, const float *v1, const float *v2) {
const float *td = d;
const float *tv1 = v1;
const float *tv2 = v2;
uint32_t stride;
float result;
asm volatile(R"(
! Swap to back-bank so we don't need to clobber any FP regs.
frchg
! s = 512 (stride: 128 floats * 4 bytes)
mov #2, %[s]
shll8 %[s] ! 2 << 8 = 512
! Load first vector into fv0 for first FIPR.
fmov.s @%[td]+, fr0 ! fr0 = d[0]
fmov.s @%[td]+, fr1 ! fr1 = d[1]
fmov.s @%[td]+, fr2 ! fr2 = d[2]
fmov.s @%[td]+, fr3 ! fr3 = d[3]
! Load second vector into fv4 for first FIPR
fmov.s @%[tv1], fr4 ! fr4 = v1[0]
add %[s], %[tv1] ! tv1 -> v1[128]
fmov.s @%[tv2], fr5 ! fr5 = v2[0]
add %[s], %[tv2] ! tv2 -> v2[128]
fmov.s @%[tv1], fr6 ! fr6 = v1[128]
add %[s], %[tv1] ! tv1 -> v1[256]
fmov.s @%[tv2], fr7 ! fr7 = v2[128]
add %[s], %[tv2] ! tv2 -> v2[256]
! Issue first FIPR
fipr fv0, fv4 ! fr7 = FIPR1 result
! Load first vector into fv8 for second FIPR.
fmov.s @%[td]+, fr8 ! fr8 = d[4]
fmov.s @%[td]+, fr9 ! fr9 = d[5]
fmov.s @%[td]+, fr10 ! fr10 = d[6]
fmov.s @%[td]+, fr11 ! fr11 = d[7]
! Load second vector into fv12 for second FIPR.
fmov.s @%[tv1], fr12 ! fr12 = v1[256]
add %[s], %[tv1] ! tv1 -> v1[384]
fmov.s @%[tv2], fr13 ! fr13 = v2[256]
add %[s], %[tv2] ! tv2 -> v2[384]
fmov.s @%[tv1], fr14 ! fr14 = v1[384]
add %[s], %[tv1] ! tv1 -> v1[512]
fmov.s @%[tv2], fr15 ! fr15 = v2[384]
add %[s], %[tv2] ! tv2 -> v2[512]
! Issue second FIPR
fipr fv8, fv12 ! fr15 = FIPR2 result
fmov.s fr7, @-r15 ! push FIPR1 result onto stack
! Load first vector into fv0 for third FIPR
fmov.s @%[td]+, fr0 ! fr0 = d[8]
fmov.s @%[td]+, fr1 ! fr1 = d[9]
fmov.s @%[td]+, fr2 ! fr2 = d[10]
fmov.s @%[td]+, fr3 ! fr3 = d[11]
! Load second vector into fv4 for third FIPR
fmov.s @%[tv1], fr4 ! fr4 = v1[512]
add %[s], %[tv1] ! tv1 -> v1[640]
fmov.s @%[tv2], fr5 ! fr5 = v2[512]
add %[s], %[tv2] ! tv2 -> v2[640]
fmov.s @%[tv1], fr6 ! fr6 = v1[640]
add %[s], %[tv1] ! tv1 -> v1[768]
fmov.s @%[tv2], fr7 ! fr7 = v2[640]
add %[s], %[tv2] ! tv2 -> v2[768]
! Issue third FIPR
fipr fv0, fv4 ! fr7 = FIPR3 result
fmov.s fr15, @-r15 ! push FIPR2 result onto stack
! Load first vector into fv8 for fourth FIPR
fmov.s @%[td]+, fr8 ! fr8 = d[12]
fmov.s @%[td]+, fr9 ! fr9 = d[13]
fmov.s @%[td]+, fr10 ! fr10 = d[14]
fmov.s @%[td]+, fr11 ! fr11 = d[15]
! Load second vector into fv12 for fourth FIPR
fmov.s @%[tv1], fr12 ! fr12 = v1[768]
add %[s], %[tv1] ! tv1 -> v1[896]
fmov.s @%[tv2], fr13 ! fr13 = v2[768]
add %[s], %[tv2] ! tv2 -> v2[896]
fmov.s @%[tv1], fr14 ! fr14 = v1[896]
fmov.s @%[tv2], fr15 ! fr15 = v2[896]
! Issue fourth FIPR
fipr fv8, fv12 ! fr15 = FIPR4 result
! Add up results from previous FIPRs while we wait
fmov.s @r15+, fr0 ! pop FIPR2 result
fmov.s @r15+, fr1 ! pop FIPR1 result
fadd fr1, fr0 ! fr0 = FIPR1 + FIPR2
fadd fr7, fr0 ! fr0 += FIPR3
! Add result from fourth FIPR now that it's ready
fadd fr15, fr0 ! fr0 += FIPR4
! Transfer result to primary bank via FPUL
flds fr0, FPUL ! secondary fr0 -> FPUL
frchg ! Switch back to primary FP bank
fsts FPUL, %[result] ! FPUL -> result register (primary bank)
)"
: [td] "+r" (td), [tv1] "+r" (tv1), [tv2] "+r" (tv2),
[s] "=r" (stride), [result] "=f" (result)
:
: "memory");
return result;
}