I found out the code you were talking about in a fork of mupen64plus.
It's hard to say whether the LUT is advantageous, because results differ a lot among scenarios and CPUs. It's really possible that the code with the LUT performs better in a real word application. Measuring the cpu time by hand is difficult when there is code accessing the memory. A compromise could be made with the LFENCE+RDTSCP instructions to benchmark the code (note that the final P is important -- without the P the instructions is different and cannot be really used to benchmark code; also, if your code only writes to memory use SFENCE and if does r/w use MFENCE). This way will give you different results, but in a real-world application you won't find anywhere a load or store fence in similar circumstances and you'd take advantage of all trick a CPU can engage (ruined by rdtscp). RDTSC (without P) and no fence could give results more similar to a real-world application usage, but it won't tell you about the worst case scenario and it is therefore useless for comparisons.
You're right. Writing everything in assembly would save a lot of instructions and allow you to apply tricks that the compiler may not see. On the other hand, if you need to change something... it's going to be a big pain.
I've found this code:
Code: Select all
case TEXEL_I4:
{
uint8_t byteval, c;
taddr = ((tbase << 4) + s) >> 1;
taddr ^= ((t & 1) ? BYTE_XOR_DWORD_SWAP : BYTE_ADDR_XOR);
byteval = g_gdp.tmem[taddr & 0xfff];
c = (s & 1) ? (byteval & 0xf) : (byteval >> 4);
c |= (c << 4);
color->r = c;
color->g = c;
color->b = c;
color->a = c;
}
break;
16beeb: c1 e1 04 shl $0x4,%ecx # tbase << 4
16beee: 83 e2 01 and $0x1,%edx
16bef1: 01 f1 add %esi,%ecx # tbase += s
16bef3: d1 e9 shr %ecx # tbase >>= 1
....
Therefore I also saw this:
Code: Select all
taddr = ((tbase << 4) + s) >> 1;
taddr = ((tbase * 16) + s) / 2;
tbase can be overwritten because isn't used later but anyway it could have be kept untouched in this way:
Code: Select all
shl $4, tbase
lea (tbase, s), taddr # sum and move to destination
shr $1, taddr
possibly it can be asm-inlined in this way:
Code: Select all
shr $1, s
lea (s, tbase, 8), taddr # sum, shift and move to destination
Next, i see this code...
#define BYTE_XOR_DWORD_SWAP 4
#define WORD_XOR_DWORD_SWAP 2
taddr ^= ((t & 1) ? BYTE_XOR_DWORD_SWAP : BYTE_ADDR_XOR);
Code: Select all
size_t a0(size_t t)
{
size_t taddr = ((t & 1) ? 4 : 2);
return taddr;
}
{without the final XOR}
00000000004005c0 <a0>:
4005c0: 83 e7 01 and $0x1,%edi
4005c3: 48 83 ff 01 cmp $0x1,%rdi
4005c7: 48 19 c0 sbb %rax,%rax
4005ca: 48 83 e0 fe and $0xfffffffffffffffe,%rax
4005ce: 48 83 c0 04 add $0x4,%rax
4005d2: c3 retq
{without the final XOR}
It's such a horrible code. The returned value can be XOR-ed with taddr at a later time.
There's a good coincidence. 2 in binary is 010 and 4 is 100. As always "t & 1" is the selector.
Code: Select all
mov $2, temp
and $1, t # t is either 0 or 1
lea (temp, t, 2), temp # temp+=t*2 (temp=2+t*2)
xor temp, taddr # xor with either 2 or 4
Code:
Code: Select all
size_t b0(size_t t)
{
size_t temp;
asm volatile(
"and $1, %[t] ;"
"lea 2(, %[t], 2), %[temp] ;"
: [temp] "=r" (temp), [t] "+r" (t)
:
: "cc"
);
return temp;
}
{without the final XOR}
00000000004005e0 <b0>:
4005e0: 48 83 e7 01 and $0x1,%rdi
4005e4: 48 8d 04 7d 02 00 00 lea 0x2(,%rdi,2),%rax
4005eb: 00
4005ec: c3 retq
{without the final XOR}
The other version, where the defines have different values:
#define BYTE_XOR_DWORD_SWAP 7
#define WORD_XOR_DWORD_SWAP 3
taddr ^= ((t & 1) ? BYTE_XOR_DWORD_SWAP : BYTE_ADDR_XOR);
Code: Select all
size_t a1(size_t t)
{
size_t taddr = ((t & 1) ? 7 : 3);
return taddr;
}
{without the final XOR}
00000000004005f0 <a1>:
4005f0: 83 e7 01 and $0x1,%edi
4005f3: 48 83 ff 01 cmp $0x1,%rdi
4005f7: 48 19 c0 sbb %rax,%rax
4005fa: 48 83 e0 fc and $0xfffffffffffffffc,%rax
4005fe: 48 83 c0 07 add $0x7,%rax
400602: c3 retq
{without the final XOR}
There is another good coincidence. 3 in binary is 011 and 7 is 111. As always "t & 1" is the selector.
Code: Select all
mov $3, temp
and $1, t # t is either 0 or 1
lea (temp, t, 4), temp # temp+=t*4 (temp=3+t*4)
xor temp, taddr # xor with either 3 or 7
In this case it's even possible to avoid to clobber "t". This will create a much better code when the functions code is inlined.
Code: Select all
size_t b1(size_t t)
{
size_t temp;
asm volatile(
"lea 3(, %[t], 4), %[temp] ;"
"and $7, %[temp] ;"
: [temp] "=r" (temp)
: [t] "r" (t)
: "cc"
);
return temp;
}
{without the final XOR}
0000000000400610 <b1>:
400610: 48 8d 04 bd 03 00 00 lea 0x3(,%rdi,4),%rax
400617: 00
400618: 48 83 e0 07 and $0x7,%rax
40061c: c3 retq
{without the final XOR}
The more assembly code you write, the shorter will be the resulting code. I noticed that usually you save a 20%-40% over the number of instruction written by a compiler. In this case (function "fetch_texel"), i notice that taddr is initialized to 0. It can be used as input for these LEAs, and also for the first of the LEAs as used in my old coded version to generate the color "c" without the LUT.
Code: Select all
static STRICTINLINE int32_t alpha_combiner_equation(int32_t a, int32_t b, int32_t c, int32_t d)
{
a = special_9bit_exttable[a];
b = special_9bit_exttable[b];
c = SIGNF(c, 9);
d = special_9bit_exttable[d];
a = (((a - b) * c) + (d << 8) + 0x80) >> 8;
return (a & 0x1ff);
}
Look also at this macro:
Code: Select all
#define SIGNF(x, numb) ((x) | -((x) & (1 << (numb - 1))))
....
40041c: 89 c2 mov %eax,%edx # always required
40041e: 81 e2 00 01 00 00 and $0x100,%edx
400424: f7 da neg %edx
400426: 09 c2 or %eax,%edx
That macro will create 4 instructions. It's possible to save one or two instructions with inline assembly, maybe using GCC functions macro. This macro shall create 3 or 2 instructions (when the value is replaced in-place).
Code: Select all
#define SIGNF(x, numb) \
({ \
typeof(x) _x = x; \
asm("shl %[bits], %[temp] ;" \
"sar %[bits], %[temp] ;" \
: [temp] "+r" (_x) \
: [bits] "n" (sizeof(_x)*8-numb) \
: "cc" \
); \
_x; \
})
.....
400410: 89 c1 mov %eax,%ecx # not always required
400412: c1 e1 17 shl $0x17,%ecx
400415: c1 f9 17 sar $0x17,%ecx
I see that you want to turn a,b,c,d into shorts. But from the code above i see that a,b,c,d are used as indexes to access to a LUT. It would be a better idea to use "(unsigned) int/long a, b, c, d...". Making everything 32 bits or even CPU-word wide, and unsigned if they cannot be signed, will result in better code. Also, all readings and writings to pointer-sized and pointer-aligned variables are guaranteed to be atomic on all platforms. Using shorts is advantageous only if you create 8 shorts and load them all with movdqa. If you read or write values one-by-one, unsigned and unsigned longs will perform better than shorts.
Another case from your following message:
Code: Select all
PAIRWRITE8(tempdword, tempbyte, (tempbyte & 1) ? 3 : 0);
Again, this is a case where the compiler does an awful job with ternary algorithms. Simply replacing that with -(tempbyte & 1) & 3 made a difference!
You can use an inline assembly to reduce even more the amount of instructions:
that basically means:
c &= 1; // c=0 or c=1
c = c+c+c; // 0+0+0=0 1+1+1=3
Inline macro for GCC:
Code: Select all
#define GET(tempbyte) \
({ \
typeof(tempbyte) temp; \
asm( \
"lea (%[mask], %[mask], 2), %[temp] ;" \
: [temp] "=r" (temp) \
: [mask] "r" (tempbyte & 1) \
); \
temp; \
})
....
PAIRWRITE8(tempdword, tempbyte, GET(tempbyte));
Look at how they translate for a standard function (input argument in RDI):
Code: Select all
return (tempbyte & 1) ? 3 : 0
0000000000400630 <tA>:
400630: 48 89 f8 mov %rdi,%rax
400633: 83 e0 01 and $0x1,%eax
400636: 48 f7 d8 neg %rax
400639: 83 e0 03 and $0x3,%eax
40063c: c3 retq
return -(tempbyte & 1) & 3
0000000000400640 <tB>:
400640: 48 89 f8 mov %rdi,%rax
400643: 83 e0 01 and $0x1,%eax
400646: 48 f7 d8 neg %rax
400649: 83 e0 03 and $0x3,%eax
40064c: c3 retq
return GET(tempbyte);
0000000000400650 <tC>:
400650: 83 e7 01 and $0x1,%edi
400653: 48 8d 04 7f lea (%rdi,%rdi,2),%rax
400657: c3 retq
No, always use LEA if possible. Also, that picture is wrong.