Discuss topics related to development here.
-
izy
- Posts: 25
- Joined: Tue Jun 02, 2015 11:34 am
Post
by izy » Tue Feb 16, 2016 12:59 pm
This is an eventfd-based barrier only tested for compiling. I don't know if this performs better than pthread_barrier_wait and not even if this actually works since i cannot really test the code.
Code: Select all
diff -rpBcN cen64-304d711-old/device/device.c cen64-304d711-new/device/device.c
*** cen64-304d711-old/device/device.c 2016-07-10 21:39:42.000000000 +0000
--- cen64-304d711-new/device/device.c 2016-07-12 22:48:31.378655896 +0000
*************** CEN64_THREAD_RETURN_TYPE run_rcp_thread(
*** 173,178 ****
--- 173,181 ----
}
// Sync up with the VR4300 thread.
+ #ifdef __linux__
+ _izy_eventfd_barrier_wait();
+ #else
cen64_mutex_lock(&device->sync_mutex);
if (!device->other_thread_is_waiting) {
*************** CEN64_THREAD_RETURN_TYPE run_rcp_thread(
*** 185,190 ****
--- 188,194 ----
cen64_mutex_unlock(&device->sync_mutex);
cen64_cv_signal(&device->sync_cv);
}
+ #endif
}
return CEN64_THREAD_RETURN_VAL;
*************** CEN64_THREAD_RETURN_TYPE run_vr4300_thre
*** 207,212 ****
--- 211,219 ----
}
// Sync up with the RCP thread.
+ #ifdef __linux__
+ _izy_eventfd_barrier_wait();
+ #else
cen64_mutex_lock(&device->sync_mutex);
if (!device->other_thread_is_waiting) {
*************** CEN64_THREAD_RETURN_TYPE run_vr4300_thre
*** 219,224 ****
--- 226,232 ----
cen64_mutex_unlock(&device->sync_mutex);
cen64_cv_signal(&device->sync_cv);
}
+ #endif
}
return CEN64_THREAD_RETURN_VAL;
*************** CEN64_THREAD_RETURN_TYPE run_vr4300_thre
*** 228,233 ****
--- 236,247 ----
int device_multithread_spin(struct cen64_device *device) {
cen64_thread vr4300_thread;
+ #ifdef __linux__
+ if(_izy_eventfd_barrier_create() == -1) {
+ printf("Failed to create the synchronization eventfd.\n");
+ return 1;
+ }
+ #else
device->other_thread_is_waiting = false;
if (cen64_mutex_create(&device->sync_mutex)) {
*************** int device_multithread_spin(struct cen64
*** 240,258 ****
--- 254,281 ----
cen64_mutex_destroy(&device->sync_mutex);
return 1;
}
+ #endif
if (cen64_thread_create(&vr4300_thread, run_vr4300_thread, device)) {
printf("Failed to create the VR4300 thread.\n");
+ #ifdef __linux__
+ _izy_eventfd_barrier_destroy();
+ #else
cen64_cv_destroy(&device->sync_cv);
cen64_mutex_destroy(&device->sync_mutex);
+ #endif
return 1;
}
run_rcp_thread(device);
cen64_thread_join(&vr4300_thread);
+ #ifdef __linux__
+ _izy_eventfd_barrier_destroy();
+ #else
cen64_cv_destroy(&device->sync_cv);
cen64_mutex_destroy(&device->sync_mutex);
+ #endif
return 0;
}
diff -rpBcN cen64-304d711-old/device/device.h cen64-304d711-new/device/device.h
*** cen64-304d711-old/device/device.h 2016-07-10 21:39:42.000000000 +0000
--- cen64-304d711-new/device/device.h 2016-07-12 22:48:34.130621300 +0000
***************
*** 26,31 ****
--- 26,34 ----
#include "thread.h"
#include "vi/controller.h"
#include "vr4300/cpu.h"
+ #ifdef __linux__
+ #include "os/linux/eventfd.h"
+ #endif
// Only used when passed -nointerface.
extern bool device_exit_requested;
*************** struct cen64_device {
*** 47,55 ****
--- 50,63 ----
int debug_sfd;
bool multithread;
+
+ #ifdef __linux__
+
+ #else
bool other_thread_is_waiting;
cen64_mutex sync_mutex;
cen64_cv sync_cv;
+ #endif
bool running;
};
diff -rpBcN cen64-304d711-old/os/linux/eventfd.h cen64-304d711-new/os/linux/eventfd.h
*** cen64-304d711-old/os/linux/eventfd.h 1970-01-01 01:00:00.000000000 +0100
--- cen64-304d711-new/os/linux/eventfd.h 2016-07-12 23:02:45.927912757 +0000
***************
*** 0 ****
--- 1,45 ----
+ /* eventfd-based barrier - Copyright/copyleft 2016 by Izy */
+ #ifndef _IZY_LINUX_EVENTFD_BARRIER
+ #define _IZY_LINUX_EVENTFD_BARRIER
+ #include "common.h"
+ #include <unistd.h>
+ #include <sys/eventfd.h>
+
+ static int _izy_eventfd_fd[2];//struct
+ static inline int _izy_eventfd_barrier_create()
+ {
+ _izy_eventfd_fd[0] = eventfd(0, EFD_CLOEXEC);
+ if(_izy_eventfd_fd[0] < 0) return -1;
+ _izy_eventfd_fd[1] = eventfd(0, EFD_CLOEXEC);
+ if(_izy_eventfd_fd[1] < 0) return 0*close(_izy_eventfd_fd[0])-1;
+ return 0;
+ }
+
+ static inline void _izy_eventfd_barrier_destroy()
+ {
+ close(_izy_eventfd_fd[0]);
+ close(_izy_eventfd_fd[1]);
+ }
+
+ static inline void _izy_eventfd_barrier_wait()
+ {
+ static uint64_t rd; // global
+ static const uint64_t wr = 1; // global
+ static unsigned counter = 0;//struct
+ static size_t index = 0;//struct
+ int myfd;
+ myfd = _izy_eventfd_fd[index];
+ if(likely(__sync_xor_and_fetch(&counter, 1) == 0))
+ {
+ index ^= 1;
+ __asm__ __volatile__("" ::: "memory");
+ while(write(myfd, &wr, sizeof(uint64_t))<=0);
+ }
+ else
+ {
+ while(read(myfd, &rd, sizeof(uint64_t))<=0);
+ }
+ }
+
+ #endif
+
Last edited by
izy on Wed Jul 13, 2016 9:43 am, edited 1 time in total.
-
MarathonMan
- Site Admin
- Posts: 692
- Joined: Fri Oct 04, 2013 4:49 pm
Post
by MarathonMan » Wed Feb 17, 2016 3:56 am
I will look at this more closely when I'm not so tired (it's midnight here

), but the threading barriers are not at all a bottleneck AFAIK.
-
izy
- Posts: 25
- Joined: Tue Jun 02, 2015 11:34 am
Post
by izy » Wed Feb 17, 2016 12:08 pm
That sounds a little weird to me, a thread barrier is always a bottleneck moreover that is a bottleneck by definition (a "barrier")
i did a sort of a speed test in cen64 this way and i get a noticeable IV/s difference
./cen64 -multithread -headless ./pifdata.bin ./mgc_2011.z64
but i don't know if that is the right way to test anything
-
Snowstorm64
- Posts: 303
- Joined: Sun Oct 20, 2013 8:22 pm
Post
by Snowstorm64 » Wed Feb 17, 2016 2:49 pm
I have tested the patch on Super Mario 64, Super Smash Bros and Conker's BFD, either with -multithread option or without it. Result: I haven't noticed any performance differences, unfortunately.
OS: Debian GNU/Linux Jessie (8.0)
CPU: Intel i7 4770K @ 3.5 GHz
Build: AVX (compiled from git)
-
izy
- Posts: 25
- Joined: Tue Jun 02, 2015 11:34 am
Post
by izy » Thu Feb 18, 2016 8:24 am
I don't know if this is the correct way to test cen64, but using Super Smash Bros i get these values:
Code: Select all
./cen64.original -multithread -headless ./pifdata.bin ./ssb.z64
Using NTSC-U PIFROM
VI/s: 0.01
VI/s: 12.95
VI/s: 10.53
VI/s: 10.73
VI/s: 10.61
VI/s: 10.64
VI/s: 10.66
VI/s: 9.76
VI/s: 9.54
VI/s: 9.51
VI/s: 9.48
VI/s: 9.71
VI/s: 9.45
VI/s: 9.39
VI/s: 9.45
VI/s: 9.57
VI/s: 9.49
VI/s: 9.47
VI/s: 9.47
VI/s: 9.64
VI/s: 9.30
VI/s: 9.41
VI/s: 9.53
VI/s: 9.45
VI/s: 9.42
VI/s: 9.71
VI/s: 9.57
VI/s: 9.36
VI/s: 9.51
^C
./cen64.eventfd -multithread -headless ./pifdata.bin ./ssb.z64
Using NTSC-U PIFROM
VI/s: 0.01
VI/s: 13.36
VI/s: 11.61
VI/s: 11.07
VI/s: 11.61
VI/s: 11.73
VI/s: 11.74
VI/s: 10.46
VI/s: 10.06
VI/s: 9.95
VI/s: 10.01
VI/s: 10.71
VI/s: 10.05
VI/s: 10.04
VI/s: 10.05
VI/s: 10.05
VI/s: 10.05
VI/s: 10.06
VI/s: 10.06
VI/s: 10.07
VI/s: 10.04
VI/s: 10.07
VI/s: 10.05
VI/s: 10.05
VI/s: 10.06
VI/s: 10.05
VI/s: 10.05
VI/s: 10.06
VI/s: 10.07
VI/s: 10.05
VI/s: 10.06
VI/s: 10.07
^C
From the plain theoretical point of view the patch should be advantageous. Also the patch gives me more or less +½IV/s with this rom.
-
Snowstorm64
- Posts: 303
- Joined: Sun Oct 20, 2013 8:22 pm
Post
by Snowstorm64 » Thu Feb 18, 2016 12:02 pm
CEN64 vanilla:
Code: Select all
./cen64 -multithread -headless ./pifdata.bin './Super Smash Bros. (USA).z64'
Using NTSC-U PIFROM
VI/s: 0.03
VI/s: 99.26
VI/s: 62.28
VI/s: 47.31
VI/s: 45.69
VI/s: 41.71
VI/s: 42.33
VI/s: 42.88
VI/s: 32.26
VI/s: 41.99
CEN64 with eventfd:
Code: Select all
./cen64 -multithread -headless ./pifdata.bin './Super Smash Bros. (USA).z64'
VI/s: 0.03
VI/s: 108.03
VI/s: 63.78
VI/s: 47.89
VI/s: 46.04
VI/s: 41.68
VI/s: 41.94
VI/s: 41.55
VI/s: 31.36
VI/s: 42.07
Maybe does eventfd help more the slower CPUs than the faster ones?
OS: Debian GNU/Linux Jessie (8.0)
CPU: Intel i7 4770K @ 3.5 GHz
Build: AVX (compiled from git)
-
izy
- Posts: 25
- Joined: Tue Jun 02, 2015 11:34 am
Post
by izy » Fri Feb 19, 2016 1:57 pm
The eventfd version performs better than vanilla from your results, or programs running in background biased the results. You log sixty entries (sixty VI/s). Paste in a Libreoffice spreadsheet. Sum all sixty VI/second entries to obtain a VI/minute entry.
Code: Select all
Second(time) pthread (VI/s) eventfd (VI/s)
1 0.03 0.03
2 99.26 108.03
3 62.28 63.78
4 47.31 47.89
5 45.69 46.04
6 41.71 41.68
7 42.33 41.94
8 42.88 41.55
9 32.26 31.36
10 41.99 42.07
455.74 VI/10s 464.37 VI/10s
pthread average VI per sec = 455.74/10 = 45.574
eventfd average VI per sec = 464.37/10 = 46.437
-
Snowstorm64
- Posts: 303
- Joined: Sun Oct 20, 2013 8:22 pm
Post
by Snowstorm64 » Sun Feb 21, 2016 9:40 am
I have ran two tests without any software in background, one is done after restarting my pc. Well, there's probably an issue, I think the audio broke after a while after the start, so I had to turn off OpenAL's complaints for both the versions. I don't know what to say about these results:
Code: Select all
Test 1
Total VI (pthread): 2848.08 - Average VI/s: 47.47
Total VI (eventfd): 2815.46 - Average VI/s: 46.92
Test 2
Total VI (pthread): 2833.02 - Average VI/s: 47.22
Total VI (eventfd): 2726.66 - Average VI/s: 45.44
-
Attachments
-
- test.zip
- This board doesn't allow .ods extension, so I have to compress it in a .zip archive.
- (14.04 KiB) Downloaded 358 times
OS: Debian GNU/Linux Jessie (8.0)
CPU: Intel i7 4770K @ 3.5 GHz
Build: AVX (compiled from git)
-
izy
- Posts: 25
- Joined: Tue Jun 02, 2015 11:34 am
Post
by izy » Sun Feb 21, 2016 1:08 pm
You didn't noticed that the two eventfd tests gave different results as if they ran on different computers. It is sure something is gone wrong. It is ok if you disable the audio. See what are the results if you run: taskset 1 ./cen64 -multithread.. and with taskset 3.
-
Snowstorm64
- Posts: 303
- Joined: Sun Oct 20, 2013 8:22 pm
Post
by Snowstorm64 » Mon Feb 22, 2016 1:16 pm
I had noticed them, and I agree with you, it's strange...However, I have re-run the benchmark with taskset 1/3 set as you have said, so here are the results in the attachment. I have noticed that cen64 with eventfd-based barrier and taskset 1 was acting strange, so I have repeated this test but this time without -headless, and looks like the VI/s counter was spinning like crazy at every break (between scenes) and CEN64 hanged at the end of the SSB64 intro. This doesn't happen with taskset 3 or with pthread (with both taskset).
-
Attachments
-
- test-2.zip
- (14.24 KiB) Downloaded 371 times
OS: Debian GNU/Linux Jessie (8.0)
CPU: Intel i7 4770K @ 3.5 GHz
Build: AVX (compiled from git)
-
izy
- Posts: 25
- Joined: Tue Jun 02, 2015 11:34 am
Post
by izy » Mon Feb 22, 2016 8:05 pm
301 VI/s are only possible if the rcp thread spins in free running and the barrier is skipped altogether. I suggest you to pack your PC and throw it in the trash. Get a TV instead.
-
Snowstorm64
- Posts: 303
- Joined: Sun Oct 20, 2013 8:22 pm
Post
by Snowstorm64 » Tue Feb 23, 2016 7:17 pm
How did you know that I'm currently looking for a good CRT TV?!
Jokes aside, it's not a problem of mine nor of my computer (it's very good), so this bug must lie elsewhere.

OS: Debian GNU/Linux Jessie (8.0)
CPU: Intel i7 4770K @ 3.5 GHz
Build: AVX (compiled from git)
-
izy
- Posts: 25
- Joined: Tue Jun 02, 2015 11:34 am
Post
by izy » Sun Apr 17, 2016 10:50 am
Snowstorm64, would you mind to test this little benchmark program
https://issues.apache.org/jira/browse/TS-2137 ? Because your results are really hard to understand (and likely wrong). The pthreads APIs cannot be faster than an eventfd. At least an atomic operation and a futex word are required for anything in pthreads. A futex implies a system call. A mutex requires a futex. A condition variable requires another futex. A system call should be faster than two syscalls, reason why the futex operation
FUTEX_WAKE_OP was added in order to avoid the double syscalls on wait for a CV, but it stills a more complex design. And it's even more complex because you lock a mutex first and this might issue a first WAIT, then you use the CV queue to wait again, as in any usage of a mutex + a cv. A read/write syscall to an eventfd can replace the mix of futex WAIT/FUTEX_WAKE_OP syscalls and an eventfd is a most simple component. Eventfd has the only handicap of requiring a real file descriptor and this is why pthreads don't use it.
-
MarathonMan
- Site Admin
- Posts: 692
- Joined: Fri Oct 04, 2013 4:49 pm
Post
by MarathonMan » Sun Apr 17, 2016 4:01 pm
It may be the case that you're profiling something that is not a hot path. In such cases, you would need a significant amount of samples to offset the variations in the data.
-
Snowstorm64
- Posts: 303
- Joined: Sun Oct 20, 2013 8:22 pm
Post
by Snowstorm64 » Sun Jun 12, 2016 4:33 pm
Sorry for being too late, but here are the results:
Code: Select all
$ time ./eventfd_test 1000000
Real: 1,70s
User: 0,71s
System: 12,86s
Percent: 796%
Cmd: ./eventfd_test 1000000
Code: Select all
$ time ./pthread_cond_test 1000000
Real: 4,85s
User: 4,82s
System: 31,41s
Percent: 747%
Cmd: ./pthread_cond_test 1000000
OS: Debian GNU/Linux Jessie (8.0)
CPU: Intel i7 4770K @ 3.5 GHz
Build: AVX (compiled from git)
-
MarathonMan
- Site Admin
- Posts: 692
- Joined: Fri Oct 04, 2013 4:49 pm
Post
by MarathonMan » Tue Jun 14, 2016 10:19 am
Snowstorm64 wrote:Sorry for being too late, but here are the results:
Code: Select all
$ time ./eventfd_test 1000000
Real: 1,70s
User: 0,71s
System: 12,86s
Percent: 796%
Cmd: ./eventfd_test 1000000
Code: Select all
$ time ./pthread_cond_test 1000000
Real: 4,85s
User: 4,82s
System: 31,41s
Percent: 747%
Cmd: ./pthread_cond_test 1000000
Those results seem to support my hypothesis:
It may be the case that you're profiling something that is not a hot path. In such cases, you would need a significant amount of samples to offset the variations in the data.
Syncing doesn't occur frequently enough for this to make a large performance difference.
-
izy
- Posts: 25
- Joined: Tue Jun 02, 2015 11:34 am
Post
by izy » Wed Jul 13, 2016 9:45 am
I update the patch with a new version not to use a counter. It uses the xor to toggle between two eventfds. Although it requires two fds and is full of global variables it will make a single system call per passage. Indeed in the end it gives little advantage over the pthread since it looks like it isn't a too critical hot path for cen64; as per MarathonMan.
Who is online
Users browsing this forum: No registered users and 2 guests