Skip to content
Snippets Groups Projects
Commit eb1fccd5 authored by Siarhei Siamashka's avatar Siarhei Siamashka
Browse files

New variants of block based C backwards copy

Because some processors are sensitive to the order of memory
accesses, add a few more variants of memory buffer backwards
copy which do sequential memory writes in the forward direction
inside of each sub-block of certain size. The most interesting
sizes of such sub-blocks are 32 and 64 bytes, because they match
the most frequently used CPU cache line sizes.

Example reports:

== ARM Cortex A7 ==
 C copy backwards                                     :    266.5 MB/s
 C copy backwards (32 byte blocks)                    :   1015.6 MB/s
 C copy backwards (64 byte blocks)                    :   1045.7 MB/s
 C copy                                               :   1033.3 MB/s

== ARM Cortex A15 ==
 C copy backwards                                     :   1438.5 MB/s
 C copy backwards (32 byte blocks)                    :   1497.5 MB/s
 C copy backwards (64 byte blocks)                    :   2643.2 MB/s
 C copy                                               :   2985.8 MB/s
parent ada1db8c
No related branches found
No related tags found
No related merge requests found
...@@ -158,6 +158,14 @@ void bandwidth_bench(int64_t *dstbuf, int64_t *srcbuf, int64_t *tmpbuf, ...@@ -158,6 +158,14 @@ void bandwidth_bench(int64_t *dstbuf, int64_t *srcbuf, int64_t *tmpbuf,
indent_prefix, 0, indent_prefix, 0,
aligned_block_copy_backwards, aligned_block_copy_backwards,
"C copy backwards"); "C copy backwards");
bandwidth_bench_helper(dstbuf, srcbuf, tmpbuf, size, blocksize,
indent_prefix, 0,
aligned_block_copy_backwards_bs32,
"C copy backwards (32 byte blocks)");
bandwidth_bench_helper(dstbuf, srcbuf, tmpbuf, size, blocksize,
indent_prefix, 0,
aligned_block_copy_backwards_bs64,
"C copy backwards (64 byte blocks)");
bandwidth_bench_helper(dstbuf, srcbuf, tmpbuf, size, blocksize, bandwidth_bench_helper(dstbuf, srcbuf, tmpbuf, size, blocksize,
indent_prefix, 0, indent_prefix, 0,
aligned_block_copy, aligned_block_copy,
......
...@@ -84,6 +84,76 @@ void aligned_block_copy_backwards(int64_t * __restrict dst_, ...@@ -84,6 +84,76 @@ void aligned_block_copy_backwards(int64_t * __restrict dst_,
} }
} }
/*
* Walk memory addresses in the backwards direction, but still
* copy each individual 32 byte block in the forward direction.
*/
void aligned_block_copy_backwards_bs32(int64_t * __restrict dst_,
int64_t * __restrict src,
int size)
{
volatile int64_t *dst = dst_;
int64_t t1, t2, t3, t4;
src += size / 8 - 8;
dst += size / 8 - 8;
while ((size -= 64) >= 0)
{
t1 = src[4];
t2 = src[5];
t3 = src[6];
t4 = src[7];
dst[4] = t1;
dst[5] = t2;
dst[6] = t3;
dst[7] = t4;
t1 = src[0];
t2 = src[1];
t3 = src[2];
t4 = src[3];
dst[0] = t1;
dst[1] = t2;
dst[2] = t3;
dst[3] = t4;
src -= 8;
dst -= 8;
}
}
/*
* Walk memory addresses in the backwards direction, but still
* copy each individual 64 byte block in the forward direction.
*/
void aligned_block_copy_backwards_bs64(int64_t * __restrict dst_,
int64_t * __restrict src,
int size)
{
volatile int64_t *dst = dst_;
int64_t t1, t2, t3, t4;
src += size / 8 - 8;
dst += size / 8 - 8;
while ((size -= 64) >= 0)
{
t1 = src[0];
t2 = src[1];
t3 = src[2];
t4 = src[3];
dst[0] = t1;
dst[1] = t2;
dst[2] = t3;
dst[3] = t4;
t1 = src[4];
t2 = src[5];
t3 = src[6];
t4 = src[7];
dst[4] = t1;
dst[5] = t2;
dst[6] = t3;
dst[7] = t4;
src -= 8;
dst -= 8;
}
}
void aligned_block_copy_pf32(int64_t * __restrict dst_, void aligned_block_copy_pf32(int64_t * __restrict dst_,
int64_t * __restrict src, int64_t * __restrict src,
int size) int size)
......
...@@ -36,6 +36,12 @@ void aligned_block_copy(int64_t * __restrict dst, ...@@ -36,6 +36,12 @@ void aligned_block_copy(int64_t * __restrict dst,
void aligned_block_copy_backwards(int64_t * __restrict dst, void aligned_block_copy_backwards(int64_t * __restrict dst,
int64_t * __restrict src, int64_t * __restrict src,
int size); int size);
void aligned_block_copy_backwards_bs32(int64_t * __restrict dst,
int64_t * __restrict src,
int size);
void aligned_block_copy_backwards_bs64(int64_t * __restrict dst,
int64_t * __restrict src,
int size);
void aligned_block_copy_pf32(int64_t * __restrict dst, void aligned_block_copy_pf32(int64_t * __restrict dst,
int64_t * __restrict src, int64_t * __restrict src,
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment