[FFmpeg-devel] [PATCH] avfilter/vf_convolution: add 16-column operation for filter_column() to prepare for x86 SIMD.
chen
chenm003 at 163.com
Mon Dec 2 09:17:14 EET 2019
I have a little suggest on filter_column16(..) [the function]
Firstly, the function is confused with filter16_column(..)
Secondly, the function's algoritym based on row direction, it means reduced address calculate operators and less cache performance, cost of them may more than calculate cost.
For more clear, I give my toy in here, I verify my patch with cmdline in below
./ffmpeg -s 1280*720 -pix_fmt yuv420p -i ~/git/sister_720x1280.yuv -vf convolution="1 2 3 4 5 6 7 8 9:1 2 3 4 5 6 7 8 9:1 2 3 4 5 6 7 8 9:1 2 3 4 5 6 7 8 9:1/45:1/45:1/45:1/45:1:2:3:4:column:column:column:column" -an -vframes 2000 -benchmark -f null /dev/null
The result:
Origin version: utime=7.359s stime=0.138s rtime=1.664s
Song version: utime=5.320s stime=0.133s rtime=1.250s
My version: utime=2.930s stime=0.122s rtime=0.794s
My patch based on today head, I have also corrected Song's merge conflict.
************ Patch Start ********************
diff --git a/libavfilter/vf_convolution.c b/libavfilter/vf_convolution.c
index 5909fea..708732a 100644
--- a/libavfilter/vf_convolution.c
+++ b/libavfilter/vf_convolution.c
@@ -521,6 +521,61 @@ static int filter_slice(AVFilterContext *ctx, void *arg, int jobnr, int nb_jobs)
continue;
}
+ if (mode == MATRIX_COLUMN && s->filter[plane] != filter_column){
+ for (y = slice_start; y < slice_end - 16; y+=16) {
+ const int xoff = (y - slice_start) * bpc;
+ const int yoff = radius * stride;
+ for (x = 0; x < radius; x++) {
+ const int xoff = (y - slice_start) * bpc;
+ const int yoff = x * stride;
+
+ s->setup[plane](radius, c, src, stride, x, width, y, height, bpc);
+ s->filter[plane](dst + yoff + xoff, 1, rdiv,
+ bias, matrix, c, 16, radius,
+ dstride, stride);
+ }
+ s->setup[plane](radius, c, src, stride, radius, width, y, height, bpc);
+ s->filter[plane](dst + yoff + xoff, sizew - 2 * radius,
+ rdiv, bias, matrix, c, 16, radius,
+ dstride, stride);
+ for (x = sizew - radius; x < sizew; x++) {
+ const int xoff = (y - slice_start) * bpc;
+ const int yoff = x * stride;
+
+ s->setup[plane](radius, c, src, stride, x, width, y, height, bpc);
+ s->filter[plane](dst + yoff + xoff, 1, rdiv,
+ bias, matrix, c, 16, radius,
+ dstride, stride);
+ }
+ }
+ if (y < slice_end){
+ const int xoff = (y - slice_start) * bpc;
+ const int yoff = radius * stride;
+ for (x = 0; x < radius; x++) {
+ const int xoff = (y - slice_start) * bpc;
+ const int yoff = x * stride;
+
+ s->setup[plane](radius, c, src, stride, x, width, y, height, bpc);
+ s->filter[plane](dst + yoff + xoff, 1, rdiv,
+ bias, matrix, c, slice_end - y, radius,
+ dstride, stride);
+ }
+ s->setup[plane](radius, c, src, stride, radius, width, y, height, bpc);
+ s->filter[plane](dst + yoff + xoff, sizew - 2 * radius,
+ rdiv, bias, matrix, c, slice_end - y, radius,
+ dstride, stride);
+ for (x = sizew - radius; x < sizew; x++) {
+ const int xoff = (y - slice_start) * bpc;
+ const int yoff = x * stride;
+
+ s->setup[plane](radius, c, src, stride, x, width, y, height, bpc);
+ s->filter[plane](dst + yoff + xoff, 1, rdiv,
+ bias, matrix, c, slice_end - y, radius,
+ dstride, stride);
+ }
+ }
+ }
+ else {
for (y = slice_start; y < slice_end; y++) {
const int xoff = mode == MATRIX_COLUMN ? (y - slice_start) * bpc : radius * bpc;
const int yoff = mode == MATRIX_COLUMN ? radius * stride : 0;
@@ -551,6 +606,7 @@ static int filter_slice(AVFilterContext *ctx, void *arg, int jobnr, int nb_jobs)
dst += dstride;
}
}
+ }
return 0;
}
diff --git a/libavfilter/x86/vf_convolution_init.c b/libavfilter/x86/vf_convolution_init.c
index 5143240..fcc9ae8 100644
--- a/libavfilter/x86/vf_convolution_init.c
+++ b/libavfilter/x86/vf_convolution_init.c
@@ -29,6 +29,56 @@ void ff_filter_3x3_sse4(uint8_t *dst, int width,
const uint8_t *c[], int peak, int radius,
int dstride, int stride);
+static void filter_column16(uint8_t *dst, int height,
+ float rdiv, float bias, const int *const matrix,
+ const uint8_t *c[], int length, int radius,
+ int dstride, int stride)
+{
+ int y, off16;
+
+#if 1
+ #define __assume(cond) do { if (!(cond)) __builtin_unreachable(); } while (0)
+ assert(length <= 16);
+ __assume(length <= 16);
+ // NOTE: alignment to 64-bytes, so 16 of int can be fill into full of a cache line
+ int __attribute__ ((aligned(64))) sum[16];
+ for (y = 0; y < height; y++) {
+ int i;
+ memset(sum, 0, sizeof(sum));
+
+ for (i = 0; i < 2 * radius + 1; i++) {
+ for (off16 = 0; off16 < length; off16++){
+ sum[off16] += c[i][0 + y * stride + off16] * matrix[i];
+ }
+ }
+
+ for (off16 = 0; off16 < length; off16++){
+ sum[off16] = (int)(sum[off16] * rdiv + bias + 0.5f);
+ dst[off16] = av_clip_uint8(sum[off16]);
+ }
+ dst += dstride;
+
+ }
+ #undef __assume
+
+#else
+
+ assert(length <= 16);
+ for (y = 0; y < height; y++) {
+ for (off16 = 0; off16 < length; off16++){
+ int i, sum = 0;
+
+ for (i = 0; i < 2 * radius + 1; i++)
+ sum += c[i][0 + y * stride + off16] * matrix[i];
+
+ sum = (int)(sum * rdiv + bias + 0.5f);
+ dst[off16] = av_clip_uint8(sum);
+ }
+ dst += dstride;
+ }
+#endif
+}
+
av_cold void ff_convolution_init_x86(ConvolutionContext *s)
{
#if ARCH_X86_64
@@ -41,6 +91,8 @@ av_cold void ff_convolution_init_x86(ConvolutionContext *s)
s->filter[i] = ff_filter_3x3_sse4;
}
}
+ if (s->mode[i] == MATRIX_COLUMN)
+ s->filter[i] = filter_column16;
}
#endif
}
************ End ********************
At 2019-12-02 14:38:04, "Song, Ruiling" <ruiling.song at intel.com> wrote:
>> -----Original Message-----
>> From: ffmpeg-devel <ffmpeg-devel-bounces at ffmpeg.org> On Behalf Of
>> xujunzz at sjtu.edu.cn
>> Sent: Wednesday, November 27, 2019 10:56 PM
>> To: ffmpeg-devel at ffmpeg.org
>> Cc: xujunzz at sjtu.edu.cn
>> Subject: [FFmpeg-devel] [PATCH] avfilter/vf_convolution: add 16-column
>> operation for filter_column() to prepare for x86 SIMD.
>>
>> From: Xu Jun <xujunzz at sjtu.edu.cn>
>>
>> In order to add x86 SIMD for filter_column(), I write a C function which
>> processes 16 columns at a time.
>>
>> Signed-off-by: Xu Jun <xujunzz at sjtu.edu.cn>
>> ---
>> libavfilter/vf_convolution.c | 56 +++++++++++++++++++++++++++
>> libavfilter/x86/vf_convolution_init.c | 23 +++++++++++
>> 2 files changed, 79 insertions(+)
>>
>> diff --git a/libavfilter/vf_convolution.c b/libavfilter/vf_convolution.c
>> index d022f1a04a..5291415d48 100644
>> --- a/libavfilter/vf_convolution.c
>> +++ b/libavfilter/vf_convolution.c
>> @@ -520,6 +520,61 @@ static int filter_slice(AVFilterContext *ctx, void *arg,
>> int jobnr, int nb_jobs)
>> continue;
>> }
>>
>> + if (mode == MATRIX_COLUMN && s->filter[plane] != filter_column){
>> + for (y = slice_start; y < slice_end - 16; y+=16) {
>Please take care of the coding style there should be white-space between variables and operators.
>And also I think this piece of change make it harder to maintain, let's try to avoid code duplicate as much as we can.
>> + const int xoff = (y - slice_start) * bpc;
>> + const int yoff = radius * stride;
>> + for (x = 0; x < radius; x++) {
>> + const int xoff = (y - slice_start) * bpc;
>> + const int yoff = x * stride;
>> +
>> + s->setup[plane](radius, c, src, stride, x, width, y, height, bpc);
>> + s->filter[plane](dst + yoff + xoff, 1, rdiv,
>> + bias, matrix, c, 16, radius,
>> + dstride, stride);
>> + }
>> + s->setup[plane](radius, c, src, stride, radius, width, y, height, bpc);
>> + s->filter[plane](dst + yoff + xoff, sizew - 2 * radius,
>> + rdiv, bias, matrix, c, 16, radius,
>> + dstride, stride);
>> + for (x = sizew - radius; x < sizew; x++) {
>> + const int xoff = (y - slice_start) * bpc;
>> + const int yoff = x * stride;
>> +
>> + s->setup[plane](radius, c, src, stride, x, width, y, height, bpc);
>> + s->filter[plane](dst + yoff + xoff, 1, rdiv,
>> + bias, matrix, c, 16, radius,
>> + dstride, stride);
>> + }
>> + }
>> + if (y < slice_end){
>> + const int xoff = (y - slice_start) * bpc;
>> + const int yoff = radius * stride;
>> + for (x = 0; x < radius; x++) {
>> + const int xoff = (y - slice_start) * bpc;
>> + const int yoff = x * stride;
>> +
>> + s->setup[plane](radius, c, src, stride, x, width, y, height, bpc);
>> + s->filter[plane](dst + yoff + xoff, 1, rdiv,
>> + bias, matrix, c, slice_end - y, radius,
>> + dstride, stride);
>> + }
>> + s->setup[plane](radius, c, src, stride, radius, width, y, height, bpc);
>> + s->filter[plane](dst + yoff + xoff, sizew - 2 * radius,
>> + rdiv, bias, matrix, c, slice_end - y, radius,
>> + dstride, stride);
>> + for (x = sizew - radius; x < sizew; x++) {
>> + const int xoff = (y - slice_start) * bpc;
>> + const int yoff = x * stride;
>> +
>> + s->setup[plane](radius, c, src, stride, x, width, y, height, bpc);
>> + s->filter[plane](dst + yoff + xoff, 1, rdiv,
>> + bias, matrix, c, slice_end - y, radius,
>> + dstride, stride);
>> + }
>> + }
>> + }
>> + else {
>> for (y = slice_start; y < slice_end; y++) {
>> const int xoff = mode == MATRIX_COLUMN ? (y - slice_start) * bpc :
>> radius * bpc;
>> const int yoff = mode == MATRIX_COLUMN ? radius * stride : 0;
>> @@ -550,6 +605,7 @@ static int filter_slice(AVFilterContext *ctx, void *arg,
>> int jobnr, int nb_jobs)
>> dst += dstride;
>> }
>> }
>> + }
>>
>> return 0;
>> }
>> diff --git a/libavfilter/x86/vf_convolution_init.c
>> b/libavfilter/x86/vf_convolution_init.c
>> index d1e8c90ceb..6b1c2f0e9f 100644
>> --- a/libavfilter/x86/vf_convolution_init.c
>> +++ b/libavfilter/x86/vf_convolution_init.c
>> @@ -34,6 +34,27 @@ void ff_filter_row_sse4(uint8_t *dst, int width,
>> const uint8_t *c[], int peak, int radius,
>> int dstride, int stride);
>>
>This C code should not be in the x86-specific file.
>
>Ruiling
>> +static void filter_column16(uint8_t *dst, int height,
>> + float rdiv, float bias, const int *const matrix,
>> + const uint8_t *c[], int length, int radius,
>> + int dstride, int stride)
>> +{
>> + int y, off16;
>> +
>> + for (y = 0; y < height; y++) {
>> + for (off16 = 0; off16 < length; off16++){
>> + int i, sum = 0;
>> +
>> + for (i = 0; i < 2 * radius + 1; i++)
>> + sum += c[i][0 + y * stride + off16] * matrix[i];
>> +
>> + sum = (int)(sum * rdiv + bias + 0.5f);
>> + dst[off16] = av_clip_uint8(sum);
>> + }
>> + dst += dstride;
>> + }
>> +
>> +}
>>
>> av_cold void ff_convolution_init_x86(ConvolutionContext *s)
>> {
>> @@ -51,6 +72,8 @@ av_cold void
>> ff_convolution_init_x86(ConvolutionContext *s)
>> if (EXTERNAL_SSE4(cpu_flags))
>> s->filter[i] = ff_filter_row_sse4;
>> }
>> + if (s->mode[i] == MATRIX_COLUMN)
>> + s->filter[i] = filter_column16;
>> }
>> #endif
>> }
>> --
>> 2.17.1
>>
More information about the ffmpeg-devel
mailing list