[FFmpeg-devel] [PATCH] SSE2 version of vf_idet's filter_line()

Pascal Massimino pascal.massimino at gmail.com
Wed Sep 3 19:05:48 CEST 2014


Clément,

On Wed, Sep 3, 2014 at 6:19 PM, Clément Bœsch <u at pkh.me> wrote:

> On Wed, Sep 03, 2014 at 05:50:32PM +0200, Pascal Massimino wrote:
> [...]
> > removed this step in both mmx and sse2 version.
> >
> > -> new patch attached.
> >
> > /skal
>
> > From d2249b05b4a881ec3c9de8fc105b2a40c680a0ea Mon Sep 17 00:00:00 2001
> > From: skal <pascal.massimino at gmail.com>
> > Date: Wed, 3 Sep 2014 11:02:32 +0200
> > Subject: [PATCH] MMX/MMXEXT/SSE2 implementation of idet's filter_line()
> >
> > integration by Neil Birkbeck, with help from Vitor Sessak.
> > core SSE2 loop by Skal (pascal.massimino at gmail.com)
> > ---
> >  MAINTAINERS                    |   1 +
> >  libavfilter/vf_idet.c          |  38 ++----------
> >  libavfilter/vf_idet.h          |  58 ++++++++++++++++++
> >  libavfilter/x86/Makefile       |   2 +
> >  libavfilter/x86/vf_idet.asm    | 134
> +++++++++++++++++++++++++++++++++++++++++
> >  libavfilter/x86/vf_idet_init.c |  71 ++++++++++++++++++++++
> >  6 files changed, 272 insertions(+), 32 deletions(-)
> >  create mode 100644 libavfilter/vf_idet.h
> >  create mode 100644 libavfilter/x86/vf_idet.asm
> >  create mode 100644 libavfilter/x86/vf_idet_init.c
> >
> [...]
> > @@ -77,7 +48,7 @@ static const char *type2str(Type type)
> >      return NULL;
> >  }
> >
> > -static int filter_line_c(const uint8_t *a, const uint8_t *b, const
> uint8_t *c, int w)
> > +int ff_idet_filter_line_c(const uint8_t *a, const uint8_t *b, const
> uint8_t *c, int w)
>
> why do you need to export it?
>

it's used to finish the left-overs in mmx/mmxext/sse2. See FUNC_MAIN_DECL
macro.


>
> >  {
> >      int x;
> >      int ret=0;
> > @@ -271,7 +242,10 @@ static av_cold int init(AVFilterContext *ctx)
> >      idet->last_type = UNDETERMINED;
> >      memset(idet->history, UNDETERMINED, HIST_SIZE);
> >
> > -    idet->filter_line = filter_line_c;
> > +    idet->filter_line = ff_idet_filter_line_c;
> > +
> > +    if (ARCH_X86)
> > +      ff_idet_init_x86(idet);
>
> wrong indent
>
>
fixed


> [...]
> > diff --git a/libavfilter/x86/vf_idet.asm b/libavfilter/x86/vf_idet.asm
> > new file mode 100644
> > index 0000000..3fb265c
> > --- /dev/null
> > +++ b/libavfilter/x86/vf_idet.asm
> > @@ -0,0 +1,134 @@
> > +;;
> *****************************************************************************
> > +;; * x86-optimized functions for idet filter
> > +;; *
> > +;; * This file is part of FFmpeg.
> > +;; *
> > +;; * FFmpeg is free software; you can redistribute it and/or modify
> > +;; * it under the terms of the GNU General Public License as published
> by
> > +;; * the Free Software Foundation; either version 2 of the License, or
> > +;; * (at your option) any later version.
> > +;; *
> > +;; * FFmpeg is distributed in the hope that it will be useful,
> > +;; * but WITHOUT ANY WARRANTY; without even the implied warranty of
> > +;; * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> > +;; * GNU General Public License for more details.
> > +;; *
> > +;; * You should have received a copy of the GNU General Public License
> along
> > +;; * with FFmpeg; if not, write to the Free Software Foundation, Inc.,
> > +;; * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
> > +;;
> ******************************************************************************
> > +
> > +%include "libavutil/x86/x86util.asm"
> > +
>
> > +SECTION .text
>
> SECTION_TEXT
>

fixed


>
> > +
> > +; Implementation that does 8-bytes at a time using single-word
> operations.
> > +%macro IDET_FILTER_LINE 0
> > +cglobal idet_filter_line, 4, 8, 6, a, b, c, width, index
> > +    xor       indexq, indexq
> > +%define   m_zero m7
> > +%define   m_sum  m6
> > +    pxor      m_sum, m_sum
> > +    pxor      m_zero, m_zero
> > +
> > +.loop:
>
> > +    movq      m0, [aq+indexq*1]
>
> movu
>
> > +    movq      m1, m0
> > +    punpcklbw m0, m_zero
> > +    punpckhbw m1, m_zero
>
> Here and later, you can do:
>
> punpckhbw m1, m0, m_zero
> punpcklbw m0, m_zero
>

done


>
> > +
> > +    movq      m3, [cq+indexq*1]
>
> movu
>
> > +    movq      m4, m3
>
> mova
>
> etc
>
> > +    punpcklbw m3, m_zero
> > +    punpckhbw m4, m_zero
> > +
> > +    paddsw    m0, m3
> > +    paddsw    m1, m4
> > +
> > +    movq      m3, [bq+indexq*1]
> > +    movq      m4, m3
> > +    punpcklbw m3, m_zero
> > +    punpckhbw m4, m_zero
> > +
> > +    paddw     m3, m3
> > +    paddw     m4, m4
> > +    psubsw    m0, m3
> > +    psubsw    m1, m4
> > +
>
> > +    ABS1      m0, m5
> > +    ABS1      m1, m5
>
> ABS2?
>

ABS2 requires the two tmp registers to be different (can't use m5 for both).


>
> > +    paddw     m0, m1
> > +    movq      m1, m0
> > +    punpcklwd m0, m_zero
> > +    punpckhwd m1, m_zero
> > +    paddd     m0, m1
> > +    paddd     m_sum, m0
> > +
> > +    add       indexq, 0x8
>
> > +    CMP       widthq, indexq
>
> Someone needs to confirm this, but I think you'll need to make width a
> ptrdiff_t and not an int
>

changed to widthd/indexd, that's enough.


>
> Also... stupid question but what's CMP?
>

it's equivalent to 'cmp DWORD' here iirc.



> > +    jg        .loop
> > +
> > +    movq      m0, m_sum
> > +    psrlq     m_sum, 0x20
> > +    paddq     m0, m_sum
> > +    movd      eax, m0
> > +    RET
> > +%endmacro
> > +
> > +%if ARCH_X86_32
> > +INIT_MMX mmxext
> > +IDET_FILTER_LINE
> > +
> > +INIT_MMX mmx
> > +IDET_FILTER_LINE
> > +%endif
> > +
> > +;; SSE2 8-bit implementation that does 16-bytes at a time:
> > +;;
>
> > +;; const int w2 = w >> 4
> > +;; for (int i = 0; i < w2, ++i) {
> > +;;   const __m128i A = _mm_loadu_si128(&p_a[i])
> > +;;   const __m128i B = _mm_loadu_si128(&p_b[i])
> > +;;   const __m128i C = _mm_loadu_si128(&p_c[i])
> > +;;   const __m128i ab = _mm_subs_epu8(A, B)
> > +;;   const __m128i ba = _mm_subs_epu8(B, A)
> > +;;   const __m128i bc = _mm_subs_epu8(B, C)
> > +;;   const __m128i cb = _mm_subs_epu8(C, B)
> > +;;   const __m128i s1 = _mm_sad_epu8(ab, bc)
> > +;;   const __m128i s2 = _mm_sad_epu8(ba, cb)
> > +;;   result1 = _mm_add_epi64(result1, s1)
> > +;;   result2 = _mm_add_epi64(result2, s2)
> > +;; }
>
> please drop this
>
>
done



> > +INIT_XMM sse2
> > +cglobal idet_filter_line, 4, 8, 6, a, b, c, width, index, total
> > +    xor       indexq, indexq
> > +    pxor      m0, m0
> > +    pxor      m1, m1
> > +
> > +.sse2_loop:
> > +    movdqu    m2, [bq+indexq*1]  ; B
> > +    movdqu    m3, [aq+indexq*1]  ; A
>
> movu
>

done


>
> > +    movdqa    m5, m2
> > +    movdqa    m6, m2
> > +    movdqa    m4, m3
>
> mova
>
>
done


> etc
>
> > +    psubusb   m5, m3             ; ba
>
> psubusb m5, m2, m3 and drop the previous mov
>

done


>
> > +
> > +    movdqu    m3, [cq+indexq*1]  ; C
> > +    add       indexq, 0x10
> > +    psubusb   m4, m2             ; ab
> > +    CMP       indexq, widthq
> > +
> > +    psubusb   m6, m3             ; bc
> > +    psubusb   m3, m2             ; cb
> > +
> > +    psadbw    m4, m6             ; |ab - bc|
> > +    paddq     m0, m4
> > +    psadbw    m5, m3             ; |ba - cb|
> > +    paddq     m1, m5
> > +    jl       .sse2_loop
> > +
> > +    paddq     m0, m1
> > +    movhlps   m1, m0
> > +    paddq     m0, m1
> > +    movd      eax, m0
> > +    RET
> > diff --git a/libavfilter/x86/vf_idet_init.c
> b/libavfilter/x86/vf_idet_init.c
> > new file mode 100644
> > index 0000000..98086ef
> > --- /dev/null
> > +++ b/libavfilter/x86/vf_idet_init.c
> > @@ -0,0 +1,71 @@
> > +/*
> > + * This file is part of FFmpeg.
> > + *
> > + * FFmpeg is free software; you can redistribute it and/or
> > + * modify it under the terms of the GNU General Public
> > + * License as published by the Free Software Foundation; either
> > + * version 2 of the License, or (at your option) any later version.
> > + *
> > + * FFmpeg is distributed in the hope that it will be useful,
> > + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> > + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> > + * GNU General Public License for more details.
> > + *
> > + * You should have received a copy of the GNU General Public License
> along
> > + * with FFmpeg; if not, write to the Free Software
> > + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
> 02110-1301 USA
> > + */
> > +
> > +#include "libavutil/attributes.h"
> > +#include "libavutil/cpu.h"
> > +#include "libavutil/mem.h"
> > +#include "libavutil/x86/asm.h"
> > +#include "libavutil/x86/cpu.h"
> > +#include "libavfilter/vf_idet.h"
> > +
> > +/* declares main callable idet_filter_line_{mmx,mmxext,sse2}() */
> > +#define FUNC_MAIN_DECL(KIND, SPAN)
> \
> > +int ff_idet_filter_line_##KIND(const uint8_t *a, const uint8_t *b,
> \
> > +                               const uint8_t *c, int w);
> \
> > +static int idet_filter_line_##KIND(const uint8_t *a, const uint8_t *b,
> \
> > +                                   const uint8_t *c, int w) {
>  \
> > +  int sum = 0;
> \
> > +  const int left_over = w % SPAN;
>  \
> > +  w -= left_over;
>  \
> > +  if (w > 0)
> \
> > +      sum += ff_idet_filter_line_##KIND(a, b, c, w);
> \
> > +  if (left_over > 0)
> \
> > +      sum += ff_idet_filter_line_c(a + w, b + w, c + w, left_over);
>  \
> > +  return sum;
>  \
> > +}
>
> wrong indent, 4 spaces
>

done eveywhere.


thanks! new patch attached.

/skal


>
> [...]
>
> --
> Clément B.
>
-------------- next part --------------
A non-text attachment was scrubbed...
Name: 0001-MMX-MMXEXT-SSE2-implementation-of-idet-s-filter_line.patch
Type: text/x-patch
Size: 11946 bytes
Desc: not available
URL: <https://ffmpeg.org/pipermail/ffmpeg-devel/attachments/20140903/e0027c98/attachment.bin>


More information about the ffmpeg-devel mailing list