[FFmpeg-devel] [RFC PATCH] avfilter/fastdeint: import simple cpu-optimized deinterlacing algorithms from VLC

Mon Sep 9 23:40:33 EEST 2019

On 9/9/2019 5:12 PM, Aman Gupta wrote:
> From: Aman Gupta <aman at tmm1.net>
> 
> These are simple algorithms which can be run efficiently
> on low powered devices to produce deinteraced images.
> 
> Signed-off-by: Aman Gupta <aman at tmm1.net>
> ---
>  doc/filters.texi                 |  27 ++
>  libavfilter/Makefile             |   1 +
>  libavfilter/aarch64/Makefile     |   1 +
>  libavfilter/aarch64/merge_neon.S |  98 ++++++
>  libavfilter/allfilters.c         |   1 +
>  libavfilter/arm/Makefile         |   3 +
>  libavfilter/arm/merge_armv6.S    |  70 ++++
>  libavfilter/arm/merge_neon.S     | 109 ++++++
>  libavfilter/vf_fastdeint.c       | 588 +++++++++++++++++++++++++++++++
>  9 files changed, 898 insertions(+)
>  create mode 100644 libavfilter/aarch64/merge_neon.S
>  create mode 100644 libavfilter/arm/Makefile
>  create mode 100644 libavfilter/arm/merge_armv6.S
>  create mode 100644 libavfilter/arm/merge_neon.S
>  create mode 100644 libavfilter/vf_fastdeint.c

Asm stuff should be in a separate entry.

> 
> diff --git a/doc/filters.texi b/doc/filters.texi
> index 6c81e1da40..55d9adeb81 100644
> --- a/doc/filters.texi
> +++ b/doc/filters.texi
> @@ -9796,6 +9796,33 @@ fade=t=in:st=5.5:d=0.5
>  
>  @end itemize
>  
> + at section fastdeint
> +Fast deinterlacing algorithms.
> +
> + at table @option
> + at item mode
> +Deinterlacing algorithm to use.
> +
> +It accepts the following values:
> + at table @samp
> + at item discard
> +Discard bottom frame.
> +
> + at item mean
> +Half resolution blender.
> +
> + at item blend
> +Full resolution blender.
> +
> + at item bob
> +Bob doubler.
> +
> + at item linear
> +Bob doubler with linear interpolation.
> + at end table
> +
> + at end table
> +
>  @section fftdnoiz
>  Denoise frames using 3D FFT (frequency domain filtering).
>  
> diff --git a/libavfilter/Makefile b/libavfilter/Makefile
> index 3ef4191d9a..a2b3566ec0 100644
> --- a/libavfilter/Makefile
> +++ b/libavfilter/Makefile
> @@ -234,6 +234,7 @@ OBJS-$(CONFIG_EROSION_OPENCL_FILTER)         += vf_neighbor_opencl.o opencl.o \
>                                                  opencl/neighbor.o
>  OBJS-$(CONFIG_EXTRACTPLANES_FILTER)          += vf_extractplanes.o
>  OBJS-$(CONFIG_FADE_FILTER)                   += vf_fade.o
> +OBJS-$(CONFIG_FASTDEINT_FILTER)              += vf_fastdeint.o
>  OBJS-$(CONFIG_FFTDNOIZ_FILTER)               += vf_fftdnoiz.o
>  OBJS-$(CONFIG_FFTFILT_FILTER)                += vf_fftfilt.o
>  OBJS-$(CONFIG_FIELD_FILTER)                  += vf_field.o
> diff --git a/libavfilter/aarch64/Makefile b/libavfilter/aarch64/Makefile
> index b58daa3a3f..2b0ad92893 100644
> --- a/libavfilter/aarch64/Makefile
> +++ b/libavfilter/aarch64/Makefile
> @@ -1,3 +1,4 @@
>  OBJS-$(CONFIG_NLMEANS_FILTER)                += aarch64/vf_nlmeans_init.o
>  
> +NEON-OBJS-$(CONFIG_FASTDEINT_FILTER)         += aarch64/merge_neon.o
>  NEON-OBJS-$(CONFIG_NLMEANS_FILTER)           += aarch64/vf_nlmeans_neon.o
> diff --git a/libavfilter/aarch64/merge_neon.S b/libavfilter/aarch64/merge_neon.S
> new file mode 100644
> index 0000000000..62377331a4
> --- /dev/null
> +++ b/libavfilter/aarch64/merge_neon.S
> @@ -0,0 +1,98 @@
> +/*
> + * Copyright (c) 2009-2016 Rémi Denis-Courmont, Janne Grunau, VLC authors
> + *
> + * This file is part of FFmpeg.
> + *
> + * FFmpeg is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU Lesser General Public
> + * License as published by the Free Software Foundation; either
> + * version 2.1 of the License, or (at your option) any later version.
> + *
> + * FFmpeg is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> + * Lesser General Public License for more details.
> + *
> + * You should have received a copy of the GNU Lesser General Public
> + * License along with FFmpeg; if not, write to the Free Software
> + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
> + */
> +
> +#include "libavutil/aarch64/asm.S"
> +
> +#define dest x0
> +#define src1 x1
> +#define src2 x2
> +#define size x3
> +
> +        .align 2
> +        // NOTE: Offset and pitch must be multiple of 16-bytes.
> +function ff_merge8_neon, export=1
> +        ands            x5, size, #~63
> +        b.eq            2f
> +        mov             x10, #64
> +        add             x11, src1, #32
> +        add             x12, src2, #32
> +1:
> +        ld1             {v0.16b,v1.16b}, [src1], x10
> +        ld1             {v4.16b,v5.16b}, [src2], x10
> +        ld1             {v2.16b,v3.16b}, [x11], x10
> +        uhadd           v0.16b, v0.16b, v4.16b
> +        ld1             {v6.16b,v7.16b}, [x12], x10
> +        subs            x5, x5, #64
> +        uhadd           v1.16b, v1.16b, v5.16b
> +        uhadd           v2.16b, v2.16b, v6.16b
> +        uhadd           v3.16b, v3.16b, v7.16b
> +        st1             {v0.16b,v1.16b}, [dest], #32
> +        st1             {v2.16b,v3.16b}, [dest], #32
> +        b.gt            1b
> +2:
> +        tbz             size, #5,  3f
> +        ld1             {v0.16b,v1.16b}, [src1], #32
> +        ld1             {v4.16b,v5.16b}, [src2], #32
> +        uhadd           v0.16b, v0.16b, v4.16b
> +        uhadd           v1.16b, v1.16b, v5.16b
> +        st1             {v0.16b,v1.16b}, [dest], #32
> +3:
> +        tbz             size, #4, 4f
> +        ld1             {v0.16b}, [src1]
> +        ld1             {v4.16b}, [src2]
> +        uhadd           v0.16b, v0.16b, v4.16b
> +        st1             {v0.16b}, [dest]
> +4:
> +        ret
> +endfunc
> +
> +        .align 2
> +function ff_merge16_neon, export=1
> +        ands            x5, size, #~63
> +        b.eq            2f
> +1:
> +        ld1             {v0.8h,v1.8h}, [src1], #32
> +        ld1             {v4.8h,v5.8h}, [src2], #32
> +        ld1             {v2.8h,v3.8h}, [src1], #32
> +        uhadd           v0.8h, v0.8h, v4.8h
> +        ld1             {v6.8h,v7.8h}, [src2], #32
> +        uhadd           v1.8h, v1.8h, v5.8h
> +        uhadd           v2.8h, v2.8h, v6.8h
> +        uhadd           v3.8h, v3.8h, v7.8h
> +        st1             {v0.8h,v1.8h}, [dest], #32
> +        st1             {v2.8h,v3.8h}, [dest], #32
> +        subs            x5, x5, #64
> +        b.gt            1b
> +2:
> +        tbz             size, #5, 3f
> +        ld1             {v0.8h,v1.8h}, [src1], #32
> +        ld1             {v4.8h,v5.8h}, [src2], #32
> +        uhadd           v0.8h, v0.8h, v4.8h
> +        uhadd           v1.8h, v1.8h, v5.8h
> +        st1             {v0.8h,v1.8h}, [dest], #32
> +3:
> +        tbz             size, #4,  4f
> +        ld1             {v0.8h}, [src1]
> +        ld1             {v4.8h}, [src2]
> +        uhadd           v0.8h, v0.8h,v4.8h
> +        st1             {v0.8h}, [dest]
> +4:
> +        ret
> +endfunc
> diff --git a/libavfilter/allfilters.c b/libavfilter/allfilters.c
> index b675c688ee..6631af2ffe 100644
> --- a/libavfilter/allfilters.c
> +++ b/libavfilter/allfilters.c
> @@ -219,6 +219,7 @@ extern AVFilter ff_vf_erosion;
>  extern AVFilter ff_vf_erosion_opencl;
>  extern AVFilter ff_vf_extractplanes;
>  extern AVFilter ff_vf_fade;
> +extern AVFilter ff_vf_fastdeint;
>  extern AVFilter ff_vf_fftdnoiz;
>  extern AVFilter ff_vf_fftfilt;
>  extern AVFilter ff_vf_field;
> diff --git a/libavfilter/arm/Makefile b/libavfilter/arm/Makefile
> new file mode 100644
> index 0000000000..c92d62fac9
> --- /dev/null
> +++ b/libavfilter/arm/Makefile
> @@ -0,0 +1,3 @@
> +ARMV6-OBJS-$(CONFIG_FASTDEINT_FILTER)  += arm/merge_armv6.o
> +
> +NEON-OBJS-$(CONFIG_FASTDEINT_FILTER)   += arm/merge_neon.o
> diff --git a/libavfilter/arm/merge_armv6.S b/libavfilter/arm/merge_armv6.S
> new file mode 100644
> index 0000000000..9b551c2c6c
> --- /dev/null
> +++ b/libavfilter/arm/merge_armv6.S
> @@ -0,0 +1,70 @@
> +/*
> + * Copyright (c) 2009-2012 Rémi Denis-Courmont, VLC authors
> + *
> + * This file is part of FFmpeg.
> + *
> + * FFmpeg is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU Lesser General Public
> + * License as published by the Free Software Foundation; either
> + * version 2.1 of the License, or (at your option) any later version.
> + *
> + * FFmpeg is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> + * Lesser General Public License for more details.
> + *
> + * You should have received a copy of the GNU Lesser General Public
> + * License along with FFmpeg; if not, write to the Free Software
> + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
> + */
> +
> +#include "libavutil/arm/asm.S"
> +
> +#define dest r0
> +#define src1 r1
> +#define src2 r2
> +#define size r3
> +
> +        .align 2
> +function ff_merge8_armv6, export=1
> +        push            {r4-r9,lr}
> +1:
> +        pld             [src1, #64]
> +        ldm             src1!, {r4-r5}
> +        pld             [src2, #64]
> +        ldm             src2!, {r8-r9}
> +        subs            size, size, #16
> +        uhadd8          r4, r4, r8
> +        ldm             src1!, {r6-r7}
> +        uhadd8          r5, r5, r9
> +        ldm             src2!, {ip,lr}
> +        uhadd8          r6, r6, ip
> +        stm             dest!, {r4-r5}
> +        uhadd8          r7, r7, lr
> +        stm             dest!, {r6-r7}
> +        it              eq
> +        popeq           {r4-r9,pc}
> +        b               1b
> +endfunc
> +
> +        .align 2
> +function ff_merge16_armv6, export=1
> +        push            {r4-r9,lr}
> +1:
> +        pld             [src1, #64]
> +        ldm             src1!, {r4-r5}
> +        pld             [src2, #64]
> +        ldm             src2!, {r8-r9}
> +        subs            size, size, #16
> +        uhadd16         r4, r4, r8
> +        ldm             src1!, {r6-r7}
> +        uhadd16         r5, r5, r9
> +        ldm             src2!, {ip,lr}
> +        uhadd16         r6, r6, ip
> +        stm             dest!, {r4-r5}
> +        uhadd16         r7, r7, lr
> +        stm             dest!, {r6-r7}
> +        it              eq
> +        popeq           {r4-r9,pc}
> +        b               1b
> +endfunc
> \ No newline at end of file

This shouldn't happen.

> diff --git a/libavfilter/arm/merge_neon.S b/libavfilter/arm/merge_neon.S
> new file mode 100644
> index 0000000000..ae36cf3ca9
> --- /dev/null
> +++ b/libavfilter/arm/merge_neon.S
> @@ -0,0 +1,109 @@
> +/*
> + * Copyright (c) 2009-2012 Rémi Denis-Courmont, VLC authors
> + *
> + * This file is part of FFmpeg.
> + *
> + * FFmpeg is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU Lesser General Public
> + * License as published by the Free Software Foundation; either
> + * version 2.1 of the License, or (at your option) any later version.
> + *
> + * FFmpeg is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> + * Lesser General Public License for more details.
> + *
> + * You should have received a copy of the GNU Lesser General Public
> + * License along with FFmpeg; if not, write to the Free Software
> + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
> + */
> +
> +#include "libavutil/arm/asm.S"
> +
> +#define dest r0
> +#define src1 r1
> +#define src2 r2
> +#define size r3
> +
> +        .align 2
> +        @ NOTE: Offset and pitch must be multiple of 16-bytes.
> +function ff_merge8_neon, export=1
> +        cmp             size, #64
> +        blo             2f
> +1:
> +        pld             [src1, #64]
> +        vld1.u8         {q0-q1}, [src1,:128]!
> +        pld             [src2, #64]
> +        vld1.u8         {q8-q9}, [src2,:128]!
> +        vhadd.u8        q0, q0, q8
> +        sub             size, size, #64
> +        vld1.u8         {q2-q3}, [src1,:128]!
> +        vhadd.u8        q1, q1, q9
> +        vld1.u8         {q10-q11}, [src2,:128]!
> +        vhadd.u8        q2, q2, q10
> +        cmp             size, #64
> +        vhadd.u8        q3, q3, q11
> +        vst1.u8         {q0-q1}, [dest,:128]!
> +        vst1.u8         {q2-q3}, [dest,:128]!
> +        bhs             1b
> +2:
> +        cmp             size, #32
> +        blo             3f
> +        vld1.u8         {q0-q1}, [src1,:128]!
> +        sub             size, size, #32
> +        vld1.u8         {q8-q9}, [src2,:128]!
> +        vhadd.u8        q0, q0, q8
> +        vhadd.u8        q1, q1, q9
> +        vst1.u8         {q0-q1}, [dest,:128]!
> +3:
> +        cmp             size, #16
> +        it              lo
> +        bxlo            lr
> +        vld1.u8         {q0}, [src1,:128]!
> +        sub             size, size, #16
> +        vld1.u8         {q8}, [src2,:128]!
> +        vhadd.u8        q0, q0, q8
> +        vst1.u8         {q0}, [dest,:128]!
> +        bx              lr
> +endfunc
> +
> +        .align 2
> +function ff_merge16_neon, export=1
> +        cmp             size, #64
> +        blo             2f
> +1:
> +        pld             [src1, #64]
> +        vld1.u16        {q0-q1}, [src1,:128]!
> +        pld             [src2, #64]
> +        vld1.u16        {q8-q9}, [src2,:128]!
> +        vhadd.u16       q0, q0, q8
> +        sub             size, size, #64
> +        vld1.u16        {q2-q3}, [src1,:128]!
> +        vhadd.u16       q1, q1, q9
> +        vld1.u16        {q10-q11}, [src2,:128]!
> +        vhadd.u16       q2, q2, q10
> +        cmp             size, #64
> +        vhadd.u16       q3, q3, q11
> +        vst1.u16        {q0-q1}, [dest,:128]!
> +        vst1.u16        {q2-q3}, [dest,:128]!
> +        bhs             1b
> +2:
> +        cmp             size, #32
> +        blo             3f
> +        vld1.u16        {q0-q1}, [src1,:128]!
> +        sub             size, size, #32
> +        vld1.u16        {q8-q9}, [src2,:128]!
> +        vhadd.u16       q0, q0, q8
> +        vhadd.u16       q1, q1, q9
> +        vst1.u16        {q0-q1}, [dest,:128]!
> +3:
> +        cmp             size, #16
> +        it              lo
> +        bxlo            lr
> +        vld1.u16        {q0}, [src1,:128]!
> +        sub             size, size, #16
> +        vld1.u16        {q8}, [src2,:128]!
> +        vhadd.u16       q0, q0, q8
> +        vst1.u16        {q0}, [dest,:128]!
> +        bx              lr
> +endfunc
> \ No newline at end of file
> diff --git a/libavfilter/vf_fastdeint.c b/libavfilter/vf_fastdeint.c
> new file mode 100644
> index 0000000000..5ddd8be392
> --- /dev/null
> +++ b/libavfilter/vf_fastdeint.c
> @@ -0,0 +1,588 @@
> +/*
> + * Copyright (C) 2015 Aman Gupta <aman at tmm1.net>
> + *               2000-2011 VLC authors and VideoLAN
> + *
> + * Author: Sam Hocevar <sam at zoy.org>
> + *         Damien Lucas <nitrox at videolan.org>
> + *         Laurent Aimar <fenrir at videolan.org>
> + *         Sigmund Augdal Helberg <sigmunau at videolan.org>
> + *
> + * These algorithms are derived from the VLC project's
> + * modules/video_filter/deinterlace/algo_basic.c
> + *
> + * FFmpeg is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU Lesser General Public
> + * License as published by the Free Software Foundation; either
> + * version 2.1 of the License, or (at your option) any later version.
> + *
> + * FFmpeg is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> + * Lesser General Public License for more details.
> + *
> + * You should have received a copy of the GNU Lesser General Public
> + * License along with FFmpeg; if not, write to the Free Software
> + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
> + */
> +
> +#include "libavutil/avassert.h"
> +#include "libavutil/cpu.h"
> +#include "libavutil/common.h"
> +#include "libavutil/opt.h"
> +#include "libavutil/pixdesc.h"
> +#include "libavutil/imgutils.h"
> +#include "libavutil/timestamp.h"
> +#include "avfilter.h"
> +#include "formats.h"
> +#include "internal.h"
> +#include "video.h"
> +
> +enum Mode {
> +  MODE_DISCARD,
> +  MODE_MEAN,
> +  MODE_BLEND,
> +  MODE_BOB,
> +  MODE_LINEAR,
> +  MODE_MAX,
> +};
> +
> +typedef void (*merge_fn)(void *dst, const void *src1, const void *src2, size_t len);
> +
> +typedef struct FastDeintContext {
> +    const AVClass *class;
> +    merge_fn merge;
> +    int merge_size;
> +    int merge_aligned;
> +    AVFrame *cur, *next;
> +    enum Mode mode;
> +    int eof;
> +} FastDeintContext;
> +
> +static void merge8_c(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, size_t bytes)
> +{
> +    for (; bytes > 0; bytes--)
> +        *dst++ = ( *src1++ + *src2++ ) >> 1;
> +}
> +
> +static void merge16_c(uint16_t *dst, const uint16_t *src1, const uint16_t *src2, size_t bytes)
> +{
> +    for (size_t words = bytes / 2; words > 0; words--)
> +        *dst++ = ( *src1++ + *src2++ ) >> 1;
> +}
> +
> +static void merge8_unaligned(FastDeintContext *s, uint8_t *dst, const uint8_t *src1, const uint8_t *src2, size_t bytes)
> +{
> +    if (s->merge_aligned) {
> +        size_t remainder = bytes % 16;
> +        if (remainder > 0) {
> +            merge8_c(dst, src1, src2, remainder);
> +            bytes -= remainder;
> +            dst += remainder;
> +            src1 += remainder;
> +            src2 += remainder;
> +        }
> +    }
> +    s->merge(dst, src1, src2, bytes);
> +}
> +
> +static void merge16_unaligned(FastDeintContext *s, uint16_t *dst, const uint16_t *src1, const uint16_t *src2, size_t bytes)
> +{
> +    if (s->merge_aligned) {
> +        size_t words = bytes / 2;
> +        size_t remainder = words % 8;
> +        if (remainder > 0) {
> +            merge16_c(dst, src1, src2, remainder);
> +            words -= remainder;
> +            dst += remainder;
> +            src1 += remainder;
> +            src2 += remainder;
> +        }
> +    }
> +    s->merge(dst, src1, src2, bytes);
> +}
> +
> +static void merge_unaligned(FastDeintContext *s, void *dst, const void *src1, const void *src2, size_t bytes)
> +{
> +    if (s->merge_size == 16)
> +        merge16_unaligned(s, dst, src1, src2, bytes);
> +    else
> +        merge8_unaligned(s, dst, src1, src2, bytes);
> +}
> +
> +#if HAVE_SSE2_INLINE && defined(__x86_64__)

No inline asm. This code needs to be ported to nasm syntax.

Also, no arch specific code should be present in arch agnostic source
files, beyond calls to init() functions.

> +static void merge8_sse2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, size_t bytes)
> +{
> +    for(; bytes > 0 && ((uintptr_t)src1 & 15); bytes--)
> +        *dst++ = ( *src1++ + *src2++ ) >> 1;
> +
> +    for (; bytes >= 16; bytes -= 16) {
> +        __asm__  __volatile__( "movdqu %2,%%xmm1;"
> +                               "pavgb %1, %%xmm1;"
> +                               "movdqu %%xmm1, %0" :"=m" (*dst):
> +                                                 "m" (*src1),
> +                                                 "m" (*src2) : "xmm1" );
> +        dst += 16;
> +        src1 += 16;
> +        src2 += 16;
> +    }
> +
> +    if (bytes > 0) {
> +        merge8_c(dst, src1, src2, bytes);
> +    }
> +}
> +static void merge16_sse2(uint16_t *dst, const uint16_t *src1, const uint16_t *src2, size_t bytes)
> +{
> +    size_t words = bytes / 2;
> +
> +    for(; words > 0 && ((uintptr_t)src1 & 15); words--)
> +        *dst++ = ( *src1++ + *src2++ ) >> 1;
> +
> +    for (; words >= 8; words -= 8) {
> +        __asm__  __volatile__( "movdqu %2,%%xmm1;"
> +                               "pavgw %1, %%xmm1;"
> +                               "movdqu %%xmm1, %0" :"=m" (*dst):
> +                                                 "m" (*src1),
> +                                                 "m" (*src2) : "xmm1" );
> +        dst += 8;
> +        src1 += 8;
> +        src2 += 8;
> +    }
> +
> +    if (words > 0) {
> +        merge16_c(dst, src1, src2, words * 2);
> +    }
> +}
> +#define merge8 merge8_sse2
> +#define merge16 merge16_sse2
> +#else
> +#define merge8 merge8_c
> +#define merge16 merge16_c
> +#endif
> +
> +static void render_image_single(FastDeintContext *s, AVFrame *out, AVFrame *frame)
> +{
> +    int i, planes_nb = 0;
> +    enum Mode mode = s->mode;
> +    const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(out->format);
> +
> +    for (i = 0; i < desc->nb_components; i++)
> +        planes_nb = FFMAX(planes_nb, desc->comp[i].plane + 1);
> +
> +    for (i = 0; i < planes_nb; i++) {
> +        int height, bwidth;
> +        int dst_linesize, src_linesize;
> +        const uint8_t *src;
> +        uint8_t *dst;
> +
> +        bwidth = av_image_get_linesize(out->format, out->width, i);
> +        if (bwidth < 0) {
> +            av_log(s, AV_LOG_ERROR, "av_image_get_linesize failed\n");
> +            return;
> +        }
> +
> +        height = out->height;
> +        if (i == 1 || i == 2) {
> +            height = FF_CEIL_RSHIFT(out->height, desc->log2_chroma_h);
> +        }
> +
> +        src = frame->data[i];
> +        dst = out->data[i];
> +        dst_linesize = out->linesize[i];
> +        src_linesize = frame->linesize[i];
> +
> +        if (mode == MODE_BLEND) {
> +            // Copy first line
> +            memcpy(dst, src, bwidth);
> +            dst += dst_linesize;
> +            height--;
> +        }
> +
> +        // Merge remaining lines
> +        for (; height > 0; height--) {
> +            if (mode == MODE_DISCARD)
> +                memcpy(dst, src, bwidth);
> +            else
> +                merge_unaligned(s, dst, src, src + src_linesize, bwidth);
> +            dst += dst_linesize;
> +            src += src_linesize;
> +            if (mode == MODE_MEAN || mode == MODE_DISCARD) {
> +                src += src_linesize;
> +                height--;
> +            }
> +        }
> +    }
> +    if (mode != MODE_DISCARD)
> +        emms_c();
> +}
> +
> +static void render_image_doubler(FastDeintContext *s, AVFrame *out, AVFrame *frame, int field)
> +{
> +    int i, planes_nb = 0;
> +    enum Mode mode = s->mode;
> +    const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(out->format);
> +
> +    for (i = 0; i < desc->nb_components; i++)
> +        planes_nb = FFMAX(planes_nb, desc->comp[i].plane + 1);
> +
> +    for (i = 0; i < planes_nb; i++) {
> +        int height, bwidth;
> +        int dst_linesize, src_linesize;
> +        const uint8_t *src;
> +        uint8_t *dst;
> +
> +        bwidth = av_image_get_linesize(out->format, out->width, i);
> +        if (bwidth < 0) {
> +            av_log(s, AV_LOG_ERROR, "av_image_get_linesize failed\n");
> +            return;
> +        }
> +        height = out->height;
> +        if (i == 1 || i == 2) {
> +            height = FF_CEIL_RSHIFT(out->height, desc->log2_chroma_h);
> +        }
> +
> +        src = frame->data[i];
> +        dst = out->data[i];
> +        src_linesize = frame->linesize[i];
> +        dst_linesize = out->linesize[i];
> +
> +        // For BOTTOM field we need to add the first line
> +        if (field == 1) {
> +            memcpy(dst, src, bwidth);
> +            dst += dst_linesize;
> +            src += src_linesize;
> +            height--;
> +        }
> +
> +        height -= 2;
> +
> +        for (; height > 0; height-=2) {
> +            memcpy(dst, src, bwidth);
> +            dst += dst_linesize;
> +
> +            if (mode == MODE_LINEAR)
> +                merge_unaligned(s, dst, src, src + 2 * src_linesize, bwidth);
> +            else
> +                memcpy(dst, src, bwidth);
> +            dst += dst_linesize;
> +
> +            src += src_linesize * 2;
> +        }
> +
> +        memcpy(dst, src, bwidth);
> +
> +        // For TOP field we need to add the last line
> +        if (field == 0)
> +        {
> +            dst += dst_linesize;
> +            src += src_linesize;
> +            memcpy(dst, src, bwidth);
> +        }
> +    }
> +    if (mode == MODE_LINEAR)
> +        emms_c();
> +}
> +
> +static int filter_frame_single(AVFilterLink *link, AVFrame *frame)
> +{
> +    AVFilterContext *ctx = link->dst;
> +    AVFrame *out;
> +    FastDeintContext *s = ctx->priv;
> +
> +    if (!frame->interlaced_frame) {
> +        return ff_filter_frame(ctx->outputs[0], frame);
> +    }
> +
> +    out = ff_get_video_buffer(ctx->outputs[0], link->w, link->h);
> +    if (!out) {
> +        av_frame_free(&frame);
> +        return AVERROR(ENOMEM);
> +    }
> +
> +    av_frame_copy_props(out, frame);
> +    out->interlaced_frame = 0;
> +    render_image_single(s, out, frame);
> +
> +    av_frame_free(&frame);
> +    return ff_filter_frame(ctx->outputs[0], out);
> +}
> +
> +static AVFrame *copy_frame(AVFilterLink *link, AVFrame *frame)
> +{
> +    AVFilterContext *ctx = link->dst;
> +    AVFrame *out;
> +
> +    if (frame->format == AV_PIX_FMT_VIDEOTOOLBOX)
> +        out = av_frame_alloc();
> +    else
> +        out = ff_get_video_buffer(ctx->outputs[0], link->w, link->h);
> +
> +    if (!out)
> +        return NULL;
> +
> +    av_frame_copy_props(out, frame);
> +    return out;
> +}
> +
> +static int filter_frame_double(AVFilterLink *link, AVFrame *in)
> +{
> +    AVFilterContext *ctx = link->dst;
> +    FastDeintContext *s = ctx->priv;
> +    AVFrame *frame, *out, *out2;
> +    int tff, ret;
> +
> +    s->cur = s->next;
> +    s->next = in;
> +
> +    if (!s->cur) {
> +        return 0;
> +    }
> +
> +    frame = s->cur;
> +
> +    if (!frame->interlaced_frame) {
> +        if (frame->pts != AV_NOPTS_VALUE)
> +            frame->pts *= 2;
> +        s->cur = NULL;
> +        return ff_filter_frame(ctx->outputs[0], frame);
> +    }
> +
> +    tff = frame->top_field_first;
> +    out = copy_frame(link, frame);
> +    if (!out) {
> +        av_frame_free(&frame);
> +        s->cur = NULL;
> +        return AVERROR(ENOMEM);
> +    }
> +
> +    out->interlaced_frame = 0;
> +    if (out->pts != AV_NOPTS_VALUE)
> +        out->pts = out->pts * 2;
> +    render_image_doubler(s, out, frame, !tff);
> +
> +    ret = ff_filter_frame(ctx->outputs[0], out);
> +    if (ret < 0) {
> +        av_frame_free(&frame);
> +        s->cur = NULL;
> +        return ret;
> +    }
> +
> +    out2 = copy_frame(link, frame);
> +    if (!out2) {
> +        av_frame_free(&frame);
> +        s->cur = NULL;
> +        return AVERROR(ENOMEM);
> +    }
> +
> +    out2->interlaced_frame = 0;
> +    av_frame_remove_side_data(out2, AV_FRAME_DATA_A53_CC);
> +    if (out2->pts != AV_NOPTS_VALUE) {
> +        out2->pts = frame->pts + s->next->pts;
> +    }
> +    render_image_doubler(s, out2, frame, tff);
> +
> +    av_frame_free(&frame);
> +    s->cur = NULL;
> +
> +    return ff_filter_frame(ctx->outputs[0], out2);
> +}
> +
> +static int filter_frame(AVFilterLink *link, AVFrame *frame)
> +{
> +    AVFilterContext *ctx = link->dst;
> +    FastDeintContext *s = ctx->priv;
> +
> +    av_assert0(frame);
> +
> +    if (s->mode == MODE_LINEAR || s->mode == MODE_BOB) {
> +        return filter_frame_double(link, frame);
> +    } else {
> +        return filter_frame_single(link, frame);
> +    }
> +}
> +
> +static av_cold void uninit(AVFilterContext *ctx)
> +{
> +    FastDeintContext *s = ctx->priv;
> +    av_frame_free(&s->cur);
> +    av_frame_free(&s->next);
> +}
> +
> +static int query_formats(AVFilterContext *ctx)
> +{
> +    static const enum AVPixelFormat pix_fmts[] = {
> +        AV_PIX_FMT_YUV420P,
> +        AV_PIX_FMT_YUV422P,
> +        AV_PIX_FMT_YUV444P,
> +        AV_PIX_FMT_YUV410P,
> +        AV_PIX_FMT_YUV411P,
> +        AV_PIX_FMT_GRAY8,
> +        AV_PIX_FMT_YUVJ420P,
> +        AV_PIX_FMT_YUVJ422P,
> +        AV_PIX_FMT_YUVJ444P,
> +        AV_PIX_FMT_GRAY16,
> +        AV_PIX_FMT_YUV440P,
> +        AV_PIX_FMT_YUVJ440P,
> +        AV_PIX_FMT_YUV420P9,
> +        AV_PIX_FMT_YUV422P9,
> +        AV_PIX_FMT_YUV444P9,
> +        AV_PIX_FMT_YUV420P10,
> +        AV_PIX_FMT_YUV422P10,
> +        AV_PIX_FMT_YUV444P10,
> +        AV_PIX_FMT_YUV420P12,
> +        AV_PIX_FMT_YUV422P12,
> +        AV_PIX_FMT_YUV444P12,
> +        AV_PIX_FMT_YUV420P14,
> +        AV_PIX_FMT_YUV422P14,
> +        AV_PIX_FMT_YUV444P14,
> +        AV_PIX_FMT_YUV420P16,
> +        AV_PIX_FMT_YUV422P16,
> +        AV_PIX_FMT_YUV444P16,
> +        AV_PIX_FMT_YUVA420P,
> +        AV_PIX_FMT_YUVA422P,
> +        AV_PIX_FMT_YUVA444P,
> +        AV_PIX_FMT_GBRP,
> +        AV_PIX_FMT_GBRP9,
> +        AV_PIX_FMT_GBRP10,
> +        AV_PIX_FMT_GBRP12,
> +        AV_PIX_FMT_GBRP14,
> +        AV_PIX_FMT_GBRP16,
> +        AV_PIX_FMT_GBRAP,
> +        AV_PIX_FMT_NONE
> +    };
> +
> +    AVFilterFormats *fmts_list = ff_make_format_list(pix_fmts);
> +    if (!fmts_list)
> +        return AVERROR(ENOMEM);
> +    return ff_set_common_formats(ctx, fmts_list);
> +}
> +
> +#if ARCH_ARM
> +#include "libavutil/arm/cpu.h"
> +#endif
> +#if ARCH_AARCH64
> +#include "libavutil/aarch64/cpu.h"
> +#endif
> +#if ARCH_AARCH64 || ARCH_ARM
> +void ff_merge8_neon(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, size_t bytes);
> +void ff_merge16_neon(uint16_t *dst, const uint16_t *src1, const uint16_t *src2, size_t bytes);
> +void ff_merge8_armv6(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, size_t bytes);
> +void ff_merge16_armv6(uint16_t *dst, const uint16_t *src1, const uint16_t *src2, size_t bytes);
> +#endif
> +
> +static int config_props(AVFilterLink *link)
> +{
> +    AVFilterContext *ctx = link->src;
> +    FastDeintContext *s = ctx->priv;
> +    const AVPixFmtDescriptor *pix;
> +#if ARCH_AARCH64 || ARCH_ARM
> +    int cpu_flags = av_get_cpu_flags();
> +#endif
> +
> +    link->w = link->src->inputs[0]->w;
> +    link->h = link->src->inputs[0]->h;
> +    link->time_base  = link->src->inputs[0]->time_base;
> +    link->frame_rate = link->src->inputs[0]->frame_rate;
> +    link->sample_aspect_ratio = link->src->inputs[0]->sample_aspect_ratio;
> +
> +    if (s->mode == MODE_MEAN || s->mode == MODE_DISCARD) {
> +        link->h /= 2;
> +        link->sample_aspect_ratio = av_mul_q(link->sample_aspect_ratio, av_make_q(1, 2));
> +    }
> +    if (s->mode == MODE_LINEAR || s->mode == MODE_BOB) {
> +        link->time_base  = av_mul_q(link->time_base,  av_make_q(1, 2));
> +        link->frame_rate = av_mul_q(link->frame_rate, av_make_q(2, 1));
> +    }
> +
> +    pix = av_pix_fmt_desc_get(link->format);
> +    s->merge_size = (pix->comp[0].depth > 8) ? 16 : 8;
> +    s->merge = s->merge_size == 16 ? (merge_fn)merge16 : (merge_fn)merge8;
> +
> +#if ARCH_ARM
> +    if (have_armv6(cpu_flags)) {
> +        s->merge = s->merge_size == 16 ? (merge_fn)ff_merge16_armv6 : (merge_fn)ff_merge8_armv6;
> +        s->merge_aligned = 1;
> +    }
> +#endif
> +#if ARCH_AARCH64 || ARCH_ARM
> +    if (have_neon(cpu_flags)) {
> +        s->merge = s->merge_size == 16 ? (merge_fn)ff_merge16_neon : (merge_fn)ff_merge8_neon;
> +        s->merge_aligned = 1;
> +    }
> +#endif

As i mentioned above, this kind of initialization and any function
prototypes should be added to init files in the respective folders.

In here you should only call init() functions which will set the above.
See how other filters do it, like tinterlace.

> +
> +    return 0;
> +}
> +
> +static int request_frame(AVFilterLink *link)
> +{
> +    AVFilterContext *ctx = link->src;
> +    FastDeintContext *s = ctx->priv;
> +    int ret;
> +
> +    if (s->eof)
> +        return AVERROR_EOF;
> +
> +    ret = ff_request_frame(ctx->inputs[0]);
> +
> +    if (ret == AVERROR_EOF && s->cur) {
> +        AVFrame *next = av_frame_clone(s->next);
> +        if (!next)
> +            return AVERROR(ENOMEM);
> +
> +        next->pts = s->next->pts * 2 - s->cur->pts;
> +        filter_frame(ctx->inputs[0], next);
> +        s->eof = 1;
> +    } else if (ret < 0) {
> +        return ret;
> +    }
> +
> +    return 0;
> +}
> +
> +#define OFFSET(x) offsetof(FastDeintContext, x)
> +#define FLAGS AV_OPT_FLAG_VIDEO_PARAM|AV_OPT_FLAG_FILTERING_PARAM
> +
> +#define CONST(name, help, val, unit) { name, help, 0, AV_OPT_TYPE_CONST, {.i64=val}, INT_MIN, INT_MAX, FLAGS, unit }
> +
> +static const AVOption fastdeint_options[] = {
> +    { "mode", "specify the deinterlacing mode", OFFSET(mode), AV_OPT_TYPE_INT, {.i64=MODE_BLEND}, 0, MODE_MAX-1, FLAGS, "mode" },
> +    CONST("discard", "discard bottom frame", MODE_DISCARD, "mode"),
> +    CONST("mean", "half resolution blender", MODE_MEAN, "mode"),
> +    CONST("blend", "full resolution blender", MODE_BLEND, "mode"),
> +    CONST("bob", "bob doubler", MODE_BOB, "mode"),
> +    CONST("linear", "bob doubler with linear interpolation", MODE_LINEAR, "mode"),
> +
> +    { NULL }
> +};
> +
> +AVFILTER_DEFINE_CLASS(fastdeint);
> +
> +static const AVFilterPad fastdeint_inputs[] = {
> +    {
> +        .name          = "default",
> +        .type          = AVMEDIA_TYPE_VIDEO,
> +        .filter_frame  = filter_frame,
> +    },
> +    { NULL }
> +};
> +
> +static const AVFilterPad fastdeint_outputs[] = {
> +    {
> +        .name          = "default",
> +        .type          = AVMEDIA_TYPE_VIDEO,
> +        .config_props  = config_props,
> +        .request_frame = request_frame
> +    },
> +    { NULL }
> +};
> +
> +AVFilter ff_vf_fastdeint = {
> +    .name          = "fastdeint",
> +    .description   = NULL_IF_CONFIG_SMALL("fast deinterlacing algorithms"),
> +    .priv_size     = sizeof(FastDeintContext),
> +    .priv_class    = &fastdeint_class,
> +    .uninit        = uninit,
> +    .query_formats = query_formats,
> +    .inputs        = fastdeint_inputs,
> +    .outputs       = fastdeint_outputs,
> +};
>