[FFmpeg-devel] [WIP] add sse4 flac lpc encoder

James Darnley james.darnley at gmail.com
Mon Feb 3 02:18:00 CET 2014


A rather hacked together patch adding an sse4 version of the flac lpc
encoder for 16-bit samples, flac_lpc_encode_c_16().  But it works correctly.

I have been using gprof to measure the time taken in functions.

> Each sample counts as 0.01 seconds.
>   %   cumulative   self              self     total           
>  time   seconds   seconds    calls  ms/call  ms/call  name    
Original code:
>  43.94     19.45    19.45                             flac_lpc_encode_c_16
This patch:
>  25.74     17.10     8.54                             ff_flac_enc_lpc_16_sse4

The fraction of total time is down from nearly half to just over a
quarter.  The time reported by `time` is also less these ~12 seconds.

Original: 0m52.318s
Patch:    0m40.198s

These tests were done with compression level 8 which does skew the time
spent in these functions to be in my favour.

I already see that I can use 4 more xmm regs to unroll the loop more.
-------------- next part --------------
From 4c8c95931aa39cf6189b7efd504134ea080b8952 Mon Sep 17 00:00:00 2001
From: James Darnley <james.darnley at gmail.com>
Date: Sun, 2 Feb 2014 17:07:41 +0100
Subject: [PATCH 1/3] WIP add sse4 flac lpc encoder

---
 libavcodec/flacdsp.c        |   25 ++++++++++++++-
 libavcodec/x86/Makefile     |    2 +
 libavcodec/x86/flac_dsp.asm |   71 +++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 97 insertions(+), 1 deletions(-)
 create mode 100644 libavcodec/x86/flac_dsp.asm

diff --git a/libavcodec/flacdsp.c b/libavcodec/flacdsp.c
index 02eba3e..8fae578 100644
--- a/libavcodec/flacdsp.c
+++ b/libavcodec/flacdsp.c
@@ -26,7 +26,6 @@
 #define SAMPLE_SIZE 16
 #define PLANAR 0
 #include "flacdsp_template.c"
-#include "flacdsp_lpc_template.c"
 
 #undef  PLANAR
 #define PLANAR 1
@@ -43,6 +42,30 @@
 #define PLANAR 1
 #include "flacdsp_template.c"
 
+void ff_flac_enc_lpc_16_sse4(int32_t *, const int32_t *, const int32_t *, int, int, int);
+
+static void flac_lpc_encode_c_16(int32_t *res, const int32_t *smp, int len,
+                                    int order, const int32_t *coefs, int shift)
+{
+    int i;
+    for (i = 0; i < order; i++)
+        res[i] = smp[i];
+    /*for (i = order; i < len; i += 2) {
+        int j;
+        int s  = smp[i];
+        int32_t p0 = 0, p1 = 0;
+        for (j = 0; j < order; j++) {
+            int c = coefs[j];
+            p1   += (c*s);
+            s     = smp[i-j-1];
+            p0   += (c*s);
+        }
+        res[i  ] = smp[i  ] - (p0 >> shift);
+        res[i+1] = smp[i+1] - (p1 >> shift);
+    }*/
+    ff_flac_enc_lpc_16_sse4(res+order, smp+order, coefs, len-order, order, shift);
+}
+
 static void flac_lpc_16_c(int32_t *decoded, const int coeffs[32],
                           int pred_order, int qlevel, int len)
 {
diff --git a/libavcodec/x86/Makefile b/libavcodec/x86/Makefile
index dddaae1..5c69e3e 100644
--- a/libavcodec/x86/Makefile
+++ b/libavcodec/x86/Makefile
@@ -107,3 +107,5 @@ YASM-OBJS-$(CONFIG_VP9_DECODER)        += x86/vp9itxfm.o                \
                                           x86/vp9lpf.o                  \
                                           x86/vp9mc.o
 YASM-OBJS-$(CONFIG_WEBP_DECODER)       += x86/vp8dsp.o
+
+YASM-OBJS-$(CONFIG_FLAC_ENCODER) += x86/flac_dsp.o
diff --git a/libavcodec/x86/flac_dsp.asm b/libavcodec/x86/flac_dsp.asm
new file mode 100644
index 0000000..5a9a24c
--- /dev/null
+++ b/libavcodec/x86/flac_dsp.asm
@@ -0,0 +1,71 @@
+;*****************************************************************************
+;* FLAC DSP functions
+;*
+;* Copyright (c) 2014 James Darnley <james.darnley at gmail.com>
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or modify
+;* it under the terms of the GNU General Public License as published by
+;* the Free Software Foundation; either version 2 of the License, or
+;* (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+;* GNU General Public License for more details.
+;*
+;* You should have received a copy of the GNU General Public License along
+;* with FFmpeg; if not, write to the Free Software Foundation, Inc.,
+;* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+; The C code for 16-bit samples simplifies to this
+
+;for (i = order; i < len; i++) {
+;    int j;
+;    int32_t p = 0;
+;    for (j = 0; j < order; j++) {
+;        int c = coefs[j];
+;        int s = smp[i-j-1];
+;        p   += (c*s);
+;    }
+;    res[i] = smp[i] - (p >> shift);
+;}
+
+INIT_XMM sse4
+cglobal flac_enc_lpc_16, 3, 5, 4, 0, res, smp, coefs ; len, order, shift
+                                   ; r0   r1   r2      r3   r4     r5
+
+%define posj r3
+%define negj r4
+
+movd m3, r5m ; shift
+loop_len:
+    pxor m0,  m0
+    xor posj, posj
+    xor negj, negj
+    loop_order:
+        movd   m2, [coefsq+posj*4] ; c = coefs[j]
+        SPLATD m2
+        movu   m1, [smpq+negj*4-4] ; s = smp[i-j-1]
+        pmulld m1,  m2
+        paddd  m0,  m1             ; p += c * s
+
+        add posj, 1
+        sub negj, 1
+        cmp posj, r4m
+    jne loop_order
+
+    psrad m0, m3                   ; p >>= shift
+    movu  m1, [smpq]
+    psubd m1, m0                   ; smp[i] - p
+    movu  [resq], m1               ; res[i] = smp[i] - (p >> shift)
+
+    add resq, mmsize
+    add smpq, mmsize
+    sub DWORD r3m, mmsize/4
+jg loop_len
+RET
-- 
1.7.9

-------------- next part --------------
A non-text attachment was scrubbed...
Name: signature.asc
Type: application/pgp-signature
Size: 683 bytes
Desc: OpenPGP digital signature
URL: <http://ffmpeg.org/pipermail/ffmpeg-devel/attachments/20140203/4c0d5b8d/attachment.asc>


More information about the ffmpeg-devel mailing list