[FFmpeg-cvslog] Merge commit 'ea7ee4b4e381e0fa731458de0cbf740430eeb013'
Clément Bœsch
git at videolan.org
Wed Apr 26 17:25:01 EEST 2017
ffmpeg | branch: master | Clément Bœsch <u at pkh.me> | Wed Apr 26 16:21:00 2017 +0200| [172b0e2e88832822632841e8e0d3794f974cbc93] | committer: Clément Bœsch
Merge commit 'ea7ee4b4e381e0fa731458de0cbf740430eeb013'
* commit 'ea7ee4b4e381e0fa731458de0cbf740430eeb013':
ppc: Centralize compiler-specific altivec.h #include handling in one place
Merged-by: Clément Bœsch <u at pkh.me>
> http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=172b0e2e88832822632841e8e0d3794f974cbc93
---
libavcodec/ppc/audiodsp.c | 5 +----
libavcodec/ppc/blockdsp.c | 7 +++----
libavcodec/ppc/fdctdsp.c | 6 +++---
libavcodec/ppc/fft_init.c | 1 -
libavcodec/ppc/fft_vsx.c | 1 -
libavcodec/ppc/fft_vsx.h | 1 -
libavcodec/ppc/h264chroma_init.c | 3 ++-
libavcodec/ppc/h264chroma_template.c | 1 -
libavcodec/ppc/h264dsp.c | 1 -
libavcodec/ppc/h264qpel.c | 4 +++-
libavcodec/ppc/h264qpel_template.c | 1 -
libavcodec/ppc/hevcdsp.c | 5 +----
libavcodec/ppc/hpeldsp_altivec.c | 7 ++-----
libavcodec/ppc/idctdsp.c | 9 ++++-----
libavcodec/ppc/lossless_audiodsp_altivec.c | 6 ++----
libavcodec/ppc/lossless_videodsp_altivec.c | 5 +----
libavcodec/ppc/me_cmp.c | 5 +----
libavcodec/ppc/mpegvideo_altivec.c | 3 ++-
libavcodec/ppc/mpegvideodsp.c | 2 +-
libavcodec/ppc/mpegvideoencdsp.c | 6 ++----
libavcodec/ppc/pixblockdsp.c | 5 +----
libavcodec/ppc/svq1enc_altivec.c | 9 +++------
libavcodec/ppc/vc1dsp_altivec.c | 3 ++-
libavcodec/ppc/vorbisdsp_altivec.c | 6 +++---
libavcodec/ppc/vp3dsp_altivec.c | 3 ++-
libavcodec/ppc/vp8dsp_altivec.c | 4 +++-
libpostproc/postprocess.c | 5 +----
libswscale/swscale_internal.h | 6 +-----
28 files changed, 44 insertions(+), 76 deletions(-)
diff --git a/libavcodec/ppc/audiodsp.c b/libavcodec/ppc/audiodsp.c
index 4ee3da42d2..2e374737bd 100644
--- a/libavcodec/ppc/audiodsp.c
+++ b/libavcodec/ppc/audiodsp.c
@@ -24,15 +24,12 @@
*/
#include "config.h"
-#if HAVE_ALTIVEC_H
-#include <altivec.h>
-#endif
#include "libavutil/attributes.h"
#include "libavutil/cpu.h"
#include "libavutil/ppc/cpu.h"
-#include "libavutil/ppc/types_altivec.h"
#include "libavutil/ppc/util_altivec.h"
+
#include "libavcodec/audiodsp.h"
#if HAVE_ALTIVEC
diff --git a/libavcodec/ppc/blockdsp.c b/libavcodec/ppc/blockdsp.c
index 45c492ab3b..d89b77e088 100644
--- a/libavcodec/ppc/blockdsp.c
+++ b/libavcodec/ppc/blockdsp.c
@@ -21,16 +21,15 @@
*/
#include "config.h"
-#if HAVE_ALTIVEC_H
-#include <altivec.h>
-#endif
+
#include <string.h>
#include "libavutil/attributes.h"
#include "libavutil/cpu.h"
#include "libavutil/mem.h"
#include "libavutil/ppc/cpu.h"
-#include "libavutil/ppc/types_altivec.h"
+#include "libavutil/ppc/util_altivec.h"
+
#include "libavcodec/blockdsp.h"
/* ***** WARNING ***** WARNING ***** WARNING ***** */
diff --git a/libavcodec/ppc/fdctdsp.c b/libavcodec/ppc/fdctdsp.c
index 6659046f98..4ab516c6b3 100644
--- a/libavcodec/ppc/fdctdsp.c
+++ b/libavcodec/ppc/fdctdsp.c
@@ -19,14 +19,14 @@
*/
#include "config.h"
-#if HAVE_ALTIVEC_H
-#include <altivec.h>
-#endif
#include "libavutil/attributes.h"
#include "libavutil/cpu.h"
#include "libavutil/ppc/cpu.h"
+#include "libavutil/ppc/util_altivec.h"
+
#include "libavcodec/fdctdsp.h"
+
#include "fdct.h"
#if HAVE_ALTIVEC
diff --git a/libavcodec/ppc/fft_init.c b/libavcodec/ppc/fft_init.c
index cbeaf98952..57d7c80ea4 100644
--- a/libavcodec/ppc/fft_init.c
+++ b/libavcodec/ppc/fft_init.c
@@ -23,7 +23,6 @@
#include "config.h"
#include "libavutil/cpu.h"
#include "libavutil/ppc/cpu.h"
-#include "libavutil/ppc/types_altivec.h"
#include "libavutil/ppc/util_altivec.h"
#include "libavcodec/fft.h"
diff --git a/libavcodec/ppc/fft_vsx.c b/libavcodec/ppc/fft_vsx.c
index e92975f74e..c365fa1380 100644
--- a/libavcodec/ppc/fft_vsx.c
+++ b/libavcodec/ppc/fft_vsx.c
@@ -25,7 +25,6 @@
#include "config.h"
#include "libavutil/cpu.h"
-#include "libavutil/ppc/types_altivec.h"
#include "libavutil/ppc/util_altivec.h"
#include "libavcodec/fft.h"
#include "libavcodec/fft-internal.h"
diff --git a/libavcodec/ppc/fft_vsx.h b/libavcodec/ppc/fft_vsx.h
index a85475d160..1e44031aa5 100644
--- a/libavcodec/ppc/fft_vsx.h
+++ b/libavcodec/ppc/fft_vsx.h
@@ -27,7 +27,6 @@
#include "config.h"
#include "libavutil/cpu.h"
-#include "libavutil/ppc/types_altivec.h"
#include "libavutil/ppc/util_altivec.h"
#include "libavcodec/fft.h"
#include "libavcodec/fft-internal.h"
diff --git a/libavcodec/ppc/h264chroma_init.c b/libavcodec/ppc/h264chroma_init.c
index 876efeca09..bd0d213bdc 100644
--- a/libavcodec/ppc/h264chroma_init.c
+++ b/libavcodec/ppc/h264chroma_init.c
@@ -19,12 +19,13 @@
*/
#include "config.h"
+
#include "libavutil/attributes.h"
#include "libavutil/cpu.h"
#include "libavutil/intreadwrite.h"
#include "libavutil/ppc/cpu.h"
-#include "libavutil/ppc/types_altivec.h"
#include "libavutil/ppc/util_altivec.h"
+
#include "libavcodec/h264chroma.h"
#if HAVE_ALTIVEC
diff --git a/libavcodec/ppc/h264chroma_template.c b/libavcodec/ppc/h264chroma_template.c
index d9b2a619e4..8f43e5dee1 100644
--- a/libavcodec/ppc/h264chroma_template.c
+++ b/libavcodec/ppc/h264chroma_template.c
@@ -19,7 +19,6 @@
*/
#include "libavutil/mem.h"
-#include "libavutil/ppc/types_altivec.h"
#include "libavutil/ppc/util_altivec.h"
/* this code assume that stride % 16 == 0 */
diff --git a/libavcodec/ppc/h264dsp.c b/libavcodec/ppc/h264dsp.c
index 22a8d4117b..e84a058d04 100644
--- a/libavcodec/ppc/h264dsp.c
+++ b/libavcodec/ppc/h264dsp.c
@@ -28,7 +28,6 @@
#include "libavutil/intreadwrite.h"
#include "libavutil/mem.h"
#include "libavutil/ppc/cpu.h"
-#include "libavutil/ppc/types_altivec.h"
#include "libavutil/ppc/util_altivec.h"
#include "libavcodec/h264dec.h"
diff --git a/libavcodec/ppc/h264qpel.c b/libavcodec/ppc/h264qpel.c
index 575f504d32..bef421fa4f 100644
--- a/libavcodec/ppc/h264qpel.c
+++ b/libavcodec/ppc/h264qpel.c
@@ -19,13 +19,15 @@
*/
#include "config.h"
+
#include "libavutil/attributes.h"
#include "libavutil/cpu.h"
#include "libavutil/intreadwrite.h"
#include "libavutil/ppc/cpu.h"
-#include "libavutil/ppc/types_altivec.h"
#include "libavutil/ppc/util_altivec.h"
+
#include "libavcodec/h264qpel.h"
+
#include "hpeldsp_altivec.h"
#if HAVE_ALTIVEC
diff --git a/libavcodec/ppc/h264qpel_template.c b/libavcodec/ppc/h264qpel_template.c
index 2f25e74840..304604c63d 100644
--- a/libavcodec/ppc/h264qpel_template.c
+++ b/libavcodec/ppc/h264qpel_template.c
@@ -25,7 +25,6 @@
#include "libavutil/avassert.h"
#include "libavutil/mem.h"
-#include "libavutil/ppc/types_altivec.h"
#include "libavutil/ppc/util_altivec.h"
#define ASSERT_ALIGNED(ptr) av_assert2(!((uintptr_t)ptr&0x0000000F));
diff --git a/libavcodec/ppc/hevcdsp.c b/libavcodec/ppc/hevcdsp.c
index 120362bebf..4b1037d792 100644
--- a/libavcodec/ppc/hevcdsp.c
+++ b/libavcodec/ppc/hevcdsp.c
@@ -20,13 +20,10 @@
*/
#include "config.h"
-#if HAVE_ALTIVEC_H
-#include <altivec.h>
-#endif
+#include "libavutil/attributes.h"
#include "libavutil/cpu.h"
#include "libavutil/ppc/cpu.h"
-#include "libavutil/ppc/types_altivec.h"
#include "libavutil/ppc/util_altivec.h"
#include "libavcodec/hevcdsp.h"
diff --git a/libavcodec/ppc/hpeldsp_altivec.c b/libavcodec/ppc/hpeldsp_altivec.c
index 87a1f05b6a..4f19521860 100644
--- a/libavcodec/ppc/hpeldsp_altivec.c
+++ b/libavcodec/ppc/hpeldsp_altivec.c
@@ -22,16 +22,13 @@
#include "config.h"
-#if HAVE_ALTIVEC_H
-#include <altivec.h>
-#endif
-
#include "libavutil/attributes.h"
#include "libavutil/cpu.h"
#include "libavutil/ppc/cpu.h"
-#include "libavutil/ppc/types_altivec.h"
#include "libavutil/ppc/util_altivec.h"
+
#include "libavcodec/hpeldsp.h"
+
#include "hpeldsp_altivec.h"
#if HAVE_ALTIVEC
diff --git a/libavcodec/ppc/idctdsp.c b/libavcodec/ppc/idctdsp.c
index f1b42470fb..29f625a01c 100644
--- a/libavcodec/ppc/idctdsp.c
+++ b/libavcodec/ppc/idctdsp.c
@@ -30,17 +30,16 @@
* IDCT function itself was to factor out the partial transposition, and to
* perform a full transpose at the end of the function. */
+#include "config.h"
+
#include <stdlib.h>
#include <string.h>
-#include "config.h"
-#if HAVE_ALTIVEC_H
-#include <altivec.h>
-#endif
#include "libavutil/attributes.h"
#include "libavutil/cpu.h"
#include "libavutil/ppc/cpu.h"
-#include "libavutil/ppc/types_altivec.h"
+#include "libavutil/ppc/util_altivec.h"
+
#include "libavcodec/idctdsp.h"
#if HAVE_ALTIVEC
diff --git a/libavcodec/ppc/lossless_audiodsp_altivec.c b/libavcodec/ppc/lossless_audiodsp_altivec.c
index bdec25223d..298e6c38a0 100644
--- a/libavcodec/ppc/lossless_audiodsp_altivec.c
+++ b/libavcodec/ppc/lossless_audiodsp_altivec.c
@@ -19,14 +19,12 @@
*/
#include "config.h"
-#if HAVE_ALTIVEC_H
-#include <altivec.h>
-#endif
#include "libavutil/attributes.h"
#include "libavutil/cpu.h"
#include "libavutil/ppc/cpu.h"
-#include "libavutil/ppc/types_altivec.h"
+#include "libavutil/ppc/util_altivec.h"
+
#include "libavcodec/lossless_audiodsp.h"
#if HAVE_BIGENDIAN
diff --git a/libavcodec/ppc/lossless_videodsp_altivec.c b/libavcodec/ppc/lossless_videodsp_altivec.c
index 16dd99f8d7..980f85b166 100644
--- a/libavcodec/ppc/lossless_videodsp_altivec.c
+++ b/libavcodec/ppc/lossless_videodsp_altivec.c
@@ -21,15 +21,12 @@
*/
#include "config.h"
-#if HAVE_ALTIVEC_H
-#include <altivec.h>
-#endif
#include "libavutil/attributes.h"
#include "libavutil/cpu.h"
#include "libavutil/ppc/cpu.h"
-#include "libavutil/ppc/types_altivec.h"
#include "libavutil/ppc/util_altivec.h"
+
#include "libavcodec/lossless_videodsp.h"
#if HAVE_ALTIVEC
diff --git a/libavcodec/ppc/me_cmp.c b/libavcodec/ppc/me_cmp.c
index 9f75ed256a..17f9a4f016 100644
--- a/libavcodec/ppc/me_cmp.c
+++ b/libavcodec/ppc/me_cmp.c
@@ -21,15 +21,12 @@
*/
#include "config.h"
-#if HAVE_ALTIVEC_H
-#include <altivec.h>
-#endif
#include "libavutil/attributes.h"
#include "libavutil/cpu.h"
#include "libavutil/ppc/cpu.h"
-#include "libavutil/ppc/types_altivec.h"
#include "libavutil/ppc/util_altivec.h"
+
#include "libavcodec/avcodec.h"
#include "libavcodec/mpegvideo.h"
#include "libavcodec/me_cmp.h"
diff --git a/libavcodec/ppc/mpegvideo_altivec.c b/libavcodec/ppc/mpegvideo_altivec.c
index 1b6bda6c36..2c6ff9165b 100644
--- a/libavcodec/ppc/mpegvideo_altivec.c
+++ b/libavcodec/ppc/mpegvideo_altivec.c
@@ -25,11 +25,12 @@
#include <stdio.h>
#include "config.h"
+
#include "libavutil/attributes.h"
#include "libavutil/cpu.h"
#include "libavutil/ppc/cpu.h"
-#include "libavutil/ppc/types_altivec.h"
#include "libavutil/ppc/util_altivec.h"
+
#include "libavcodec/mpegvideo.h"
#if HAVE_ALTIVEC
diff --git a/libavcodec/ppc/mpegvideodsp.c b/libavcodec/ppc/mpegvideodsp.c
index 021933255b..990a974a4e 100644
--- a/libavcodec/ppc/mpegvideodsp.c
+++ b/libavcodec/ppc/mpegvideodsp.c
@@ -23,8 +23,8 @@
#include "libavutil/cpu.h"
#include "libavutil/mem.h"
#include "libavutil/ppc/cpu.h"
-#include "libavutil/ppc/types_altivec.h"
#include "libavutil/ppc/util_altivec.h"
+
#include "libavcodec/mpegvideodsp.h"
#if HAVE_ALTIVEC
diff --git a/libavcodec/ppc/mpegvideoencdsp.c b/libavcodec/ppc/mpegvideoencdsp.c
index 3e6765ce15..b96487bf81 100644
--- a/libavcodec/ppc/mpegvideoencdsp.c
+++ b/libavcodec/ppc/mpegvideoencdsp.c
@@ -17,16 +17,14 @@
*/
#include "config.h"
+
#include <stdint.h>
-#if HAVE_ALTIVEC_H
-#include <altivec.h>
-#endif
#include "libavutil/attributes.h"
#include "libavutil/cpu.h"
#include "libavutil/ppc/cpu.h"
-#include "libavutil/ppc/types_altivec.h"
#include "libavutil/ppc/util_altivec.h"
+
#include "libavcodec/mpegvideoencdsp.h"
#if HAVE_ALTIVEC
diff --git a/libavcodec/ppc/pixblockdsp.c b/libavcodec/ppc/pixblockdsp.c
index f5ac8509f0..01d14b4124 100644
--- a/libavcodec/ppc/pixblockdsp.c
+++ b/libavcodec/ppc/pixblockdsp.c
@@ -21,15 +21,12 @@
*/
#include "config.h"
-#if HAVE_ALTIVEC_H
-#include <altivec.h>
-#endif
#include "libavutil/attributes.h"
#include "libavutil/cpu.h"
#include "libavutil/ppc/cpu.h"
-#include "libavutil/ppc/types_altivec.h"
#include "libavutil/ppc/util_altivec.h"
+
#include "libavcodec/avcodec.h"
#include "libavcodec/pixblockdsp.h"
diff --git a/libavcodec/ppc/svq1enc_altivec.c b/libavcodec/ppc/svq1enc_altivec.c
index 4e25e253f6..f63f086602 100644
--- a/libavcodec/ppc/svq1enc_altivec.c
+++ b/libavcodec/ppc/svq1enc_altivec.c
@@ -18,18 +18,15 @@
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
-#include <stdint.h>
-
#include "config.h"
-#if HAVE_ALTIVEC_H
-#include <altivec.h>
-#endif
+
+#include <stdint.h>
#include "libavutil/attributes.h"
#include "libavutil/cpu.h"
#include "libavutil/ppc/cpu.h"
-#include "libavutil/ppc/types_altivec.h"
#include "libavutil/ppc/util_altivec.h"
+
#include "libavcodec/svq1enc.h"
#if HAVE_ALTIVEC
diff --git a/libavcodec/ppc/vc1dsp_altivec.c b/libavcodec/ppc/vc1dsp_altivec.c
index 83d537f0c1..bbadb2aaee 100644
--- a/libavcodec/ppc/vc1dsp_altivec.c
+++ b/libavcodec/ppc/vc1dsp_altivec.c
@@ -20,11 +20,12 @@
*/
#include "config.h"
+
#include "libavutil/attributes.h"
#include "libavutil/cpu.h"
#include "libavutil/ppc/cpu.h"
-#include "libavutil/ppc/types_altivec.h"
#include "libavutil/ppc/util_altivec.h"
+
#include "libavcodec/vc1dsp.h"
#if HAVE_ALTIVEC
diff --git a/libavcodec/ppc/vorbisdsp_altivec.c b/libavcodec/ppc/vorbisdsp_altivec.c
index d7557c815b..4dabf2dc7d 100644
--- a/libavcodec/ppc/vorbisdsp_altivec.c
+++ b/libavcodec/ppc/vorbisdsp_altivec.c
@@ -19,12 +19,12 @@
*/
#include "config.h"
-#if HAVE_ALTIVEC_H
-#include <altivec.h>
-#endif
+
#include "libavutil/attributes.h"
#include "libavutil/cpu.h"
#include "libavutil/ppc/cpu.h"
+#include "libavutil/ppc/util_altivec.h"
+
#include "libavcodec/vorbisdsp.h"
#if HAVE_ALTIVEC
diff --git a/libavcodec/ppc/vp3dsp_altivec.c b/libavcodec/ppc/vp3dsp_altivec.c
index d2231d090a..a9a48d145b 100644
--- a/libavcodec/ppc/vp3dsp_altivec.c
+++ b/libavcodec/ppc/vp3dsp_altivec.c
@@ -21,11 +21,12 @@
#include <string.h>
#include "config.h"
+
#include "libavutil/attributes.h"
#include "libavutil/cpu.h"
#include "libavutil/ppc/cpu.h"
-#include "libavutil/ppc/types_altivec.h"
#include "libavutil/ppc/util_altivec.h"
+
#include "libavcodec/vp3dsp.h"
#if HAVE_ALTIVEC
diff --git a/libavcodec/ppc/vp8dsp_altivec.c b/libavcodec/ppc/vp8dsp_altivec.c
index 23e4ace7da..31201ed2d8 100644
--- a/libavcodec/ppc/vp8dsp_altivec.c
+++ b/libavcodec/ppc/vp8dsp_altivec.c
@@ -21,12 +21,14 @@
*/
#include "config.h"
+
#include "libavutil/cpu.h"
#include "libavutil/mem.h"
#include "libavutil/ppc/cpu.h"
-#include "libavutil/ppc/types_altivec.h"
#include "libavutil/ppc/util_altivec.h"
+
#include "libavcodec/vp8dsp.h"
+
#include "hpeldsp_altivec.h"
#if HAVE_ALTIVEC
diff --git a/libpostproc/postprocess.c b/libpostproc/postprocess.c
index 1dc719cf93..6aa4ace337 100644
--- a/libpostproc/postprocess.c
+++ b/libpostproc/postprocess.c
@@ -89,6 +89,7 @@ try to unroll inner for(x=0 ... loop to avoid these damn if(x ... checks
#include "postprocess.h"
#include "postprocess_internal.h"
#include "libavutil/avstring.h"
+#include "libavutil/ppc/util_altivec.h"
#include "libavutil/ffversion.h"
const char postproc_ffversion[] = "FFmpeg version " FFMPEG_VERSION;
@@ -110,10 +111,6 @@ const char *postproc_license(void)
return LICENSE_PREFIX FFMPEG_LICENSE + sizeof(LICENSE_PREFIX) - 1;
}
-#if HAVE_ALTIVEC_H
-#include <altivec.h>
-#endif
-
#define GET_MODE_BUFFER_SIZE 500
#define OPTIONS_ARRAY_SIZE 10
#define BLOCK_SIZE 8
diff --git a/libswscale/swscale_internal.h b/libswscale/swscale_internal.h
index 84d5bee5ff..0f51df95d7 100644
--- a/libswscale/swscale_internal.h
+++ b/libswscale/swscale_internal.h
@@ -22,11 +22,6 @@
#define SWSCALE_SWSCALE_INTERNAL_H
#include "config.h"
-
-#if HAVE_ALTIVEC_H
-#include <altivec.h>
-#endif
-
#include "version.h"
#include "libavutil/avassert.h"
@@ -36,6 +31,7 @@
#include "libavutil/log.h"
#include "libavutil/pixfmt.h"
#include "libavutil/pixdesc.h"
+#include "libavutil/ppc/util_altivec.h"
#define STR(s) AV_TOSTRING(s) // AV_STRINGIFY is too long
======================================================================
diff --cc libavcodec/ppc/audiodsp.c
index 4ee3da42d2,371e0d1e2e..2e374737bd
--- a/libavcodec/ppc/audiodsp.c
+++ b/libavcodec/ppc/audiodsp.c
@@@ -31,11 -28,11 +28,11 @@@
#include "libavutil/attributes.h"
#include "libavutil/cpu.h"
#include "libavutil/ppc/cpu.h"
- #include "libavutil/ppc/types_altivec.h"
#include "libavutil/ppc/util_altivec.h"
+
#include "libavcodec/audiodsp.h"
-#if HAVE_ALTIVEC && HAVE_BIGENDIAN
+#if HAVE_ALTIVEC
static int32_t scalarproduct_int16_altivec(const int16_t *v1, const int16_t *v2,
int order)
diff --cc libavcodec/ppc/fdctdsp.c
index 6659046f98,36d4b4e4ba..4ab516c6b3
--- a/libavcodec/ppc/fdctdsp.c
+++ b/libavcodec/ppc/fdctdsp.c
@@@ -26,10 -23,13 +23,13 @@@
#include "libavutil/attributes.h"
#include "libavutil/cpu.h"
#include "libavutil/ppc/cpu.h"
+ #include "libavutil/ppc/util_altivec.h"
+
#include "libavcodec/fdctdsp.h"
+
#include "fdct.h"
-#if HAVE_ALTIVEC && HAVE_BIGENDIAN
+#if HAVE_ALTIVEC
#define vs16(v) ((vector signed short) (v))
#define vs32(v) ((vector signed int) (v))
diff --cc libavcodec/ppc/fft_init.c
index cbeaf98952,56eafb91be..57d7c80ea4
--- a/libavcodec/ppc/fft_init.c
+++ b/libavcodec/ppc/fft_init.c
@@@ -21,133 -17,13 +21,132 @@@
*/
#include "config.h"
-
#include "libavutil/cpu.h"
#include "libavutil/ppc/cpu.h"
- #include "libavutil/ppc/types_altivec.h"
-
+#include "libavutil/ppc/util_altivec.h"
#include "libavcodec/fft.h"
+/**
+ * Do a complex FFT with the parameters defined in ff_fft_init().
+ * The input data must be permuted before with s->revtab table.
+ * No 1.0 / sqrt(n) normalization is done.
+ * AltiVec-enabled:
+ * This code assumes that the 'z' pointer is 16 bytes-aligned.
+ * It also assumes all FFTComplex are 8 bytes-aligned pairs of floats.
+ */
+
+#if HAVE_VSX
+#include "fft_vsx.h"
+#else
+void ff_fft_calc_altivec(FFTContext *s, FFTComplex *z);
void ff_fft_calc_interleave_altivec(FFTContext *s, FFTComplex *z);
+#endif
+
+#if HAVE_GNU_AS && HAVE_ALTIVEC
+static void imdct_half_altivec(FFTContext *s, FFTSample *output, const FFTSample *input)
+{
+ int j, k;
+ int n = 1 << s->mdct_bits;
+ int n4 = n >> 2;
+ int n8 = n >> 3;
+ int n32 = n >> 5;
+ const uint16_t *revtabj = s->revtab;
+ const uint16_t *revtabk = s->revtab+n4;
+ const vec_f *tcos = (const vec_f*)(s->tcos+n8);
+ const vec_f *tsin = (const vec_f*)(s->tsin+n8);
+ const vec_f *pin = (const vec_f*)(input+n4);
+ vec_f *pout = (vec_f*)(output+n4);
+
+ /* pre rotation */
+ k = n32-1;
+ do {
+ vec_f cos,sin,cos0,sin0,cos1,sin1,re,im,r0,i0,r1,i1,a,b,c,d;
+#define CMULA(p,o0,o1,o2,o3)\
+ a = pin[ k*2+p]; /* { z[k].re, z[k].im, z[k+1].re, z[k+1].im } */\
+ b = pin[-k*2-p-1]; /* { z[-k-2].re, z[-k-2].im, z[-k-1].re, z[-k-1].im } */\
+ re = vec_perm(a, b, vcprm(0,2,s0,s2)); /* { z[k].re, z[k+1].re, z[-k-2].re, z[-k-1].re } */\
+ im = vec_perm(a, b, vcprm(s3,s1,3,1)); /* { z[-k-1].im, z[-k-2].im, z[k+1].im, z[k].im } */\
+ cos = vec_perm(cos0, cos1, vcprm(o0,o1,s##o2,s##o3)); /* { cos[k], cos[k+1], cos[-k-2], cos[-k-1] } */\
+ sin = vec_perm(sin0, sin1, vcprm(o0,o1,s##o2,s##o3));\
+ r##p = im*cos - re*sin;\
+ i##p = re*cos + im*sin;
+#define STORE2(v,dst)\
+ j = dst;\
+ vec_ste(v, 0, output+j*2);\
+ vec_ste(v, 4, output+j*2);
+#define STORE8(p)\
+ a = vec_perm(r##p, i##p, vcprm(0,s0,0,s0));\
+ b = vec_perm(r##p, i##p, vcprm(1,s1,1,s1));\
+ c = vec_perm(r##p, i##p, vcprm(2,s2,2,s2));\
+ d = vec_perm(r##p, i##p, vcprm(3,s3,3,s3));\
+ STORE2(a, revtabk[ p*2-4]);\
+ STORE2(b, revtabk[ p*2-3]);\
+ STORE2(c, revtabj[-p*2+2]);\
+ STORE2(d, revtabj[-p*2+3]);
+
+ cos0 = tcos[k];
+ sin0 = tsin[k];
+ cos1 = tcos[-k-1];
+ sin1 = tsin[-k-1];
+ CMULA(0, 0,1,2,3);
+ CMULA(1, 2,3,0,1);
+ STORE8(0);
+ STORE8(1);
+ revtabj += 4;
+ revtabk -= 4;
+ k--;
+ } while(k >= 0);
+
+#if HAVE_VSX
+ ff_fft_calc_vsx(s, (FFTComplex*)output);
+#else
+ ff_fft_calc_altivec(s, (FFTComplex*)output);
+#endif
+
+ /* post rotation + reordering */
+ j = -n32;
+ k = n32-1;
+ do {
+ vec_f cos,sin,re,im,a,b,c,d;
+#define CMULB(d0,d1,o)\
+ re = pout[o*2];\
+ im = pout[o*2+1];\
+ cos = tcos[o];\
+ sin = tsin[o];\
+ d0 = im*sin - re*cos;\
+ d1 = re*sin + im*cos;
+
+ CMULB(a,b,j);
+ CMULB(c,d,k);
+ pout[2*j] = vec_perm(a, d, vcprm(0,s3,1,s2));
+ pout[2*j+1] = vec_perm(a, d, vcprm(2,s1,3,s0));
+ pout[2*k] = vec_perm(c, b, vcprm(0,s3,1,s2));
+ pout[2*k+1] = vec_perm(c, b, vcprm(2,s1,3,s0));
+ j++;
+ k--;
+ } while(k >= 0);
+}
+
+static void imdct_calc_altivec(FFTContext *s, FFTSample *output, const FFTSample *input)
+{
+ int k;
+ int n = 1 << s->mdct_bits;
+ int n4 = n >> 2;
+ int n16 = n >> 4;
+ vec_u32 sign = {1U<<31,1U<<31,1U<<31,1U<<31};
+ vec_u32 *p0 = (vec_u32*)(output+n4);
+ vec_u32 *p1 = (vec_u32*)(output+n4*3);
+
+ imdct_half_altivec(s, output + n4, input);
+
+ for (k = 0; k < n16; k++) {
+ vec_u32 a = p0[k] ^ sign;
+ vec_u32 b = p1[-k-1];
+ p0[-k-1] = vec_perm(a, a, vcprm(3,2,1,0));
+ p1[k] = vec_perm(b, b, vcprm(3,2,1,0));
+ }
+}
+#endif /* HAVE_GNU_AS && HAVE_ALTIVEC && HAVE_BIGENDIAN */
av_cold void ff_fft_init_ppc(FFTContext *s)
{
diff --cc libavcodec/ppc/fft_vsx.c
index e92975f74e,0000000000..c365fa1380
mode 100644,000000..100644
--- a/libavcodec/ppc/fft_vsx.c
+++ b/libavcodec/ppc/fft_vsx.c
@@@ -1,227 -1,0 +1,226 @@@
+/*
+ * FFT transform, optimized with VSX built-in functions
+ * Copyright (c) 2014 Rong Yan
+ *
+ * This algorithm (though not any of the implementation details) is
+ * based on libdjbfft by D. J. Bernstein.
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+
+#include "config.h"
+#include "libavutil/cpu.h"
- #include "libavutil/ppc/types_altivec.h"
+#include "libavutil/ppc/util_altivec.h"
+#include "libavcodec/fft.h"
+#include "libavcodec/fft-internal.h"
+#include "fft_vsx.h"
+
+#if HAVE_VSX
+
+static void fft32_vsx_interleave(FFTComplex *z)
+{
+ fft16_vsx_interleave(z);
+ fft8_vsx_interleave(z+16);
+ fft8_vsx_interleave(z+24);
+ pass_vsx_interleave(z,ff_cos_32,4);
+}
+
+static void fft64_vsx_interleave(FFTComplex *z)
+{
+ fft32_vsx_interleave(z);
+ fft16_vsx_interleave(z+32);
+ fft16_vsx_interleave(z+48);
+ pass_vsx_interleave(z,ff_cos_64, 8);
+}
+static void fft128_vsx_interleave(FFTComplex *z)
+{
+ fft64_vsx_interleave(z);
+ fft32_vsx_interleave(z+64);
+ fft32_vsx_interleave(z+96);
+ pass_vsx_interleave(z,ff_cos_128,16);
+}
+static void fft256_vsx_interleave(FFTComplex *z)
+{
+ fft128_vsx_interleave(z);
+ fft64_vsx_interleave(z+128);
+ fft64_vsx_interleave(z+192);
+ pass_vsx_interleave(z,ff_cos_256,32);
+}
+static void fft512_vsx_interleave(FFTComplex *z)
+{
+ fft256_vsx_interleave(z);
+ fft128_vsx_interleave(z+256);
+ fft128_vsx_interleave(z+384);
+ pass_vsx_interleave(z,ff_cos_512,64);
+}
+static void fft1024_vsx_interleave(FFTComplex *z)
+{
+ fft512_vsx_interleave(z);
+ fft256_vsx_interleave(z+512);
+ fft256_vsx_interleave(z+768);
+ pass_vsx_interleave(z,ff_cos_1024,128);
+
+}
+static void fft2048_vsx_interleave(FFTComplex *z)
+{
+ fft1024_vsx_interleave(z);
+ fft512_vsx_interleave(z+1024);
+ fft512_vsx_interleave(z+1536);
+ pass_vsx_interleave(z,ff_cos_2048,256);
+}
+static void fft4096_vsx_interleave(FFTComplex *z)
+{
+ fft2048_vsx_interleave(z);
+ fft1024_vsx_interleave(z+2048);
+ fft1024_vsx_interleave(z+3072);
+ pass_vsx_interleave(z,ff_cos_4096, 512);
+}
+static void fft8192_vsx_interleave(FFTComplex *z)
+{
+ fft4096_vsx_interleave(z);
+ fft2048_vsx_interleave(z+4096);
+ fft2048_vsx_interleave(z+6144);
+ pass_vsx_interleave(z,ff_cos_8192,1024);
+}
+static void fft16384_vsx_interleave(FFTComplex *z)
+{
+ fft8192_vsx_interleave(z);
+ fft4096_vsx_interleave(z+8192);
+ fft4096_vsx_interleave(z+12288);
+ pass_vsx_interleave(z,ff_cos_16384,2048);
+}
+static void fft32768_vsx_interleave(FFTComplex *z)
+{
+ fft16384_vsx_interleave(z);
+ fft8192_vsx_interleave(z+16384);
+ fft8192_vsx_interleave(z+24576);
+ pass_vsx_interleave(z,ff_cos_32768,4096);
+}
+static void fft65536_vsx_interleave(FFTComplex *z)
+{
+ fft32768_vsx_interleave(z);
+ fft16384_vsx_interleave(z+32768);
+ fft16384_vsx_interleave(z+49152);
+ pass_vsx_interleave(z,ff_cos_65536,8192);
+}
+
+static void fft32_vsx(FFTComplex *z)
+{
+ fft16_vsx(z);
+ fft8_vsx(z+16);
+ fft8_vsx(z+24);
+ pass_vsx(z,ff_cos_32,4);
+}
+
+static void fft64_vsx(FFTComplex *z)
+{
+ fft32_vsx(z);
+ fft16_vsx(z+32);
+ fft16_vsx(z+48);
+ pass_vsx(z,ff_cos_64, 8);
+}
+static void fft128_vsx(FFTComplex *z)
+{
+ fft64_vsx(z);
+ fft32_vsx(z+64);
+ fft32_vsx(z+96);
+ pass_vsx(z,ff_cos_128,16);
+}
+static void fft256_vsx(FFTComplex *z)
+{
+ fft128_vsx(z);
+ fft64_vsx(z+128);
+ fft64_vsx(z+192);
+ pass_vsx(z,ff_cos_256,32);
+}
+static void fft512_vsx(FFTComplex *z)
+{
+ fft256_vsx(z);
+ fft128_vsx(z+256);
+ fft128_vsx(z+384);
+ pass_vsx(z,ff_cos_512,64);
+}
+static void fft1024_vsx(FFTComplex *z)
+{
+ fft512_vsx(z);
+ fft256_vsx(z+512);
+ fft256_vsx(z+768);
+ pass_vsx(z,ff_cos_1024,128);
+
+}
+static void fft2048_vsx(FFTComplex *z)
+{
+ fft1024_vsx(z);
+ fft512_vsx(z+1024);
+ fft512_vsx(z+1536);
+ pass_vsx(z,ff_cos_2048,256);
+}
+static void fft4096_vsx(FFTComplex *z)
+{
+ fft2048_vsx(z);
+ fft1024_vsx(z+2048);
+ fft1024_vsx(z+3072);
+ pass_vsx(z,ff_cos_4096, 512);
+}
+static void fft8192_vsx(FFTComplex *z)
+{
+ fft4096_vsx(z);
+ fft2048_vsx(z+4096);
+ fft2048_vsx(z+6144);
+ pass_vsx(z,ff_cos_8192,1024);
+}
+static void fft16384_vsx(FFTComplex *z)
+{
+ fft8192_vsx(z);
+ fft4096_vsx(z+8192);
+ fft4096_vsx(z+12288);
+ pass_vsx(z,ff_cos_16384,2048);
+}
+static void fft32768_vsx(FFTComplex *z)
+{
+ fft16384_vsx(z);
+ fft8192_vsx(z+16384);
+ fft8192_vsx(z+24576);
+ pass_vsx(z,ff_cos_32768,4096);
+}
+static void fft65536_vsx(FFTComplex *z)
+{
+ fft32768_vsx(z);
+ fft16384_vsx(z+32768);
+ fft16384_vsx(z+49152);
+ pass_vsx(z,ff_cos_65536,8192);
+}
+
+static void (* const fft_dispatch_vsx[])(FFTComplex*) = {
+ fft4_vsx, fft8_vsx, fft16_vsx, fft32_vsx, fft64_vsx, fft128_vsx, fft256_vsx, fft512_vsx, fft1024_vsx,
+ fft2048_vsx, fft4096_vsx, fft8192_vsx, fft16384_vsx, fft32768_vsx, fft65536_vsx,
+};
+static void (* const fft_dispatch_vsx_interleave[])(FFTComplex*) = {
+ fft4_vsx_interleave, fft8_vsx_interleave, fft16_vsx_interleave, fft32_vsx_interleave, fft64_vsx_interleave,
+ fft128_vsx_interleave, fft256_vsx_interleave, fft512_vsx_interleave, fft1024_vsx_interleave,
+ fft2048_vsx_interleave, fft4096_vsx_interleave, fft8192_vsx_interleave, fft16384_vsx_interleave, fft32768_vsx_interleave, fft65536_vsx_interleave,
+};
+void ff_fft_calc_interleave_vsx(FFTContext *s, FFTComplex *z)
+{
+ fft_dispatch_vsx_interleave[s->nbits-2](z);
+}
+void ff_fft_calc_vsx(FFTContext *s, FFTComplex *z)
+{
+ fft_dispatch_vsx[s->nbits-2](z);
+}
+#endif /* HAVE_VSX */
diff --cc libavcodec/ppc/fft_vsx.h
index a85475d160,0000000000..1e44031aa5
mode 100644,000000..100644
--- a/libavcodec/ppc/fft_vsx.h
+++ b/libavcodec/ppc/fft_vsx.h
@@@ -1,830 -1,0 +1,829 @@@
+#ifndef AVCODEC_PPC_FFT_VSX_H
+#define AVCODEC_PPC_FFT_VSX_H
+/*
+ * FFT transform, optimized with VSX built-in functions
+ * Copyright (c) 2014 Rong Yan Copyright (c) 2009 Loren Merritt
+ *
+ * This algorithm (though not any of the implementation details) is
+ * based on libdjbfft by D. J. Bernstein, and fft_altivec_s.S.
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+
+#include "config.h"
+#include "libavutil/cpu.h"
- #include "libavutil/ppc/types_altivec.h"
+#include "libavutil/ppc/util_altivec.h"
+#include "libavcodec/fft.h"
+#include "libavcodec/fft-internal.h"
+
+#if HAVE_VSX
+
+void ff_fft_calc_interleave_vsx(FFTContext *s, FFTComplex *z);
+void ff_fft_calc_vsx(FFTContext *s, FFTComplex *z);
+
+
+#define byte_2complex (2*sizeof(FFTComplex))
+#define byte_4complex (4*sizeof(FFTComplex))
+#define byte_6complex (6*sizeof(FFTComplex))
+#define byte_8complex (8*sizeof(FFTComplex))
+#define byte_10complex (10*sizeof(FFTComplex))
+#define byte_12complex (12*sizeof(FFTComplex))
+#define byte_14complex (14*sizeof(FFTComplex))
+
+inline static void pass_vsx_interleave(FFTComplex *z, const FFTSample *wre, unsigned int n)
+{
+ int o1 = n<<1;
+ int o2 = n<<2;
+ int o3 = o1+o2;
+ int i1, i2, i3;
+ FFTSample* out = (FFTSample*)z;
+ const FFTSample *wim = wre+o1;
+ vec_f vz0, vzo1, vzo2, vzo3;
+ vec_f x0, x1, x2, x3;
+ vec_f x4, x5, x6, x7;
+ vec_f x8, x9, x10, x11;
+ vec_f x12, x13, x14, x15;
+ vec_f x16, x17, x18, x19;
+ vec_f x20, x21, x22, x23;
+ vec_f vz0plus1, vzo1plus1, vzo2plus1, vzo3plus1;
+ vec_f y0, y1, y2, y3;
+ vec_f y4, y5, y8, y9;
+ vec_f y10, y13, y14, y15;
+ vec_f y16, y17, y18, y19;
+ vec_f y20, y21, y22, y23;
+ vec_f wr1, wi1, wr0, wi0;
+ vec_f wr2, wi2, wr3, wi3;
+ vec_f xmulwi0, xmulwi1, ymulwi2, ymulwi3;
+
+ n = n-2;
+ i1 = o1*sizeof(FFTComplex);
+ i2 = o2*sizeof(FFTComplex);
+ i3 = o3*sizeof(FFTComplex);
+ vzo2 = vec_ld(i2, &(out[0])); // zo2.r zo2.i z(o2+1).r z(o2+1).i
+ vzo2plus1 = vec_ld(i2+16, &(out[0]));
+ vzo3 = vec_ld(i3, &(out[0])); // zo3.r zo3.i z(o3+1).r z(o3+1).i
+ vzo3plus1 = vec_ld(i3+16, &(out[0]));
+ vz0 = vec_ld(0, &(out[0])); // z0.r z0.i z1.r z1.i
+ vz0plus1 = vec_ld(16, &(out[0]));
+ vzo1 = vec_ld(i1, &(out[0])); // zo1.r zo1.i z(o1+1).r z(o1+1).i
+ vzo1plus1 = vec_ld(i1+16, &(out[0]));
+
+ x0 = vec_add(vzo2, vzo3);
+ x1 = vec_sub(vzo2, vzo3);
+ y0 = vec_add(vzo2plus1, vzo3plus1);
+ y1 = vec_sub(vzo2plus1, vzo3plus1);
+
+ wr1 = vec_splats(wre[1]);
+ wi1 = vec_splats(wim[-1]);
+ wi2 = vec_splats(wim[-2]);
+ wi3 = vec_splats(wim[-3]);
+ wr2 = vec_splats(wre[2]);
+ wr3 = vec_splats(wre[3]);
+
+ x2 = vec_perm(x0, x1, vcprm(2,s2,3,s3));
+ x3 = vec_perm(x0, x1, vcprm(s3,3,s2,2));
+
+ y4 = vec_perm(y0, y1, vcprm(s1,1,s0,0));
+ y5 = vec_perm(y0, y1, vcprm(s3,3,s2,2));
+ y2 = vec_perm(y0, y1, vcprm(0,s0,1,s1));
+ y3 = vec_perm(y0, y1, vcprm(2,s2,3,s3));
+
+ ymulwi2 = vec_mul(y4, wi2);
+ ymulwi3 = vec_mul(y5, wi3);
+ x4 = vec_mul(x2, wr1);
+ x5 = vec_mul(x3, wi1);
+ y8 = vec_madd(y2, wr2, ymulwi2);
+ y9 = vec_msub(y2, wr2, ymulwi2);
+ x6 = vec_add(x4, x5);
+ x7 = vec_sub(x4, x5);
+ y13 = vec_madd(y3, wr3, ymulwi3);
+ y14 = vec_msub(y3, wr3, ymulwi3);
+
+ x8 = vec_perm(x6, x7, vcprm(0,1,s2,s3));
+ y10 = vec_perm(y8, y9, vcprm(0,1,s2,s3));
+ y15 = vec_perm(y13, y14, vcprm(0,1,s2,s3));
+
+ x9 = vec_perm(x0, x8, vcprm(0,1,s0,s2));
+ x10 = vec_perm(x1, x8, vcprm(1,0,s3,s1));
+
+ y16 = vec_perm(y10, y15, vcprm(0,2,s0,s2));
+ y17 = vec_perm(y10, y15, vcprm(3,1,s3,s1));
+
+ x11 = vec_add(vz0, x9);
+ x12 = vec_sub(vz0, x9);
+ x13 = vec_add(vzo1, x10);
+ x14 = vec_sub(vzo1, x10);
+
+ y18 = vec_add(vz0plus1, y16);
+ y19 = vec_sub(vz0plus1, y16);
+ y20 = vec_add(vzo1plus1, y17);
+ y21 = vec_sub(vzo1plus1, y17);
+
+ x15 = vec_perm(x13, x14, vcprm(0,s1,2,s3));
+ x16 = vec_perm(x13, x14, vcprm(s0,1,s2,3));
+ y22 = vec_perm(y20, y21, vcprm(0,s1,2,s3));
+ y23 = vec_perm(y20, y21, vcprm(s0,1,s2,3));
+
+
+ vec_st(x11, 0, &(out[0]));
+ vec_st(y18, 16, &(out[0]));
+ vec_st(x15, i1, &(out[0]));
+ vec_st(y22, i1+16, &(out[0]));
+ vec_st(x12, i2, &(out[0]));
+ vec_st(y19, i2+16, &(out[0]));
+ vec_st(x16, i3, &(out[0]));
+ vec_st(y23, i3+16, &(out[0]));
+
+ do {
+ out += 8;
+ wre += 4;
+ wim -= 4;
+ wr0 = vec_splats(wre[0]);
+ wr1 = vec_splats(wre[1]);
+ wi0 = vec_splats(wim[0]);
+ wi1 = vec_splats(wim[-1]);
+
+ wr2 = vec_splats(wre[2]);
+ wr3 = vec_splats(wre[3]);
+ wi2 = vec_splats(wim[-2]);
+ wi3 = vec_splats(wim[-3]);
+
+ vzo2 = vec_ld(i2, &(out[0])); // zo2.r zo2.i z(o2+1).r z(o2+1).i
+ vzo2plus1 = vec_ld(i2+16, &(out[0]));
+ vzo3 = vec_ld(i3, &(out[0])); // zo3.r zo3.i z(o3+1).r z(o3+1).i
+ vzo3plus1 = vec_ld(i3+16, &(out[0]));
+ vz0 = vec_ld(0, &(out[0])); // z0.r z0.i z1.r z1.i
+ vz0plus1 = vec_ld(16, &(out[0]));
+ vzo1 = vec_ld(i1, &(out[0])); // zo1.r zo1.i z(o1+1).r z(o1+1).i
+ vzo1plus1 = vec_ld(i1+16, &(out[0]));
+
+ x0 = vec_add(vzo2, vzo3);
+ x1 = vec_sub(vzo2, vzo3);
+
+ y0 = vec_add(vzo2plus1, vzo3plus1);
+ y1 = vec_sub(vzo2plus1, vzo3plus1);
+
+ x4 = vec_perm(x0, x1, vcprm(s1,1,s0,0));
+ x5 = vec_perm(x0, x1, vcprm(s3,3,s2,2));
+ x2 = vec_perm(x0, x1, vcprm(0,s0,1,s1));
+ x3 = vec_perm(x0, x1, vcprm(2,s2,3,s3));
+
+ y2 = vec_perm(y0, y1, vcprm(0,s0,1,s1));
+ y3 = vec_perm(y0, y1, vcprm(2,s2,3,s3));
+ xmulwi0 = vec_mul(x4, wi0);
+ xmulwi1 = vec_mul(x5, wi1);
+
+ y4 = vec_perm(y0, y1, vcprm(s1,1,s0,0));
+ y5 = vec_perm(y0, y1, vcprm(s3,3,s2,2));
+
+ x8 = vec_madd(x2, wr0, xmulwi0);
+ x9 = vec_msub(x2, wr0, xmulwi0);
+ ymulwi2 = vec_mul(y4, wi2);
+ ymulwi3 = vec_mul(y5, wi3);
+
+ x13 = vec_madd(x3, wr1, xmulwi1);
+ x14 = vec_msub(x3, wr1, xmulwi1);
+
+ y8 = vec_madd(y2, wr2, ymulwi2);
+ y9 = vec_msub(y2, wr2, ymulwi2);
+ y13 = vec_madd(y3, wr3, ymulwi3);
+ y14 = vec_msub(y3, wr3, ymulwi3);
+
+ x10 = vec_perm(x8, x9, vcprm(0,1,s2,s3));
+ x15 = vec_perm(x13, x14, vcprm(0,1,s2,s3));
+
+ y10 = vec_perm(y8, y9, vcprm(0,1,s2,s3));
+ y15 = vec_perm(y13, y14, vcprm(0,1,s2,s3));
+
+ x16 = vec_perm(x10, x15, vcprm(0,2,s0,s2));
+ x17 = vec_perm(x10, x15, vcprm(3,1,s3,s1));
+
+ y16 = vec_perm(y10, y15, vcprm(0,2,s0,s2));
+ y17 = vec_perm(y10, y15, vcprm(3,1,s3,s1));
+
+ x18 = vec_add(vz0, x16);
+ x19 = vec_sub(vz0, x16);
+ x20 = vec_add(vzo1, x17);
+ x21 = vec_sub(vzo1, x17);
+
+ y18 = vec_add(vz0plus1, y16);
+ y19 = vec_sub(vz0plus1, y16);
+ y20 = vec_add(vzo1plus1, y17);
+ y21 = vec_sub(vzo1plus1, y17);
+
+ x22 = vec_perm(x20, x21, vcprm(0,s1,2,s3));
+ x23 = vec_perm(x20, x21, vcprm(s0,1,s2,3));
+
+ y22 = vec_perm(y20, y21, vcprm(0,s1,2,s3));
+ y23 = vec_perm(y20, y21, vcprm(s0,1,s2,3));
+
+ vec_st(x18, 0, &(out[0]));
+ vec_st(y18, 16, &(out[0]));
+ vec_st(x22, i1, &(out[0]));
+ vec_st(y22, i1+16, &(out[0]));
+ vec_st(x19, i2, &(out[0]));
+ vec_st(y19, i2+16, &(out[0]));
+ vec_st(x23, i3, &(out[0]));
+ vec_st(y23, i3+16, &(out[0]));
+ } while (n-=2);
+}
+
+inline static void fft2_vsx_interleave(FFTComplex *z)
+{
+ FFTSample r1, i1;
+
+ r1 = z[0].re - z[1].re;
+ z[0].re += z[1].re;
+ z[1].re = r1;
+
+ i1 = z[0].im - z[1].im;
+ z[0].im += z[1].im;
+ z[1].im = i1;
+ }
+
+inline static void fft4_vsx_interleave(FFTComplex *z)
+{
+ vec_f a, b, c, d;
+ float* out= (float*)z;
+ a = vec_ld(0, &(out[0]));
+ b = vec_ld(byte_2complex, &(out[0]));
+
+ c = vec_perm(a, b, vcprm(0,1,s2,s1));
+ d = vec_perm(a, b, vcprm(2,3,s0,s3));
+ a = vec_add(c, d);
+ b = vec_sub(c, d);
+
+ c = vec_perm(a, b, vcprm(0,1,s0,s1));
+ d = vec_perm(a, b, vcprm(2,3,s3,s2));
+
+ a = vec_add(c, d);
+ b = vec_sub(c, d);
+ vec_st(a, 0, &(out[0]));
+ vec_st(b, byte_2complex, &(out[0]));
+}
+
+inline static void fft8_vsx_interleave(FFTComplex *z)
+{
+ vec_f vz0, vz1, vz2, vz3;
+ vec_f x0, x1, x2, x3;
+ vec_f x4, x5, x6, x7;
+ vec_f x8, x9, x10, x11;
+ vec_f x12, x13, x14, x15;
+ vec_f x16, x17, x18, x19;
+ vec_f x20, x21, x22, x23;
+ vec_f x24, x25, x26, x27;
+ vec_f x28, x29, x30, x31;
+ vec_f x32, x33, x34;
+
+ float* out= (float*)z;
+ vec_f vc1 = {sqrthalf, sqrthalf, sqrthalf, sqrthalf};
+
+ vz0 = vec_ld(0, &(out[0]));
+ vz1 = vec_ld(byte_2complex, &(out[0]));
+ vz2 = vec_ld(byte_4complex, &(out[0]));
+ vz3 = vec_ld(byte_6complex, &(out[0]));
+
+ x0 = vec_perm(vz0, vz1, vcprm(0,1,s2,s1));
+ x1 = vec_perm(vz0, vz1, vcprm(2,3,s0,s3));
+ x2 = vec_perm(vz2, vz3, vcprm(2,1,s0,s1));
+ x3 = vec_perm(vz2, vz3, vcprm(0,3,s2,s3));
+
+ x4 = vec_add(x0, x1);
+ x5 = vec_sub(x0, x1);
+ x6 = vec_add(x2, x3);
+ x7 = vec_sub(x2, x3);
+
+ x8 = vec_perm(x4, x5, vcprm(0,1,s0,s1));
+ x9 = vec_perm(x4, x5, vcprm(2,3,s3,s2));
+ x10 = vec_perm(x6, x7, vcprm(2,1,s2,s1));
+ x11 = vec_perm(x6, x7, vcprm(0,3,s0,s3));
+
+ x12 = vec_add(x8, x9);
+ x13 = vec_sub(x8, x9);
+ x14 = vec_add(x10, x11);
+ x15 = vec_sub(x10, x11);
+ x16 = vec_perm(x12, x13, vcprm(0,s0,1,s1));
+ x17 = vec_perm(x14, x15, vcprm(0,s0,1,s1));
+ x18 = vec_perm(x16, x17, vcprm(s0,s3,s2,s1));
+ x19 = vec_add(x16, x18); // z0.r z2.r z0.i z2.i
+ x20 = vec_sub(x16, x18); // z4.r z6.r z4.i z6.i
+
+ x21 = vec_perm(x12, x13, vcprm(2,s2,3,s3));
+ x22 = vec_perm(x14, x15, vcprm(2,3,s2,s3));
+ x23 = vec_perm(x14, x15, vcprm(3,2,s3,s2));
+ x24 = vec_add(x22, x23);
+ x25 = vec_sub(x22, x23);
+ x26 = vec_mul( vec_perm(x24, x25, vcprm(2,s2,0,s0)), vc1);
+
+ x27 = vec_add(x21, x26); // z1.r z7.r z1.i z3.i
+ x28 = vec_sub(x21, x26); //z5.r z3.r z5.i z7.i
+
+ x29 = vec_perm(x19, x27, vcprm(0,2,s0,s2)); // z0.r z0.i z1.r z1.i
+ x30 = vec_perm(x19, x27, vcprm(1,3,s1,s3)); // z2.r z2.i z7.r z3.i
+ x31 = vec_perm(x20, x28, vcprm(0,2,s0,s2)); // z4.r z4.i z5.r z5.i
+ x32 = vec_perm(x20, x28, vcprm(1,3,s1,s3)); // z6.r z6.i z3.r z7.i
+ x33 = vec_perm(x30, x32, vcprm(0,1,s2,3)); // z2.r z2.i z3.r z3.i
+ x34 = vec_perm(x30, x32, vcprm(s0,s1,2,s3)); // z6.r z6.i z7.r z7.i
+
+ vec_st(x29, 0, &(out[0]));
+ vec_st(x33, byte_2complex, &(out[0]));
+ vec_st(x31, byte_4complex, &(out[0]));
+ vec_st(x34, byte_6complex, &(out[0]));
+}
+
+inline static void fft16_vsx_interleave(FFTComplex *z)
+{
+ float* out= (float*)z;
+ vec_f vc0 = {sqrthalf, sqrthalf, sqrthalf, sqrthalf};
+ vec_f vc1 = {ff_cos_16[1], ff_cos_16[1], ff_cos_16[1], ff_cos_16[1]};
+ vec_f vc2 = {ff_cos_16[3], ff_cos_16[3], ff_cos_16[3], ff_cos_16[3]};
+ vec_f vz0, vz1, vz2, vz3;
+ vec_f vz4, vz5, vz6, vz7;
+ vec_f x0, x1, x2, x3;
+ vec_f x4, x5, x6, x7;
+ vec_f x8, x9, x10, x11;
+ vec_f x12, x13, x14, x15;
+ vec_f x16, x17, x18, x19;
+ vec_f x20, x21, x22, x23;
+ vec_f x24, x25, x26, x27;
+ vec_f x28, x29, x30, x31;
+ vec_f x32, x33, x34, x35;
+ vec_f x36, x37, x38, x39;
+ vec_f x40, x41, x42, x43;
+ vec_f x44, x45, x46, x47;
+ vec_f x48, x49, x50, x51;
+ vec_f x52, x53, x54, x55;
+ vec_f x56, x57, x58, x59;
+ vec_f x60, x61, x62, x63;
+ vec_f x64, x65, x66, x67;
+ vec_f x68, x69, x70, x71;
+ vec_f x72, x73, x74, x75;
+ vec_f x76, x77, x78, x79;
+ vec_f x80, x81, x82, x83;
+ vec_f x84, x85, x86;
+
+ vz0 = vec_ld(0, &(out[0]));
+ vz1 = vec_ld(byte_2complex, &(out[0]));
+ vz2 = vec_ld(byte_4complex, &(out[0]));
+ vz3 = vec_ld(byte_6complex, &(out[0]));
+ vz4 = vec_ld(byte_8complex, &(out[0]));
+ vz5 = vec_ld(byte_10complex, &(out[0]));
+ vz6 = vec_ld(byte_12complex, &(out[0]));
+ vz7 = vec_ld(byte_14complex, &(out[0]));
+
+ x0 = vec_perm(vz0, vz1, vcprm(0,1,s2,s1));
+ x1 = vec_perm(vz0, vz1, vcprm(2,3,s0,s3));
+ x2 = vec_perm(vz2, vz3, vcprm(0,1,s0,s1));
+ x3 = vec_perm(vz2, vz3, vcprm(2,3,s2,s3));
+
+ x4 = vec_perm(vz4, vz5, vcprm(0,1,s2,s1));
+ x5 = vec_perm(vz4, vz5, vcprm(2,3,s0,s3));
+ x6 = vec_perm(vz6, vz7, vcprm(0,1,s2,s1));
+ x7 = vec_perm(vz6, vz7, vcprm(2,3,s0,s3));
+
+ x8 = vec_add(x0, x1);
+ x9 = vec_sub(x0, x1);
+ x10 = vec_add(x2, x3);
+ x11 = vec_sub(x2, x3);
+
+ x12 = vec_add(x4, x5);
+ x13 = vec_sub(x4, x5);
+ x14 = vec_add(x6, x7);
+ x15 = vec_sub(x6, x7);
+
+ x16 = vec_perm(x8, x9, vcprm(0,1,s0,s1));
+ x17 = vec_perm(x8, x9, vcprm(2,3,s3,s2));
+ x18 = vec_perm(x10, x11, vcprm(2,1,s1,s2));
+ x19 = vec_perm(x10, x11, vcprm(0,3,s0,s3));
+ x20 = vec_perm(x12, x14, vcprm(0,1,s0, s1));
+ x21 = vec_perm(x12, x14, vcprm(2,3,s2,s3));
+ x22 = vec_perm(x13, x15, vcprm(0,1,s0,s1));
+ x23 = vec_perm(x13, x15, vcprm(3,2,s3,s2));
+
+ x24 = vec_add(x16, x17);
+ x25 = vec_sub(x16, x17);
+ x26 = vec_add(x18, x19);
+ x27 = vec_sub(x18, x19);
+ x28 = vec_add(x20, x21);
+ x29 = vec_sub(x20, x21);
+ x30 = vec_add(x22, x23);
+ x31 = vec_sub(x22, x23);
+
+ x32 = vec_add(x24, x26);
+ x33 = vec_sub(x24, x26);
+ x34 = vec_perm(x32, x33, vcprm(0,1,s0,s1));
+
+ x35 = vec_perm(x28, x29, vcprm(2,1,s1,s2));
+ x36 = vec_perm(x28, x29, vcprm(0,3,s0,s3));
+ x37 = vec_add(x35, x36);
+ x38 = vec_sub(x35, x36);
+ x39 = vec_perm(x37, x38, vcprm(0,1,s1,s0));
+
+ x40 = vec_perm(x27, x38, vcprm(3,2,s2,s3));
+ x41 = vec_perm(x26, x37, vcprm(2,3,s3,s2));
+ x42 = vec_add(x40, x41);
+ x43 = vec_sub(x40, x41);
+ x44 = vec_mul(x42, vc0);
+ x45 = vec_mul(x43, vc0);
+
+ x46 = vec_add(x34, x39); // z0.r z0.i z4.r z4.i
+ x47 = vec_sub(x34, x39); // z8.r z8.i z12.r z12.i
+
+ x48 = vec_perm(x30, x31, vcprm(2,1,s1,s2));
+ x49 = vec_perm(x30, x31, vcprm(0,3,s3,s0));
+ x50 = vec_add(x48, x49);
+ x51 = vec_sub(x48, x49);
+ x52 = vec_mul(x50, vc1);
+ x53 = vec_mul(x50, vc2);
+ x54 = vec_mul(x51, vc1);
+ x55 = vec_mul(x51, vc2);
+
+ x56 = vec_perm(x24, x25, vcprm(2,3,s2,s3));
+ x57 = vec_perm(x44, x45, vcprm(0,1,s1,s0));
+ x58 = vec_add(x56, x57);
+ x59 = vec_sub(x56, x57);
+
+ x60 = vec_perm(x54, x55, vcprm(1,0,3,2));
+ x61 = vec_perm(x54, x55, vcprm(s1,s0,s3,s2));
+ x62 = vec_add(x52, x61);
+ x63 = vec_sub(x52, x61);
+ x64 = vec_add(x60, x53);
+ x65 = vec_sub(x60, x53);
+ x66 = vec_perm(x62, x64, vcprm(0,1,s3,s2));
+ x67 = vec_perm(x63, x65, vcprm(s0,s1,3,2));
+
+ x68 = vec_add(x58, x66); // z1.r z1.i z3.r z3.i
+ x69 = vec_sub(x58, x66); // z9.r z9.i z11.r z11.i
+ x70 = vec_add(x59, x67); // z5.r z5.i z15.r z15.i
+ x71 = vec_sub(x59, x67); // z13.r z13.i z7.r z7.i
+
+ x72 = vec_perm(x25, x27, vcprm(s1,s0,s2,s3));
+ x73 = vec_add(x25, x72);
+ x74 = vec_sub(x25, x72);
+ x75 = vec_perm(x73, x74, vcprm(0,1,s0,s1));
+ x76 = vec_perm(x44, x45, vcprm(3,2,s2,s3));
+ x77 = vec_add(x75, x76); // z2.r z2.i z6.r z6.i
+ x78 = vec_sub(x75, x76); // z10.r z10.i z14.r z14.i
+
+ x79 = vec_perm(x46, x68, vcprm(0,1,s0,s1)); // z0.r z0.i z1.r z1.i
+ x80 = vec_perm(x77, x68, vcprm(0,1,s2,s3)); // z2.r z2.i z3.r z3.i
+ x81 = vec_perm(x46, x70, vcprm(2,3,s0,s1)); // z4.r z4.i z5.r z5.i
+ x82 = vec_perm(x71, x77, vcprm(s2,s3,2,3)); // z6.r z6.i z7.r z7.i
+ vec_st(x79, 0, &(out[0]));
+ vec_st(x80, byte_2complex, &(out[0]));
+ vec_st(x81, byte_4complex, &(out[0]));
+ vec_st(x82, byte_6complex, &(out[0]));
+ x83 = vec_perm(x47, x69, vcprm(0,1,s0,s1)); // z8.r z8.i z9.r z9.i
+ x84 = vec_perm(x78, x69, vcprm(0,1,s2,s3)); // z10.r z10.i z11.r z11.i
+ x85 = vec_perm(x47, x71, vcprm(2,3,s0,s1)); // z12.r z12.i z13.r z13.i
+ x86 = vec_perm(x70, x78, vcprm(s2,s3,2,3)); // z14.r z14.i z15.r z15.i
+ vec_st(x83, byte_8complex, &(out[0]));
+ vec_st(x84, byte_10complex, &(out[0]));
+ vec_st(x85, byte_12complex, &(out[0]));
+ vec_st(x86, byte_14complex, &(out[0]));
+}
+
+inline static void fft4_vsx(FFTComplex *z)
+{
+ vec_f a, b, c, d;
+ float* out= (float*)z;
+ a = vec_ld(0, &(out[0]));
+ b = vec_ld(byte_2complex, &(out[0]));
+
+ c = vec_perm(a, b, vcprm(0,1,s2,s1));
+ d = vec_perm(a, b, vcprm(2,3,s0,s3));
+ a = vec_add(c, d);
+ b = vec_sub(c, d);
+
+ c = vec_perm(a,b, vcprm(0,s0,1,s1));
+ d = vec_perm(a, b, vcprm(2,s3,3,s2));
+
+ a = vec_add(c, d);
+ b = vec_sub(c, d);
+
+ c = vec_perm(a, b, vcprm(0,1,s0,s1));
+ d = vec_perm(a, b, vcprm(2,3,s2,s3));
+
+ vec_st(c, 0, &(out[0]));
+ vec_st(d, byte_2complex, &(out[0]));
+ return;
+}
+
+inline static void fft8_vsx(FFTComplex *z)
+{
+ vec_f vz0, vz1, vz2, vz3;
+ vec_f vz4, vz5, vz6, vz7, vz8;
+
+ float* out= (float*)z;
+ vec_f vc0 = {0.0, 0.0, 0.0, 0.0};
+ vec_f vc1 = {-sqrthalf, sqrthalf, sqrthalf, -sqrthalf};
+ vec_f vc2 = {sqrthalf, sqrthalf, sqrthalf, sqrthalf};
+
+ vz0 = vec_ld(0, &(out[0]));
+ vz1 = vec_ld(byte_2complex, &(out[0]));
+ vz2 = vec_ld(byte_4complex, &(out[0]));
+ vz3 = vec_ld(byte_6complex, &(out[0]));
+
+ vz6 = vec_perm(vz2, vz3, vcprm(0,s0,1,s1));
+ vz7 = vec_perm(vz2, vz3, vcprm(2,s2,3,s3));
+ vz4 = vec_perm(vz0, vz1, vcprm(0,1,s2,s1));
+ vz5 = vec_perm(vz0, vz1, vcprm(2,3,s0,s3));
+
+ vz2 = vec_add(vz6, vz7);
+ vz3 = vec_sub(vz6, vz7);
+ vz8 = vec_perm(vz3, vz3, vcprm(2,3,0,1));
+
+ vz0 = vec_add(vz4, vz5);
+ vz1 = vec_sub(vz4, vz5);
+
+ vz3 = vec_madd(vz3, vc1, vc0);
+ vz3 = vec_madd(vz8, vc2, vz3);
+
+ vz4 = vec_perm(vz0, vz1, vcprm(0,s0,1,s1));
+ vz5 = vec_perm(vz0, vz1, vcprm(2,s3,3,s2));
+ vz6 = vec_perm(vz2, vz3, vcprm(1,2,s3,s0));
+ vz7 = vec_perm(vz2, vz3, vcprm(0,3,s2,s1));
+
+ vz0 = vec_add(vz4, vz5);
+ vz1 = vec_sub(vz4, vz5);
+ vz2 = vec_add(vz6, vz7);
+ vz3 = vec_sub(vz6, vz7);
+
+ vz4 = vec_perm(vz0, vz1, vcprm(0,1,s0,s1));
+ vz5 = vec_perm(vz0, vz1, vcprm(2,3,s2,s3));
+ vz6 = vec_perm(vz2, vz3, vcprm(0,2,s1,s3));
+ vz7 = vec_perm(vz2, vz3, vcprm(1,3,s0,s2));
+
+
+ vz2 = vec_sub(vz4, vz6);
+ vz3 = vec_sub(vz5, vz7);
+
+ vz0 = vec_add(vz4, vz6);
+ vz1 = vec_add(vz5, vz7);
+
+ vec_st(vz0, 0, &(out[0]));
+ vec_st(vz1, byte_2complex, &(out[0]));
+ vec_st(vz2, byte_4complex, &(out[0]));
+ vec_st(vz3, byte_6complex, &(out[0]));
+ return;
+}
+
+inline static void fft16_vsx(FFTComplex *z)
+{
+ float* out= (float*)z;
+ vec_f vc0 = {0.0, 0.0, 0.0, 0.0};
+ vec_f vc1 = {-sqrthalf, sqrthalf, sqrthalf, -sqrthalf};
+ vec_f vc2 = {sqrthalf, sqrthalf, sqrthalf, sqrthalf};
+ vec_f vc3 = {1.0, 0.92387953, sqrthalf, 0.38268343};
+ vec_f vc4 = {0.0, 0.38268343, sqrthalf, 0.92387953};
+ vec_f vc5 = {-0.0, -0.38268343, -sqrthalf, -0.92387953};
+
+ vec_f vz0, vz1, vz2, vz3;
+ vec_f vz4, vz5, vz6, vz7;
+ vec_f vz8, vz9, vz10, vz11;
+ vec_f vz12, vz13;
+
+ vz0 = vec_ld(byte_8complex, &(out[0]));
+ vz1 = vec_ld(byte_10complex, &(out[0]));
+ vz2 = vec_ld(byte_12complex, &(out[0]));
+ vz3 = vec_ld(byte_14complex, &(out[0]));
+
+ vz4 = vec_perm(vz0, vz1, vcprm(0,1,s2,s1));
+ vz5 = vec_perm(vz0, vz1, vcprm(2,3,s0,s3));
+ vz6 = vec_perm(vz2, vz3, vcprm(0,1,s2,s1));
+ vz7 = vec_perm(vz2, vz3, vcprm(2,3,s0,s3));
+
+ vz0 = vec_add(vz4, vz5);
+ vz1= vec_sub(vz4, vz5);
+ vz2 = vec_add(vz6, vz7);
+ vz3 = vec_sub(vz6, vz7);
+
+ vz4 = vec_perm(vz0, vz1, vcprm(0,s0,1,s1));
+ vz5 = vec_perm(vz0, vz1, vcprm(2,s3,3,s2));
+ vz6 = vec_perm(vz2, vz3, vcprm(0,s0,1,s1));
+ vz7 = vec_perm(vz2, vz3, vcprm(2,s3,3,s2));
+
+ vz0 = vec_add(vz4, vz5);
+ vz1 = vec_sub(vz4, vz5);
+ vz2 = vec_add(vz6, vz7);
+ vz3 = vec_sub(vz6, vz7);
+
+ vz4 = vec_perm(vz0, vz1, vcprm(0,1,s0,s1));
+ vz5 = vec_perm(vz0, vz1, vcprm(2,3,s2,s3));
+
+ vz6 = vec_perm(vz2, vz3, vcprm(0,1,s0,s1));
+ vz7 = vec_perm(vz2, vz3, vcprm(2,3,s2,s3));
+
+ vz0 = vec_ld(0, &(out[0]));
+ vz1 = vec_ld(byte_2complex, &(out[0]));
+ vz2 = vec_ld(byte_4complex, &(out[0]));
+ vz3 = vec_ld(byte_6complex, &(out[0]));
+ vz10 = vec_perm(vz2, vz3, vcprm(0,s0,1,s1));
+ vz11 = vec_perm(vz2, vz3, vcprm(2,s2,3,s3));
+ vz8 = vec_perm(vz0, vz1, vcprm(0,1,s2,s1));
+ vz9 = vec_perm(vz0, vz1, vcprm(2,3,s0,s3));
+
+ vz2 = vec_add(vz10, vz11);
+ vz3 = vec_sub(vz10, vz11);
+ vz12 = vec_perm(vz3, vz3, vcprm(2,3,0,1));
+ vz0 = vec_add(vz8, vz9);
+ vz1 = vec_sub(vz8, vz9);
+
+ vz3 = vec_madd(vz3, vc1, vc0);
+ vz3 = vec_madd(vz12, vc2, vz3);
+ vz8 = vec_perm(vz0, vz1, vcprm(0,s0,1,s1));
+ vz9 = vec_perm(vz0, vz1, vcprm(2,s3,3,s2));
+ vz10 = vec_perm(vz2, vz3, vcprm(1,2,s3,s0));
+ vz11 = vec_perm(vz2, vz3, vcprm(0,3,s2,s1));
+
+ vz0 = vec_add(vz8, vz9);
+ vz1 = vec_sub(vz8, vz9);
+ vz2 = vec_add(vz10, vz11);
+ vz3 = vec_sub(vz10, vz11);
+
+ vz8 = vec_perm(vz0, vz1, vcprm(0,1,s0,s1));
+ vz9 = vec_perm(vz0, vz1, vcprm(2,3,s2,s3));
+ vz10 = vec_perm(vz2, vz3, vcprm(0,2,s1,s3));
+ vz11 = vec_perm(vz2, vz3, vcprm(1,3,s0,s2));
+
+ vz2 = vec_sub(vz8, vz10);
+ vz3 = vec_sub(vz9, vz11);
+ vz0 = vec_add(vz8, vz10);
+ vz1 = vec_add(vz9, vz11);
+
+ vz8 = vec_madd(vz4, vc3, vc0);
+ vz9 = vec_madd(vz5, vc3, vc0);
+ vz10 = vec_madd(vz6, vc3, vc0);
+ vz11 = vec_madd(vz7, vc3, vc0);
+
+ vz8 = vec_madd(vz5, vc4, vz8);
+ vz9 = vec_madd(vz4, vc5, vz9);
+ vz10 = vec_madd(vz7, vc5, vz10);
+ vz11 = vec_madd(vz6, vc4, vz11);
+
+ vz12 = vec_sub(vz10, vz8);
+ vz10 = vec_add(vz10, vz8);
+
+ vz13 = vec_sub(vz9, vz11);
+ vz11 = vec_add(vz9, vz11);
+
+ vz4 = vec_sub(vz0, vz10);
+ vz0 = vec_add(vz0, vz10);
+
+ vz7= vec_sub(vz3, vz12);
+ vz3= vec_add(vz3, vz12);
+
+ vz5 = vec_sub(vz1, vz11);
+ vz1 = vec_add(vz1, vz11);
+
+ vz6 = vec_sub(vz2, vz13);
+ vz2 = vec_add(vz2, vz13);
+
+ vec_st(vz0, 0, &(out[0]));
+ vec_st(vz1, byte_2complex, &(out[0]));
+ vec_st(vz2, byte_4complex, &(out[0]));
+ vec_st(vz3, byte_6complex, &(out[0]));
+ vec_st(vz4, byte_8complex, &(out[0]));
+ vec_st(vz5, byte_10complex, &(out[0]));
+ vec_st(vz6, byte_12complex, &(out[0]));
+ vec_st(vz7, byte_14complex, &(out[0]));
+ return;
+
+}
+inline static void pass_vsx(FFTComplex * z, const FFTSample * wre, unsigned int n)
+{
+ int o1 = n<<1;
+ int o2 = n<<2;
+ int o3 = o1+o2;
+ int i1, i2, i3;
+ FFTSample* out = (FFTSample*)z;
+ const FFTSample *wim = wre+o1;
+ vec_f v0, v1, v2, v3;
+ vec_f v4, v5, v6, v7;
+ vec_f v8, v9, v10, v11;
+ vec_f v12, v13;
+
+ n = n-2;
+ i1 = o1*sizeof(FFTComplex);
+ i2 = o2*sizeof(FFTComplex);
+ i3 = o3*sizeof(FFTComplex);
+
+ v8 = vec_ld(0, &(wre[0]));
+ v10 = vec_ld(0, &(wim[0]));
+ v9 = vec_ld(0, &(wim[-4]));
+ v9 = vec_perm(v9, v10, vcprm(s0,3,2,1));
+
+ v4 = vec_ld(i2, &(out[0]));
+ v5 = vec_ld(i2+16, &(out[0]));
+ v6 = vec_ld(i3, &(out[0]));
+ v7 = vec_ld(i3+16, &(out[0]));
+ v10 = vec_mul(v4, v8); // r2*wre
+ v11 = vec_mul(v5, v8); // i2*wre
+ v12 = vec_mul(v6, v8); // r3*wre
+ v13 = vec_mul(v7, v8); // i3*wre
+
+ v0 = vec_ld(0, &(out[0])); // r0
+ v3 = vec_ld(i1+16, &(out[0])); // i1
+ v10 = vec_madd(v5, v9, v10); // r2*wim
+ v11 = vec_nmsub(v4, v9, v11); // i2*wim
+ v12 = vec_nmsub(v7, v9, v12); // r3*wim
+ v13 = vec_madd(v6, v9, v13); // i3*wim
+
+ v1 = vec_ld(16, &(out[0])); // i0
+ v2 = vec_ld(i1, &(out[0])); // r1
+ v8 = vec_sub(v12, v10);
+ v12 = vec_add(v12, v10);
+ v9 = vec_sub(v11, v13);
+ v13 = vec_add(v11, v13);
+ v4 = vec_sub(v0, v12);
+ v0 = vec_add(v0, v12);
+ v7 = vec_sub(v3, v8);
+ v3 = vec_add(v3, v8);
+
+ vec_st(v0, 0, &(out[0])); // r0
+ vec_st(v3, i1+16, &(out[0])); // i1
+ vec_st(v4, i2, &(out[0])); // r2
+ vec_st(v7, i3+16, &(out[0]));// i3
+
+ v5 = vec_sub(v1, v13);
+ v1 = vec_add(v1, v13);
+ v6 = vec_sub(v2, v9);
+ v2 = vec_add(v2, v9);
+
+ vec_st(v1, 16, &(out[0])); // i0
+ vec_st(v2, i1, &(out[0])); // r1
+ vec_st(v5, i2+16, &(out[0])); // i2
+ vec_st(v6, i3, &(out[0])); // r3
+
+ do {
+ out += 8;
+ wre += 4;
+ wim -= 4;
+
+ v8 = vec_ld(0, &(wre[0]));
+ v10 = vec_ld(0, &(wim[0]));
+ v9 = vec_ld(0, &(wim[-4]));
+ v9 = vec_perm(v9, v10, vcprm(s0,3,2,1));
+
+ v4 = vec_ld(i2, &(out[0])); // r2
+ v5 = vec_ld(i2+16, &(out[0])); // i2
+ v6 = vec_ld(i3, &(out[0])); // r3
+ v7 = vec_ld(i3+16, &(out[0]));// i3
+ v10 = vec_mul(v4, v8); // r2*wre
+ v11 = vec_mul(v5, v8); // i2*wre
+ v12 = vec_mul(v6, v8); // r3*wre
+ v13 = vec_mul(v7, v8); // i3*wre
+
+ v0 = vec_ld(0, &(out[0])); // r0
+ v3 = vec_ld(i1+16, &(out[0])); // i1
+ v10 = vec_madd(v5, v9, v10); // r2*wim
+ v11 = vec_nmsub(v4, v9, v11); // i2*wim
+ v12 = vec_nmsub(v7, v9, v12); // r3*wim
+ v13 = vec_madd(v6, v9, v13); // i3*wim
+
+ v1 = vec_ld(16, &(out[0])); // i0
+ v2 = vec_ld(i1, &(out[0])); // r1
+ v8 = vec_sub(v12, v10);
+ v12 = vec_add(v12, v10);
+ v9 = vec_sub(v11, v13);
+ v13 = vec_add(v11, v13);
+ v4 = vec_sub(v0, v12);
+ v0 = vec_add(v0, v12);
+ v7 = vec_sub(v3, v8);
+ v3 = vec_add(v3, v8);
+
+ vec_st(v0, 0, &(out[0])); // r0
+ vec_st(v3, i1+16, &(out[0])); // i1
+ vec_st(v4, i2, &(out[0])); // r2
+ vec_st(v7, i3+16, &(out[0])); // i3
+
+ v5 = vec_sub(v1, v13);
+ v1 = vec_add(v1, v13);
+ v6 = vec_sub(v2, v9);
+ v2 = vec_add(v2, v9);
+
+ vec_st(v1, 16, &(out[0])); // i0
+ vec_st(v2, i1, &(out[0])); // r1
+ vec_st(v5, i2+16, &(out[0])); // i2
+ vec_st(v6, i3, &(out[0])); // r3
+ } while (n-=2);
+}
+
+#endif
+
+#endif /* AVCODEC_PPC_FFT_VSX_H */
diff --cc libavcodec/ppc/h264chroma_init.c
index 876efeca09,f8392c2ee2..bd0d213bdc
--- a/libavcodec/ppc/h264chroma_init.c
+++ b/libavcodec/ppc/h264chroma_init.c
@@@ -23,11 -24,11 +24,11 @@@
#include "libavutil/cpu.h"
#include "libavutil/intreadwrite.h"
#include "libavutil/ppc/cpu.h"
- #include "libavutil/ppc/types_altivec.h"
#include "libavutil/ppc/util_altivec.h"
+
#include "libavcodec/h264chroma.h"
-#if HAVE_ALTIVEC && HAVE_BIGENDIAN
+#if HAVE_ALTIVEC
#define PUT_OP_U8_ALTIVEC(d, s, dst) d = s
#define AVG_OP_U8_ALTIVEC(d, s, dst) d = vec_avg(dst, s)
diff --cc libavcodec/ppc/h264chroma_template.c
index d9b2a619e4,daa7652128..8f43e5dee1
--- a/libavcodec/ppc/h264chroma_template.c
+++ b/libavcodec/ppc/h264chroma_template.c
@@@ -19,8 -19,6 +19,7 @@@
*/
#include "libavutil/mem.h"
- #include "libavutil/ppc/types_altivec.h"
+#include "libavutil/ppc/util_altivec.h"
/* this code assume that stride % 16 == 0 */
diff --cc libavcodec/ppc/h264qpel.c
index 575f504d32,5da09bf46e..bef421fa4f
--- a/libavcodec/ppc/h264qpel.c
+++ b/libavcodec/ppc/h264qpel.c
@@@ -23,12 -24,13 +24,13 @@@
#include "libavutil/cpu.h"
#include "libavutil/intreadwrite.h"
#include "libavutil/ppc/cpu.h"
- #include "libavutil/ppc/types_altivec.h"
#include "libavutil/ppc/util_altivec.h"
+
#include "libavcodec/h264qpel.h"
+
#include "hpeldsp_altivec.h"
-#if HAVE_ALTIVEC && HAVE_BIGENDIAN
+#if HAVE_ALTIVEC
#define PUT_OP_U8_ALTIVEC(d, s, dst) d = s
#define AVG_OP_U8_ALTIVEC(d, s, dst) d = vec_avg(dst, s)
diff --cc libavcodec/ppc/h264qpel_template.c
index 2f25e74840,6de063a719..304604c63d
--- a/libavcodec/ppc/h264qpel_template.c
+++ b/libavcodec/ppc/h264qpel_template.c
@@@ -18,87 -18,13 +18,86 @@@
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
-#include "libavutil/mem.h"
+#include "config.h"
+#if HAVE_UNISTD_H
+#include <unistd.h>
+#endif
-#ifdef DEBUG
-#define ASSERT_ALIGNED(ptr) assert(((unsigned long)ptr&0x0000000F));
+#include "libavutil/avassert.h"
+#include "libavutil/mem.h"
- #include "libavutil/ppc/types_altivec.h"
+#include "libavutil/ppc/util_altivec.h"
+
+#define ASSERT_ALIGNED(ptr) av_assert2(!((uintptr_t)ptr&0x0000000F));
+
+#if HAVE_BIGENDIAN
+#define load_alignment(s, ali, pm2, pm1, pp0, pp1, pp2, pp3){\
+ vec_u8 srcR1 = vec_ld(-2, s);\
+ vec_u8 srcR2 = vec_ld(14, s);\
+ switch (ali) {\
+ default: {\
+ srcM2 = vec_perm(srcR1, srcR2, pm2);\
+ srcM1 = vec_perm(srcR1, srcR2, pm1);\
+ srcP0 = vec_perm(srcR1, srcR2, pp0);\
+ srcP1 = vec_perm(srcR1, srcR2, pp1);\
+ srcP2 = vec_perm(srcR1, srcR2, pp2);\
+ srcP3 = vec_perm(srcR1, srcR2, pp3);\
+ } break;\
+ case 11: {\
+ srcM2 = vec_perm(srcR1, srcR2, pm2);\
+ srcM1 = vec_perm(srcR1, srcR2, pm1);\
+ srcP0 = vec_perm(srcR1, srcR2, pp0);\
+ srcP1 = vec_perm(srcR1, srcR2, pp1);\
+ srcP2 = vec_perm(srcR1, srcR2, pp2);\
+ srcP3 = srcR2;\
+ } break;\
+ case 12: {\
+ vec_u8 srcR3 = vec_ld(30, s);\
+ srcM2 = vec_perm(srcR1, srcR2, pm2);\
+ srcM1 = vec_perm(srcR1, srcR2, pm1);\
+ srcP0 = vec_perm(srcR1, srcR2, pp0);\
+ srcP1 = vec_perm(srcR1, srcR2, pp1);\
+ srcP2 = srcR2;\
+ srcP3 = vec_perm(srcR2, srcR3, pp3);\
+ } break;\
+ case 13: {\
+ vec_u8 srcR3 = vec_ld(30, s);\
+ srcM2 = vec_perm(srcR1, srcR2, pm2);\
+ srcM1 = vec_perm(srcR1, srcR2, pm1);\
+ srcP0 = vec_perm(srcR1, srcR2, pp0);\
+ srcP1 = srcR2;\
+ srcP2 = vec_perm(srcR2, srcR3, pp2);\
+ srcP3 = vec_perm(srcR2, srcR3, pp3);\
+ } break;\
+ case 14: {\
+ vec_u8 srcR3 = vec_ld(30, s);\
+ srcM2 = vec_perm(srcR1, srcR2, pm2);\
+ srcM1 = vec_perm(srcR1, srcR2, pm1);\
+ srcP0 = srcR2;\
+ srcP1 = vec_perm(srcR2, srcR3, pp1);\
+ srcP2 = vec_perm(srcR2, srcR3, pp2);\
+ srcP3 = vec_perm(srcR2, srcR3, pp3);\
+ } break;\
+ case 15: {\
+ vec_u8 srcR3 = vec_ld(30, s);\
+ srcM2 = vec_perm(srcR1, srcR2, pm2);\
+ srcM1 = srcR2;\
+ srcP0 = vec_perm(srcR2, srcR3, pp0);\
+ srcP1 = vec_perm(srcR2, srcR3, pp1);\
+ srcP2 = vec_perm(srcR2, srcR3, pp2);\
+ srcP3 = vec_perm(srcR2, srcR3, pp3);\
+ } break;\
+ }\
+ }
#else
-#define ASSERT_ALIGNED(ptr) ;
-#endif
+#define load_alignment(s, ali, pm2, pm1, pp0, pp1, pp2, pp3){\
+ srcM2 = vec_vsx_ld(-2, s);\
+ srcM1 = vec_vsx_ld(-1, s);\
+ srcP0 = vec_vsx_ld(0, s);\
+ srcP1 = vec_vsx_ld(1, s);\
+ srcP2 = vec_vsx_ld(2, s);\
+ srcP3 = vec_vsx_ld(3, s);\
+ }
+#endif /* HAVE_BIGENDIAN */
/* this code assume stride % 16 == 0 */
#ifdef PREFIX_h264_qpel16_h_lowpass_altivec
diff --cc libavcodec/ppc/hpeldsp_altivec.c
index 87a1f05b6a,405b91841e..4f19521860
--- a/libavcodec/ppc/hpeldsp_altivec.c
+++ b/libavcodec/ppc/hpeldsp_altivec.c
@@@ -25,16 -25,13 +25,13 @@@
#include "libavutil/attributes.h"
#include "libavutil/cpu.h"
#include "libavutil/ppc/cpu.h"
- #include "libavutil/ppc/types_altivec.h"
#include "libavutil/ppc/util_altivec.h"
+
#include "libavcodec/hpeldsp.h"
+
#include "hpeldsp_altivec.h"
-#if HAVE_ALTIVEC && HAVE_BIGENDIAN
+#if HAVE_ALTIVEC
/* next one assumes that ((line_size % 16) == 0) */
void ff_put_pixels16_altivec(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
{
diff --cc libavcodec/ppc/idctdsp.c
index f1b42470fb,dc22e15269..29f625a01c
--- a/libavcodec/ppc/idctdsp.c
+++ b/libavcodec/ppc/idctdsp.c
@@@ -40,10 -38,11 +38,11 @@@
#include "libavutil/attributes.h"
#include "libavutil/cpu.h"
#include "libavutil/ppc/cpu.h"
- #include "libavutil/ppc/types_altivec.h"
+ #include "libavutil/ppc/util_altivec.h"
+
#include "libavcodec/idctdsp.h"
-#if HAVE_ALTIVEC && HAVE_BIGENDIAN
+#if HAVE_ALTIVEC
#define IDCT_HALF \
/* 1st stage */ \
diff --cc libavcodec/ppc/lossless_audiodsp_altivec.c
index bdec25223d,0000000000..298e6c38a0
mode 100644,000000..100644
--- a/libavcodec/ppc/lossless_audiodsp_altivec.c
+++ b/libavcodec/ppc/lossless_audiodsp_altivec.c
@@@ -1,93 -1,0 +1,91 @@@
+/*
+ * Copyright (c) 2007 Luca Barbato <lu_zero at gentoo.org>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "config.h"
- #if HAVE_ALTIVEC_H
- #include <altivec.h>
- #endif
+
+#include "libavutil/attributes.h"
+#include "libavutil/cpu.h"
+#include "libavutil/ppc/cpu.h"
- #include "libavutil/ppc/types_altivec.h"
++#include "libavutil/ppc/util_altivec.h"
++
+#include "libavcodec/lossless_audiodsp.h"
+
+#if HAVE_BIGENDIAN
+#define GET_T(tt0,tt1,src,a,b){ \
+ a = vec_ld(16, src); \
+ tt0 = vec_perm(b, a, align); \
+ b = vec_ld(32, src); \
+ tt1 = vec_perm(a, b, align); \
+ }
+#else
+#define GET_T(tt0,tt1,src,a,b){ \
+ tt0 = vec_vsx_ld(0, src); \
+ tt1 = vec_vsx_ld(16, src); \
+ }
+#endif
+
+#if HAVE_ALTIVEC
+static int32_t scalarproduct_and_madd_int16_altivec(int16_t *v1,
+ const int16_t *v2,
+ const int16_t *v3,
+ int order, int mul)
+{
+ LOAD_ZERO;
+ vec_s16 *pv1 = (vec_s16 *) v1;
+ register vec_s16 muls = { mul, mul, mul, mul, mul, mul, mul, mul };
+ register vec_s16 t0, t1, i0, i1, i4, i2, i3;
+ register vec_s32 res = zero_s32v;
+#if HAVE_BIGENDIAN
+ register vec_u8 align = vec_lvsl(0, v2);
+ i2 = vec_ld(0, v2);
+ i3 = vec_ld(0, v3);
+#endif
+ int32_t ires;
+
+ order >>= 4;
+ do {
+ GET_T(t0,t1,v2,i1,i2);
+ i0 = pv1[0];
+ i1 = pv1[1];
+ res = vec_msum(t0, i0, res);
+ res = vec_msum(t1, i1, res);
+ GET_T(t0,t1,v3,i4,i3);
+ pv1[0] = vec_mladd(t0, muls, i0);
+ pv1[1] = vec_mladd(t1, muls, i1);
+ pv1 += 2;
+ v2 += 16;
+ v3 += 16;
+ } while (--order);
+ res = vec_splat(vec_sums(res, zero_s32v), 3);
+ vec_ste(res, 0, &ires);
+
+ return ires;
+}
+#endif /* HAVE_ALTIVEC */
+
+av_cold void ff_llauddsp_init_ppc(LLAudDSPContext *c)
+{
+#if HAVE_ALTIVEC
+ if (!PPC_ALTIVEC(av_get_cpu_flags()))
+ return;
+
+ c->scalarproduct_and_madd_int16 = scalarproduct_and_madd_int16_altivec;
+#endif /* HAVE_ALTIVEC */
+}
diff --cc libavcodec/ppc/lossless_videodsp_altivec.c
index 16dd99f8d7,0000000000..980f85b166
mode 100644,000000..100644
--- a/libavcodec/ppc/lossless_videodsp_altivec.c
+++ b/libavcodec/ppc/lossless_videodsp_altivec.c
@@@ -1,62 -1,0 +1,59 @@@
+/*
+ * Copyright (c) 2002 Brian Foley
+ * Copyright (c) 2002 Dieter Shirley
+ * Copyright (c) 2003-2004 Romain Dolbeau <romain at dolbeau.org>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "config.h"
- #if HAVE_ALTIVEC_H
- #include <altivec.h>
- #endif
+
+#include "libavutil/attributes.h"
+#include "libavutil/cpu.h"
+#include "libavutil/ppc/cpu.h"
- #include "libavutil/ppc/types_altivec.h"
+#include "libavutil/ppc/util_altivec.h"
++
+#include "libavcodec/lossless_videodsp.h"
+
+#if HAVE_ALTIVEC
+static void add_bytes_altivec(uint8_t *dst, uint8_t *src, ptrdiff_t w)
+{
+ register int i;
+ register vector unsigned char vdst, vsrc;
+
+ /* dst and src are 16 bytes-aligned (guaranteed). */
+ for (i = 0; i + 15 < w; i += 16) {
+ vdst = vec_ld(i, (unsigned char *) dst);
+ vsrc = vec_ld(i, (unsigned char *) src);
+ vdst = vec_add(vsrc, vdst);
+ vec_st(vdst, i, (unsigned char *) dst);
+ }
+ /* If w is not a multiple of 16. */
+ for (; i < w; i++)
+ dst[i] = src[i];
+}
+#endif /* HAVE_ALTIVEC */
+
+av_cold void ff_llviddsp_init_ppc(LLVidDSPContext *c)
+{
+#if HAVE_ALTIVEC
+ if (!PPC_ALTIVEC(av_get_cpu_flags()))
+ return;
+
+ c->add_bytes = add_bytes_altivec;
+#endif /* HAVE_ALTIVEC */
+}
diff --cc libavcodec/ppc/mpegvideo_altivec.c
index 1b6bda6c36,89e15a4a7f..2c6ff9165b
--- a/libavcodec/ppc/mpegvideo_altivec.c
+++ b/libavcodec/ppc/mpegvideo_altivec.c
@@@ -28,11 -29,11 +29,11 @@@
#include "libavutil/attributes.h"
#include "libavutil/cpu.h"
#include "libavutil/ppc/cpu.h"
- #include "libavutil/ppc/types_altivec.h"
#include "libavutil/ppc/util_altivec.h"
+
#include "libavcodec/mpegvideo.h"
-#if HAVE_ALTIVEC && HAVE_BIGENDIAN
+#if HAVE_ALTIVEC
/* AltiVec version of dct_unquantize_h263
this code assumes `block' is 16 bytes-aligned */
diff --cc libavcodec/ppc/mpegvideodsp.c
index 021933255b,44ae126774..990a974a4e
--- a/libavcodec/ppc/mpegvideodsp.c
+++ b/libavcodec/ppc/mpegvideodsp.c
@@@ -23,11 -23,11 +23,11 @@@
#include "libavutil/cpu.h"
#include "libavutil/mem.h"
#include "libavutil/ppc/cpu.h"
- #include "libavutil/ppc/types_altivec.h"
#include "libavutil/ppc/util_altivec.h"
+
#include "libavcodec/mpegvideodsp.h"
-#if HAVE_ALTIVEC && HAVE_BIGENDIAN
+#if HAVE_ALTIVEC
/* AltiVec-enhanced gmc1. ATM this code assumes stride is a multiple of 8
* to preserve proper dst alignment. */
static void gmc1_altivec(uint8_t *dst /* align 8 */, uint8_t *src /* align1 */,
diff --cc libavcodec/ppc/mpegvideoencdsp.c
index 3e6765ce15,d11f05bf1e..b96487bf81
--- a/libavcodec/ppc/mpegvideoencdsp.c
+++ b/libavcodec/ppc/mpegvideoencdsp.c
@@@ -25,40 -23,12 +23,40 @@@
#include "libavutil/attributes.h"
#include "libavutil/cpu.h"
#include "libavutil/ppc/cpu.h"
- #include "libavutil/ppc/types_altivec.h"
#include "libavutil/ppc/util_altivec.h"
+
#include "libavcodec/mpegvideoencdsp.h"
-#if HAVE_ALTIVEC && HAVE_BIGENDIAN
+#if HAVE_ALTIVEC
+#if HAVE_VSX
+static int pix_norm1_altivec(uint8_t *pix, int line_size)
+{
+ int i, s = 0;
+ const vector unsigned int zero =
+ (const vector unsigned int) vec_splat_u32(0);
+ vector unsigned int sv = (vector unsigned int) vec_splat_u32(0);
+ vector signed int sum;
+
+ for (i = 0; i < 16; i++) {
+ /* Read the potentially unaligned pixels. */
+ //vector unsigned char pixl = vec_ld(0, pix);
+ //vector unsigned char pixr = vec_ld(15, pix);
+ //vector unsigned char pixv = vec_perm(pixl, pixr, perm);
+ vector unsigned char pixv = vec_vsx_ld(0, pix);
+
+ /* Square the values, and add them to our sum. */
+ sv = vec_msum(pixv, pixv, sv);
+
+ pix += line_size;
+ }
+ /* Sum up the four partial sums, and put the result into s. */
+ sum = vec_sums((vector signed int) sv, (vector signed int) zero);
+ sum = vec_splat(sum, 3);
+ vec_ste(sum, 0, &s);
+ return s;
+}
+#else
static int pix_norm1_altivec(uint8_t *pix, int line_size)
{
int i, s = 0;
diff --cc libavcodec/ppc/svq1enc_altivec.c
index 4e25e253f6,e155f885cd..f63f086602
--- a/libavcodec/ppc/svq1enc_altivec.c
+++ b/libavcodec/ppc/svq1enc_altivec.c
@@@ -28,11 -25,11 +25,11 @@@
#include "libavutil/attributes.h"
#include "libavutil/cpu.h"
#include "libavutil/ppc/cpu.h"
- #include "libavutil/ppc/types_altivec.h"
#include "libavutil/ppc/util_altivec.h"
+
#include "libavcodec/svq1enc.h"
-#if HAVE_ALTIVEC && HAVE_BIGENDIAN
+#if HAVE_ALTIVEC
static int ssd_int8_vs_int16_altivec(const int8_t *pix1, const int16_t *pix2,
int size)
{
diff --cc libavcodec/ppc/vc1dsp_altivec.c
index 83d537f0c1,fc82502358..bbadb2aaee
--- a/libavcodec/ppc/vc1dsp_altivec.c
+++ b/libavcodec/ppc/vc1dsp_altivec.c
@@@ -23,11 -24,11 +24,11 @@@
#include "libavutil/attributes.h"
#include "libavutil/cpu.h"
#include "libavutil/ppc/cpu.h"
- #include "libavutil/ppc/types_altivec.h"
#include "libavutil/ppc/util_altivec.h"
+
#include "libavcodec/vc1dsp.h"
-#if HAVE_ALTIVEC && HAVE_BIGENDIAN
+#if HAVE_ALTIVEC
// main steps of 8x8 transform
#define STEP8(s0, s1, s2, s3, s4, s5, s6, s7, vec_rnd) \
diff --cc libavcodec/ppc/vorbisdsp_altivec.c
index d7557c815b,52c29527ba..4dabf2dc7d
--- a/libavcodec/ppc/vorbisdsp_altivec.c
+++ b/libavcodec/ppc/vorbisdsp_altivec.c
@@@ -25,9 -23,11 +23,11 @@@
#include "libavutil/attributes.h"
#include "libavutil/cpu.h"
#include "libavutil/ppc/cpu.h"
+ #include "libavutil/ppc/util_altivec.h"
+
#include "libavcodec/vorbisdsp.h"
-#if HAVE_ALTIVEC && HAVE_BIGENDIAN
+#if HAVE_ALTIVEC
static void vorbis_inverse_coupling_altivec(float *mag, float *ang,
intptr_t blocksize)
{
diff --cc libavcodec/ppc/vp3dsp_altivec.c
index d2231d090a,2b7cc9d503..a9a48d145b
--- a/libavcodec/ppc/vp3dsp_altivec.c
+++ b/libavcodec/ppc/vp3dsp_altivec.c
@@@ -24,11 -25,11 +25,11 @@@
#include "libavutil/attributes.h"
#include "libavutil/cpu.h"
#include "libavutil/ppc/cpu.h"
- #include "libavutil/ppc/types_altivec.h"
#include "libavutil/ppc/util_altivec.h"
+
#include "libavcodec/vp3dsp.h"
-#if HAVE_ALTIVEC && HAVE_BIGENDIAN
+#if HAVE_ALTIVEC
static const vec_s16 constants =
{0, 64277, 60547, 54491, 46341, 36410, 25080, 12785};
diff --cc libavcodec/ppc/vp8dsp_altivec.c
index 23e4ace7da,6857e6b6a2..31201ed2d8
--- a/libavcodec/ppc/vp8dsp_altivec.c
+++ b/libavcodec/ppc/vp8dsp_altivec.c
@@@ -24,12 -25,13 +25,13 @@@
#include "libavutil/cpu.h"
#include "libavutil/mem.h"
#include "libavutil/ppc/cpu.h"
- #include "libavutil/ppc/types_altivec.h"
#include "libavutil/ppc/util_altivec.h"
+
#include "libavcodec/vp8dsp.h"
+
#include "hpeldsp_altivec.h"
-#if HAVE_ALTIVEC && HAVE_BIGENDIAN
+#if HAVE_ALTIVEC
#define REPT4(...) { __VA_ARGS__, __VA_ARGS__, __VA_ARGS__, __VA_ARGS__ }
// h subpel filter uses msum to multiply+add 4 pixel taps at once
diff --cc libpostproc/postprocess.c
index 1dc719cf93,0000000000..6aa4ace337
mode 100644,000000..100644
--- a/libpostproc/postprocess.c
+++ b/libpostproc/postprocess.c
@@@ -1,1044 -1,0 +1,1041 @@@
+/*
+ * Copyright (C) 2001-2003 Michael Niedermayer (michaelni at gmx.at)
+ *
+ * AltiVec optimizations (C) 2004 Romain Dolbeau <romain at dolbeau.org>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * postprocessing.
+ */
+
+/*
+ C MMX MMX2 3DNow AltiVec
+isVertDC Ec Ec Ec
+isVertMinMaxOk Ec Ec Ec
+doVertLowPass E e e Ec
+doVertDefFilter Ec Ec e e Ec
+isHorizDC Ec Ec Ec
+isHorizMinMaxOk a E Ec
+doHorizLowPass E e e Ec
+doHorizDefFilter Ec Ec e e Ec
+do_a_deblock Ec E Ec E
+deRing E e e* Ecp
+Vertical RKAlgo1 E a a
+Horizontal RKAlgo1 a a
+Vertical X1# a E E
+Horizontal X1# a E E
+LinIpolDeinterlace e E E*
+CubicIpolDeinterlace a e e*
+LinBlendDeinterlace e E E*
+MedianDeinterlace# E Ec Ec
+TempDeNoiser# E e e Ec
+
+* I do not have a 3DNow! CPU -> it is untested, but no one said it does not work so it seems to work
+# more or less selfinvented filters so the exactness is not too meaningful
+E = Exact implementation
+e = almost exact implementation (slightly different rounding,...)
+a = alternative / approximate impl
+c = checked against the other implementations (-vo md5)
+p = partially optimized, still some work to do
+*/
+
+/*
+TODO:
+reduce the time wasted on the mem transfer
+unroll stuff if instructions depend too much on the prior one
+move YScale thing to the end instead of fixing QP
+write a faster and higher quality deblocking filter :)
+make the mainloop more flexible (variable number of blocks at once
+ (the if/else stuff per block is slowing things down)
+compare the quality & speed of all filters
+split this huge file
+optimize c versions
+try to unroll inner for(x=0 ... loop to avoid these damn if(x ... checks
+...
+*/
+
+//Changelog: use git log
+
+#include "config.h"
+#include "libavutil/avutil.h"
+#include "libavutil/avassert.h"
+#include "libavutil/intreadwrite.h"
+#include <inttypes.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+//#undef HAVE_MMXEXT_INLINE
+//#define HAVE_AMD3DNOW_INLINE
+//#undef HAVE_MMX_INLINE
+//#undef ARCH_X86
+//#define DEBUG_BRIGHTNESS
+#include "postprocess.h"
+#include "postprocess_internal.h"
+#include "libavutil/avstring.h"
++#include "libavutil/ppc/util_altivec.h"
+
+#include "libavutil/ffversion.h"
+const char postproc_ffversion[] = "FFmpeg version " FFMPEG_VERSION;
+
+unsigned postproc_version(void)
+{
+ av_assert0(LIBPOSTPROC_VERSION_MICRO >= 100);
+ return LIBPOSTPROC_VERSION_INT;
+}
+
+const char *postproc_configuration(void)
+{
+ return FFMPEG_CONFIGURATION;
+}
+
+const char *postproc_license(void)
+{
+#define LICENSE_PREFIX "libpostproc license: "
+ return LICENSE_PREFIX FFMPEG_LICENSE + sizeof(LICENSE_PREFIX) - 1;
+}
+
- #if HAVE_ALTIVEC_H
- #include <altivec.h>
- #endif
-
+#define GET_MODE_BUFFER_SIZE 500
+#define OPTIONS_ARRAY_SIZE 10
+#define BLOCK_SIZE 8
+#define TEMP_STRIDE 8
+//#define NUM_BLOCKS_AT_ONCE 16 //not used yet
+
+#if ARCH_X86 && HAVE_INLINE_ASM
+DECLARE_ASM_CONST(8, uint64_t, w05)= 0x0005000500050005LL;
+DECLARE_ASM_CONST(8, uint64_t, w04)= 0x0004000400040004LL;
+DECLARE_ASM_CONST(8, uint64_t, w20)= 0x0020002000200020LL;
+DECLARE_ASM_CONST(8, uint64_t, b00)= 0x0000000000000000LL;
+DECLARE_ASM_CONST(8, uint64_t, b01)= 0x0101010101010101LL;
+DECLARE_ASM_CONST(8, uint64_t, b02)= 0x0202020202020202LL;
+DECLARE_ASM_CONST(8, uint64_t, b08)= 0x0808080808080808LL;
+DECLARE_ASM_CONST(8, uint64_t, b80)= 0x8080808080808080LL;
+#endif
+
+DECLARE_ASM_CONST(8, int, deringThreshold)= 20;
+
+
+static const struct PPFilter filters[]=
+{
+ {"hb", "hdeblock", 1, 1, 3, H_DEBLOCK},
+ {"vb", "vdeblock", 1, 2, 4, V_DEBLOCK},
+/* {"hr", "rkhdeblock", 1, 1, 3, H_RK1_FILTER},
+ {"vr", "rkvdeblock", 1, 2, 4, V_RK1_FILTER},*/
+ {"h1", "x1hdeblock", 1, 1, 3, H_X1_FILTER},
+ {"v1", "x1vdeblock", 1, 2, 4, V_X1_FILTER},
+ {"ha", "ahdeblock", 1, 1, 3, H_A_DEBLOCK},
+ {"va", "avdeblock", 1, 2, 4, V_A_DEBLOCK},
+ {"dr", "dering", 1, 5, 6, DERING},
+ {"al", "autolevels", 0, 1, 2, LEVEL_FIX},
+ {"lb", "linblenddeint", 1, 1, 4, LINEAR_BLEND_DEINT_FILTER},
+ {"li", "linipoldeint", 1, 1, 4, LINEAR_IPOL_DEINT_FILTER},
+ {"ci", "cubicipoldeint", 1, 1, 4, CUBIC_IPOL_DEINT_FILTER},
+ {"md", "mediandeint", 1, 1, 4, MEDIAN_DEINT_FILTER},
+ {"fd", "ffmpegdeint", 1, 1, 4, FFMPEG_DEINT_FILTER},
+ {"l5", "lowpass5", 1, 1, 4, LOWPASS5_DEINT_FILTER},
+ {"tn", "tmpnoise", 1, 7, 8, TEMP_NOISE_FILTER},
+ {"fq", "forcequant", 1, 0, 0, FORCE_QUANT},
+ {"be", "bitexact", 1, 0, 0, BITEXACT},
+ {"vi", "visualize", 1, 0, 0, VISUALIZE},
+ {NULL, NULL,0,0,0,0} //End Marker
+};
+
+static const char * const replaceTable[]=
+{
+ "default", "hb:a,vb:a,dr:a",
+ "de", "hb:a,vb:a,dr:a",
+ "fast", "h1:a,v1:a,dr:a",
+ "fa", "h1:a,v1:a,dr:a",
+ "ac", "ha:a:128:7,va:a,dr:a",
+ NULL //End Marker
+};
+
+/* The horizontal functions exist only in C because the MMX
+ * code is faster with vertical filters and transposing. */
+
+/**
+ * Check if the given 8x8 Block is mostly "flat"
+ */
+static inline int isHorizDC_C(const uint8_t src[], int stride, const PPContext *c)
+{
+ int numEq= 0;
+ int y;
+ const int dcOffset= ((c->nonBQP*c->ppMode.baseDcDiff)>>8) + 1;
+ const int dcThreshold= dcOffset*2 + 1;
+
+ for(y=0; y<BLOCK_SIZE; y++){
+ numEq += ((unsigned)(src[0] - src[1] + dcOffset)) < dcThreshold;
+ numEq += ((unsigned)(src[1] - src[2] + dcOffset)) < dcThreshold;
+ numEq += ((unsigned)(src[2] - src[3] + dcOffset)) < dcThreshold;
+ numEq += ((unsigned)(src[3] - src[4] + dcOffset)) < dcThreshold;
+ numEq += ((unsigned)(src[4] - src[5] + dcOffset)) < dcThreshold;
+ numEq += ((unsigned)(src[5] - src[6] + dcOffset)) < dcThreshold;
+ numEq += ((unsigned)(src[6] - src[7] + dcOffset)) < dcThreshold;
+ src+= stride;
+ }
+ return numEq > c->ppMode.flatnessThreshold;
+}
+
+/**
+ * Check if the middle 8x8 Block in the given 8x16 block is flat
+ */
+static inline int isVertDC_C(const uint8_t src[], int stride, const PPContext *c)
+{
+ int numEq= 0;
+ int y;
+ const int dcOffset= ((c->nonBQP*c->ppMode.baseDcDiff)>>8) + 1;
+ const int dcThreshold= dcOffset*2 + 1;
+
+ src+= stride*4; // src points to begin of the 8x8 Block
+ for(y=0; y<BLOCK_SIZE-1; y++){
+ numEq += ((unsigned)(src[0] - src[0+stride] + dcOffset)) < dcThreshold;
+ numEq += ((unsigned)(src[1] - src[1+stride] + dcOffset)) < dcThreshold;
+ numEq += ((unsigned)(src[2] - src[2+stride] + dcOffset)) < dcThreshold;
+ numEq += ((unsigned)(src[3] - src[3+stride] + dcOffset)) < dcThreshold;
+ numEq += ((unsigned)(src[4] - src[4+stride] + dcOffset)) < dcThreshold;
+ numEq += ((unsigned)(src[5] - src[5+stride] + dcOffset)) < dcThreshold;
+ numEq += ((unsigned)(src[6] - src[6+stride] + dcOffset)) < dcThreshold;
+ numEq += ((unsigned)(src[7] - src[7+stride] + dcOffset)) < dcThreshold;
+ src+= stride;
+ }
+ return numEq > c->ppMode.flatnessThreshold;
+}
+
+static inline int isHorizMinMaxOk_C(const uint8_t src[], int stride, int QP)
+{
+ int i;
+ for(i=0; i<2; i++){
+ if((unsigned)(src[0] - src[5] + 2*QP) > 4*QP) return 0;
+ src += stride;
+ if((unsigned)(src[2] - src[7] + 2*QP) > 4*QP) return 0;
+ src += stride;
+ if((unsigned)(src[4] - src[1] + 2*QP) > 4*QP) return 0;
+ src += stride;
+ if((unsigned)(src[6] - src[3] + 2*QP) > 4*QP) return 0;
+ src += stride;
+ }
+ return 1;
+}
+
+static inline int isVertMinMaxOk_C(const uint8_t src[], int stride, int QP)
+{
+ int x;
+ src+= stride*4;
+ for(x=0; x<BLOCK_SIZE; x+=4){
+ if((unsigned)(src[ x + 0*stride] - src[ x + 5*stride] + 2*QP) > 4*QP) return 0;
+ if((unsigned)(src[1+x + 2*stride] - src[1+x + 7*stride] + 2*QP) > 4*QP) return 0;
+ if((unsigned)(src[2+x + 4*stride] - src[2+x + 1*stride] + 2*QP) > 4*QP) return 0;
+ if((unsigned)(src[3+x + 6*stride] - src[3+x + 3*stride] + 2*QP) > 4*QP) return 0;
+ }
+ return 1;
+}
+
+static inline int horizClassify_C(const uint8_t src[], int stride, const PPContext *c)
+{
+ if( isHorizDC_C(src, stride, c) ){
+ return isHorizMinMaxOk_C(src, stride, c->QP);
+ }else{
+ return 2;
+ }
+}
+
+static inline int vertClassify_C(const uint8_t src[], int stride, const PPContext *c)
+{
+ if( isVertDC_C(src, stride, c) ){
+ return isVertMinMaxOk_C(src, stride, c->QP);
+ }else{
+ return 2;
+ }
+}
+
+static inline void doHorizDefFilter_C(uint8_t dst[], int stride, const PPContext *c)
+{
+ int y;
+ for(y=0; y<BLOCK_SIZE; y++){
+ const int middleEnergy= 5*(dst[4] - dst[3]) + 2*(dst[2] - dst[5]);
+
+ if(FFABS(middleEnergy) < 8*c->QP){
+ const int q=(dst[3] - dst[4])/2;
+ const int leftEnergy= 5*(dst[2] - dst[1]) + 2*(dst[0] - dst[3]);
+ const int rightEnergy= 5*(dst[6] - dst[5]) + 2*(dst[4] - dst[7]);
+
+ int d= FFABS(middleEnergy) - FFMIN( FFABS(leftEnergy), FFABS(rightEnergy) );
+ d= FFMAX(d, 0);
+
+ d= (5*d + 32) >> 6;
+ d*= FFSIGN(-middleEnergy);
+
+ if(q>0)
+ {
+ d = FFMAX(d, 0);
+ d = FFMIN(d, q);
+ }
+ else
+ {
+ d = FFMIN(d, 0);
+ d = FFMAX(d, q);
+ }
+
+ dst[3]-= d;
+ dst[4]+= d;
+ }
+ dst+= stride;
+ }
+}
+
+/**
+ * Do a horizontal low pass filter on the 10x8 block (dst points to middle 8x8 Block)
+ * using the 9-Tap Filter (1,1,2,2,4,2,2,1,1)/16 (C version)
+ */
+static inline void doHorizLowPass_C(uint8_t dst[], int stride, const PPContext *c)
+{
+ int y;
+ for(y=0; y<BLOCK_SIZE; y++){
+ const int first= FFABS(dst[-1] - dst[0]) < c->QP ? dst[-1] : dst[0];
+ const int last= FFABS(dst[8] - dst[7]) < c->QP ? dst[8] : dst[7];
+
+ int sums[10];
+ sums[0] = 4*first + dst[0] + dst[1] + dst[2] + 4;
+ sums[1] = sums[0] - first + dst[3];
+ sums[2] = sums[1] - first + dst[4];
+ sums[3] = sums[2] - first + dst[5];
+ sums[4] = sums[3] - first + dst[6];
+ sums[5] = sums[4] - dst[0] + dst[7];
+ sums[6] = sums[5] - dst[1] + last;
+ sums[7] = sums[6] - dst[2] + last;
+ sums[8] = sums[7] - dst[3] + last;
+ sums[9] = sums[8] - dst[4] + last;
+
+ dst[0]= (sums[0] + sums[2] + 2*dst[0])>>4;
+ dst[1]= (sums[1] + sums[3] + 2*dst[1])>>4;
+ dst[2]= (sums[2] + sums[4] + 2*dst[2])>>4;
+ dst[3]= (sums[3] + sums[5] + 2*dst[3])>>4;
+ dst[4]= (sums[4] + sums[6] + 2*dst[4])>>4;
+ dst[5]= (sums[5] + sums[7] + 2*dst[5])>>4;
+ dst[6]= (sums[6] + sums[8] + 2*dst[6])>>4;
+ dst[7]= (sums[7] + sums[9] + 2*dst[7])>>4;
+
+ dst+= stride;
+ }
+}
+
+/**
+ * Experimental Filter 1 (Horizontal)
+ * will not damage linear gradients
+ * Flat blocks should look like they were passed through the (1,1,2,2,4,2,2,1,1) 9-Tap filter
+ * can only smooth blocks at the expected locations (it cannot smooth them if they did move)
+ * MMX2 version does correct clipping C version does not
+ * not identical with the vertical one
+ */
+static inline void horizX1Filter(uint8_t *src, int stride, int QP)
+{
+ int y;
+ static uint64_t lut[256];
+ if(!lut[255])
+ {
+ int i;
+ for(i=0; i<256; i++)
+ {
+ int v= i < 128 ? 2*i : 2*(i-256);
+/*
+//Simulate 112242211 9-Tap filter
+ uint64_t a= (v/16) & 0xFF;
+ uint64_t b= (v/8) & 0xFF;
+ uint64_t c= (v/4) & 0xFF;
+ uint64_t d= (3*v/8) & 0xFF;
+*/
+//Simulate piecewise linear interpolation
+ uint64_t a= (v/16) & 0xFF;
+ uint64_t b= (v*3/16) & 0xFF;
+ uint64_t c= (v*5/16) & 0xFF;
+ uint64_t d= (7*v/16) & 0xFF;
+ uint64_t A= (0x100 - a)&0xFF;
+ uint64_t B= (0x100 - b)&0xFF;
+ uint64_t C= (0x100 - c)&0xFF;
+ uint64_t D= (0x100 - c)&0xFF;
+
+ lut[i] = (a<<56) | (b<<48) | (c<<40) | (d<<32) |
+ (D<<24) | (C<<16) | (B<<8) | (A);
+ //lut[i] = (v<<32) | (v<<24);
+ }
+ }
+
+ for(y=0; y<BLOCK_SIZE; y++){
+ int a= src[1] - src[2];
+ int b= src[3] - src[4];
+ int c= src[5] - src[6];
+
+ int d= FFMAX(FFABS(b) - (FFABS(a) + FFABS(c))/2, 0);
+
+ if(d < QP){
+ int v = d * FFSIGN(-b);
+
+ src[1] +=v/8;
+ src[2] +=v/4;
+ src[3] +=3*v/8;
+ src[4] -=3*v/8;
+ src[5] -=v/4;
+ src[6] -=v/8;
+ }
+ src+=stride;
+ }
+}
+
+/**
+ * accurate deblock filter
+ */
+static av_always_inline void do_a_deblock_C(uint8_t *src, int step,
+ int stride, const PPContext *c, int mode)
+{
+ int y;
+ const int QP= c->QP;
+ const int dcOffset= ((c->nonBQP*c->ppMode.baseDcDiff)>>8) + 1;
+ const int dcThreshold= dcOffset*2 + 1;
+//START_TIMER
+ src+= step*4; // src points to begin of the 8x8 Block
+ for(y=0; y<8; y++){
+ int numEq= 0;
+
+ numEq += ((unsigned)(src[-1*step] - src[0*step] + dcOffset)) < dcThreshold;
+ numEq += ((unsigned)(src[ 0*step] - src[1*step] + dcOffset)) < dcThreshold;
+ numEq += ((unsigned)(src[ 1*step] - src[2*step] + dcOffset)) < dcThreshold;
+ numEq += ((unsigned)(src[ 2*step] - src[3*step] + dcOffset)) < dcThreshold;
+ numEq += ((unsigned)(src[ 3*step] - src[4*step] + dcOffset)) < dcThreshold;
+ numEq += ((unsigned)(src[ 4*step] - src[5*step] + dcOffset)) < dcThreshold;
+ numEq += ((unsigned)(src[ 5*step] - src[6*step] + dcOffset)) < dcThreshold;
+ numEq += ((unsigned)(src[ 6*step] - src[7*step] + dcOffset)) < dcThreshold;
+ numEq += ((unsigned)(src[ 7*step] - src[8*step] + dcOffset)) < dcThreshold;
+ if(numEq > c->ppMode.flatnessThreshold){
+ int min, max, x;
+
+ if(src[0] > src[step]){
+ max= src[0];
+ min= src[step];
+ }else{
+ max= src[step];
+ min= src[0];
+ }
+ for(x=2; x<8; x+=2){
+ if(src[x*step] > src[(x+1)*step]){
+ if(src[x *step] > max) max= src[ x *step];
+ if(src[(x+1)*step] < min) min= src[(x+1)*step];
+ }else{
+ if(src[(x+1)*step] > max) max= src[(x+1)*step];
+ if(src[ x *step] < min) min= src[ x *step];
+ }
+ }
+ if(max-min < 2*QP){
+ const int first= FFABS(src[-1*step] - src[0]) < QP ? src[-1*step] : src[0];
+ const int last= FFABS(src[8*step] - src[7*step]) < QP ? src[8*step] : src[7*step];
+
+ int sums[10];
+ sums[0] = 4*first + src[0*step] + src[1*step] + src[2*step] + 4;
+ sums[1] = sums[0] - first + src[3*step];
+ sums[2] = sums[1] - first + src[4*step];
+ sums[3] = sums[2] - first + src[5*step];
+ sums[4] = sums[3] - first + src[6*step];
+ sums[5] = sums[4] - src[0*step] + src[7*step];
+ sums[6] = sums[5] - src[1*step] + last;
+ sums[7] = sums[6] - src[2*step] + last;
+ sums[8] = sums[7] - src[3*step] + last;
+ sums[9] = sums[8] - src[4*step] + last;
+
+ if (mode & VISUALIZE) {
+ src[0*step] =
+ src[1*step] =
+ src[2*step] =
+ src[3*step] =
+ src[4*step] =
+ src[5*step] =
+ src[6*step] =
+ src[7*step] = 128;
+ }
+ src[0*step]= (sums[0] + sums[2] + 2*src[0*step])>>4;
+ src[1*step]= (sums[1] + sums[3] + 2*src[1*step])>>4;
+ src[2*step]= (sums[2] + sums[4] + 2*src[2*step])>>4;
+ src[3*step]= (sums[3] + sums[5] + 2*src[3*step])>>4;
+ src[4*step]= (sums[4] + sums[6] + 2*src[4*step])>>4;
+ src[5*step]= (sums[5] + sums[7] + 2*src[5*step])>>4;
+ src[6*step]= (sums[6] + sums[8] + 2*src[6*step])>>4;
+ src[7*step]= (sums[7] + sums[9] + 2*src[7*step])>>4;
+ }
+ }else{
+ const int middleEnergy= 5*(src[4*step] - src[3*step]) + 2*(src[2*step] - src[5*step]);
+
+ if(FFABS(middleEnergy) < 8*QP){
+ const int q=(src[3*step] - src[4*step])/2;
+ const int leftEnergy= 5*(src[2*step] - src[1*step]) + 2*(src[0*step] - src[3*step]);
+ const int rightEnergy= 5*(src[6*step] - src[5*step]) + 2*(src[4*step] - src[7*step]);
+
+ int d= FFABS(middleEnergy) - FFMIN( FFABS(leftEnergy), FFABS(rightEnergy) );
+ d= FFMAX(d, 0);
+
+ d= (5*d + 32) >> 6;
+ d*= FFSIGN(-middleEnergy);
+
+ if(q>0){
+ d = FFMAX(d, 0);
+ d = FFMIN(d, q);
+ }else{
+ d = FFMIN(d, 0);
+ d = FFMAX(d, q);
+ }
+
+ if ((mode & VISUALIZE) && d) {
+ d= (d < 0) ? 32 : -32;
+ src[3*step]= av_clip_uint8(src[3*step] - d);
+ src[4*step]= av_clip_uint8(src[4*step] + d);
+ d = 0;
+ }
+
+ src[3*step]-= d;
+ src[4*step]+= d;
+ }
+ }
+
+ src += stride;
+ }
+/*if(step==16){
+ STOP_TIMER("step16")
+}else{
+ STOP_TIMER("stepX")
+}*/
+}
+
+//Note: we have C, MMX, MMX2, 3DNOW version there is no 3DNOW+MMX2 one
+//Plain C versions
+//we always compile C for testing which needs bitexactness
+#define TEMPLATE_PP_C 1
+#include "postprocess_template.c"
+
+#if HAVE_ALTIVEC
+# define TEMPLATE_PP_ALTIVEC 1
+# include "postprocess_altivec_template.c"
+# include "postprocess_template.c"
+#endif
+
+#if ARCH_X86 && HAVE_INLINE_ASM
+# if CONFIG_RUNTIME_CPUDETECT
+# define TEMPLATE_PP_MMX 1
+# include "postprocess_template.c"
+# define TEMPLATE_PP_MMXEXT 1
+# include "postprocess_template.c"
+# define TEMPLATE_PP_3DNOW 1
+# include "postprocess_template.c"
+# define TEMPLATE_PP_SSE2 1
+# include "postprocess_template.c"
+# else
+# if HAVE_SSE2_INLINE
+# define TEMPLATE_PP_SSE2 1
+# include "postprocess_template.c"
+# elif HAVE_MMXEXT_INLINE
+# define TEMPLATE_PP_MMXEXT 1
+# include "postprocess_template.c"
+# elif HAVE_AMD3DNOW_INLINE
+# define TEMPLATE_PP_3DNOW 1
+# include "postprocess_template.c"
+# elif HAVE_MMX_INLINE
+# define TEMPLATE_PP_MMX 1
+# include "postprocess_template.c"
+# endif
+# endif
+#endif
+
+typedef void (*pp_fn)(const uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int width, int height,
+ const QP_STORE_T QPs[], int QPStride, int isColor, PPContext *c2);
+
+static inline void postProcess(const uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int width, int height,
+ const QP_STORE_T QPs[], int QPStride, int isColor, pp_mode *vm, pp_context *vc)
+{
+ pp_fn pp = postProcess_C;
+ PPContext *c= (PPContext *)vc;
+ PPMode *ppMode= (PPMode *)vm;
+ c->ppMode= *ppMode; //FIXME
+
+ if (!(ppMode->lumMode & BITEXACT)) {
+#if CONFIG_RUNTIME_CPUDETECT
+#if ARCH_X86 && HAVE_INLINE_ASM
+ // ordered per speed fastest first
+ if (c->cpuCaps & AV_CPU_FLAG_SSE2) pp = postProcess_SSE2;
+ else if (c->cpuCaps & AV_CPU_FLAG_MMXEXT) pp = postProcess_MMX2;
+ else if (c->cpuCaps & AV_CPU_FLAG_3DNOW) pp = postProcess_3DNow;
+ else if (c->cpuCaps & AV_CPU_FLAG_MMX) pp = postProcess_MMX;
+#elif HAVE_ALTIVEC
+ if (c->cpuCaps & AV_CPU_FLAG_ALTIVEC) pp = postProcess_altivec;
+#endif
+#else /* CONFIG_RUNTIME_CPUDETECT */
+#if HAVE_SSE2_INLINE
+ pp = postProcess_SSE2;
+#elif HAVE_MMXEXT_INLINE
+ pp = postProcess_MMX2;
+#elif HAVE_AMD3DNOW_INLINE
+ pp = postProcess_3DNow;
+#elif HAVE_MMX_INLINE
+ pp = postProcess_MMX;
+#elif HAVE_ALTIVEC
+ pp = postProcess_altivec;
+#endif
+#endif /* !CONFIG_RUNTIME_CPUDETECT */
+ }
+
+ pp(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c);
+}
+
+/* -pp Command line Help
+*/
+const char pp_help[] =
+"Available postprocessing filters:\n"
+"Filters Options\n"
+"short long name short long option Description\n"
+"* * a autoq CPU power dependent enabler\n"
+" c chrom chrominance filtering enabled\n"
+" y nochrom chrominance filtering disabled\n"
+" n noluma luma filtering disabled\n"
+"hb hdeblock (2 threshold) horizontal deblocking filter\n"
+" 1. difference factor: default=32, higher -> more deblocking\n"
+" 2. flatness threshold: default=39, lower -> more deblocking\n"
+" the h & v deblocking filters share these\n"
+" so you can't set different thresholds for h / v\n"
+"vb vdeblock (2 threshold) vertical deblocking filter\n"
+"ha hadeblock (2 threshold) horizontal deblocking filter\n"
+"va vadeblock (2 threshold) vertical deblocking filter\n"
+"h1 x1hdeblock experimental h deblock filter 1\n"
+"v1 x1vdeblock experimental v deblock filter 1\n"
+"dr dering deringing filter\n"
+"al autolevels automatic brightness / contrast\n"
+" f fullyrange stretch luminance to (0..255)\n"
+"lb linblenddeint linear blend deinterlacer\n"
+"li linipoldeint linear interpolating deinterlace\n"
+"ci cubicipoldeint cubic interpolating deinterlacer\n"
+"md mediandeint median deinterlacer\n"
+"fd ffmpegdeint ffmpeg deinterlacer\n"
+"l5 lowpass5 FIR lowpass deinterlacer\n"
+"de default hb:a,vb:a,dr:a\n"
+"fa fast h1:a,v1:a,dr:a\n"
+"ac ha:a:128:7,va:a,dr:a\n"
+"tn tmpnoise (3 threshold) temporal noise reducer\n"
+" 1. <= 2. <= 3. larger -> stronger filtering\n"
+"fq forceQuant <quantizer> force quantizer\n"
+"Usage:\n"
+"<filterName>[:<option>[:<option>...]][[,|/][-]<filterName>[:<option>...]]...\n"
+"long form example:\n"
+"vdeblock:autoq/hdeblock:autoq/linblenddeint default,-vdeblock\n"
+"short form example:\n"
+"vb:a/hb:a/lb de,-vb\n"
+"more examples:\n"
+"tn:64:128:256\n"
+"\n"
+;
+
+pp_mode *pp_get_mode_by_name_and_quality(const char *name, int quality)
+{
+ char temp[GET_MODE_BUFFER_SIZE];
+ char *p= temp;
+ static const char filterDelimiters[] = ",/";
+ static const char optionDelimiters[] = ":|";
+ struct PPMode *ppMode;
+ char *filterToken;
+
+ if (!name) {
+ av_log(NULL, AV_LOG_ERROR, "pp: Missing argument\n");
+ return NULL;
+ }
+
+ if (!strcmp(name, "help")) {
+ const char *p;
+ for (p = pp_help; strchr(p, '\n'); p = strchr(p, '\n') + 1) {
+ av_strlcpy(temp, p, FFMIN(sizeof(temp), strchr(p, '\n') - p + 2));
+ av_log(NULL, AV_LOG_INFO, "%s", temp);
+ }
+ return NULL;
+ }
+
+ ppMode= av_malloc(sizeof(PPMode));
+ if (!ppMode)
+ return NULL;
+
+ ppMode->lumMode= 0;
+ ppMode->chromMode= 0;
+ ppMode->maxTmpNoise[0]= 700;
+ ppMode->maxTmpNoise[1]= 1500;
+ ppMode->maxTmpNoise[2]= 3000;
+ ppMode->maxAllowedY= 234;
+ ppMode->minAllowedY= 16;
+ ppMode->baseDcDiff= 256/8;
+ ppMode->flatnessThreshold= 56-16-1;
+ ppMode->maxClippedThreshold= (AVRational){1,100};
+ ppMode->error=0;
+
+ memset(temp, 0, GET_MODE_BUFFER_SIZE);
+ av_strlcpy(temp, name, GET_MODE_BUFFER_SIZE - 1);
+
+ av_log(NULL, AV_LOG_DEBUG, "pp: %s\n", name);
+
+ for(;;){
+ const char *filterName;
+ int q= 1000000; //PP_QUALITY_MAX;
+ int chrom=-1;
+ int luma=-1;
+ const char *option;
+ const char *options[OPTIONS_ARRAY_SIZE];
+ int i;
+ int filterNameOk=0;
+ int numOfUnknownOptions=0;
+ int enable=1; //does the user want us to enabled or disabled the filter
+ char *tokstate;
+
+ filterToken= av_strtok(p, filterDelimiters, &tokstate);
+ if(!filterToken) break;
+ p+= strlen(filterToken) + 1; // p points to next filterToken
+ filterName= av_strtok(filterToken, optionDelimiters, &tokstate);
+ if (!filterName) {
+ ppMode->error++;
+ break;
+ }
+ av_log(NULL, AV_LOG_DEBUG, "pp: %s::%s\n", filterToken, filterName);
+
+ if(*filterName == '-'){
+ enable=0;
+ filterName++;
+ }
+
+ for(;;){ //for all options
+ option= av_strtok(NULL, optionDelimiters, &tokstate);
+ if(!option) break;
+
+ av_log(NULL, AV_LOG_DEBUG, "pp: option: %s\n", option);
+ if(!strcmp("autoq", option) || !strcmp("a", option)) q= quality;
+ else if(!strcmp("nochrom", option) || !strcmp("y", option)) chrom=0;
+ else if(!strcmp("chrom", option) || !strcmp("c", option)) chrom=1;
+ else if(!strcmp("noluma", option) || !strcmp("n", option)) luma=0;
+ else{
+ options[numOfUnknownOptions] = option;
+ numOfUnknownOptions++;
+ }
+ if(numOfUnknownOptions >= OPTIONS_ARRAY_SIZE-1) break;
+ }
+ options[numOfUnknownOptions] = NULL;
+
+ /* replace stuff from the replace Table */
+ for(i=0; replaceTable[2*i]; i++){
+ if(!strcmp(replaceTable[2*i], filterName)){
+ size_t newlen = strlen(replaceTable[2*i + 1]);
+ int plen;
+ int spaceLeft;
+
+ p--, *p=',';
+
+ plen= strlen(p);
+ spaceLeft= p - temp + plen;
+ if(spaceLeft + newlen >= GET_MODE_BUFFER_SIZE - 1){
+ ppMode->error++;
+ break;
+ }
+ memmove(p + newlen, p, plen+1);
+ memcpy(p, replaceTable[2*i + 1], newlen);
+ filterNameOk=1;
+ }
+ }
+
+ for(i=0; filters[i].shortName; i++){
+ if( !strcmp(filters[i].longName, filterName)
+ || !strcmp(filters[i].shortName, filterName)){
+ ppMode->lumMode &= ~filters[i].mask;
+ ppMode->chromMode &= ~filters[i].mask;
+
+ filterNameOk=1;
+ if(!enable) break; // user wants to disable it
+
+ if(q >= filters[i].minLumQuality && luma)
+ ppMode->lumMode|= filters[i].mask;
+ if(chrom==1 || (chrom==-1 && filters[i].chromDefault))
+ if(q >= filters[i].minChromQuality)
+ ppMode->chromMode|= filters[i].mask;
+
+ if(filters[i].mask == LEVEL_FIX){
+ int o;
+ ppMode->minAllowedY= 16;
+ ppMode->maxAllowedY= 234;
+ for(o=0; options[o]; o++){
+ if( !strcmp(options[o],"fullyrange")
+ ||!strcmp(options[o],"f")){
+ ppMode->minAllowedY= 0;
+ ppMode->maxAllowedY= 255;
+ numOfUnknownOptions--;
+ }
+ }
+ }
+ else if(filters[i].mask == TEMP_NOISE_FILTER)
+ {
+ int o;
+ int numOfNoises=0;
+
+ for(o=0; options[o]; o++){
+ char *tail;
+ ppMode->maxTmpNoise[numOfNoises]=
+ strtol(options[o], &tail, 0);
+ if(tail!=options[o]){
+ numOfNoises++;
+ numOfUnknownOptions--;
+ if(numOfNoises >= 3) break;
+ }
+ }
+ }
+ else if(filters[i].mask == V_DEBLOCK || filters[i].mask == H_DEBLOCK
+ || filters[i].mask == V_A_DEBLOCK || filters[i].mask == H_A_DEBLOCK){
+ int o;
+
+ for(o=0; options[o] && o<2; o++){
+ char *tail;
+ int val= strtol(options[o], &tail, 0);
+ if(tail==options[o]) break;
+
+ numOfUnknownOptions--;
+ if(o==0) ppMode->baseDcDiff= val;
+ else ppMode->flatnessThreshold= val;
+ }
+ }
+ else if(filters[i].mask == FORCE_QUANT){
+ int o;
+ ppMode->forcedQuant= 15;
+
+ for(o=0; options[o] && o<1; o++){
+ char *tail;
+ int val= strtol(options[o], &tail, 0);
+ if(tail==options[o]) break;
+
+ numOfUnknownOptions--;
+ ppMode->forcedQuant= val;
+ }
+ }
+ }
+ }
+ if(!filterNameOk) ppMode->error++;
+ ppMode->error += numOfUnknownOptions;
+ }
+
+ av_log(NULL, AV_LOG_DEBUG, "pp: lumMode=%X, chromMode=%X\n", ppMode->lumMode, ppMode->chromMode);
+ if(ppMode->error){
+ av_log(NULL, AV_LOG_ERROR, "%d errors in postprocess string \"%s\"\n", ppMode->error, name);
+ av_free(ppMode);
+ return NULL;
+ }
+ return ppMode;
+}
+
+void pp_free_mode(pp_mode *mode){
+ av_free(mode);
+}
+
+static void reallocAlign(void **p, int size){
+ av_free(*p);
+ *p= av_mallocz(size);
+}
+
+static void reallocBuffers(PPContext *c, int width, int height, int stride, int qpStride){
+ int mbWidth = (width+15)>>4;
+ int mbHeight= (height+15)>>4;
+ int i;
+
+ c->stride= stride;
+ c->qpStride= qpStride;
+
+ reallocAlign((void **)&c->tempDst, stride*24+32);
+ reallocAlign((void **)&c->tempSrc, stride*24);
+ reallocAlign((void **)&c->tempBlocks, 2*16*8);
+ reallocAlign((void **)&c->yHistogram, 256*sizeof(uint64_t));
+ for(i=0; i<256; i++)
+ c->yHistogram[i]= width*height/64*15/256;
+
+ for(i=0; i<3; i++){
+ //Note: The +17*1024 is just there so I do not have to worry about r/w over the end.
+ reallocAlign((void **)&c->tempBlurred[i], stride*mbHeight*16 + 17*1024);
+ reallocAlign((void **)&c->tempBlurredPast[i], 256*((height+7)&(~7))/2 + 17*1024);//FIXME size
+ }
+
+ reallocAlign((void **)&c->deintTemp, 2*width+32);
+ reallocAlign((void **)&c->nonBQPTable, qpStride*mbHeight*sizeof(QP_STORE_T));
+ reallocAlign((void **)&c->stdQPTable, qpStride*mbHeight*sizeof(QP_STORE_T));
+ reallocAlign((void **)&c->forcedQPTable, mbWidth*sizeof(QP_STORE_T));
+}
+
+static const char * context_to_name(void * ptr) {
+ return "postproc";
+}
+
+static const AVClass av_codec_context_class = { "Postproc", context_to_name, NULL };
+
+av_cold pp_context *pp_get_context(int width, int height, int cpuCaps){
+ PPContext *c= av_mallocz(sizeof(PPContext));
+ int stride= FFALIGN(width, 16); //assumed / will realloc if needed
+ int qpStride= (width+15)/16 + 2; //assumed / will realloc if needed
+
+ if (!c)
+ return NULL;
+
+ c->av_class = &av_codec_context_class;
+ if(cpuCaps&PP_FORMAT){
+ c->hChromaSubSample= cpuCaps&0x3;
+ c->vChromaSubSample= (cpuCaps>>4)&0x3;
+ }else{
+ c->hChromaSubSample= 1;
+ c->vChromaSubSample= 1;
+ }
+ if (cpuCaps & PP_CPU_CAPS_AUTO) {
+ c->cpuCaps = av_get_cpu_flags();
+ } else {
+ c->cpuCaps = 0;
+ if (cpuCaps & PP_CPU_CAPS_MMX) c->cpuCaps |= AV_CPU_FLAG_MMX;
+ if (cpuCaps & PP_CPU_CAPS_MMX2) c->cpuCaps |= AV_CPU_FLAG_MMXEXT;
+ if (cpuCaps & PP_CPU_CAPS_3DNOW) c->cpuCaps |= AV_CPU_FLAG_3DNOW;
+ if (cpuCaps & PP_CPU_CAPS_ALTIVEC) c->cpuCaps |= AV_CPU_FLAG_ALTIVEC;
+ }
+
+ reallocBuffers(c, width, height, stride, qpStride);
+
+ c->frameNum=-1;
+
+ return c;
+}
+
+av_cold void pp_free_context(void *vc){
+ PPContext *c = (PPContext*)vc;
+ int i;
+
+ for(i=0; i<FF_ARRAY_ELEMS(c->tempBlurred); i++)
+ av_free(c->tempBlurred[i]);
+ for(i=0; i<FF_ARRAY_ELEMS(c->tempBlurredPast); i++)
+ av_free(c->tempBlurredPast[i]);
+
+ av_free(c->tempBlocks);
+ av_free(c->yHistogram);
+ av_free(c->tempDst);
+ av_free(c->tempSrc);
+ av_free(c->deintTemp);
+ av_free(c->stdQPTable);
+ av_free(c->nonBQPTable);
+ av_free(c->forcedQPTable);
+
+ memset(c, 0, sizeof(PPContext));
+
+ av_free(c);
+}
+
+void pp_postprocess(const uint8_t * src[3], const int srcStride[3],
+ uint8_t * dst[3], const int dstStride[3],
+ int width, int height,
+ const QP_STORE_T *QP_store, int QPStride,
+ pp_mode *vm, void *vc, int pict_type)
+{
+ int mbWidth = (width+15)>>4;
+ int mbHeight= (height+15)>>4;
+ PPMode *mode = vm;
+ PPContext *c = vc;
+ int minStride= FFMAX(FFABS(srcStride[0]), FFABS(dstStride[0]));
+ int absQPStride = FFABS(QPStride);
+
+ // c->stride and c->QPStride are always positive
+ if(c->stride < minStride || c->qpStride < absQPStride)
+ reallocBuffers(c, width, height,
+ FFMAX(minStride, c->stride),
+ FFMAX(c->qpStride, absQPStride));
+
+ if(!QP_store || (mode->lumMode & FORCE_QUANT)){
+ int i;
+ QP_store= c->forcedQPTable;
+ absQPStride = QPStride = 0;
+ if(mode->lumMode & FORCE_QUANT)
+ for(i=0; i<mbWidth; i++) c->forcedQPTable[i]= mode->forcedQuant;
+ else
+ for(i=0; i<mbWidth; i++) c->forcedQPTable[i]= 1;
+ }
+
+ if(pict_type & PP_PICT_TYPE_QP2){
+ int i;
+ const int count= FFMAX(mbHeight * absQPStride, mbWidth);
+ for(i=0; i<(count>>2); i++){
+ AV_WN32(c->stdQPTable + (i<<2), AV_RN32(QP_store + (i<<2)) >> 1 & 0x7F7F7F7F);
+ }
+ for(i<<=2; i<count; i++){
+ c->stdQPTable[i] = QP_store[i]>>1;
+ }
+ QP_store= c->stdQPTable;
+ QPStride= absQPStride;
+ }
+
+ if(0){
+ int x,y;
+ for(y=0; y<mbHeight; y++){
+ for(x=0; x<mbWidth; x++){
+ av_log(c, AV_LOG_INFO, "%2d ", QP_store[x + y*QPStride]);
+ }
+ av_log(c, AV_LOG_INFO, "\n");
+ }
+ av_log(c, AV_LOG_INFO, "\n");
+ }
+
+ if((pict_type&7)!=3){
+ if (QPStride >= 0){
+ int i;
+ const int count= FFMAX(mbHeight * QPStride, mbWidth);
+ for(i=0; i<(count>>2); i++){
+ AV_WN32(c->nonBQPTable + (i<<2), AV_RN32(QP_store + (i<<2)) & 0x3F3F3F3F);
+ }
+ for(i<<=2; i<count; i++){
+ c->nonBQPTable[i] = QP_store[i] & 0x3F;
+ }
+ } else {
+ int i,j;
+ for(i=0; i<mbHeight; i++) {
+ for(j=0; j<absQPStride; j++) {
+ c->nonBQPTable[i*absQPStride+j] = QP_store[i*QPStride+j] & 0x3F;
+ }
+ }
+ }
+ }
+
+ av_log(c, AV_LOG_DEBUG, "using npp filters 0x%X/0x%X\n",
+ mode->lumMode, mode->chromMode);
+
+ postProcess(src[0], srcStride[0], dst[0], dstStride[0],
+ width, height, QP_store, QPStride, 0, mode, c);
+
+ if (!(src[1] && src[2] && dst[1] && dst[2]))
+ return;
+
+ width = (width )>>c->hChromaSubSample;
+ height = (height)>>c->vChromaSubSample;
+
+ if(mode->chromMode){
+ postProcess(src[1], srcStride[1], dst[1], dstStride[1],
+ width, height, QP_store, QPStride, 1, mode, c);
+ postProcess(src[2], srcStride[2], dst[2], dstStride[2],
+ width, height, QP_store, QPStride, 2, mode, c);
+ }
+ else if(srcStride[1] == dstStride[1] && srcStride[2] == dstStride[2]){
+ linecpy(dst[1], src[1], height, srcStride[1]);
+ linecpy(dst[2], src[2], height, srcStride[2]);
+ }else{
+ int y;
+ for(y=0; y<height; y++){
+ memcpy(&(dst[1][y*dstStride[1]]), &(src[1][y*srcStride[1]]), width);
+ memcpy(&(dst[2][y*dstStride[2]]), &(src[2][y*srcStride[2]]), width);
+ }
+ }
+}
diff --cc libswscale/swscale_internal.h
index 84d5bee5ff,adfe1708e1..0f51df95d7
--- a/libswscale/swscale_internal.h
+++ b/libswscale/swscale_internal.h
@@@ -22,12 -22,6 +22,7 @@@
#define SWSCALE_SWSCALE_INTERNAL_H
#include "config.h"
-
- #if HAVE_ALTIVEC_H
- #include <altivec.h>
- #endif
-
+#include "version.h"
#include "libavutil/avassert.h"
#include "libavutil/avutil.h"
More information about the ffmpeg-cvslog
mailing list