diff libavcodec/x86/h264dsp_mmx.c @ 2:897f711a7157

rearrange to work with autoconf
author Nina Engelhardt <nengel@mailbox.tu-berlin.de>
date Tue, 25 Sep 2012 15:55:33 +0200
parents
children
line diff
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/libavcodec/x86/h264dsp_mmx.c	Tue Sep 25 15:55:33 2012 +0200
     1.3 @@ -0,0 +1,1741 @@
     1.4 +/*
     1.5 + * Copyright (c) 2004-2005 Michael Niedermayer, Loren Merritt
     1.6 + *
     1.7 + * This file is part of FFmpeg.
     1.8 + *
     1.9 + * FFmpeg is free software; you can redistribute it and/or
    1.10 + * modify it under the terms of the GNU Lesser General Public
    1.11 + * License as published by the Free Software Foundation; either
    1.12 + * version 2.1 of the License, or (at your option) any later version.
    1.13 + *
    1.14 + * FFmpeg is distributed in the hope that it will be useful,
    1.15 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
    1.16 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
    1.17 + * Lesser General Public License for more details.
    1.18 + *
    1.19 + * You should have received a copy of the GNU Lesser General Public
    1.20 + * License along with FFmpeg; if not, write to the Free Software
    1.21 + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
    1.22 + */
    1.23 +
    1.24 +#include "dsputil_mmx.h"
    1.25 +
    1.26 +DECLARE_ALIGNED(8, static const uint64_t, ff_pb_3_1  ) = 0x0103010301030103ULL;
    1.27 +DECLARE_ALIGNED(8, static const uint64_t, ff_pb_7_3  ) = 0x0307030703070307ULL;
    1.28 +
    1.29 +/***********************************/
    1.30 +/* IDCT */
    1.31 +
    1.32 +#define SUMSUB_BADC( a, b, c, d ) \
    1.33 +    "paddw "#b", "#a" \n\t"\
    1.34 +    "paddw "#d", "#c" \n\t"\
    1.35 +    "paddw "#b", "#b" \n\t"\
    1.36 +    "paddw "#d", "#d" \n\t"\
    1.37 +    "psubw "#a", "#b" \n\t"\
    1.38 +    "psubw "#c", "#d" \n\t"
    1.39 +
    1.40 +#define SUMSUBD2_AB( a, b, t ) \
    1.41 +    "movq  "#b", "#t" \n\t"\
    1.42 +    "psraw  $1 , "#b" \n\t"\
    1.43 +    "paddw "#a", "#b" \n\t"\
    1.44 +    "psraw  $1 , "#a" \n\t"\
    1.45 +    "psubw "#t", "#a" \n\t"
    1.46 +
    1.47 +#define IDCT4_1D( s02, s13, d02, d13, t ) \
    1.48 +    SUMSUB_BA  ( s02, d02 )\
    1.49 +    SUMSUBD2_AB( s13, d13, t )\
    1.50 +    SUMSUB_BADC( d13, s02, s13, d02 )
    1.51 +
    1.52 +#define STORE_DIFF_4P( p, t, z ) \
    1.53 +    "psraw      $6,     "#p" \n\t"\
    1.54 +    "movd       (%0),   "#t" \n\t"\
    1.55 +    "punpcklbw "#z",    "#t" \n\t"\
    1.56 +    "paddsw    "#t",    "#p" \n\t"\
    1.57 +    "packuswb  "#z",    "#p" \n\t"\
    1.58 +    "movd      "#p",    (%0) \n\t"
    1.59 +
    1.60 +static void ff_h264_idct_add_mmx(uint8_t *dst, int16_t *block, int stride)
    1.61 +{
    1.62 +    /* Load dct coeffs */
    1.63 +    __asm__ volatile(
    1.64 +        "movq   (%0), %%mm0 \n\t"
    1.65 +        "movq  8(%0), %%mm1 \n\t"
    1.66 +        "movq 16(%0), %%mm2 \n\t"
    1.67 +        "movq 24(%0), %%mm3 \n\t"
    1.68 +    :: "r"(block) );
    1.69 +
    1.70 +    __asm__ volatile(
    1.71 +        /* mm1=s02+s13  mm2=s02-s13  mm4=d02+d13  mm0=d02-d13 */
    1.72 +        IDCT4_1D( %%mm2, %%mm1, %%mm0, %%mm3, %%mm4 )
    1.73 +
    1.74 +        "movq      %0,    %%mm6 \n\t"
    1.75 +        /* in: 1,4,0,2  out: 1,2,3,0 */
    1.76 +        TRANSPOSE4( %%mm3, %%mm1, %%mm0, %%mm2, %%mm4 )
    1.77 +
    1.78 +        "paddw     %%mm6, %%mm3 \n\t"
    1.79 +
    1.80 +        /* mm2=s02+s13  mm3=s02-s13  mm4=d02+d13  mm1=d02-d13 */
    1.81 +        IDCT4_1D( %%mm4, %%mm2, %%mm3, %%mm0, %%mm1 )
    1.82 +
    1.83 +        "pxor %%mm7, %%mm7    \n\t"
    1.84 +    :: "m"(ff_pw_32));
    1.85 +
    1.86 +    __asm__ volatile(
    1.87 +    STORE_DIFF_4P( %%mm0, %%mm1, %%mm7)
    1.88 +        "add %1, %0             \n\t"
    1.89 +    STORE_DIFF_4P( %%mm2, %%mm1, %%mm7)
    1.90 +        "add %1, %0             \n\t"
    1.91 +    STORE_DIFF_4P( %%mm3, %%mm1, %%mm7)
    1.92 +        "add %1, %0             \n\t"
    1.93 +    STORE_DIFF_4P( %%mm4, %%mm1, %%mm7)
    1.94 +        : "+r"(dst)
    1.95 +        : "r" ((x86_reg)stride)
    1.96 +    );
    1.97 +}
    1.98 +
    1.99 +static inline void h264_idct8_1d(int16_t *block)
   1.100 +{
   1.101 +    __asm__ volatile(
   1.102 +        "movq 112(%0), %%mm7  \n\t"
   1.103 +        "movq  80(%0), %%mm0  \n\t"
   1.104 +        "movq  48(%0), %%mm3  \n\t"
   1.105 +        "movq  16(%0), %%mm5  \n\t"
   1.106 +
   1.107 +        "movq   %%mm0, %%mm4  \n\t"
   1.108 +        "movq   %%mm5, %%mm1  \n\t"
   1.109 +        "psraw  $1,    %%mm4  \n\t"
   1.110 +        "psraw  $1,    %%mm1  \n\t"
   1.111 +        "paddw  %%mm0, %%mm4  \n\t"
   1.112 +        "paddw  %%mm5, %%mm1  \n\t"
   1.113 +        "paddw  %%mm7, %%mm4  \n\t"
   1.114 +        "paddw  %%mm0, %%mm1  \n\t"
   1.115 +        "psubw  %%mm5, %%mm4  \n\t"
   1.116 +        "paddw  %%mm3, %%mm1  \n\t"
   1.117 +
   1.118 +        "psubw  %%mm3, %%mm5  \n\t"
   1.119 +        "psubw  %%mm3, %%mm0  \n\t"
   1.120 +        "paddw  %%mm7, %%mm5  \n\t"
   1.121 +        "psubw  %%mm7, %%mm0  \n\t"
   1.122 +        "psraw  $1,    %%mm3  \n\t"
   1.123 +        "psraw  $1,    %%mm7  \n\t"
   1.124 +        "psubw  %%mm3, %%mm5  \n\t"
   1.125 +        "psubw  %%mm7, %%mm0  \n\t"
   1.126 +
   1.127 +        "movq   %%mm4, %%mm3  \n\t"
   1.128 +        "movq   %%mm1, %%mm7  \n\t"
   1.129 +        "psraw  $2,    %%mm1  \n\t"
   1.130 +        "psraw  $2,    %%mm3  \n\t"
   1.131 +        "paddw  %%mm5, %%mm3  \n\t"
   1.132 +        "psraw  $2,    %%mm5  \n\t"
   1.133 +        "paddw  %%mm0, %%mm1  \n\t"
   1.134 +        "psraw  $2,    %%mm0  \n\t"
   1.135 +        "psubw  %%mm4, %%mm5  \n\t"
   1.136 +        "psubw  %%mm0, %%mm7  \n\t"
   1.137 +
   1.138 +        "movq  32(%0), %%mm2  \n\t"
   1.139 +        "movq  96(%0), %%mm6  \n\t"
   1.140 +        "movq   %%mm2, %%mm4  \n\t"
   1.141 +        "movq   %%mm6, %%mm0  \n\t"
   1.142 +        "psraw  $1,    %%mm4  \n\t"
   1.143 +        "psraw  $1,    %%mm6  \n\t"
   1.144 +        "psubw  %%mm0, %%mm4  \n\t"
   1.145 +        "paddw  %%mm2, %%mm6  \n\t"
   1.146 +
   1.147 +        "movq    (%0), %%mm2  \n\t"
   1.148 +        "movq  64(%0), %%mm0  \n\t"
   1.149 +        SUMSUB_BA( %%mm0, %%mm2 )
   1.150 +        SUMSUB_BA( %%mm6, %%mm0 )
   1.151 +        SUMSUB_BA( %%mm4, %%mm2 )
   1.152 +        SUMSUB_BA( %%mm7, %%mm6 )
   1.153 +        SUMSUB_BA( %%mm5, %%mm4 )
   1.154 +        SUMSUB_BA( %%mm3, %%mm2 )
   1.155 +        SUMSUB_BA( %%mm1, %%mm0 )
   1.156 +        :: "r"(block)
   1.157 +    );
   1.158 +}
   1.159 +
   1.160 +static void ff_h264_idct8_add_mmx(uint8_t *dst, int16_t *block, int stride)
   1.161 +{
   1.162 +    int i;
   1.163 +    DECLARE_ALIGNED(8, int16_t, b2)[64];
   1.164 +
   1.165 +    block[0] += 32;
   1.166 +
   1.167 +    for(i=0; i<2; i++){
   1.168 +        DECLARE_ALIGNED(8, uint64_t, tmp);
   1.169 +
   1.170 +        h264_idct8_1d(block+4*i);
   1.171 +
   1.172 +        __asm__ volatile(
   1.173 +            "movq   %%mm7,    %0   \n\t"
   1.174 +            TRANSPOSE4( %%mm0, %%mm2, %%mm4, %%mm6, %%mm7 )
   1.175 +            "movq   %%mm0,  8(%1)  \n\t"
   1.176 +            "movq   %%mm6, 24(%1)  \n\t"
   1.177 +            "movq   %%mm7, 40(%1)  \n\t"
   1.178 +            "movq   %%mm4, 56(%1)  \n\t"
   1.179 +            "movq    %0,    %%mm7  \n\t"
   1.180 +            TRANSPOSE4( %%mm7, %%mm5, %%mm3, %%mm1, %%mm0 )
   1.181 +            "movq   %%mm7,   (%1)  \n\t"
   1.182 +            "movq   %%mm1, 16(%1)  \n\t"
   1.183 +            "movq   %%mm0, 32(%1)  \n\t"
   1.184 +            "movq   %%mm3, 48(%1)  \n\t"
   1.185 +            : "=m"(tmp)
   1.186 +            : "r"(b2+32*i)
   1.187 +            : "memory"
   1.188 +        );
   1.189 +    }
   1.190 +
   1.191 +    for(i=0; i<2; i++){
   1.192 +        h264_idct8_1d(b2+4*i);
   1.193 +
   1.194 +        __asm__ volatile(
   1.195 +            "psraw     $6, %%mm7  \n\t"
   1.196 +            "psraw     $6, %%mm6  \n\t"
   1.197 +            "psraw     $6, %%mm5  \n\t"
   1.198 +            "psraw     $6, %%mm4  \n\t"
   1.199 +            "psraw     $6, %%mm3  \n\t"
   1.200 +            "psraw     $6, %%mm2  \n\t"
   1.201 +            "psraw     $6, %%mm1  \n\t"
   1.202 +            "psraw     $6, %%mm0  \n\t"
   1.203 +
   1.204 +            "movq   %%mm7,    (%0)  \n\t"
   1.205 +            "movq   %%mm5,  16(%0)  \n\t"
   1.206 +            "movq   %%mm3,  32(%0)  \n\t"
   1.207 +            "movq   %%mm1,  48(%0)  \n\t"
   1.208 +            "movq   %%mm0,  64(%0)  \n\t"
   1.209 +            "movq   %%mm2,  80(%0)  \n\t"
   1.210 +            "movq   %%mm4,  96(%0)  \n\t"
   1.211 +            "movq   %%mm6, 112(%0)  \n\t"
   1.212 +            :: "r"(b2+4*i)
   1.213 +            : "memory"
   1.214 +        );
   1.215 +    }
   1.216 +
   1.217 +    add_pixels_clamped_mmx(b2, dst, stride);
   1.218 +}
   1.219 +
   1.220 +#define STORE_DIFF_8P( p, d, t, z )\
   1.221 +        "movq       "#d", "#t" \n"\
   1.222 +        "psraw       $6,  "#p" \n"\
   1.223 +        "punpcklbw  "#z", "#t" \n"\
   1.224 +        "paddsw     "#t", "#p" \n"\
   1.225 +        "packuswb   "#p", "#p" \n"\
   1.226 +        "movq       "#p", "#d" \n"
   1.227 +
   1.228 +#define H264_IDCT8_1D_SSE2(a,b,c,d,e,f,g,h)\
   1.229 +        "movdqa     "#c", "#a" \n"\
   1.230 +        "movdqa     "#g", "#e" \n"\
   1.231 +        "psraw       $1,  "#c" \n"\
   1.232 +        "psraw       $1,  "#g" \n"\
   1.233 +        "psubw      "#e", "#c" \n"\
   1.234 +        "paddw      "#a", "#g" \n"\
   1.235 +        "movdqa     "#b", "#e" \n"\
   1.236 +        "psraw       $1,  "#e" \n"\
   1.237 +        "paddw      "#b", "#e" \n"\
   1.238 +        "paddw      "#d", "#e" \n"\
   1.239 +        "paddw      "#f", "#e" \n"\
   1.240 +        "movdqa     "#f", "#a" \n"\
   1.241 +        "psraw       $1,  "#a" \n"\
   1.242 +        "paddw      "#f", "#a" \n"\
   1.243 +        "paddw      "#h", "#a" \n"\
   1.244 +        "psubw      "#b", "#a" \n"\
   1.245 +        "psubw      "#d", "#b" \n"\
   1.246 +        "psubw      "#d", "#f" \n"\
   1.247 +        "paddw      "#h", "#b" \n"\
   1.248 +        "psubw      "#h", "#f" \n"\
   1.249 +        "psraw       $1,  "#d" \n"\
   1.250 +        "psraw       $1,  "#h" \n"\
   1.251 +        "psubw      "#d", "#b" \n"\
   1.252 +        "psubw      "#h", "#f" \n"\
   1.253 +        "movdqa     "#e", "#d" \n"\
   1.254 +        "movdqa     "#a", "#h" \n"\
   1.255 +        "psraw       $2,  "#d" \n"\
   1.256 +        "psraw       $2,  "#h" \n"\
   1.257 +        "paddw      "#f", "#d" \n"\
   1.258 +        "paddw      "#b", "#h" \n"\
   1.259 +        "psraw       $2,  "#f" \n"\
   1.260 +        "psraw       $2,  "#b" \n"\
   1.261 +        "psubw      "#f", "#e" \n"\
   1.262 +        "psubw      "#a", "#b" \n"\
   1.263 +        "movdqa 0x00(%1), "#a" \n"\
   1.264 +        "movdqa 0x40(%1), "#f" \n"\
   1.265 +        SUMSUB_BA(f, a)\
   1.266 +        SUMSUB_BA(g, f)\
   1.267 +        SUMSUB_BA(c, a)\
   1.268 +        SUMSUB_BA(e, g)\
   1.269 +        SUMSUB_BA(b, c)\
   1.270 +        SUMSUB_BA(h, a)\
   1.271 +        SUMSUB_BA(d, f)
   1.272 +
   1.273 +static void ff_h264_idct8_add_sse2(uint8_t *dst, int16_t *block, int stride)
   1.274 +{
   1.275 +    __asm__ volatile(
   1.276 +        "movdqa   0x10(%1), %%xmm1 \n"
   1.277 +        "movdqa   0x20(%1), %%xmm2 \n"
   1.278 +        "movdqa   0x30(%1), %%xmm3 \n"
   1.279 +        "movdqa   0x50(%1), %%xmm5 \n"
   1.280 +        "movdqa   0x60(%1), %%xmm6 \n"
   1.281 +        "movdqa   0x70(%1), %%xmm7 \n"
   1.282 +        H264_IDCT8_1D_SSE2(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm6, %%xmm7)
   1.283 +        TRANSPOSE8(%%xmm4, %%xmm1, %%xmm7, %%xmm3, %%xmm5, %%xmm0, %%xmm2, %%xmm6, (%1))
   1.284 +        "paddw          %4, %%xmm4 \n"
   1.285 +        "movdqa     %%xmm4, 0x00(%1) \n"
   1.286 +        "movdqa     %%xmm2, 0x40(%1) \n"
   1.287 +        H264_IDCT8_1D_SSE2(%%xmm4, %%xmm0, %%xmm6, %%xmm3, %%xmm2, %%xmm5, %%xmm7, %%xmm1)
   1.288 +        "movdqa     %%xmm6, 0x60(%1) \n"
   1.289 +        "movdqa     %%xmm7, 0x70(%1) \n"
   1.290 +        "pxor       %%xmm7, %%xmm7 \n"
   1.291 +        STORE_DIFF_8P(%%xmm2, (%0),      %%xmm6, %%xmm7)
   1.292 +        STORE_DIFF_8P(%%xmm0, (%0,%2),   %%xmm6, %%xmm7)
   1.293 +        STORE_DIFF_8P(%%xmm1, (%0,%2,2), %%xmm6, %%xmm7)
   1.294 +        STORE_DIFF_8P(%%xmm3, (%0,%3),   %%xmm6, %%xmm7)
   1.295 +        "lea     (%0,%2,4), %0 \n"
   1.296 +        STORE_DIFF_8P(%%xmm5, (%0),      %%xmm6, %%xmm7)
   1.297 +        STORE_DIFF_8P(%%xmm4, (%0,%2),   %%xmm6, %%xmm7)
   1.298 +        "movdqa   0x60(%1), %%xmm0 \n"
   1.299 +        "movdqa   0x70(%1), %%xmm1 \n"
   1.300 +        STORE_DIFF_8P(%%xmm0, (%0,%2,2), %%xmm6, %%xmm7)
   1.301 +        STORE_DIFF_8P(%%xmm1, (%0,%3),   %%xmm6, %%xmm7)
   1.302 +        :"+r"(dst)
   1.303 +        :"r"(block), "r"((x86_reg)stride), "r"((x86_reg)3L*stride), "m"(ff_pw_32)
   1.304 +    );
   1.305 +}
   1.306 +
   1.307 +static void ff_h264_idct_dc_add_mmx2(uint8_t *dst, int16_t *block, int stride)
   1.308 +{
   1.309 +    int dc = (block[0] + 32) >> 6;
   1.310 +    __asm__ volatile(
   1.311 +        "movd          %0, %%mm0 \n\t"
   1.312 +        "pshufw $0, %%mm0, %%mm0 \n\t"
   1.313 +        "pxor       %%mm1, %%mm1 \n\t"
   1.314 +        "psubw      %%mm0, %%mm1 \n\t"
   1.315 +        "packuswb   %%mm0, %%mm0 \n\t"
   1.316 +        "packuswb   %%mm1, %%mm1 \n\t"
   1.317 +        ::"r"(dc)
   1.318 +    );
   1.319 +    __asm__ volatile(
   1.320 +        "movd          %0, %%mm2 \n\t"
   1.321 +        "movd          %1, %%mm3 \n\t"
   1.322 +        "movd          %2, %%mm4 \n\t"
   1.323 +        "movd          %3, %%mm5 \n\t"
   1.324 +        "paddusb    %%mm0, %%mm2 \n\t"
   1.325 +        "paddusb    %%mm0, %%mm3 \n\t"
   1.326 +        "paddusb    %%mm0, %%mm4 \n\t"
   1.327 +        "paddusb    %%mm0, %%mm5 \n\t"
   1.328 +        "psubusb    %%mm1, %%mm2 \n\t"
   1.329 +        "psubusb    %%mm1, %%mm3 \n\t"
   1.330 +        "psubusb    %%mm1, %%mm4 \n\t"
   1.331 +        "psubusb    %%mm1, %%mm5 \n\t"
   1.332 +        "movd       %%mm2, %0    \n\t"
   1.333 +        "movd       %%mm3, %1    \n\t"
   1.334 +        "movd       %%mm4, %2    \n\t"
   1.335 +        "movd       %%mm5, %3    \n\t"
   1.336 +        :"+m"(*(uint32_t*)(dst+0*stride)),
   1.337 +         "+m"(*(uint32_t*)(dst+1*stride)),
   1.338 +         "+m"(*(uint32_t*)(dst+2*stride)),
   1.339 +         "+m"(*(uint32_t*)(dst+3*stride))
   1.340 +    );
   1.341 +}
   1.342 +
   1.343 +static void ff_h264_idct8_dc_add_mmx2(uint8_t *dst, int16_t *block, int stride)
   1.344 +{
   1.345 +    int dc = (block[0] + 32) >> 6;
   1.346 +    int y;
   1.347 +    __asm__ volatile(
   1.348 +        "movd          %0, %%mm0 \n\t"
   1.349 +        "pshufw $0, %%mm0, %%mm0 \n\t"
   1.350 +        "pxor       %%mm1, %%mm1 \n\t"
   1.351 +        "psubw      %%mm0, %%mm1 \n\t"
   1.352 +        "packuswb   %%mm0, %%mm0 \n\t"
   1.353 +        "packuswb   %%mm1, %%mm1 \n\t"
   1.354 +        ::"r"(dc)
   1.355 +    );
   1.356 +    for(y=2; y--; dst += 4*stride){
   1.357 +    __asm__ volatile(
   1.358 +        "movq          %0, %%mm2 \n\t"
   1.359 +        "movq          %1, %%mm3 \n\t"
   1.360 +        "movq          %2, %%mm4 \n\t"
   1.361 +        "movq          %3, %%mm5 \n\t"
   1.362 +        "paddusb    %%mm0, %%mm2 \n\t"
   1.363 +        "paddusb    %%mm0, %%mm3 \n\t"
   1.364 +        "paddusb    %%mm0, %%mm4 \n\t"
   1.365 +        "paddusb    %%mm0, %%mm5 \n\t"
   1.366 +        "psubusb    %%mm1, %%mm2 \n\t"
   1.367 +        "psubusb    %%mm1, %%mm3 \n\t"
   1.368 +        "psubusb    %%mm1, %%mm4 \n\t"
   1.369 +        "psubusb    %%mm1, %%mm5 \n\t"
   1.370 +        "movq       %%mm2, %0    \n\t"
   1.371 +        "movq       %%mm3, %1    \n\t"
   1.372 +        "movq       %%mm4, %2    \n\t"
   1.373 +        "movq       %%mm5, %3    \n\t"
   1.374 +        :"+m"(*(uint64_t*)(dst+0*stride)),
   1.375 +         "+m"(*(uint64_t*)(dst+1*stride)),
   1.376 +         "+m"(*(uint64_t*)(dst+2*stride)),
   1.377 +         "+m"(*(uint64_t*)(dst+3*stride))
   1.378 +    );
   1.379 +    }
   1.380 +}
   1.381 +
   1.382 +//FIXME this table is a duplicate from h264data.h, and will be removed once the tables from, h264 have been split
   1.383 +static const uint8_t scan8[16 + 2*4]={
   1.384 + 4+1*8, 5+1*8, 4+2*8, 5+2*8,
   1.385 + 6+1*8, 7+1*8, 6+2*8, 7+2*8,
   1.386 + 4+3*8, 5+3*8, 4+4*8, 5+4*8,
   1.387 + 6+3*8, 7+3*8, 6+4*8, 7+4*8,
   1.388 + 1+1*8, 2+1*8,
   1.389 + 1+2*8, 2+2*8,
   1.390 + 1+4*8, 2+4*8,
   1.391 + 1+5*8, 2+5*8,
   1.392 +};
   1.393 +
   1.394 +static void ff_h264_idct_add16_mmx2(uint8_t *dst, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]){
   1.395 +    int i;
   1.396 +    for(i=0; i<16; i++){
   1.397 +        int nnz = nnzc[ scan8[i] ];
   1.398 +        if(nnz){
   1.399 +            if(nnz==1 && block[i*16]) ff_h264_idct_dc_add_mmx2(dst + block_offset[i], block + i*16, stride);
   1.400 +            else                      ff_h264_idct_add_mmx    (dst + block_offset[i], block + i*16, stride);
   1.401 +        }
   1.402 +    }
   1.403 +}
   1.404 +
   1.405 +static void ff_h264_idct_add16intra_mmx2(uint8_t *dst, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]){
   1.406 +    int i;
   1.407 +    for(i=0; i<16; i++){
   1.408 +        if(nnzc[ scan8[i] ]) ff_h264_idct_add_mmx    (dst + block_offset[i], block + i*16, stride);
   1.409 +        else if(block[i*16]) ff_h264_idct_dc_add_mmx2(dst + block_offset[i], block + i*16, stride);
   1.410 +    }
   1.411 +}
   1.412 +
   1.413 +static void ff_h264_idct8_add4_mmx2(uint8_t *dst, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]){
   1.414 +    int i;
   1.415 +    for(i=0; i<16; i+=4){
   1.416 +        int nnz = nnzc[ scan8[i] ];
   1.417 +        if(nnz){
   1.418 +            if(nnz==1 && block[i*16]) ff_h264_idct8_dc_add_mmx2(dst + block_offset[i], block + i*16, stride);
   1.419 +            else                      ff_h264_idct8_add_mmx    (dst + block_offset[i], block + i*16, stride);
   1.420 +        }
   1.421 +    }
   1.422 +}
   1.423 +
   1.424 +static void ff_h264_idct8_add4_sse2(uint8_t *dst, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]){
   1.425 +    int i;
   1.426 +    for(i=0; i<16; i+=4){
   1.427 +        int nnz = nnzc[ scan8[i] ];
   1.428 +        if(nnz){
   1.429 +            if(nnz==1 && block[i*16]) ff_h264_idct8_dc_add_mmx2(dst + block_offset[i], block + i*16, stride);
   1.430 +            else                      ff_h264_idct8_add_sse2   (dst + block_offset[i], block + i*16, stride);
   1.431 +        }
   1.432 +    }
   1.433 +}
   1.434 +
   1.435 +static void ff_h264_idct_add8_mmx2(uint8_t **dest, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]){
   1.436 +    int i;
   1.437 +    for(i=16; i<16+8; i++){
   1.438 +        if(nnzc[ scan8[i] ])
   1.439 +            ff_h264_idct_add_mmx    (dest[(i&4)>>2] + block_offset[i], block + i*16, stride);
   1.440 +        else if(block[i*16])
   1.441 +            ff_h264_idct_dc_add_mmx2(dest[(i&4)>>2] + block_offset[i], block + i*16, stride);
   1.442 +    }
   1.443 +}
   1.444 +
   1.445 +/***********************************/
   1.446 +/* deblocking */
   1.447 +
   1.448 +// out: o = |x-y|>a
   1.449 +// clobbers: t
   1.450 +#define DIFF_GT_MMX(x,y,a,o,t)\
   1.451 +    "movq     "#y", "#t"  \n\t"\
   1.452 +    "movq     "#x", "#o"  \n\t"\
   1.453 +    "psubusb  "#x", "#t"  \n\t"\
   1.454 +    "psubusb  "#y", "#o"  \n\t"\
   1.455 +    "por      "#t", "#o"  \n\t"\
   1.456 +    "psubusb  "#a", "#o"  \n\t"
   1.457 +
   1.458 +// out: o = |x-y|>a
   1.459 +// clobbers: t
   1.460 +#define DIFF_GT2_MMX(x,y,a,o,t)\
   1.461 +    "movq     "#y", "#t"  \n\t"\
   1.462 +    "movq     "#x", "#o"  \n\t"\
   1.463 +    "psubusb  "#x", "#t"  \n\t"\
   1.464 +    "psubusb  "#y", "#o"  \n\t"\
   1.465 +    "psubusb  "#a", "#t"  \n\t"\
   1.466 +    "psubusb  "#a", "#o"  \n\t"\
   1.467 +    "pcmpeqb  "#t", "#o"  \n\t"\
   1.468 +
   1.469 +// in: mm0=p1 mm1=p0 mm2=q0 mm3=q1
   1.470 +// out: mm5=beta-1, mm7=mask
   1.471 +// clobbers: mm4,mm6
   1.472 +#define H264_DEBLOCK_MASK(alpha1, beta1) \
   1.473 +    "pshufw $0, "#alpha1", %%mm4 \n\t"\
   1.474 +    "pshufw $0, "#beta1 ", %%mm5 \n\t"\
   1.475 +    "packuswb  %%mm4, %%mm4      \n\t"\
   1.476 +    "packuswb  %%mm5, %%mm5      \n\t"\
   1.477 +    DIFF_GT_MMX(%%mm1, %%mm2, %%mm4, %%mm7, %%mm6) /* |p0-q0| > alpha-1 */\
   1.478 +    DIFF_GT_MMX(%%mm0, %%mm1, %%mm5, %%mm4, %%mm6) /* |p1-p0| > beta-1 */\
   1.479 +    "por       %%mm4, %%mm7      \n\t"\
   1.480 +    DIFF_GT_MMX(%%mm3, %%mm2, %%mm5, %%mm4, %%mm6) /* |q1-q0| > beta-1 */\
   1.481 +    "por       %%mm4, %%mm7      \n\t"\
   1.482 +    "pxor      %%mm6, %%mm6      \n\t"\
   1.483 +    "pcmpeqb   %%mm6, %%mm7      \n\t"
   1.484 +
   1.485 +// in: mm0=p1 mm1=p0 mm2=q0 mm3=q1 mm7=(tc&mask)
   1.486 +// out: mm1=p0' mm2=q0'
   1.487 +// clobbers: mm0,3-6
   1.488 +#define H264_DEBLOCK_P0_Q0(pb_01, pb_3f)\
   1.489 +        "movq    %%mm1              , %%mm5 \n\t"\
   1.490 +        "pxor    %%mm2              , %%mm5 \n\t" /* p0^q0*/\
   1.491 +        "pand    "#pb_01"           , %%mm5 \n\t" /* (p0^q0)&1*/\
   1.492 +        "pcmpeqb %%mm4              , %%mm4 \n\t"\
   1.493 +        "pxor    %%mm4              , %%mm3 \n\t"\
   1.494 +        "pavgb   %%mm0              , %%mm3 \n\t" /* (p1 - q1 + 256)>>1*/\
   1.495 +        "pavgb   "MANGLE(ff_pb_3)"  , %%mm3 \n\t" /*(((p1 - q1 + 256)>>1)+4)>>1 = 64+2+(p1-q1)>>2*/\
   1.496 +        "pxor    %%mm1              , %%mm4 \n\t"\
   1.497 +        "pavgb   %%mm2              , %%mm4 \n\t" /* (q0 - p0 + 256)>>1*/\
   1.498 +        "pavgb   %%mm5              , %%mm3 \n\t"\
   1.499 +        "paddusb %%mm4              , %%mm3 \n\t" /* d+128+33*/\
   1.500 +        "movq    "MANGLE(ff_pb_A1)" , %%mm6 \n\t"\
   1.501 +        "psubusb %%mm3              , %%mm6 \n\t"\
   1.502 +        "psubusb "MANGLE(ff_pb_A1)" , %%mm3 \n\t"\
   1.503 +        "pminub  %%mm7              , %%mm6 \n\t"\
   1.504 +        "pminub  %%mm7              , %%mm3 \n\t"\
   1.505 +        "psubusb %%mm6              , %%mm1 \n\t"\
   1.506 +        "psubusb %%mm3              , %%mm2 \n\t"\
   1.507 +        "paddusb %%mm3              , %%mm1 \n\t"\
   1.508 +        "paddusb %%mm6              , %%mm2 \n\t"
   1.509 +
   1.510 +// in: mm0=p1 mm1=p0 mm2=q0 mm3=q1 mm7=(tc&mask) %8=ff_bone
   1.511 +// out: (q1addr) = av_clip( (q2+((p0+q0+1)>>1))>>1, q1-tc0, q1+tc0 )
   1.512 +// clobbers: q2, tmp, tc0
   1.513 +#define H264_DEBLOCK_Q1(p1, q2, q2addr, q1addr, tc0, tmp)\
   1.514 +        "movq     %%mm1,  "#tmp"   \n\t"\
   1.515 +        "pavgb    %%mm2,  "#tmp"   \n\t"\
   1.516 +        "pavgb    "#tmp", "#q2"    \n\t" /* avg(p2,avg(p0,q0)) */\
   1.517 +        "pxor   "q2addr", "#tmp"   \n\t"\
   1.518 +        "pand     %9,     "#tmp"   \n\t" /* (p2^avg(p0,q0))&1 */\
   1.519 +        "psubusb  "#tmp", "#q2"    \n\t" /* (p2+((p0+q0+1)>>1))>>1 */\
   1.520 +        "movq     "#p1",  "#tmp"   \n\t"\
   1.521 +        "psubusb  "#tc0", "#tmp"   \n\t"\
   1.522 +        "paddusb  "#p1",  "#tc0"   \n\t"\
   1.523 +        "pmaxub   "#tmp", "#q2"    \n\t"\
   1.524 +        "pminub   "#tc0", "#q2"    \n\t"\
   1.525 +        "movq     "#q2",  "q1addr" \n\t"
   1.526 +
   1.527 +static inline void h264_loop_filter_luma_mmx2(uint8_t *pix, int stride, int alpha1, int beta1, int8_t *tc0)
   1.528 +{
   1.529 +    DECLARE_ALIGNED(8, uint64_t, tmp0)[2];
   1.530 +
   1.531 +    __asm__ volatile(
   1.532 +        "movq    (%2,%4), %%mm0    \n\t" //p1
   1.533 +        "movq    (%2,%4,2), %%mm1  \n\t" //p0
   1.534 +        "movq    (%3),    %%mm2    \n\t" //q0
   1.535 +        "movq    (%3,%4), %%mm3    \n\t" //q1
   1.536 +        H264_DEBLOCK_MASK(%7, %8)
   1.537 +
   1.538 +        "movd      %6,    %%mm4    \n\t"
   1.539 +        "punpcklbw %%mm4, %%mm4    \n\t"
   1.540 +        "punpcklwd %%mm4, %%mm4    \n\t"
   1.541 +        "pcmpeqb   %%mm3, %%mm3    \n\t"
   1.542 +        "movq      %%mm4, %%mm6    \n\t"
   1.543 +        "pcmpgtb   %%mm3, %%mm4    \n\t"
   1.544 +        "movq      %%mm6, %1       \n\t"
   1.545 +        "pand      %%mm4, %%mm7    \n\t"
   1.546 +        "movq      %%mm7, %0       \n\t"
   1.547 +
   1.548 +        /* filter p1 */
   1.549 +        "movq     (%2),   %%mm3    \n\t" //p2
   1.550 +        DIFF_GT2_MMX(%%mm1, %%mm3, %%mm5, %%mm6, %%mm4) // |p2-p0|>beta-1
   1.551 +        "pand     %%mm7,  %%mm6    \n\t" // mask & |p2-p0|<beta
   1.552 +        "pand     %1,     %%mm7    \n\t" // mask & tc0
   1.553 +        "movq     %%mm7,  %%mm4    \n\t"
   1.554 +        "psubb    %%mm6,  %%mm7    \n\t"
   1.555 +        "pand     %%mm4,  %%mm6    \n\t" // mask & |p2-p0|<beta & tc0
   1.556 +        H264_DEBLOCK_Q1(%%mm0, %%mm3, "(%2)", "(%2,%4)", %%mm6, %%mm4)
   1.557 +
   1.558 +        /* filter q1 */
   1.559 +        "movq    (%3,%4,2), %%mm4  \n\t" //q2
   1.560 +        DIFF_GT2_MMX(%%mm2, %%mm4, %%mm5, %%mm6, %%mm3) // |q2-q0|>beta-1
   1.561 +        "pand     %0,     %%mm6    \n\t"
   1.562 +        "movq     %1,     %%mm5    \n\t" // can be merged with the and below but is slower then
   1.563 +        "pand     %%mm6,  %%mm5    \n\t"
   1.564 +        "psubb    %%mm6,  %%mm7    \n\t"
   1.565 +        "movq    (%3,%4), %%mm3    \n\t"
   1.566 +        H264_DEBLOCK_Q1(%%mm3, %%mm4, "(%3,%4,2)", "(%3,%4)", %%mm5, %%mm6)
   1.567 +
   1.568 +        /* filter p0, q0 */
   1.569 +        H264_DEBLOCK_P0_Q0(%9, unused)
   1.570 +        "movq      %%mm1, (%2,%4,2) \n\t"
   1.571 +        "movq      %%mm2, (%3)      \n\t"
   1.572 +
   1.573 +        : "=m"(tmp0[0]), "=m"(tmp0[1])
   1.574 +        : "r"(pix-3*stride), "r"(pix), "r"((x86_reg)stride),
   1.575 +          "m"(*tmp0/*unused*/), "m"(*(uint32_t*)tc0), "m"(alpha1), "m"(beta1),
   1.576 +          "m"(ff_bone)
   1.577 +    );
   1.578 +}
   1.579 +
   1.580 +static void h264_v_loop_filter_luma_mmx2(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
   1.581 +{
   1.582 +    if((tc0[0] & tc0[1]) >= 0)
   1.583 +        h264_loop_filter_luma_mmx2(pix, stride, alpha-1, beta-1, tc0);
   1.584 +    if((tc0[2] & tc0[3]) >= 0)
   1.585 +        h264_loop_filter_luma_mmx2(pix+8, stride, alpha-1, beta-1, tc0+2);
   1.586 +}
   1.587 +static void h264_h_loop_filter_luma_mmx2(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
   1.588 +{
   1.589 +    //FIXME: could cut some load/stores by merging transpose with filter
   1.590 +    // also, it only needs to transpose 6x8
   1.591 +    DECLARE_ALIGNED(8, uint8_t, trans)[8*8];
   1.592 +    int i;
   1.593 +    for(i=0; i<2; i++, pix+=8*stride, tc0+=2) {
   1.594 +        if((tc0[0] & tc0[1]) < 0)
   1.595 +            continue;
   1.596 +        transpose4x4(trans,       pix-4,          8, stride);
   1.597 +        transpose4x4(trans  +4*8, pix,            8, stride);
   1.598 +        transpose4x4(trans+4,     pix-4+4*stride, 8, stride);
   1.599 +        transpose4x4(trans+4+4*8, pix  +4*stride, 8, stride);
   1.600 +        h264_loop_filter_luma_mmx2(trans+4*8, 8, alpha-1, beta-1, tc0);
   1.601 +        transpose4x4(pix-2,          trans  +2*8, stride, 8);
   1.602 +        transpose4x4(pix-2+4*stride, trans+4+2*8, stride, 8);
   1.603 +    }
   1.604 +}
   1.605 +
   1.606 +static inline void h264_loop_filter_chroma_mmx2(uint8_t *pix, int stride, int alpha1, int beta1, int8_t *tc0)
   1.607 +{
   1.608 +    __asm__ volatile(
   1.609 +        "movq    (%0),    %%mm0     \n\t" //p1
   1.610 +        "movq    (%0,%2), %%mm1     \n\t" //p0
   1.611 +        "movq    (%1),    %%mm2     \n\t" //q0
   1.612 +        "movq    (%1,%2), %%mm3     \n\t" //q1
   1.613 +        H264_DEBLOCK_MASK(%4, %5)
   1.614 +        "movd      %3,    %%mm6     \n\t"
   1.615 +        "punpcklbw %%mm6, %%mm6     \n\t"
   1.616 +        "pand      %%mm6, %%mm7     \n\t" // mm7 = tc&mask
   1.617 +        H264_DEBLOCK_P0_Q0(%6, %7)
   1.618 +        "movq      %%mm1, (%0,%2)   \n\t"
   1.619 +        "movq      %%mm2, (%1)      \n\t"
   1.620 +
   1.621 +        :: "r"(pix-2*stride), "r"(pix), "r"((x86_reg)stride),
   1.622 +           "r"(*(uint32_t*)tc0),
   1.623 +           "m"(alpha1), "m"(beta1), "m"(ff_bone), "m"(ff_pb_3F)
   1.624 +    );
   1.625 +}
   1.626 +
   1.627 +static void h264_v_loop_filter_chroma_mmx2(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
   1.628 +{
   1.629 +    h264_loop_filter_chroma_mmx2(pix, stride, alpha-1, beta-1, tc0);
   1.630 +}
   1.631 +
   1.632 +static void h264_h_loop_filter_chroma_mmx2(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
   1.633 +{
   1.634 +    //FIXME: could cut some load/stores by merging transpose with filter
   1.635 +    DECLARE_ALIGNED(8, uint8_t, trans)[8*4];
   1.636 +    transpose4x4(trans, pix-2, 8, stride);
   1.637 +    transpose4x4(trans+4, pix-2+4*stride, 8, stride);
   1.638 +    h264_loop_filter_chroma_mmx2(trans+2*8, 8, alpha-1, beta-1, tc0);
   1.639 +    transpose4x4(pix-2, trans, stride, 8);
   1.640 +    transpose4x4(pix-2+4*stride, trans+4, stride, 8);
   1.641 +}
   1.642 +
   1.643 +// p0 = (p0 + q1 + 2*p1 + 2) >> 2
   1.644 +#define H264_FILTER_CHROMA4(p0, p1, q1, one) \
   1.645 +    "movq    "#p0", %%mm4  \n\t"\
   1.646 +    "pxor    "#q1", %%mm4  \n\t"\
   1.647 +    "pand   "#one", %%mm4  \n\t" /* mm4 = (p0^q1)&1 */\
   1.648 +    "pavgb   "#q1", "#p0"  \n\t"\
   1.649 +    "psubusb %%mm4, "#p0"  \n\t"\
   1.650 +    "pavgb   "#p1", "#p0"  \n\t" /* dst = avg(p1, avg(p0,q1) - ((p0^q1)&1)) */\
   1.651 +
   1.652 +static inline void h264_loop_filter_chroma_intra_mmx2(uint8_t *pix, int stride, int alpha1, int beta1)
   1.653 +{
   1.654 +    __asm__ volatile(
   1.655 +        "movq    (%0),    %%mm0     \n\t"
   1.656 +        "movq    (%0,%2), %%mm1     \n\t"
   1.657 +        "movq    (%1),    %%mm2     \n\t"
   1.658 +        "movq    (%1,%2), %%mm3     \n\t"
   1.659 +        H264_DEBLOCK_MASK(%3, %4)
   1.660 +        "movq    %%mm1,   %%mm5     \n\t"
   1.661 +        "movq    %%mm2,   %%mm6     \n\t"
   1.662 +        H264_FILTER_CHROMA4(%%mm1, %%mm0, %%mm3, %5) //p0'
   1.663 +        H264_FILTER_CHROMA4(%%mm2, %%mm3, %%mm0, %5) //q0'
   1.664 +        "psubb   %%mm5,   %%mm1     \n\t"
   1.665 +        "psubb   %%mm6,   %%mm2     \n\t"
   1.666 +        "pand    %%mm7,   %%mm1     \n\t"
   1.667 +        "pand    %%mm7,   %%mm2     \n\t"
   1.668 +        "paddb   %%mm5,   %%mm1     \n\t"
   1.669 +        "paddb   %%mm6,   %%mm2     \n\t"
   1.670 +        "movq    %%mm1,   (%0,%2)   \n\t"
   1.671 +        "movq    %%mm2,   (%1)      \n\t"
   1.672 +        :: "r"(pix-2*stride), "r"(pix), "r"((x86_reg)stride),
   1.673 +           "m"(alpha1), "m"(beta1), "m"(ff_bone)
   1.674 +    );
   1.675 +}
   1.676 +
   1.677 +static void h264_v_loop_filter_chroma_intra_mmx2(uint8_t *pix, int stride, int alpha, int beta)
   1.678 +{
   1.679 +    h264_loop_filter_chroma_intra_mmx2(pix, stride, alpha-1, beta-1);
   1.680 +}
   1.681 +
   1.682 +static void h264_h_loop_filter_chroma_intra_mmx2(uint8_t *pix, int stride, int alpha, int beta)
   1.683 +{
   1.684 +    //FIXME: could cut some load/stores by merging transpose with filter
   1.685 +    DECLARE_ALIGNED(8, uint8_t, trans)[8*4];
   1.686 +    transpose4x4(trans, pix-2, 8, stride);
   1.687 +    transpose4x4(trans+4, pix-2+4*stride, 8, stride);
   1.688 +    h264_loop_filter_chroma_intra_mmx2(trans+2*8, 8, alpha-1, beta-1);
   1.689 +    transpose4x4(pix-2, trans, stride, 8);
   1.690 +    transpose4x4(pix-2+4*stride, trans+4, stride, 8);
   1.691 +}
   1.692 +
   1.693 +static void h264_loop_filter_strength_mmx2( int16_t bS[2][4][4], uint8_t nnz[40], int8_t ref[2][40], int16_t mv[2][40][2],
   1.694 +                                            int bidir, int edges, int step, int mask_mv0, int mask_mv1, int field ) {
   1.695 +    int dir;
   1.696 +    __asm__ volatile(
   1.697 +        "movq %0, %%mm7 \n"
   1.698 +        "movq %1, %%mm6 \n"
   1.699 +        ::"m"(ff_pb_1), "m"(ff_pb_3)
   1.700 +    );
   1.701 +    if(field)
   1.702 +        __asm__ volatile(
   1.703 +            "movq %0, %%mm6 \n"
   1.704 +            ::"m"(ff_pb_3_1)
   1.705 +        );
   1.706 +    __asm__ volatile(
   1.707 +        "movq  %%mm6, %%mm5 \n"
   1.708 +        "paddb %%mm5, %%mm5 \n"
   1.709 +    :);
   1.710 +
   1.711 +    // could do a special case for dir==0 && edges==1, but it only reduces the
   1.712 +    // average filter time by 1.2%
   1.713 +    for( dir=1; dir>=0; dir-- ) {
   1.714 +        const x86_reg d_idx = dir ? -8 : -1;
   1.715 +        const int mask_mv = dir ? mask_mv1 : mask_mv0;
   1.716 +        DECLARE_ALIGNED(8, const uint64_t, mask_dir) = dir ? 0 : 0xffffffffffffffffULL;
   1.717 +        int b_idx, edge;
   1.718 +        for( b_idx=12, edge=0; edge<edges; edge+=step, b_idx+=8*step ) {
   1.719 +            __asm__ volatile(
   1.720 +                "pand %0, %%mm0 \n\t"
   1.721 +                ::"m"(mask_dir)
   1.722 +            );
   1.723 +            if(!(mask_mv & edge)) {
   1.724 +                if(bidir) {
   1.725 +                    __asm__ volatile(
   1.726 +                        "movd         (%1,%0), %%mm2 \n"
   1.727 +                        "punpckldq  40(%1,%0), %%mm2 \n" // { ref0[bn], ref1[bn] }
   1.728 +                        "pshufw $0x44,   (%1), %%mm0 \n" // { ref0[b], ref0[b] }
   1.729 +                        "pshufw $0x44, 40(%1), %%mm1 \n" // { ref1[b], ref1[b] }
   1.730 +                        "pshufw $0x4E, %%mm2, %%mm3 \n"
   1.731 +                        "psubb         %%mm2, %%mm0 \n" // { ref0[b]!=ref0[bn], ref0[b]!=ref1[bn] }
   1.732 +                        "psubb         %%mm3, %%mm1 \n" // { ref1[b]!=ref1[bn], ref1[b]!=ref0[bn] }
   1.733 +                        "1: \n"
   1.734 +                        "por           %%mm1, %%mm0 \n"
   1.735 +                        "movq      (%2,%0,4), %%mm1 \n"
   1.736 +                        "movq     8(%2,%0,4), %%mm2 \n"
   1.737 +                        "movq          %%mm1, %%mm3 \n"
   1.738 +                        "movq          %%mm2, %%mm4 \n"
   1.739 +                        "psubw          (%2), %%mm1 \n"
   1.740 +                        "psubw         8(%2), %%mm2 \n"
   1.741 +                        "psubw       160(%2), %%mm3 \n"
   1.742 +                        "psubw       168(%2), %%mm4 \n"
   1.743 +                        "packsswb      %%mm2, %%mm1 \n"
   1.744 +                        "packsswb      %%mm4, %%mm3 \n"
   1.745 +                        "paddb         %%mm6, %%mm1 \n"
   1.746 +                        "paddb         %%mm6, %%mm3 \n"
   1.747 +                        "psubusb       %%mm5, %%mm1 \n" // abs(mv[b] - mv[bn]) >= limit
   1.748 +                        "psubusb       %%mm5, %%mm3 \n"
   1.749 +                        "packsswb      %%mm3, %%mm1 \n"
   1.750 +                        "add $40, %0 \n"
   1.751 +                        "cmp $40, %0 \n"
   1.752 +                        "jl 1b \n"
   1.753 +                        "sub $80, %0 \n"
   1.754 +                        "pshufw $0x4E, %%mm1, %%mm1 \n"
   1.755 +                        "por           %%mm1, %%mm0 \n"
   1.756 +                        "pshufw $0x4E, %%mm0, %%mm1 \n"
   1.757 +                        "pminub        %%mm1, %%mm0 \n"
   1.758 +                        ::"r"(d_idx),
   1.759 +                          "r"(ref[0]+b_idx),
   1.760 +                          "r"(mv[0]+b_idx)
   1.761 +                    );
   1.762 +                } else {
   1.763 +                    __asm__ volatile(
   1.764 +                        "movd        (%1), %%mm0 \n"
   1.765 +                        "psubb    (%1,%0), %%mm0 \n" // ref[b] != ref[bn]
   1.766 +                        "movq        (%2), %%mm1 \n"
   1.767 +                        "movq       8(%2), %%mm2 \n"
   1.768 +                        "psubw  (%2,%0,4), %%mm1 \n"
   1.769 +                        "psubw 8(%2,%0,4), %%mm2 \n"
   1.770 +                        "packsswb   %%mm2, %%mm1 \n"
   1.771 +                        "paddb      %%mm6, %%mm1 \n"
   1.772 +                        "psubusb    %%mm5, %%mm1 \n" // abs(mv[b] - mv[bn]) >= limit
   1.773 +                        "packsswb   %%mm1, %%mm1 \n"
   1.774 +                        "por        %%mm1, %%mm0 \n"
   1.775 +                        ::"r"(d_idx),
   1.776 +                          "r"(ref[0]+b_idx),
   1.777 +                          "r"(mv[0]+b_idx)
   1.778 +                    );
   1.779 +                }
   1.780 +            }
   1.781 +            __asm__ volatile(
   1.782 +                "movd %0, %%mm1 \n"
   1.783 +                "por  %1, %%mm1 \n" // nnz[b] || nnz[bn]
   1.784 +                ::"m"(nnz[b_idx]),
   1.785 +                  "m"(nnz[b_idx+d_idx])
   1.786 +            );
   1.787 +            __asm__ volatile(
   1.788 +                "pminub    %%mm7, %%mm1 \n"
   1.789 +                "pminub    %%mm7, %%mm0 \n"
   1.790 +                "psllw        $1, %%mm1 \n"
   1.791 +                "pxor      %%mm2, %%mm2 \n"
   1.792 +                "pmaxub    %%mm0, %%mm1 \n"
   1.793 +                "punpcklbw %%mm2, %%mm1 \n"
   1.794 +                "movq      %%mm1, %0    \n"
   1.795 +                :"=m"(*bS[dir][edge])
   1.796 +                ::"memory"
   1.797 +            );
   1.798 +        }
   1.799 +        edges = 4;
   1.800 +        step = 1;
   1.801 +    }
   1.802 +    __asm__ volatile(
   1.803 +        "movq   (%0), %%mm0 \n\t"
   1.804 +        "movq  8(%0), %%mm1 \n\t"
   1.805 +        "movq 16(%0), %%mm2 \n\t"
   1.806 +        "movq 24(%0), %%mm3 \n\t"
   1.807 +        TRANSPOSE4(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4)
   1.808 +        "movq %%mm0,   (%0) \n\t"
   1.809 +        "movq %%mm3,  8(%0) \n\t"
   1.810 +        "movq %%mm4, 16(%0) \n\t"
   1.811 +        "movq %%mm2, 24(%0) \n\t"
   1.812 +        ::"r"(bS[0])
   1.813 +        :"memory"
   1.814 +    );
   1.815 +}
   1.816 +
   1.817 +/***********************************/
   1.818 +/* motion compensation */
   1.819 +
   1.820 +#define QPEL_H264V_MM(A,B,C,D,E,F,OP,T,Z,d,q)\
   1.821 +        "mov"#q" "#C", "#T"         \n\t"\
   1.822 +        "mov"#d" (%0), "#F"         \n\t"\
   1.823 +        "paddw "#D", "#T"           \n\t"\
   1.824 +        "psllw $2, "#T"             \n\t"\
   1.825 +        "psubw "#B", "#T"           \n\t"\
   1.826 +        "psubw "#E", "#T"           \n\t"\
   1.827 +        "punpcklbw "#Z", "#F"       \n\t"\
   1.828 +        "pmullw %4, "#T"            \n\t"\
   1.829 +        "paddw %5, "#A"             \n\t"\
   1.830 +        "add %2, %0                 \n\t"\
   1.831 +        "paddw "#F", "#A"           \n\t"\
   1.832 +        "paddw "#A", "#T"           \n\t"\
   1.833 +        "psraw $5, "#T"             \n\t"\
   1.834 +        "packuswb "#T", "#T"        \n\t"\
   1.835 +        OP(T, (%1), A, d)\
   1.836 +        "add %3, %1                 \n\t"
   1.837 +
   1.838 +#define QPEL_H264HV_MM(A,B,C,D,E,F,OF,T,Z,d,q)\
   1.839 +        "mov"#q" "#C", "#T"         \n\t"\
   1.840 +        "mov"#d" (%0), "#F"         \n\t"\
   1.841 +        "paddw "#D", "#T"           \n\t"\
   1.842 +        "psllw $2, "#T"             \n\t"\
   1.843 +        "paddw %4, "#A"             \n\t"\
   1.844 +        "psubw "#B", "#T"           \n\t"\
   1.845 +        "psubw "#E", "#T"           \n\t"\
   1.846 +        "punpcklbw "#Z", "#F"       \n\t"\
   1.847 +        "pmullw %3, "#T"            \n\t"\
   1.848 +        "paddw "#F", "#A"           \n\t"\
   1.849 +        "add %2, %0                 \n\t"\
   1.850 +        "paddw "#A", "#T"           \n\t"\
   1.851 +        "mov"#q" "#T", "#OF"(%1)    \n\t"
   1.852 +
   1.853 +#define QPEL_H264V(A,B,C,D,E,F,OP) QPEL_H264V_MM(A,B,C,D,E,F,OP,%%mm6,%%mm7,d,q)
   1.854 +#define QPEL_H264HV(A,B,C,D,E,F,OF) QPEL_H264HV_MM(A,B,C,D,E,F,OF,%%mm6,%%mm7,d,q)
   1.855 +#define QPEL_H264V_XMM(A,B,C,D,E,F,OP) QPEL_H264V_MM(A,B,C,D,E,F,OP,%%xmm6,%%xmm7,q,dqa)
   1.856 +#define QPEL_H264HV_XMM(A,B,C,D,E,F,OF) QPEL_H264HV_MM(A,B,C,D,E,F,OF,%%xmm6,%%xmm7,q,dqa)
   1.857 +
   1.858 +
   1.859 +#define QPEL_H264(OPNAME, OP, MMX)\
   1.860 +\
   1.861 +static av_noinline void OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride){\
   1.862 +    int h=8;\
   1.863 +    __asm__ volatile(\
   1.864 +        "pxor %%mm7, %%mm7          \n\t"\
   1.865 +        "movq %0, %%mm6             \n\t"\
   1.866 +        :: "m"(ff_pw_5)\
   1.867 +    );\
   1.868 +    do{\
   1.869 +    __asm__ volatile(\
   1.870 +        "movq    (%0), %%mm0        \n\t"\
   1.871 +        "movq   1(%0), %%mm2        \n\t"\
   1.872 +        "movq %%mm0, %%mm1          \n\t"\
   1.873 +        "movq %%mm2, %%mm3          \n\t"\
   1.874 +        "punpcklbw %%mm7, %%mm0     \n\t"\
   1.875 +        "punpckhbw %%mm7, %%mm1     \n\t"\
   1.876 +        "punpcklbw %%mm7, %%mm2     \n\t"\
   1.877 +        "punpckhbw %%mm7, %%mm3     \n\t"\
   1.878 +        "paddw %%mm2, %%mm0         \n\t"\
   1.879 +        "paddw %%mm3, %%mm1         \n\t"\
   1.880 +        "psllw $2, %%mm0            \n\t"\
   1.881 +        "psllw $2, %%mm1            \n\t"\
   1.882 +        "movq   -1(%0), %%mm2       \n\t"\
   1.883 +        "movq    2(%0), %%mm4       \n\t"\
   1.884 +        "movq %%mm2, %%mm3          \n\t"\
   1.885 +        "movq %%mm4, %%mm5          \n\t"\
   1.886 +        "punpcklbw %%mm7, %%mm2     \n\t"\
   1.887 +        "punpckhbw %%mm7, %%mm3     \n\t"\
   1.888 +        "punpcklbw %%mm7, %%mm4     \n\t"\
   1.889 +        "punpckhbw %%mm7, %%mm5     \n\t"\
   1.890 +        "paddw %%mm4, %%mm2         \n\t"\
   1.891 +        "paddw %%mm3, %%mm5         \n\t"\
   1.892 +        "psubw %%mm2, %%mm0         \n\t"\
   1.893 +        "psubw %%mm5, %%mm1         \n\t"\
   1.894 +        "pmullw %%mm6, %%mm0        \n\t"\
   1.895 +        "pmullw %%mm6, %%mm1        \n\t"\
   1.896 +        "movd   -2(%0), %%mm2       \n\t"\
   1.897 +        "movd    7(%0), %%mm5       \n\t"\
   1.898 +        "punpcklbw %%mm7, %%mm2     \n\t"\
   1.899 +        "punpcklbw %%mm7, %%mm5     \n\t"\
   1.900 +        "paddw %%mm3, %%mm2         \n\t"\
   1.901 +        "paddw %%mm5, %%mm4         \n\t"\
   1.902 +        "movq %5, %%mm5             \n\t"\
   1.903 +        "paddw %%mm5, %%mm2         \n\t"\
   1.904 +        "paddw %%mm5, %%mm4         \n\t"\
   1.905 +        "paddw %%mm2, %%mm0         \n\t"\
   1.906 +        "paddw %%mm4, %%mm1         \n\t"\
   1.907 +        "psraw $5, %%mm0            \n\t"\
   1.908 +        "psraw $5, %%mm1            \n\t"\
   1.909 +        "movq (%2), %%mm4           \n\t"\
   1.910 +        "packuswb %%mm1, %%mm0      \n\t"\
   1.911 +        PAVGB" %%mm4, %%mm0         \n\t"\
   1.912 +        OP(%%mm0, (%1),%%mm5, q)\
   1.913 +        "add %4, %0                 \n\t"\
   1.914 +        "add %4, %1                 \n\t"\
   1.915 +        "add %3, %2                 \n\t"\
   1.916 +        : "+a"(src), "+c"(dst), "+d"(src2)\
   1.917 +        : "D"((x86_reg)src2Stride), "S"((x86_reg)dstStride),\
   1.918 +          "m"(ff_pw_16)\
   1.919 +        : "memory"\
   1.920 +    );\
   1.921 +    }while(--h);\
   1.922 +}\
   1.923 +\
   1.924 +static av_always_inline void OPNAME ## h264_qpel8or16_hv2_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, int dstStride, int tmpStride, int size){\
   1.925 +    int w = size>>4;\
   1.926 +    do{\
   1.927 +    int h = size;\
   1.928 +    __asm__ volatile(\
   1.929 +        "1:                         \n\t"\
   1.930 +        "movq     (%0), %%mm0       \n\t"\
   1.931 +        "movq    8(%0), %%mm3       \n\t"\
   1.932 +        "movq    2(%0), %%mm1       \n\t"\
   1.933 +        "movq   10(%0), %%mm4       \n\t"\
   1.934 +        "paddw   %%mm4, %%mm0       \n\t"\
   1.935 +        "paddw   %%mm3, %%mm1       \n\t"\
   1.936 +        "paddw  18(%0), %%mm3       \n\t"\
   1.937 +        "paddw  16(%0), %%mm4       \n\t"\
   1.938 +        "movq    4(%0), %%mm2       \n\t"\
   1.939 +        "movq   12(%0), %%mm5       \n\t"\
   1.940 +        "paddw   6(%0), %%mm2       \n\t"\
   1.941 +        "paddw  14(%0), %%mm5       \n\t"\
   1.942 +        "psubw %%mm1, %%mm0         \n\t"\
   1.943 +        "psubw %%mm4, %%mm3         \n\t"\
   1.944 +        "psraw $2, %%mm0            \n\t"\
   1.945 +        "psraw $2, %%mm3            \n\t"\
   1.946 +        "psubw %%mm1, %%mm0         \n\t"\
   1.947 +        "psubw %%mm4, %%mm3         \n\t"\
   1.948 +        "paddsw %%mm2, %%mm0        \n\t"\
   1.949 +        "paddsw %%mm5, %%mm3        \n\t"\
   1.950 +        "psraw $2, %%mm0            \n\t"\
   1.951 +        "psraw $2, %%mm3            \n\t"\
   1.952 +        "paddw %%mm2, %%mm0         \n\t"\
   1.953 +        "paddw %%mm5, %%mm3         \n\t"\
   1.954 +        "psraw $6, %%mm0            \n\t"\
   1.955 +        "psraw $6, %%mm3            \n\t"\
   1.956 +        "packuswb %%mm3, %%mm0      \n\t"\
   1.957 +        OP(%%mm0, (%1),%%mm7, q)\
   1.958 +        "add $48, %0                \n\t"\
   1.959 +        "add %3, %1                 \n\t"\
   1.960 +        "decl %2                    \n\t"\
   1.961 +        " jnz 1b                    \n\t"\
   1.962 +        : "+a"(tmp), "+c"(dst), "+g"(h)\
   1.963 +        : "S"((x86_reg)dstStride)\
   1.964 +        : "memory"\
   1.965 +    );\
   1.966 +    tmp += 8 - size*24;\
   1.967 +    dst += 8 - size*dstStride;\
   1.968 +    }while(w--);\
   1.969 +}\
   1.970 +\
   1.971 +static av_noinline void OPNAME ## h264_qpel16_h_lowpass_l2_ ## MMX(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride){\
   1.972 +    OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst  , src  , src2  , dstStride, src2Stride);\
   1.973 +    OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst+8, src+8, src2+8, dstStride, src2Stride);\
   1.974 +    src += 8*dstStride;\
   1.975 +    dst += 8*dstStride;\
   1.976 +    src2 += 8*src2Stride;\
   1.977 +    OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst  , src  , src2  , dstStride, src2Stride);\
   1.978 +    OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst+8, src+8, src2+8, dstStride, src2Stride);\
   1.979 +}\
   1.980 +static av_noinline void OPNAME ## pixels8_l2_shift5_ ## MMX(uint8_t *dst, int16_t *src16, uint8_t *src8, int dstStride, int src8Stride, int h)\
   1.981 +{\
   1.982 +    do{\
   1.983 +    __asm__ volatile(\
   1.984 +        "movq      (%1), %%mm0          \n\t"\
   1.985 +        "movq     8(%1), %%mm1          \n\t"\
   1.986 +        "movq    48(%1), %%mm2          \n\t"\
   1.987 +        "movq  8+48(%1), %%mm3          \n\t"\
   1.988 +        "psraw      $5,  %%mm0          \n\t"\
   1.989 +        "psraw      $5,  %%mm1          \n\t"\
   1.990 +        "psraw      $5,  %%mm2          \n\t"\
   1.991 +        "psraw      $5,  %%mm3          \n\t"\
   1.992 +        "packuswb %%mm1, %%mm0          \n\t"\
   1.993 +        "packuswb %%mm3, %%mm2          \n\t"\
   1.994 +        PAVGB"     (%0), %%mm0          \n\t"\
   1.995 +        PAVGB"  (%0,%3), %%mm2          \n\t"\
   1.996 +        OP(%%mm0, (%2), %%mm5, q)\
   1.997 +        OP(%%mm2, (%2,%4), %%mm5, q)\
   1.998 +        ::"a"(src8), "c"(src16), "d"(dst),\
   1.999 +          "r"((x86_reg)src8Stride), "r"((x86_reg)dstStride)\
  1.1000 +        :"memory");\
  1.1001 +        src8 += 2L*src8Stride;\
  1.1002 +        src16 += 48;\
  1.1003 +        dst += 2L*dstStride;\
  1.1004 +    }while(h-=2);\
  1.1005 +}\
  1.1006 +static void OPNAME ## pixels16_l2_shift5_ ## MMX(uint8_t *dst, int16_t *src16, uint8_t *src8, int dstStride, int src8Stride, int h)\
  1.1007 +{\
  1.1008 +    OPNAME ## pixels8_l2_shift5_ ## MMX(dst  , src16  , src8  , dstStride, src8Stride, h);\
  1.1009 +    OPNAME ## pixels8_l2_shift5_ ## MMX(dst+8, src16+8, src8+8, dstStride, src8Stride, h);\
  1.1010 +}\
  1.1011 +
  1.1012 +
  1.1013 +#if ARCH_X86_64
  1.1014 +#define QPEL_H264_H16_XMM(OPNAME, OP, MMX)\
  1.1015 +static av_noinline void OPNAME ## h264_qpel16_h_lowpass_l2_ ## MMX(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride){\
  1.1016 +    int h=16;\
  1.1017 +    __asm__ volatile(\
  1.1018 +        "pxor %%xmm15, %%xmm15      \n\t"\
  1.1019 +        "movdqa %6, %%xmm14         \n\t"\
  1.1020 +        "movdqa %7, %%xmm13         \n\t"\
  1.1021 +        "1:                         \n\t"\
  1.1022 +        "lddqu    6(%0), %%xmm1     \n\t"\
  1.1023 +        "lddqu   -2(%0), %%xmm7     \n\t"\
  1.1024 +        "movdqa  %%xmm1, %%xmm0     \n\t"\
  1.1025 +        "punpckhbw %%xmm15, %%xmm1  \n\t"\
  1.1026 +        "punpcklbw %%xmm15, %%xmm0  \n\t"\
  1.1027 +        "punpcklbw %%xmm15, %%xmm7  \n\t"\
  1.1028 +        "movdqa  %%xmm1, %%xmm2     \n\t"\
  1.1029 +        "movdqa  %%xmm0, %%xmm6     \n\t"\
  1.1030 +        "movdqa  %%xmm1, %%xmm3     \n\t"\
  1.1031 +        "movdqa  %%xmm0, %%xmm8     \n\t"\
  1.1032 +        "movdqa  %%xmm1, %%xmm4     \n\t"\
  1.1033 +        "movdqa  %%xmm0, %%xmm9     \n\t"\
  1.1034 +        "movdqa  %%xmm0, %%xmm12    \n\t"\
  1.1035 +        "movdqa  %%xmm1, %%xmm11    \n\t"\
  1.1036 +        "palignr $10,%%xmm0, %%xmm11\n\t"\
  1.1037 +        "palignr $10,%%xmm7, %%xmm12\n\t"\
  1.1038 +        "palignr $2, %%xmm0, %%xmm4 \n\t"\
  1.1039 +        "palignr $2, %%xmm7, %%xmm9 \n\t"\
  1.1040 +        "palignr $4, %%xmm0, %%xmm3 \n\t"\
  1.1041 +        "palignr $4, %%xmm7, %%xmm8 \n\t"\
  1.1042 +        "palignr $6, %%xmm0, %%xmm2 \n\t"\
  1.1043 +        "palignr $6, %%xmm7, %%xmm6 \n\t"\
  1.1044 +        "paddw   %%xmm0 ,%%xmm11    \n\t"\
  1.1045 +        "palignr $8, %%xmm0, %%xmm1 \n\t"\
  1.1046 +        "palignr $8, %%xmm7, %%xmm0 \n\t"\
  1.1047 +        "paddw   %%xmm12,%%xmm7     \n\t"\
  1.1048 +        "paddw   %%xmm3, %%xmm2     \n\t"\
  1.1049 +        "paddw   %%xmm8, %%xmm6     \n\t"\
  1.1050 +        "paddw   %%xmm4, %%xmm1     \n\t"\
  1.1051 +        "paddw   %%xmm9, %%xmm0     \n\t"\
  1.1052 +        "psllw   $2,     %%xmm2     \n\t"\
  1.1053 +        "psllw   $2,     %%xmm6     \n\t"\
  1.1054 +        "psubw   %%xmm1, %%xmm2     \n\t"\
  1.1055 +        "psubw   %%xmm0, %%xmm6     \n\t"\
  1.1056 +        "paddw   %%xmm13,%%xmm11    \n\t"\
  1.1057 +        "paddw   %%xmm13,%%xmm7     \n\t"\
  1.1058 +        "pmullw  %%xmm14,%%xmm2     \n\t"\
  1.1059 +        "pmullw  %%xmm14,%%xmm6     \n\t"\
  1.1060 +        "lddqu   (%2),   %%xmm3     \n\t"\
  1.1061 +        "paddw   %%xmm11,%%xmm2     \n\t"\
  1.1062 +        "paddw   %%xmm7, %%xmm6     \n\t"\
  1.1063 +        "psraw   $5,     %%xmm2     \n\t"\
  1.1064 +        "psraw   $5,     %%xmm6     \n\t"\
  1.1065 +        "packuswb %%xmm2,%%xmm6     \n\t"\
  1.1066 +        "pavgb   %%xmm3, %%xmm6     \n\t"\
  1.1067 +        OP(%%xmm6, (%1), %%xmm4, dqa)\
  1.1068 +        "add %5, %0                 \n\t"\
  1.1069 +        "add %5, %1                 \n\t"\
  1.1070 +        "add %4, %2                 \n\t"\
  1.1071 +        "decl %3                    \n\t"\
  1.1072 +        "jg 1b                      \n\t"\
  1.1073 +        : "+a"(src), "+c"(dst), "+d"(src2), "+g"(h)\
  1.1074 +        : "D"((x86_reg)src2Stride), "S"((x86_reg)dstStride),\
  1.1075 +          "m"(ff_pw_5), "m"(ff_pw_16)\
  1.1076 +        : "memory"\
  1.1077 +    );\
  1.1078 +}
  1.1079 +#else // ARCH_X86_64
  1.1080 +#define QPEL_H264_H16_XMM(OPNAME, OP, MMX)\
  1.1081 +static av_noinline void OPNAME ## h264_qpel16_h_lowpass_l2_ ## MMX(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride){\
  1.1082 +    OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst  , src  , src2  , dstStride, src2Stride);\
  1.1083 +    OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst+8, src+8, src2+8, dstStride, src2Stride);\
  1.1084 +    src += 8*dstStride;\
  1.1085 +    dst += 8*dstStride;\
  1.1086 +    src2 += 8*src2Stride;\
  1.1087 +    OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst  , src  , src2  , dstStride, src2Stride);\
  1.1088 +    OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst+8, src+8, src2+8, dstStride, src2Stride);\
  1.1089 +}
  1.1090 +#endif // ARCH_X86_64
  1.1091 +
  1.1092 +#define QPEL_H264_H_XMM(OPNAME, OP, MMX)\
  1.1093 +static av_noinline void OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride){\
  1.1094 +    int h=8;\
  1.1095 +    __asm__ volatile(\
  1.1096 +        "pxor %%xmm7, %%xmm7        \n\t"\
  1.1097 +        "movdqa %0, %%xmm6          \n\t"\
  1.1098 +        :: "m"(ff_pw_5)\
  1.1099 +    );\
  1.1100 +    do{\
  1.1101 +    __asm__ volatile(\
  1.1102 +        "lddqu   -2(%0), %%xmm1     \n\t"\
  1.1103 +        "movdqa  %%xmm1, %%xmm0     \n\t"\
  1.1104 +        "punpckhbw %%xmm7, %%xmm1   \n\t"\
  1.1105 +        "punpcklbw %%xmm7, %%xmm0   \n\t"\
  1.1106 +        "movdqa  %%xmm1, %%xmm2     \n\t"\
  1.1107 +        "movdqa  %%xmm1, %%xmm3     \n\t"\
  1.1108 +        "movdqa  %%xmm1, %%xmm4     \n\t"\
  1.1109 +        "movdqa  %%xmm1, %%xmm5     \n\t"\
  1.1110 +        "palignr $2, %%xmm0, %%xmm4 \n\t"\
  1.1111 +        "palignr $4, %%xmm0, %%xmm3 \n\t"\
  1.1112 +        "palignr $6, %%xmm0, %%xmm2 \n\t"\
  1.1113 +        "palignr $8, %%xmm0, %%xmm1 \n\t"\
  1.1114 +        "palignr $10,%%xmm0, %%xmm5 \n\t"\
  1.1115 +        "paddw   %%xmm5, %%xmm0     \n\t"\
  1.1116 +        "paddw   %%xmm3, %%xmm2     \n\t"\
  1.1117 +        "paddw   %%xmm4, %%xmm1     \n\t"\
  1.1118 +        "psllw   $2,     %%xmm2     \n\t"\
  1.1119 +        "movq    (%2),   %%xmm3     \n\t"\
  1.1120 +        "psubw   %%xmm1, %%xmm2     \n\t"\
  1.1121 +        "paddw   %5,     %%xmm0     \n\t"\
  1.1122 +        "pmullw  %%xmm6, %%xmm2     \n\t"\
  1.1123 +        "paddw   %%xmm0, %%xmm2     \n\t"\
  1.1124 +        "psraw   $5,     %%xmm2     \n\t"\
  1.1125 +        "packuswb %%xmm2, %%xmm2    \n\t"\
  1.1126 +        "pavgb   %%xmm3, %%xmm2     \n\t"\
  1.1127 +        OP(%%xmm2, (%1), %%xmm4, q)\
  1.1128 +        "add %4, %0                 \n\t"\
  1.1129 +        "add %4, %1                 \n\t"\
  1.1130 +        "add %3, %2                 \n\t"\
  1.1131 +        : "+a"(src), "+c"(dst), "+d"(src2)\
  1.1132 +        : "D"((x86_reg)src2Stride), "S"((x86_reg)dstStride),\
  1.1133 +          "m"(ff_pw_16)\
  1.1134 +        : "memory"\
  1.1135 +    );\
  1.1136 +    }while(--h);\
  1.1137 +}\
  1.1138 +QPEL_H264_H16_XMM(OPNAME, OP, MMX)\
  1.1139 +\
  1.1140 +static av_noinline void OPNAME ## h264_qpel8_h_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
  1.1141 +    int h=8;\
  1.1142 +    __asm__ volatile(\
  1.1143 +        "pxor %%xmm7, %%xmm7        \n\t"\
  1.1144 +        "movdqa %5, %%xmm6          \n\t"\
  1.1145 +        "1:                         \n\t"\
  1.1146 +        "lddqu   -2(%0), %%xmm1     \n\t"\
  1.1147 +        "movdqa  %%xmm1, %%xmm0     \n\t"\
  1.1148 +        "punpckhbw %%xmm7, %%xmm1   \n\t"\
  1.1149 +        "punpcklbw %%xmm7, %%xmm0   \n\t"\
  1.1150 +        "movdqa  %%xmm1, %%xmm2     \n\t"\
  1.1151 +        "movdqa  %%xmm1, %%xmm3     \n\t"\
  1.1152 +        "movdqa  %%xmm1, %%xmm4     \n\t"\
  1.1153 +        "movdqa  %%xmm1, %%xmm5     \n\t"\
  1.1154 +        "palignr $2, %%xmm0, %%xmm4 \n\t"\
  1.1155 +        "palignr $4, %%xmm0, %%xmm3 \n\t"\
  1.1156 +        "palignr $6, %%xmm0, %%xmm2 \n\t"\
  1.1157 +        "palignr $8, %%xmm0, %%xmm1 \n\t"\
  1.1158 +        "palignr $10,%%xmm0, %%xmm5 \n\t"\
  1.1159 +        "paddw   %%xmm5, %%xmm0     \n\t"\
  1.1160 +        "paddw   %%xmm3, %%xmm2     \n\t"\
  1.1161 +        "paddw   %%xmm4, %%xmm1     \n\t"\
  1.1162 +        "psllw   $2,     %%xmm2     \n\t"\
  1.1163 +        "psubw   %%xmm1, %%xmm2     \n\t"\
  1.1164 +        "paddw   %6,     %%xmm0     \n\t"\
  1.1165 +        "pmullw  %%xmm6, %%xmm2     \n\t"\
  1.1166 +        "paddw   %%xmm0, %%xmm2     \n\t"\
  1.1167 +        "psraw   $5,     %%xmm2     \n\t"\
  1.1168 +        "packuswb %%xmm2, %%xmm2    \n\t"\
  1.1169 +        OP(%%xmm2, (%1), %%xmm4, q)\
  1.1170 +        "add %3, %0                 \n\t"\
  1.1171 +        "add %4, %1                 \n\t"\
  1.1172 +        "decl %2                    \n\t"\
  1.1173 +        " jnz 1b                    \n\t"\
  1.1174 +        : "+a"(src), "+c"(dst), "+g"(h)\
  1.1175 +        : "D"((x86_reg)srcStride), "S"((x86_reg)dstStride),\
  1.1176 +          "m"(ff_pw_5), "m"(ff_pw_16)\
  1.1177 +        : "memory"\
  1.1178 +    );\
  1.1179 +}\
  1.1180 +static void OPNAME ## h264_qpel16_h_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
  1.1181 +    OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst  , src  , dstStride, srcStride);\
  1.1182 +    OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride);\
  1.1183 +    src += 8*srcStride;\
  1.1184 +    dst += 8*dstStride;\
  1.1185 +    OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst  , src  , dstStride, srcStride);\
  1.1186 +    OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride);\
  1.1187 +}\
  1.1188 +
  1.1189 +#define QPEL_H264_V_XMM(OPNAME, OP, MMX)\
  1.1190 +static av_noinline void OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
  1.1191 +    src -= 2*srcStride;\
  1.1192 +    \
  1.1193 +    __asm__ volatile(\
  1.1194 +        "pxor %%xmm7, %%xmm7        \n\t"\
  1.1195 +        "movq (%0), %%xmm0          \n\t"\
  1.1196 +        "add %2, %0                 \n\t"\
  1.1197 +        "movq (%0), %%xmm1          \n\t"\
  1.1198 +        "add %2, %0                 \n\t"\
  1.1199 +        "movq (%0), %%xmm2          \n\t"\
  1.1200 +        "add %2, %0                 \n\t"\
  1.1201 +        "movq (%0), %%xmm3          \n\t"\
  1.1202 +        "add %2, %0                 \n\t"\
  1.1203 +        "movq (%0), %%xmm4          \n\t"\
  1.1204 +        "add %2, %0                 \n\t"\
  1.1205 +        "punpcklbw %%xmm7, %%xmm0   \n\t"\
  1.1206 +        "punpcklbw %%xmm7, %%xmm1   \n\t"\
  1.1207 +        "punpcklbw %%xmm7, %%xmm2   \n\t"\
  1.1208 +        "punpcklbw %%xmm7, %%xmm3   \n\t"\
  1.1209 +        "punpcklbw %%xmm7, %%xmm4   \n\t"\
  1.1210 +        QPEL_H264V_XMM(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, OP)\
  1.1211 +        QPEL_H264V_XMM(%%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, OP)\
  1.1212 +        QPEL_H264V_XMM(%%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, OP)\
  1.1213 +        QPEL_H264V_XMM(%%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, OP)\
  1.1214 +        QPEL_H264V_XMM(%%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, OP)\
  1.1215 +        QPEL_H264V_XMM(%%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, OP)\
  1.1216 +        QPEL_H264V_XMM(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, OP)\
  1.1217 +        QPEL_H264V_XMM(%%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, OP)\
  1.1218 +         \
  1.1219 +        : "+a"(src), "+c"(dst)\
  1.1220 +        : "S"((x86_reg)srcStride), "D"((x86_reg)dstStride), "m"(ff_pw_5), "m"(ff_pw_16)\
  1.1221 +        : "memory"\
  1.1222 +    );\
  1.1223 +    if(h==16){\
  1.1224 +        __asm__ volatile(\
  1.1225 +            QPEL_H264V_XMM(%%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, OP)\
  1.1226 +            QPEL_H264V_XMM(%%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, OP)\
  1.1227 +            QPEL_H264V_XMM(%%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, OP)\
  1.1228 +            QPEL_H264V_XMM(%%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, OP)\
  1.1229 +            QPEL_H264V_XMM(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, OP)\
  1.1230 +            QPEL_H264V_XMM(%%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, OP)\
  1.1231 +            QPEL_H264V_XMM(%%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, OP)\
  1.1232 +            QPEL_H264V_XMM(%%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, OP)\
  1.1233 +            \
  1.1234 +            : "+a"(src), "+c"(dst)\
  1.1235 +            : "S"((x86_reg)srcStride), "D"((x86_reg)dstStride), "m"(ff_pw_5), "m"(ff_pw_16)\
  1.1236 +            : "memory"\
  1.1237 +        );\
  1.1238 +    }\
  1.1239 +}\
  1.1240 +static void OPNAME ## h264_qpel8_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
  1.1241 +    OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst  , src  , dstStride, srcStride, 8);\
  1.1242 +}\
  1.1243 +static av_noinline void OPNAME ## h264_qpel16_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
  1.1244 +    OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst  , src  , dstStride, srcStride, 16);\
  1.1245 +    OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride, 16);\
  1.1246 +}
  1.1247 +
  1.1248 +static av_always_inline void put_h264_qpel8or16_hv1_lowpass_sse2(int16_t *tmp, uint8_t *src, int tmpStride, int srcStride, int size){
  1.1249 +    int w = (size+8)>>3;
  1.1250 +    src -= 2*srcStride+2;
  1.1251 +    while(w--){
  1.1252 +        __asm__ volatile(
  1.1253 +            "pxor %%xmm7, %%xmm7        \n\t"
  1.1254 +            "movq (%0), %%xmm0          \n\t"
  1.1255 +            "add %2, %0                 \n\t"
  1.1256 +            "movq (%0), %%xmm1          \n\t"
  1.1257 +            "add %2, %0                 \n\t"
  1.1258 +            "movq (%0), %%xmm2          \n\t"
  1.1259 +            "add %2, %0                 \n\t"
  1.1260 +            "movq (%0), %%xmm3          \n\t"
  1.1261 +            "add %2, %0                 \n\t"
  1.1262 +            "movq (%0), %%xmm4          \n\t"
  1.1263 +            "add %2, %0                 \n\t"
  1.1264 +            "punpcklbw %%xmm7, %%xmm0   \n\t"
  1.1265 +            "punpcklbw %%xmm7, %%xmm1   \n\t"
  1.1266 +            "punpcklbw %%xmm7, %%xmm2   \n\t"
  1.1267 +            "punpcklbw %%xmm7, %%xmm3   \n\t"
  1.1268 +            "punpcklbw %%xmm7, %%xmm4   \n\t"
  1.1269 +            QPEL_H264HV_XMM(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, 0*48)
  1.1270 +            QPEL_H264HV_XMM(%%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, 1*48)
  1.1271 +            QPEL_H264HV_XMM(%%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, 2*48)
  1.1272 +            QPEL_H264HV_XMM(%%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, 3*48)
  1.1273 +            QPEL_H264HV_XMM(%%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, 4*48)
  1.1274 +            QPEL_H264HV_XMM(%%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, 5*48)
  1.1275 +            QPEL_H264HV_XMM(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, 6*48)
  1.1276 +            QPEL_H264HV_XMM(%%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, 7*48)
  1.1277 +            : "+a"(src)
  1.1278 +            : "c"(tmp), "S"((x86_reg)srcStride), "m"(ff_pw_5), "m"(ff_pw_16)
  1.1279 +            : "memory"
  1.1280 +        );
  1.1281 +        if(size==16){
  1.1282 +            __asm__ volatile(
  1.1283 +                QPEL_H264HV_XMM(%%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1,  8*48)
  1.1284 +                QPEL_H264HV_XMM(%%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2,  9*48)
  1.1285 +                QPEL_H264HV_XMM(%%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, 10*48)
  1.1286 +                QPEL_H264HV_XMM(%%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, 11*48)
  1.1287 +                QPEL_H264HV_XMM(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, 12*48)
  1.1288 +                QPEL_H264HV_XMM(%%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, 13*48)
  1.1289 +                QPEL_H264HV_XMM(%%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, 14*48)
  1.1290 +                QPEL_H264HV_XMM(%%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, 15*48)
  1.1291 +                : "+a"(src)
  1.1292 +                : "c"(tmp), "S"((x86_reg)srcStride), "m"(ff_pw_5), "m"(ff_pw_16)
  1.1293 +                : "memory"
  1.1294 +            );
  1.1295 +        }
  1.1296 +        tmp += 8;
  1.1297 +        src += 8 - (size+5)*srcStride;
  1.1298 +    }
  1.1299 +}
  1.1300 +
  1.1301 +#define QPEL_H264_HV2_XMM(OPNAME, OP, MMX)\
  1.1302 +static av_always_inline void OPNAME ## h264_qpel8or16_hv2_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, int dstStride, int tmpStride, int size){\
  1.1303 +    int h = size;\
  1.1304 +    if(size == 16){\
  1.1305 +        __asm__ volatile(\
  1.1306 +            "1:                         \n\t"\
  1.1307 +            "movdqa 32(%0), %%xmm4      \n\t"\
  1.1308 +            "movdqa 16(%0), %%xmm5      \n\t"\
  1.1309 +            "movdqa   (%0), %%xmm7      \n\t"\
  1.1310 +            "movdqa %%xmm4, %%xmm3      \n\t"\
  1.1311 +            "movdqa %%xmm4, %%xmm2      \n\t"\
  1.1312 +            "movdqa %%xmm4, %%xmm1      \n\t"\
  1.1313 +            "movdqa %%xmm4, %%xmm0      \n\t"\
  1.1314 +            "palignr $10, %%xmm5, %%xmm0 \n\t"\
  1.1315 +            "palignr  $8, %%xmm5, %%xmm1 \n\t"\
  1.1316 +            "palignr  $6, %%xmm5, %%xmm2 \n\t"\
  1.1317 +            "palignr  $4, %%xmm5, %%xmm3 \n\t"\
  1.1318 +            "palignr  $2, %%xmm5, %%xmm4 \n\t"\
  1.1319 +            "paddw  %%xmm5, %%xmm0      \n\t"\
  1.1320 +            "paddw  %%xmm4, %%xmm1      \n\t"\
  1.1321 +            "paddw  %%xmm3, %%xmm2      \n\t"\
  1.1322 +            "movdqa %%xmm5, %%xmm6      \n\t"\
  1.1323 +            "movdqa %%xmm5, %%xmm4      \n\t"\
  1.1324 +            "movdqa %%xmm5, %%xmm3      \n\t"\
  1.1325 +            "palignr  $8, %%xmm7, %%xmm4 \n\t"\
  1.1326 +            "palignr  $2, %%xmm7, %%xmm6 \n\t"\
  1.1327 +            "palignr $10, %%xmm7, %%xmm3 \n\t"\
  1.1328 +            "paddw  %%xmm6, %%xmm4      \n\t"\
  1.1329 +            "movdqa %%xmm5, %%xmm6      \n\t"\
  1.1330 +            "palignr  $6, %%xmm7, %%xmm5 \n\t"\
  1.1331 +            "palignr  $4, %%xmm7, %%xmm6 \n\t"\
  1.1332 +            "paddw  %%xmm7, %%xmm3      \n\t"\
  1.1333 +            "paddw  %%xmm6, %%xmm5      \n\t"\
  1.1334 +            \
  1.1335 +            "psubw  %%xmm1, %%xmm0      \n\t"\
  1.1336 +            "psubw  %%xmm4, %%xmm3      \n\t"\
  1.1337 +            "psraw      $2, %%xmm0      \n\t"\
  1.1338 +            "psraw      $2, %%xmm3      \n\t"\
  1.1339 +            "psubw  %%xmm1, %%xmm0      \n\t"\
  1.1340 +            "psubw  %%xmm4, %%xmm3      \n\t"\
  1.1341 +            "paddw  %%xmm2, %%xmm0      \n\t"\
  1.1342 +            "paddw  %%xmm5, %%xmm3      \n\t"\
  1.1343 +            "psraw      $2, %%xmm0      \n\t"\
  1.1344 +            "psraw      $2, %%xmm3      \n\t"\
  1.1345 +            "paddw  %%xmm2, %%xmm0      \n\t"\
  1.1346 +            "paddw  %%xmm5, %%xmm3      \n\t"\
  1.1347 +            "psraw      $6, %%xmm0      \n\t"\
  1.1348 +            "psraw      $6, %%xmm3      \n\t"\
  1.1349 +            "packuswb %%xmm0, %%xmm3    \n\t"\
  1.1350 +            OP(%%xmm3, (%1), %%xmm7, dqa)\
  1.1351 +            "add $48, %0                \n\t"\
  1.1352 +            "add %3, %1                 \n\t"\
  1.1353 +            "decl %2                    \n\t"\
  1.1354 +            " jnz 1b                    \n\t"\
  1.1355 +            : "+a"(tmp), "+c"(dst), "+g"(h)\
  1.1356 +            : "S"((x86_reg)dstStride)\
  1.1357 +            : "memory"\
  1.1358 +        );\
  1.1359 +    }else{\
  1.1360 +        __asm__ volatile(\
  1.1361 +            "1:                         \n\t"\
  1.1362 +            "movdqa 16(%0), %%xmm1      \n\t"\
  1.1363 +            "movdqa   (%0), %%xmm0      \n\t"\
  1.1364 +            "movdqa %%xmm1, %%xmm2      \n\t"\
  1.1365 +            "movdqa %%xmm1, %%xmm3      \n\t"\
  1.1366 +            "movdqa %%xmm1, %%xmm4      \n\t"\
  1.1367 +            "movdqa %%xmm1, %%xmm5      \n\t"\
  1.1368 +            "palignr $10, %%xmm0, %%xmm5 \n\t"\
  1.1369 +            "palignr  $8, %%xmm0, %%xmm4 \n\t"\
  1.1370 +            "palignr  $6, %%xmm0, %%xmm3 \n\t"\
  1.1371 +            "palignr  $4, %%xmm0, %%xmm2 \n\t"\
  1.1372 +            "palignr  $2, %%xmm0, %%xmm1 \n\t"\
  1.1373 +            "paddw  %%xmm5, %%xmm0      \n\t"\
  1.1374 +            "paddw  %%xmm4, %%xmm1      \n\t"\
  1.1375 +            "paddw  %%xmm3, %%xmm2      \n\t"\
  1.1376 +            "psubw  %%xmm1, %%xmm0      \n\t"\
  1.1377 +            "psraw      $2, %%xmm0      \n\t"\
  1.1378 +            "psubw  %%xmm1, %%xmm0      \n\t"\
  1.1379 +            "paddw  %%xmm2, %%xmm0      \n\t"\
  1.1380 +            "psraw      $2, %%xmm0      \n\t"\
  1.1381 +            "paddw  %%xmm2, %%xmm0      \n\t"\
  1.1382 +            "psraw      $6, %%xmm0      \n\t"\
  1.1383 +            "packuswb %%xmm0, %%xmm0    \n\t"\
  1.1384 +            OP(%%xmm0, (%1), %%xmm7, q)\
  1.1385 +            "add $48, %0                \n\t"\
  1.1386 +            "add %3, %1                 \n\t"\
  1.1387 +            "decl %2                    \n\t"\
  1.1388 +            " jnz 1b                    \n\t"\
  1.1389 +            : "+a"(tmp), "+c"(dst), "+g"(h)\
  1.1390 +            : "S"((x86_reg)dstStride)\
  1.1391 +            : "memory"\
  1.1392 +        );\
  1.1393 +    }\
  1.1394 +}
  1.1395 +
  1.1396 +#define QPEL_H264_HV_XMM(OPNAME, OP, MMX)\
  1.1397 +static av_noinline void OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride, int size){\
  1.1398 +          put_h264_qpel8or16_hv1_lowpass_sse2(tmp, src, tmpStride, srcStride, size);\
  1.1399 +    OPNAME ## h264_qpel8or16_hv2_lowpass_ ## MMX(dst, tmp, dstStride, tmpStride, size);\
  1.1400 +}\
  1.1401 +static void OPNAME ## h264_qpel8_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
  1.1402 +    OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(dst, tmp, src, dstStride, tmpStride, srcStride, 8);\
  1.1403 +}\
  1.1404 +static void OPNAME ## h264_qpel16_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
  1.1405 +    OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(dst, tmp, src, dstStride, tmpStride, srcStride, 16);\
  1.1406 +}\
  1.1407 +
  1.1408 +#define put_pixels8_l2_sse2 put_pixels8_l2_mmx2
  1.1409 +#define avg_pixels8_l2_sse2 avg_pixels8_l2_mmx2
  1.1410 +#define put_pixels16_l2_sse2 put_pixels16_l2_mmx2
  1.1411 +#define avg_pixels16_l2_sse2 avg_pixels16_l2_mmx2
  1.1412 +#define put_pixels8_l2_ssse3 put_pixels8_l2_mmx2
  1.1413 +#define avg_pixels8_l2_ssse3 avg_pixels8_l2_mmx2
  1.1414 +#define put_pixels16_l2_ssse3 put_pixels16_l2_mmx2
  1.1415 +#define avg_pixels16_l2_ssse3 avg_pixels16_l2_mmx2
  1.1416 +
  1.1417 +#define put_pixels8_l2_shift5_sse2 put_pixels8_l2_shift5_mmx2
  1.1418 +#define avg_pixels8_l2_shift5_sse2 avg_pixels8_l2_shift5_mmx2
  1.1419 +#define put_pixels16_l2_shift5_sse2 put_pixels16_l2_shift5_mmx2
  1.1420 +#define avg_pixels16_l2_shift5_sse2 avg_pixels16_l2_shift5_mmx2
  1.1421 +#define put_pixels8_l2_shift5_ssse3 put_pixels8_l2_shift5_mmx2
  1.1422 +#define avg_pixels8_l2_shift5_ssse3 avg_pixels8_l2_shift5_mmx2
  1.1423 +#define put_pixels16_l2_shift5_ssse3 put_pixels16_l2_shift5_mmx2
  1.1424 +#define avg_pixels16_l2_shift5_ssse3 avg_pixels16_l2_shift5_mmx2
  1.1425 +
  1.1426 +#define put_h264_qpel8_h_lowpass_l2_sse2 put_h264_qpel8_h_lowpass_l2_mmx2
  1.1427 +#define avg_h264_qpel8_h_lowpass_l2_sse2 avg_h264_qpel8_h_lowpass_l2_mmx2
  1.1428 +#define put_h264_qpel16_h_lowpass_l2_sse2 put_h264_qpel16_h_lowpass_l2_mmx2
  1.1429 +#define avg_h264_qpel16_h_lowpass_l2_sse2 avg_h264_qpel16_h_lowpass_l2_mmx2
  1.1430 +
  1.1431 +#define put_h264_qpel8_v_lowpass_ssse3 put_h264_qpel8_v_lowpass_sse2
  1.1432 +#define avg_h264_qpel8_v_lowpass_ssse3 avg_h264_qpel8_v_lowpass_sse2
  1.1433 +#define put_h264_qpel16_v_lowpass_ssse3 put_h264_qpel16_v_lowpass_sse2
  1.1434 +#define avg_h264_qpel16_v_lowpass_ssse3 avg_h264_qpel16_v_lowpass_sse2
  1.1435 +
  1.1436 +#define put_h264_qpel8or16_hv2_lowpass_sse2 put_h264_qpel8or16_hv2_lowpass_mmx2
  1.1437 +#define avg_h264_qpel8or16_hv2_lowpass_sse2 avg_h264_qpel8or16_hv2_lowpass_mmx2
  1.1438 +
  1.1439 +#define H264_MC(OPNAME, SIZE, MMX, ALIGN) \
  1.1440 +H264_MC_C(OPNAME, SIZE, MMX, ALIGN)\
  1.1441 +H264_MC_V(OPNAME, SIZE, MMX, ALIGN)\
  1.1442 +H264_MC_H(OPNAME, SIZE, MMX, ALIGN)\
  1.1443 +H264_MC_HV(OPNAME, SIZE, MMX, ALIGN)\
  1.1444 +
  1.1445 +// static void put_h264_qpel16_mc00_sse2 (uint8_t *dst, uint8_t *src, int stride){
  1.1446 +//     put_pixels16_sse2(dst, src, stride, 16);
  1.1447 +// }
  1.1448 +// static void avg_h264_qpel16_mc00_sse2 (uint8_t *dst, uint8_t *src, int stride){
  1.1449 +//     avg_pixels16_sse2(dst, src, stride, 16);
  1.1450 +// }
  1.1451 +#define put_h264_qpel8_mc00_sse2 put_h264_qpel8_mc00_mmx2
  1.1452 +#define avg_h264_qpel8_mc00_sse2 avg_h264_qpel8_mc00_mmx2
  1.1453 +
  1.1454 +#define H264_MC_C(OPNAME, SIZE, MMX, ALIGN) \
  1.1455 +static void OPNAME ## h264_qpel ## SIZE ## _mc00_ ## MMX (uint8_t *dst, uint8_t *src, int stride){\
  1.1456 +    OPNAME ## pixels ## SIZE ## _ ## MMX(dst, src, stride, SIZE);\
  1.1457 +}\
  1.1458 +
  1.1459 +#define H264_MC_H(OPNAME, SIZE, MMX, ALIGN) \
  1.1460 +static void OPNAME ## h264_qpel ## SIZE ## _mc10_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
  1.1461 +    OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src, src, stride, stride);\
  1.1462 +}\
  1.1463 +\
  1.1464 +static void OPNAME ## h264_qpel ## SIZE ## _mc20_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
  1.1465 +    OPNAME ## h264_qpel ## SIZE ## _h_lowpass_ ## MMX(dst, src, stride, stride);\
  1.1466 +}\
  1.1467 +\
  1.1468 +static void OPNAME ## h264_qpel ## SIZE ## _mc30_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
  1.1469 +    OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src, src+1, stride, stride);\
  1.1470 +}\
  1.1471 +
  1.1472 +#define H264_MC_V(OPNAME, SIZE, MMX, ALIGN) \
  1.1473 +static void OPNAME ## h264_qpel ## SIZE ## _mc01_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
  1.1474 +    DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*SIZE];\
  1.1475 +    put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src, SIZE, stride);\
  1.1476 +    OPNAME ## pixels ## SIZE ## _l2_ ## MMX(dst, src, temp, stride, stride, SIZE);\
  1.1477 +}\
  1.1478 +\
  1.1479 +static void OPNAME ## h264_qpel ## SIZE ## _mc02_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
  1.1480 +    OPNAME ## h264_qpel ## SIZE ## _v_lowpass_ ## MMX(dst, src, stride, stride);\
  1.1481 +}\
  1.1482 +\
  1.1483 +static void OPNAME ## h264_qpel ## SIZE ## _mc03_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
  1.1484 +    DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*SIZE];\
  1.1485 +    put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src, SIZE, stride);\
  1.1486 +    OPNAME ## pixels ## SIZE ## _l2_ ## MMX(dst, src+stride, temp, stride, stride, SIZE);\
  1.1487 +}\
  1.1488 +
  1.1489 +#define H264_MC_HV(OPNAME, SIZE, MMX, ALIGN) \
  1.1490 +static void OPNAME ## h264_qpel ## SIZE ## _mc11_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
  1.1491 +    DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*SIZE];\
  1.1492 +    put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src, SIZE, stride);\
  1.1493 +    OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src, temp, stride, SIZE);\
  1.1494 +}\
  1.1495 +\
  1.1496 +static void OPNAME ## h264_qpel ## SIZE ## _mc31_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
  1.1497 +    DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*SIZE];\
  1.1498 +    put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src+1, SIZE, stride);\
  1.1499 +    OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src, temp, stride, SIZE);\
  1.1500 +}\
  1.1501 +\
  1.1502 +static void OPNAME ## h264_qpel ## SIZE ## _mc13_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
  1.1503 +    DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*SIZE];\
  1.1504 +    put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src, SIZE, stride);\
  1.1505 +    OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src+stride, temp, stride, SIZE);\
  1.1506 +}\
  1.1507 +\
  1.1508 +static void OPNAME ## h264_qpel ## SIZE ## _mc33_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
  1.1509 +    DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*SIZE];\
  1.1510 +    put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src+1, SIZE, stride);\
  1.1511 +    OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src+stride, temp, stride, SIZE);\
  1.1512 +}\
  1.1513 +\
  1.1514 +static void OPNAME ## h264_qpel ## SIZE ## _mc22_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
  1.1515 +    DECLARE_ALIGNED(ALIGN, uint16_t, temp)[SIZE*(SIZE<8?12:24)];\
  1.1516 +    OPNAME ## h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(dst, temp, src, stride, SIZE, stride);\
  1.1517 +}\
  1.1518 +\
  1.1519 +static void OPNAME ## h264_qpel ## SIZE ## _mc21_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
  1.1520 +    DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*(SIZE<8?12:24)*2 + SIZE*SIZE];\
  1.1521 +    uint8_t * const halfHV= temp;\
  1.1522 +    int16_t * const halfV= (int16_t*)(temp + SIZE*SIZE);\
  1.1523 +    assert(((int)temp & 7) == 0);\
  1.1524 +    put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, halfV, src, SIZE, SIZE, stride);\
  1.1525 +    OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src, halfHV, stride, SIZE);\
  1.1526 +}\
  1.1527 +\
  1.1528 +static void OPNAME ## h264_qpel ## SIZE ## _mc23_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
  1.1529 +    DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*(SIZE<8?12:24)*2 + SIZE*SIZE];\
  1.1530 +    uint8_t * const halfHV= temp;\
  1.1531 +    int16_t * const halfV= (int16_t*)(temp + SIZE*SIZE);\
  1.1532 +    assert(((int)temp & 7) == 0);\
  1.1533 +    put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, halfV, src, SIZE, SIZE, stride);\
  1.1534 +    OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src+stride, halfHV, stride, SIZE);\
  1.1535 +}\
  1.1536 +\
  1.1537 +static void OPNAME ## h264_qpel ## SIZE ## _mc12_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
  1.1538 +    DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*(SIZE<8?12:24)*2 + SIZE*SIZE];\
  1.1539 +    uint8_t * const halfHV= temp;\
  1.1540 +    int16_t * const halfV= (int16_t*)(temp + SIZE*SIZE);\
  1.1541 +    assert(((int)temp & 7) == 0);\
  1.1542 +    put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, halfV, src, SIZE, SIZE, stride);\
  1.1543 +    OPNAME ## pixels ## SIZE ## _l2_shift5_ ## MMX(dst, halfV+2, halfHV, stride, SIZE, SIZE);\
  1.1544 +}\
  1.1545 +\
  1.1546 +static void OPNAME ## h264_qpel ## SIZE ## _mc32_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
  1.1547 +    DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*(SIZE<8?12:24)*2 + SIZE*SIZE];\
  1.1548 +    uint8_t * const halfHV= temp;\
  1.1549 +    int16_t * const halfV= (int16_t*)(temp + SIZE*SIZE);\
  1.1550 +    assert(((int)temp & 7) == 0);\
  1.1551 +    put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, halfV, src, SIZE, SIZE, stride);\
  1.1552 +    OPNAME ## pixels ## SIZE ## _l2_shift5_ ## MMX(dst, halfV+3, halfHV, stride, SIZE, SIZE);\
  1.1553 +}\
  1.1554 +
  1.1555 +#define H264_MC_4816(MMX)\
  1.1556 +H264_MC(put_, 4, MMX, 8)\
  1.1557 +H264_MC(put_, 8, MMX, 8)\
  1.1558 +H264_MC(put_, 16,MMX, 8)\
  1.1559 +H264_MC(avg_, 4, MMX, 8)\
  1.1560 +H264_MC(avg_, 8, MMX, 8)\
  1.1561 +H264_MC(avg_, 16,MMX, 8)\
  1.1562 +
  1.1563 +#define H264_MC_816(QPEL, XMM)\
  1.1564 +QPEL(put_, 8, XMM, 16)\
  1.1565 +QPEL(put_, 16,XMM, 16)\
  1.1566 +QPEL(avg_, 8, XMM, 16)\
  1.1567 +QPEL(avg_, 16,XMM, 16)\
  1.1568 +
  1.1569 +
  1.1570 +#define AVG_3DNOW_OP(a,b,temp, size) \
  1.1571 +"mov" #size " " #b ", " #temp "   \n\t"\
  1.1572 +"pavgusb " #temp ", " #a "        \n\t"\
  1.1573 +"mov" #size " " #a ", " #b "      \n\t"
  1.1574 +#define AVG_MMX2_OP(a,b,temp, size) \
  1.1575 +"mov" #size " " #b ", " #temp "   \n\t"\
  1.1576 +"pavgb " #temp ", " #a "          \n\t"\
  1.1577 +"mov" #size " " #a ", " #b "      \n\t"
  1.1578 +
  1.1579 +///this does not get detected correctly, uncomment on AMD machine
  1.1580 +#ifdef HAVE_AMD3DNOW
  1.1581 +#define PAVGB "pavgusb"
  1.1582 +//QPEL_H264(put_,       PUT_OP, 3dnow)
  1.1583 +//QPEL_H264(avg_, AVG_3DNOW_OP, 3dnow)
  1.1584 +#undef PAVGB
  1.1585 +#endif
  1.1586 +
  1.1587 +#define PAVGB "pavgb"
  1.1588 +QPEL_H264(put_,       PUT_OP, mmx2)
  1.1589 +QPEL_H264(avg_,  AVG_MMX2_OP, mmx2)
  1.1590 +QPEL_H264_V_XMM(put_,       PUT_OP, sse2)
  1.1591 +QPEL_H264_V_XMM(avg_,  AVG_MMX2_OP, sse2)
  1.1592 +QPEL_H264_HV_XMM(put_,       PUT_OP, sse2)
  1.1593 +QPEL_H264_HV_XMM(avg_,  AVG_MMX2_OP, sse2)
  1.1594 +#if HAVE_SSSE3
  1.1595 +QPEL_H264_H_XMM(put_,       PUT_OP, ssse3)
  1.1596 +QPEL_H264_H_XMM(avg_,  AVG_MMX2_OP, ssse3)
  1.1597 +QPEL_H264_HV2_XMM(put_,       PUT_OP, ssse3)
  1.1598 +QPEL_H264_HV2_XMM(avg_,  AVG_MMX2_OP, ssse3)
  1.1599 +QPEL_H264_HV_XMM(put_,       PUT_OP, ssse3)
  1.1600 +QPEL_H264_HV_XMM(avg_,  AVG_MMX2_OP, ssse3)
  1.1601 +#endif
  1.1602 +#undef PAVGB
  1.1603 +
  1.1604 +H264_MC_816(H264_MC_V, sse2)
  1.1605 +H264_MC_816(H264_MC_HV, sse2)
  1.1606 +#if HAVE_SSSE3
  1.1607 +H264_MC_816(H264_MC_H, ssse3)
  1.1608 +H264_MC_816(H264_MC_HV, ssse3)
  1.1609 +#endif
  1.1610 +
  1.1611 +/* rnd interleaved with rnd div 8, use p+1 to access rnd div 8 */
  1.1612 +DECLARE_ALIGNED(8, static const uint64_t, h264_rnd_reg)[4] = {
  1.1613 +    0x0020002000200020ULL, 0x0004000400040004ULL, 0x001C001C001C001CULL, 0x0003000300030003ULL
  1.1614 +};
  1.1615 +
  1.1616 +#if HAVE_SSSE3
  1.1617 +#define AVG_OP(X)
  1.1618 +#undef H264_CHROMA_MC8_TMPL
  1.1619 +#undef H264_CHROMA_MC4_TMPL
  1.1620 +#define H264_CHROMA_MC8_TMPL put_h264_chroma_mc8_ssse3
  1.1621 +#define H264_CHROMA_MC4_TMPL put_h264_chroma_mc4_ssse3
  1.1622 +#define H264_CHROMA_MC8_MV0 put_pixels8_mmx
  1.1623 +#include "dsputil_h264_template_ssse3.c"
  1.1624 +static void put_h264_chroma_mc8_ssse3_rnd(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y)
  1.1625 +{
  1.1626 +    put_h264_chroma_mc8_ssse3(dst, src, stride, h, x, y, 1);
  1.1627 +}
  1.1628 +
  1.1629 +#undef AVG_OP
  1.1630 +#undef H264_CHROMA_MC8_TMPL
  1.1631 +#undef H264_CHROMA_MC4_TMPL
  1.1632 +#undef H264_CHROMA_MC8_MV0
  1.1633 +#define AVG_OP(X) X
  1.1634 +#define H264_CHROMA_MC8_TMPL avg_h264_chroma_mc8_ssse3
  1.1635 +#define H264_CHROMA_MC4_TMPL avg_h264_chroma_mc4_ssse3
  1.1636 +#define H264_CHROMA_MC8_MV0 avg_pixels8_mmx2
  1.1637 +#include "dsputil_h264_template_ssse3.c"
  1.1638 +static void avg_h264_chroma_mc8_ssse3_rnd(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y)
  1.1639 +{
  1.1640 +    avg_h264_chroma_mc8_ssse3(dst, src, stride, h, x, y, 1);
  1.1641 +}
  1.1642 +#undef AVG_OP
  1.1643 +#undef H264_CHROMA_MC8_TMPL
  1.1644 +#undef H264_CHROMA_MC4_TMPL
  1.1645 +#undef H264_CHROMA_MC8_MV0
  1.1646 +#endif
  1.1647 +
  1.1648 +/***********************************/
  1.1649 +/* weighted prediction */
  1.1650 +
  1.1651 +static inline void ff_h264_weight_WxH_mmx2(uint8_t *dst, int stride, int log2_denom, int weight, int offset, int w, int h)
  1.1652 +{
  1.1653 +    int x, y;
  1.1654 +    offset <<= log2_denom;
  1.1655 +    offset += (1 << log2_denom) >> 1;
  1.1656 +    __asm__ volatile(
  1.1657 +        "movd    %0, %%mm4        \n\t"
  1.1658 +        "movd    %1, %%mm5        \n\t"
  1.1659 +        "movd    %2, %%mm6        \n\t"
  1.1660 +        "pshufw  $0, %%mm4, %%mm4 \n\t"
  1.1661 +        "pshufw  $0, %%mm5, %%mm5 \n\t"
  1.1662 +        "pxor    %%mm7, %%mm7     \n\t"
  1.1663 +        :: "g"(weight), "g"(offset), "g"(log2_denom)
  1.1664 +    );
  1.1665 +    for(y=0; y<h; y+=2){
  1.1666 +        for(x=0; x<w; x+=4){
  1.1667 +            __asm__ volatile(
  1.1668 +                "movd      %0,    %%mm0 \n\t"
  1.1669 +                "movd      %1,    %%mm1 \n\t"
  1.1670 +                "punpcklbw %%mm7, %%mm0 \n\t"
  1.1671 +                "punpcklbw %%mm7, %%mm1 \n\t"
  1.1672 +                "pmullw    %%mm4, %%mm0 \n\t"
  1.1673 +                "pmullw    %%mm4, %%mm1 \n\t"
  1.1674 +                "paddsw    %%mm5, %%mm0 \n\t"
  1.1675 +                "paddsw    %%mm5, %%mm1 \n\t"
  1.1676 +                "psraw     %%mm6, %%mm0 \n\t"
  1.1677 +                "psraw     %%mm6, %%mm1 \n\t"
  1.1678 +                "packuswb  %%mm7, %%mm0 \n\t"
  1.1679 +                "packuswb  %%mm7, %%mm1 \n\t"
  1.1680 +                "movd      %%mm0, %0    \n\t"
  1.1681 +                "movd      %%mm1, %1    \n\t"
  1.1682 +                : "+m"(*(uint32_t*)(dst+x)),
  1.1683 +                  "+m"(*(uint32_t*)(dst+x+stride))
  1.1684 +            );
  1.1685 +        }
  1.1686 +        dst += 2*stride;
  1.1687 +    }
  1.1688 +}
  1.1689 +
  1.1690 +static inline void ff_h264_biweight_WxH_mmx2(uint8_t *dst, uint8_t *src, int stride, int log2_denom, int weightd, int weights, int offset, int w, int h)
  1.1691 +{
  1.1692 +    int x, y;
  1.1693 +    offset = ((offset + 1) | 1) << log2_denom;
  1.1694 +    __asm__ volatile(
  1.1695 +        "movd    %0, %%mm3        \n\t"
  1.1696 +        "movd    %1, %%mm4        \n\t"
  1.1697 +        "movd    %2, %%mm5        \n\t"
  1.1698 +        "movd    %3, %%mm6        \n\t"
  1.1699 +        "pshufw  $0, %%mm3, %%mm3 \n\t"
  1.1700 +        "pshufw  $0, %%mm4, %%mm4 \n\t"
  1.1701 +        "pshufw  $0, %%mm5, %%mm5 \n\t"
  1.1702 +        "pxor    %%mm7, %%mm7     \n\t"
  1.1703 +        :: "g"(weightd), "g"(weights), "g"(offset), "g"(log2_denom+1)
  1.1704 +    );
  1.1705 +    for(y=0; y<h; y++){
  1.1706 +        for(x=0; x<w; x+=4){
  1.1707 +            __asm__ volatile(
  1.1708 +                "movd      %0,    %%mm0 \n\t"
  1.1709 +                "movd      %1,    %%mm1 \n\t"
  1.1710 +                "punpcklbw %%mm7, %%mm0 \n\t"
  1.1711 +                "punpcklbw %%mm7, %%mm1 \n\t"
  1.1712 +                "pmullw    %%mm3, %%mm0 \n\t"
  1.1713 +                "pmullw    %%mm4, %%mm1 \n\t"
  1.1714 +                "paddsw    %%mm1, %%mm0 \n\t"
  1.1715 +                "paddsw    %%mm5, %%mm0 \n\t"
  1.1716 +                "psraw     %%mm6, %%mm0 \n\t"
  1.1717 +                "packuswb  %%mm0, %%mm0 \n\t"
  1.1718 +                "movd      %%mm0, %0    \n\t"
  1.1719 +                : "+m"(*(uint32_t*)(dst+x))
  1.1720 +                :  "m"(*(uint32_t*)(src+x))
  1.1721 +            );
  1.1722 +        }
  1.1723 +        src += stride;
  1.1724 +        dst += stride;
  1.1725 +    }
  1.1726 +}
  1.1727 +
  1.1728 +#define H264_WEIGHT(W,H) \
  1.1729 +static void ff_h264_biweight_ ## W ## x ## H ## _mmx2(uint8_t *dst, uint8_t *src, int stride, int log2_denom, int weightd, int weights, int offset){ \
  1.1730 +    ff_h264_biweight_WxH_mmx2(dst, src, stride, log2_denom, weightd, weights, offset, W, H); \
  1.1731 +} \
  1.1732 +static void ff_h264_weight_ ## W ## x ## H ## _mmx2(uint8_t *dst, int stride, int log2_denom, int weight, int offset){ \
  1.1733 +    ff_h264_weight_WxH_mmx2(dst, stride, log2_denom, weight, offset, W, H); \
  1.1734 +}
  1.1735 +
  1.1736 +H264_WEIGHT(16,16)
  1.1737 +H264_WEIGHT(16, 8)
  1.1738 +H264_WEIGHT( 8,16)
  1.1739 +H264_WEIGHT( 8, 8)
  1.1740 +H264_WEIGHT( 8, 4)
  1.1741 +H264_WEIGHT( 4, 8)
  1.1742 +H264_WEIGHT( 4, 4)
  1.1743 +H264_WEIGHT( 4, 2)
  1.1744 +