diff libavcodec/arm/dsputil_neon.S @ 2:897f711a7157

rearrange to work with autoconf
author Nina Engelhardt <nengel@mailbox.tu-berlin.de>
date Tue, 25 Sep 2012 15:55:33 +0200
parents
children
line diff
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/libavcodec/arm/dsputil_neon.S	Tue Sep 25 15:55:33 2012 +0200
     1.3 @@ -0,0 +1,1146 @@
     1.4 +/*
     1.5 + * ARM NEON optimised DSP functions
     1.6 + * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
     1.7 + *
     1.8 + * This file is part of FFmpeg.
     1.9 + *
    1.10 + * FFmpeg is free software; you can redistribute it and/or
    1.11 + * modify it under the terms of the GNU Lesser General Public
    1.12 + * License as published by the Free Software Foundation; either
    1.13 + * version 2.1 of the License, or (at your option) any later version.
    1.14 + *
    1.15 + * FFmpeg is distributed in the hope that it will be useful,
    1.16 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
    1.17 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
    1.18 + * Lesser General Public License for more details.
    1.19 + *
    1.20 + * You should have received a copy of the GNU Lesser General Public
    1.21 + * License along with FFmpeg; if not, write to the Free Software
    1.22 + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
    1.23 + */
    1.24 +
    1.25 +#include "config.h"
    1.26 +#include "asm.S"
    1.27 +
    1.28 +        preserve8
    1.29 +        .text
    1.30 +
    1.31 +        .macro pixels16 avg=0
    1.32 +.if \avg
    1.33 +        mov             ip,  r0
    1.34 +.endif
    1.35 +1:      vld1.64         {d0, d1},  [r1], r2
    1.36 +        vld1.64         {d2, d3},  [r1], r2
    1.37 +        vld1.64         {d4, d5},  [r1], r2
    1.38 +        pld             [r1, r2, lsl #2]
    1.39 +        vld1.64         {d6, d7},  [r1], r2
    1.40 +        pld             [r1]
    1.41 +        pld             [r1, r2]
    1.42 +        pld             [r1, r2, lsl #1]
    1.43 +.if \avg
    1.44 +        vld1.64         {d16,d17}, [ip,:128], r2
    1.45 +        vrhadd.u8       q0,  q0,  q8
    1.46 +        vld1.64         {d18,d19}, [ip,:128], r2
    1.47 +        vrhadd.u8       q1,  q1,  q9
    1.48 +        vld1.64         {d20,d21}, [ip,:128], r2
    1.49 +        vrhadd.u8       q2,  q2,  q10
    1.50 +        vld1.64         {d22,d23}, [ip,:128], r2
    1.51 +        vrhadd.u8       q3,  q3,  q11
    1.52 +.endif
    1.53 +        subs            r3,  r3,  #4
    1.54 +        vst1.64         {d0, d1},  [r0,:128], r2
    1.55 +        vst1.64         {d2, d3},  [r0,:128], r2
    1.56 +        vst1.64         {d4, d5},  [r0,:128], r2
    1.57 +        vst1.64         {d6, d7},  [r0,:128], r2
    1.58 +        bne             1b
    1.59 +        bx              lr
    1.60 +        .endm
    1.61 +
    1.62 +        .macro pixels16_x2 vhadd=vrhadd.u8
    1.63 +1:      vld1.64         {d0-d2},   [r1], r2
    1.64 +        vld1.64         {d4-d6},   [r1], r2
    1.65 +        pld             [r1]
    1.66 +        pld             [r1, r2]
    1.67 +        subs            r3,  r3,  #2
    1.68 +        vext.8          q1,  q0,  q1,  #1
    1.69 +        \vhadd          q0,  q0,  q1
    1.70 +        vext.8          q3,  q2,  q3,  #1
    1.71 +        \vhadd          q2,  q2,  q3
    1.72 +        vst1.64         {d0, d1},  [r0,:128], r2
    1.73 +        vst1.64         {d4, d5},  [r0,:128], r2
    1.74 +        bne             1b
    1.75 +        bx              lr
    1.76 +        .endm
    1.77 +
    1.78 +        .macro pixels16_y2 vhadd=vrhadd.u8
    1.79 +        vld1.64         {d0, d1},  [r1], r2
    1.80 +        vld1.64         {d2, d3},  [r1], r2
    1.81 +1:      subs            r3,  r3,  #2
    1.82 +        \vhadd          q2,  q0,  q1
    1.83 +        vld1.64         {d0, d1},  [r1], r2
    1.84 +        \vhadd          q3,  q0,  q1
    1.85 +        vld1.64         {d2, d3},  [r1], r2
    1.86 +        pld             [r1]
    1.87 +        pld             [r1, r2]
    1.88 +        vst1.64         {d4, d5},  [r0,:128], r2
    1.89 +        vst1.64         {d6, d7},  [r0,:128], r2
    1.90 +        bne             1b
    1.91 +        bx              lr
    1.92 +        .endm
    1.93 +
    1.94 +        .macro pixels16_xy2 vshrn=vrshrn.u16 no_rnd=0
    1.95 +        vld1.64         {d0-d2},   [r1], r2
    1.96 +        vld1.64         {d4-d6},   [r1], r2
    1.97 +.if \no_rnd
    1.98 +        vmov.i16        q13, #1
    1.99 +.endif
   1.100 +        pld             [r1]
   1.101 +        pld             [r1, r2]
   1.102 +        vext.8          q1,  q0,  q1,  #1
   1.103 +        vext.8          q3,  q2,  q3,  #1
   1.104 +        vaddl.u8        q8,  d0,  d2
   1.105 +        vaddl.u8        q10, d1,  d3
   1.106 +        vaddl.u8        q9,  d4,  d6
   1.107 +        vaddl.u8        q11, d5,  d7
   1.108 +1:      subs            r3,  r3,  #2
   1.109 +        vld1.64         {d0-d2},   [r1], r2
   1.110 +        vadd.u16        q12, q8,  q9
   1.111 +        pld             [r1]
   1.112 +.if \no_rnd
   1.113 +        vadd.u16        q12, q12, q13
   1.114 +.endif
   1.115 +        vext.8          q15, q0,  q1,  #1
   1.116 +        vadd.u16        q1 , q10, q11
   1.117 +        \vshrn          d28, q12, #2
   1.118 +.if \no_rnd
   1.119 +        vadd.u16        q1,  q1,  q13
   1.120 +.endif
   1.121 +        \vshrn          d29, q1,  #2
   1.122 +        vaddl.u8        q8,  d0,  d30
   1.123 +        vld1.64         {d2-d4},   [r1], r2
   1.124 +        vaddl.u8        q10, d1,  d31
   1.125 +        vst1.64         {d28,d29}, [r0,:128], r2
   1.126 +        vadd.u16        q12, q8,  q9
   1.127 +        pld             [r1, r2]
   1.128 +.if \no_rnd
   1.129 +        vadd.u16        q12, q12, q13
   1.130 +.endif
   1.131 +        vext.8          q2,  q1,  q2,  #1
   1.132 +        vadd.u16        q0,  q10, q11
   1.133 +        \vshrn          d30, q12, #2
   1.134 +.if \no_rnd
   1.135 +        vadd.u16        q0,  q0,  q13
   1.136 +.endif
   1.137 +        \vshrn          d31, q0,  #2
   1.138 +        vaddl.u8        q9,  d2,  d4
   1.139 +        vaddl.u8        q11, d3,  d5
   1.140 +        vst1.64         {d30,d31}, [r0,:128], r2
   1.141 +        bgt             1b
   1.142 +        bx              lr
   1.143 +        .endm
   1.144 +
   1.145 +        .macro pixels8 avg=0
   1.146 +1:      vld1.64         {d0}, [r1], r2
   1.147 +        vld1.64         {d1}, [r1], r2
   1.148 +        vld1.64         {d2}, [r1], r2
   1.149 +        pld             [r1, r2, lsl #2]
   1.150 +        vld1.64         {d3}, [r1], r2
   1.151 +        pld             [r1]
   1.152 +        pld             [r1, r2]
   1.153 +        pld             [r1, r2, lsl #1]
   1.154 +.if \avg
   1.155 +        vld1.64         {d4}, [r0,:64], r2
   1.156 +        vrhadd.u8       d0,  d0,  d4
   1.157 +        vld1.64         {d5}, [r0,:64], r2
   1.158 +        vrhadd.u8       d1,  d1,  d5
   1.159 +        vld1.64         {d6}, [r0,:64], r2
   1.160 +        vrhadd.u8       d2,  d2,  d6
   1.161 +        vld1.64         {d7}, [r0,:64], r2
   1.162 +        vrhadd.u8       d3,  d3,  d7
   1.163 +        sub             r0,  r0,  r2,  lsl #2
   1.164 +.endif
   1.165 +        subs            r3,  r3,  #4
   1.166 +        vst1.64         {d0}, [r0,:64], r2
   1.167 +        vst1.64         {d1}, [r0,:64], r2
   1.168 +        vst1.64         {d2}, [r0,:64], r2
   1.169 +        vst1.64         {d3}, [r0,:64], r2
   1.170 +        bne             1b
   1.171 +        bx              lr
   1.172 +        .endm
   1.173 +
   1.174 +        .macro pixels8_x2 vhadd=vrhadd.u8
   1.175 +1:      vld1.64         {d0, d1},  [r1], r2
   1.176 +        vext.8          d1,  d0,  d1,  #1
   1.177 +        vld1.64         {d2, d3},  [r1], r2
   1.178 +        vext.8          d3,  d2,  d3,  #1
   1.179 +        pld             [r1]
   1.180 +        pld             [r1, r2]
   1.181 +        subs            r3,  r3,  #2
   1.182 +        vswp            d1,  d2
   1.183 +        \vhadd          q0,  q0,  q1
   1.184 +        vst1.64         {d0},      [r0,:64], r2
   1.185 +        vst1.64         {d1},      [r0,:64], r2
   1.186 +        bne             1b
   1.187 +        bx              lr
   1.188 +        .endm
   1.189 +
   1.190 +        .macro pixels8_y2 vhadd=vrhadd.u8
   1.191 +        vld1.64         {d0},      [r1], r2
   1.192 +        vld1.64         {d1},      [r1], r2
   1.193 +1:      subs            r3,  r3,  #2
   1.194 +        \vhadd          d4,  d0,  d1
   1.195 +        vld1.64         {d0},      [r1], r2
   1.196 +        \vhadd          d5,  d0,  d1
   1.197 +        vld1.64         {d1},      [r1], r2
   1.198 +        pld             [r1]
   1.199 +        pld             [r1, r2]
   1.200 +        vst1.64         {d4},      [r0,:64], r2
   1.201 +        vst1.64         {d5},      [r0,:64], r2
   1.202 +        bne             1b
   1.203 +        bx              lr
   1.204 +        .endm
   1.205 +
   1.206 +        .macro pixels8_xy2 vshrn=vrshrn.u16 no_rnd=0
   1.207 +        vld1.64         {d0, d1},  [r1], r2
   1.208 +        vld1.64         {d2, d3},  [r1], r2
   1.209 +.if \no_rnd
   1.210 +        vmov.i16        q11, #1
   1.211 +.endif
   1.212 +        pld             [r1]
   1.213 +        pld             [r1, r2]
   1.214 +        vext.8          d4,  d0,  d1,  #1
   1.215 +        vext.8          d6,  d2,  d3,  #1
   1.216 +        vaddl.u8        q8,  d0,  d4
   1.217 +        vaddl.u8        q9,  d2,  d6
   1.218 +1:      subs            r3,  r3,  #2
   1.219 +        vld1.64         {d0, d1},  [r1], r2
   1.220 +        pld             [r1]
   1.221 +        vadd.u16        q10, q8,  q9
   1.222 +        vext.8          d4,  d0,  d1,  #1
   1.223 +.if \no_rnd
   1.224 +        vadd.u16        q10, q10, q11
   1.225 +.endif
   1.226 +        vaddl.u8        q8,  d0,  d4
   1.227 +        \vshrn          d5,  q10, #2
   1.228 +        vld1.64         {d2, d3},  [r1], r2
   1.229 +        vadd.u16        q10, q8,  q9
   1.230 +        pld             [r1, r2]
   1.231 +.if \no_rnd
   1.232 +        vadd.u16        q10, q10, q11
   1.233 +.endif
   1.234 +        vst1.64         {d5},      [r0,:64], r2
   1.235 +        \vshrn          d7,  q10, #2
   1.236 +        vext.8          d6,  d2,  d3,  #1
   1.237 +        vaddl.u8        q9,  d2,  d6
   1.238 +        vst1.64         {d7},      [r0,:64], r2
   1.239 +        bgt             1b
   1.240 +        bx              lr
   1.241 +        .endm
   1.242 +
   1.243 +        .macro pixfunc pfx name suf rnd_op args:vararg
   1.244 +function ff_\pfx\name\suf\()_neon, export=1
   1.245 +        \name \rnd_op \args
   1.246 +endfunc
   1.247 +        .endm
   1.248 +
   1.249 +        .macro pixfunc2 pfx name args:vararg
   1.250 +        pixfunc \pfx \name
   1.251 +        pixfunc \pfx \name \args
   1.252 +        .endm
   1.253 +
   1.254 +function ff_put_h264_qpel16_mc00_neon, export=1
   1.255 +        mov             r3,  #16
   1.256 +endfunc
   1.257 +
   1.258 +        pixfunc  put_ pixels16
   1.259 +        pixfunc2 put_ pixels16_x2,  _no_rnd, vhadd.u8
   1.260 +        pixfunc2 put_ pixels16_y2,  _no_rnd, vhadd.u8
   1.261 +        pixfunc2 put_ pixels16_xy2, _no_rnd, vshrn.u16, 1
   1.262 +
   1.263 +function ff_avg_h264_qpel16_mc00_neon, export=1
   1.264 +        mov             r3,  #16
   1.265 +endfunc
   1.266 +
   1.267 +        pixfunc  avg_ pixels16,, 1
   1.268 +
   1.269 +function ff_put_h264_qpel8_mc00_neon, export=1
   1.270 +        mov             r3,  #8
   1.271 +endfunc
   1.272 +
   1.273 +        pixfunc  put_ pixels8
   1.274 +        pixfunc2 put_ pixels8_x2,   _no_rnd, vhadd.u8
   1.275 +        pixfunc2 put_ pixels8_y2,   _no_rnd, vhadd.u8
   1.276 +        pixfunc2 put_ pixels8_xy2,  _no_rnd, vshrn.u16, 1
   1.277 +
   1.278 +function ff_avg_h264_qpel8_mc00_neon, export=1
   1.279 +        mov             r3,  #8
   1.280 +endfunc
   1.281 +
   1.282 +        pixfunc  avg_ pixels8,, 1
   1.283 +
   1.284 +function ff_put_pixels_clamped_neon, export=1
   1.285 +        vld1.64         {d16-d19}, [r0,:128]!
   1.286 +        vqmovun.s16     d0, q8
   1.287 +        vld1.64         {d20-d23}, [r0,:128]!
   1.288 +        vqmovun.s16     d1, q9
   1.289 +        vld1.64         {d24-d27}, [r0,:128]!
   1.290 +        vqmovun.s16     d2, q10
   1.291 +        vld1.64         {d28-d31}, [r0,:128]!
   1.292 +        vqmovun.s16     d3, q11
   1.293 +        vst1.64         {d0},      [r1,:64], r2
   1.294 +        vqmovun.s16     d4, q12
   1.295 +        vst1.64         {d1},      [r1,:64], r2
   1.296 +        vqmovun.s16     d5, q13
   1.297 +        vst1.64         {d2},      [r1,:64], r2
   1.298 +        vqmovun.s16     d6, q14
   1.299 +        vst1.64         {d3},      [r1,:64], r2
   1.300 +        vqmovun.s16     d7, q15
   1.301 +        vst1.64         {d4},      [r1,:64], r2
   1.302 +        vst1.64         {d5},      [r1,:64], r2
   1.303 +        vst1.64         {d6},      [r1,:64], r2
   1.304 +        vst1.64         {d7},      [r1,:64], r2
   1.305 +        bx              lr
   1.306 +endfunc
   1.307 +
   1.308 +function ff_put_signed_pixels_clamped_neon, export=1
   1.309 +        vmov.u8         d31, #128
   1.310 +        vld1.64         {d16-d17}, [r0,:128]!
   1.311 +        vqmovn.s16      d0, q8
   1.312 +        vld1.64         {d18-d19}, [r0,:128]!
   1.313 +        vqmovn.s16      d1, q9
   1.314 +        vld1.64         {d16-d17}, [r0,:128]!
   1.315 +        vqmovn.s16      d2, q8
   1.316 +        vld1.64         {d18-d19}, [r0,:128]!
   1.317 +        vadd.u8         d0, d0, d31
   1.318 +        vld1.64         {d20-d21}, [r0,:128]!
   1.319 +        vadd.u8         d1, d1, d31
   1.320 +        vld1.64         {d22-d23}, [r0,:128]!
   1.321 +        vadd.u8         d2, d2, d31
   1.322 +        vst1.64         {d0},      [r1,:64], r2
   1.323 +        vqmovn.s16      d3, q9
   1.324 +        vst1.64         {d1},      [r1,:64], r2
   1.325 +        vqmovn.s16      d4, q10
   1.326 +        vst1.64         {d2},      [r1,:64], r2
   1.327 +        vqmovn.s16      d5, q11
   1.328 +        vld1.64         {d24-d25}, [r0,:128]!
   1.329 +        vadd.u8         d3, d3, d31
   1.330 +        vld1.64         {d26-d27}, [r0,:128]!
   1.331 +        vadd.u8         d4, d4, d31
   1.332 +        vadd.u8         d5, d5, d31
   1.333 +        vst1.64         {d3},      [r1,:64], r2
   1.334 +        vqmovn.s16      d6, q12
   1.335 +        vst1.64         {d4},      [r1,:64], r2
   1.336 +        vqmovn.s16      d7, q13
   1.337 +        vst1.64         {d5},      [r1,:64], r2
   1.338 +        vadd.u8         d6, d6, d31
   1.339 +        vadd.u8         d7, d7, d31
   1.340 +        vst1.64         {d6},      [r1,:64], r2
   1.341 +        vst1.64         {d7},      [r1,:64], r2
   1.342 +        bx              lr
   1.343 +endfunc
   1.344 +
   1.345 +function ff_add_pixels_clamped_neon, export=1
   1.346 +        mov             r3, r1
   1.347 +        vld1.64         {d16},   [r1,:64], r2
   1.348 +        vld1.64         {d0-d1}, [r0,:128]!
   1.349 +        vaddw.u8        q0, q0, d16
   1.350 +        vld1.64         {d17},   [r1,:64], r2
   1.351 +        vld1.64         {d2-d3}, [r0,:128]!
   1.352 +        vqmovun.s16     d0, q0
   1.353 +        vld1.64         {d18},   [r1,:64], r2
   1.354 +        vaddw.u8        q1, q1, d17
   1.355 +        vld1.64         {d4-d5}, [r0,:128]!
   1.356 +        vaddw.u8        q2, q2, d18
   1.357 +        vst1.64         {d0},    [r3,:64], r2
   1.358 +        vqmovun.s16     d2, q1
   1.359 +        vld1.64         {d19},   [r1,:64], r2
   1.360 +        vld1.64         {d6-d7}, [r0,:128]!
   1.361 +        vaddw.u8        q3, q3, d19
   1.362 +        vqmovun.s16     d4, q2
   1.363 +        vst1.64         {d2},    [r3,:64], r2
   1.364 +        vld1.64         {d16},   [r1,:64], r2
   1.365 +        vqmovun.s16     d6, q3
   1.366 +        vld1.64         {d0-d1}, [r0,:128]!
   1.367 +        vaddw.u8        q0, q0, d16
   1.368 +        vst1.64         {d4},    [r3,:64], r2
   1.369 +        vld1.64         {d17},   [r1,:64], r2
   1.370 +        vld1.64         {d2-d3}, [r0,:128]!
   1.371 +        vaddw.u8        q1, q1, d17
   1.372 +        vst1.64         {d6},    [r3,:64], r2
   1.373 +        vqmovun.s16     d0, q0
   1.374 +        vld1.64         {d18},   [r1,:64], r2
   1.375 +        vld1.64         {d4-d5}, [r0,:128]!
   1.376 +        vaddw.u8        q2, q2, d18
   1.377 +        vst1.64         {d0},    [r3,:64], r2
   1.378 +        vqmovun.s16     d2, q1
   1.379 +        vld1.64         {d19},   [r1,:64], r2
   1.380 +        vqmovun.s16     d4, q2
   1.381 +        vld1.64         {d6-d7}, [r0,:128]!
   1.382 +        vaddw.u8        q3, q3, d19
   1.383 +        vst1.64         {d2},    [r3,:64], r2
   1.384 +        vqmovun.s16     d6, q3
   1.385 +        vst1.64         {d4},    [r3,:64], r2
   1.386 +        vst1.64         {d6},    [r3,:64], r2
   1.387 +        bx              lr
   1.388 +endfunc
   1.389 +
   1.390 +function ff_float_to_int16_neon, export=1
   1.391 +        subs            r2,  r2,  #8
   1.392 +        vld1.64         {d0-d1},  [r1,:128]!
   1.393 +        vcvt.s32.f32    q8,  q0,  #16
   1.394 +        vld1.64         {d2-d3},  [r1,:128]!
   1.395 +        vcvt.s32.f32    q9,  q1,  #16
   1.396 +        beq             3f
   1.397 +        bics            ip,  r2,  #15
   1.398 +        beq             2f
   1.399 +1:      subs            ip,  ip,  #16
   1.400 +        vshrn.s32       d4,  q8,  #16
   1.401 +        vld1.64         {d0-d1},  [r1,:128]!
   1.402 +        vcvt.s32.f32    q0,  q0,  #16
   1.403 +        vshrn.s32       d5,  q9,  #16
   1.404 +        vld1.64         {d2-d3},  [r1,:128]!
   1.405 +        vcvt.s32.f32    q1,  q1,  #16
   1.406 +        vshrn.s32       d6,  q0,  #16
   1.407 +        vst1.64         {d4-d5},  [r0,:128]!
   1.408 +        vshrn.s32       d7,  q1,  #16
   1.409 +        vld1.64         {d16-d17},[r1,:128]!
   1.410 +        vcvt.s32.f32    q8,  q8,  #16
   1.411 +        vld1.64         {d18-d19},[r1,:128]!
   1.412 +        vcvt.s32.f32    q9,  q9,  #16
   1.413 +        vst1.64         {d6-d7},  [r0,:128]!
   1.414 +        bne             1b
   1.415 +        ands            r2,  r2,  #15
   1.416 +        beq             3f
   1.417 +2:      vld1.64         {d0-d1},  [r1,:128]!
   1.418 +        vshrn.s32       d4,  q8,  #16
   1.419 +        vcvt.s32.f32    q0,  q0,  #16
   1.420 +        vld1.64         {d2-d3},  [r1,:128]!
   1.421 +        vshrn.s32       d5,  q9,  #16
   1.422 +        vcvt.s32.f32    q1,  q1,  #16
   1.423 +        vshrn.s32       d6,  q0,  #16
   1.424 +        vst1.64         {d4-d5},  [r0,:128]!
   1.425 +        vshrn.s32       d7,  q1,  #16
   1.426 +        vst1.64         {d6-d7},  [r0,:128]!
   1.427 +        bx              lr
   1.428 +3:      vshrn.s32       d4,  q8,  #16
   1.429 +        vshrn.s32       d5,  q9,  #16
   1.430 +        vst1.64         {d4-d5},  [r0,:128]!
   1.431 +        bx              lr
   1.432 +endfunc
   1.433 +
   1.434 +function ff_float_to_int16_interleave_neon, export=1
   1.435 +        cmp             r3, #2
   1.436 +        ldrlt           r1, [r1]
   1.437 +        blt             ff_float_to_int16_neon
   1.438 +        bne             4f
   1.439 +
   1.440 +        ldr             r3, [r1]
   1.441 +        ldr             r1, [r1, #4]
   1.442 +
   1.443 +        subs            r2,  r2,  #8
   1.444 +        vld1.64         {d0-d1},  [r3,:128]!
   1.445 +        vcvt.s32.f32    q8,  q0,  #16
   1.446 +        vld1.64         {d2-d3},  [r3,:128]!
   1.447 +        vcvt.s32.f32    q9,  q1,  #16
   1.448 +        vld1.64         {d20-d21},[r1,:128]!
   1.449 +        vcvt.s32.f32    q10, q10, #16
   1.450 +        vld1.64         {d22-d23},[r1,:128]!
   1.451 +        vcvt.s32.f32    q11, q11, #16
   1.452 +        beq             3f
   1.453 +        bics            ip,  r2,  #15
   1.454 +        beq             2f
   1.455 +1:      subs            ip,  ip,  #16
   1.456 +        vld1.64         {d0-d1},  [r3,:128]!
   1.457 +        vcvt.s32.f32    q0,  q0,  #16
   1.458 +        vsri.32         q10, q8,  #16
   1.459 +        vld1.64         {d2-d3},  [r3,:128]!
   1.460 +        vcvt.s32.f32    q1,  q1,  #16
   1.461 +        vld1.64         {d24-d25},[r1,:128]!
   1.462 +        vcvt.s32.f32    q12, q12, #16
   1.463 +        vld1.64         {d26-d27},[r1,:128]!
   1.464 +        vsri.32         q11, q9,  #16
   1.465 +        vst1.64         {d20-d21},[r0,:128]!
   1.466 +        vcvt.s32.f32    q13, q13, #16
   1.467 +        vst1.64         {d22-d23},[r0,:128]!
   1.468 +        vsri.32         q12, q0,  #16
   1.469 +        vld1.64         {d16-d17},[r3,:128]!
   1.470 +        vsri.32         q13, q1,  #16
   1.471 +        vst1.64         {d24-d25},[r0,:128]!
   1.472 +        vcvt.s32.f32    q8,  q8,  #16
   1.473 +        vld1.64         {d18-d19},[r3,:128]!
   1.474 +        vcvt.s32.f32    q9,  q9,  #16
   1.475 +        vld1.64         {d20-d21},[r1,:128]!
   1.476 +        vcvt.s32.f32    q10, q10, #16
   1.477 +        vld1.64         {d22-d23},[r1,:128]!
   1.478 +        vcvt.s32.f32    q11, q11, #16
   1.479 +        vst1.64         {d26-d27},[r0,:128]!
   1.480 +        bne             1b
   1.481 +        ands            r2,  r2,  #15
   1.482 +        beq             3f
   1.483 +2:      vsri.32         q10, q8,  #16
   1.484 +        vld1.64         {d0-d1},  [r3,:128]!
   1.485 +        vcvt.s32.f32    q0,  q0,  #16
   1.486 +        vld1.64         {d2-d3},  [r3,:128]!
   1.487 +        vcvt.s32.f32    q1,  q1,  #16
   1.488 +        vld1.64         {d24-d25},[r1,:128]!
   1.489 +        vcvt.s32.f32    q12, q12, #16
   1.490 +        vsri.32         q11, q9,  #16
   1.491 +        vld1.64         {d26-d27},[r1,:128]!
   1.492 +        vcvt.s32.f32    q13, q13, #16
   1.493 +        vst1.64         {d20-d21},[r0,:128]!
   1.494 +        vsri.32         q12, q0,  #16
   1.495 +        vst1.64         {d22-d23},[r0,:128]!
   1.496 +        vsri.32         q13, q1,  #16
   1.497 +        vst1.64         {d24-d27},[r0,:128]!
   1.498 +        bx              lr
   1.499 +3:      vsri.32         q10, q8,  #16
   1.500 +        vsri.32         q11, q9,  #16
   1.501 +        vst1.64         {d20-d23},[r0,:128]!
   1.502 +        bx              lr
   1.503 +
   1.504 +4:      push            {r4-r8,lr}
   1.505 +        cmp             r3,  #4
   1.506 +        lsl             ip,  r3,  #1
   1.507 +        blt             4f
   1.508 +
   1.509 +        @ 4 channels
   1.510 +5:      ldmia           r1!, {r4-r7}
   1.511 +        mov             lr,  r2
   1.512 +        mov             r8,  r0
   1.513 +        vld1.64         {d16-d17},[r4,:128]!
   1.514 +        vcvt.s32.f32    q8,  q8,  #16
   1.515 +        vld1.64         {d18-d19},[r5,:128]!
   1.516 +        vcvt.s32.f32    q9,  q9,  #16
   1.517 +        vld1.64         {d20-d21},[r6,:128]!
   1.518 +        vcvt.s32.f32    q10, q10, #16
   1.519 +        vld1.64         {d22-d23},[r7,:128]!
   1.520 +        vcvt.s32.f32    q11, q11, #16
   1.521 +6:      subs            lr,  lr,  #8
   1.522 +        vld1.64         {d0-d1},  [r4,:128]!
   1.523 +        vcvt.s32.f32    q0,  q0,  #16
   1.524 +        vsri.32         q9,  q8,  #16
   1.525 +        vld1.64         {d2-d3},  [r5,:128]!
   1.526 +        vcvt.s32.f32    q1,  q1,  #16
   1.527 +        vsri.32         q11, q10, #16
   1.528 +        vld1.64         {d4-d5},  [r6,:128]!
   1.529 +        vcvt.s32.f32    q2,  q2,  #16
   1.530 +        vzip.32         d18, d22
   1.531 +        vld1.64         {d6-d7},  [r7,:128]!
   1.532 +        vcvt.s32.f32    q3,  q3,  #16
   1.533 +        vzip.32         d19, d23
   1.534 +        vst1.64         {d18},    [r8], ip
   1.535 +        vsri.32         q1,  q0,  #16
   1.536 +        vst1.64         {d22},    [r8], ip
   1.537 +        vsri.32         q3,  q2,  #16
   1.538 +        vst1.64         {d19},    [r8], ip
   1.539 +        vzip.32         d2,  d6
   1.540 +        vst1.64         {d23},    [r8], ip
   1.541 +        vzip.32         d3,  d7
   1.542 +        beq             7f
   1.543 +        vld1.64         {d16-d17},[r4,:128]!
   1.544 +        vcvt.s32.f32    q8,  q8,  #16
   1.545 +        vst1.64         {d2},     [r8], ip
   1.546 +        vld1.64         {d18-d19},[r5,:128]!
   1.547 +        vcvt.s32.f32    q9,  q9,  #16
   1.548 +        vst1.64         {d6},     [r8], ip
   1.549 +        vld1.64         {d20-d21},[r6,:128]!
   1.550 +        vcvt.s32.f32    q10, q10, #16
   1.551 +        vst1.64         {d3},     [r8], ip
   1.552 +        vld1.64         {d22-d23},[r7,:128]!
   1.553 +        vcvt.s32.f32    q11, q11, #16
   1.554 +        vst1.64         {d7},     [r8], ip
   1.555 +        b               6b
   1.556 +7:      vst1.64         {d2},     [r8], ip
   1.557 +        vst1.64         {d6},     [r8], ip
   1.558 +        vst1.64         {d3},     [r8], ip
   1.559 +        vst1.64         {d7},     [r8], ip
   1.560 +        subs            r3,  r3,  #4
   1.561 +        popeq           {r4-r8,pc}
   1.562 +        cmp             r3,  #4
   1.563 +        add             r0,  r0,  #8
   1.564 +        bge             5b
   1.565 +
   1.566 +        @ 2 channels
   1.567 +4:      cmp             r3,  #2
   1.568 +        blt             4f
   1.569 +        ldmia           r1!, {r4-r5}
   1.570 +        mov             lr,  r2
   1.571 +        mov             r8,  r0
   1.572 +        tst             lr,  #8
   1.573 +        vld1.64         {d16-d17},[r4,:128]!
   1.574 +        vcvt.s32.f32    q8,  q8,  #16
   1.575 +        vld1.64         {d18-d19},[r5,:128]!
   1.576 +        vcvt.s32.f32    q9,  q9,  #16
   1.577 +        vld1.64         {d20-d21},[r4,:128]!
   1.578 +        vcvt.s32.f32    q10, q10, #16
   1.579 +        vld1.64         {d22-d23},[r5,:128]!
   1.580 +        vcvt.s32.f32    q11, q11, #16
   1.581 +        beq             6f
   1.582 +        subs            lr,  lr,  #8
   1.583 +        beq             7f
   1.584 +        vsri.32         d18, d16, #16
   1.585 +        vsri.32         d19, d17, #16
   1.586 +        vld1.64         {d16-d17},[r4,:128]!
   1.587 +        vcvt.s32.f32    q8,  q8,  #16
   1.588 +        vst1.32         {d18[0]}, [r8], ip
   1.589 +        vsri.32         d22, d20, #16
   1.590 +        vst1.32         {d18[1]}, [r8], ip
   1.591 +        vsri.32         d23, d21, #16
   1.592 +        vst1.32         {d19[0]}, [r8], ip
   1.593 +        vst1.32         {d19[1]}, [r8], ip
   1.594 +        vld1.64         {d18-d19},[r5,:128]!
   1.595 +        vcvt.s32.f32    q9,  q9,  #16
   1.596 +        vst1.32         {d22[0]}, [r8], ip
   1.597 +        vst1.32         {d22[1]}, [r8], ip
   1.598 +        vld1.64         {d20-d21},[r4,:128]!
   1.599 +        vcvt.s32.f32    q10, q10, #16
   1.600 +        vst1.32         {d23[0]}, [r8], ip
   1.601 +        vst1.32         {d23[1]}, [r8], ip
   1.602 +        vld1.64         {d22-d23},[r5,:128]!
   1.603 +        vcvt.s32.f32    q11, q11, #16
   1.604 +6:      subs            lr,  lr,  #16
   1.605 +        vld1.64         {d0-d1},  [r4,:128]!
   1.606 +        vcvt.s32.f32    q0,  q0,  #16
   1.607 +        vsri.32         d18, d16, #16
   1.608 +        vld1.64         {d2-d3},  [r5,:128]!
   1.609 +        vcvt.s32.f32    q1,  q1,  #16
   1.610 +        vsri.32         d19, d17, #16
   1.611 +        vld1.64         {d4-d5},  [r4,:128]!
   1.612 +        vcvt.s32.f32    q2,  q2,  #16
   1.613 +        vld1.64         {d6-d7},  [r5,:128]!
   1.614 +        vcvt.s32.f32    q3,  q3,  #16
   1.615 +        vst1.32         {d18[0]}, [r8], ip
   1.616 +        vsri.32         d22, d20, #16
   1.617 +        vst1.32         {d18[1]}, [r8], ip
   1.618 +        vsri.32         d23, d21, #16
   1.619 +        vst1.32         {d19[0]}, [r8], ip
   1.620 +        vsri.32         d2,  d0,  #16
   1.621 +        vst1.32         {d19[1]}, [r8], ip
   1.622 +        vsri.32         d3,  d1,  #16
   1.623 +        vst1.32         {d22[0]}, [r8], ip
   1.624 +        vsri.32         d6,  d4,  #16
   1.625 +        vst1.32         {d22[1]}, [r8], ip
   1.626 +        vsri.32         d7,  d5,  #16
   1.627 +        vst1.32         {d23[0]}, [r8], ip
   1.628 +        vst1.32         {d23[1]}, [r8], ip
   1.629 +        beq             6f
   1.630 +        vld1.64         {d16-d17},[r4,:128]!
   1.631 +        vcvt.s32.f32    q8,  q8,  #16
   1.632 +        vst1.32         {d2[0]},  [r8], ip
   1.633 +        vst1.32         {d2[1]},  [r8], ip
   1.634 +        vld1.64         {d18-d19},[r5,:128]!
   1.635 +        vcvt.s32.f32    q9,  q9,  #16
   1.636 +        vst1.32         {d3[0]},  [r8], ip
   1.637 +        vst1.32         {d3[1]},  [r8], ip
   1.638 +        vld1.64         {d20-d21},[r4,:128]!
   1.639 +        vcvt.s32.f32    q10, q10, #16
   1.640 +        vst1.32         {d6[0]},  [r8], ip
   1.641 +        vst1.32         {d6[1]},  [r8], ip
   1.642 +        vld1.64         {d22-d23},[r5,:128]!
   1.643 +        vcvt.s32.f32    q11, q11, #16
   1.644 +        vst1.32         {d7[0]},  [r8], ip
   1.645 +        vst1.32         {d7[1]},  [r8], ip
   1.646 +        bgt             6b
   1.647 +6:      vst1.32         {d2[0]},  [r8], ip
   1.648 +        vst1.32         {d2[1]},  [r8], ip
   1.649 +        vst1.32         {d3[0]},  [r8], ip
   1.650 +        vst1.32         {d3[1]},  [r8], ip
   1.651 +        vst1.32         {d6[0]},  [r8], ip
   1.652 +        vst1.32         {d6[1]},  [r8], ip
   1.653 +        vst1.32         {d7[0]},  [r8], ip
   1.654 +        vst1.32         {d7[1]},  [r8], ip
   1.655 +        b               8f
   1.656 +7:      vsri.32         d18, d16, #16
   1.657 +        vsri.32         d19, d17, #16
   1.658 +        vst1.32         {d18[0]}, [r8], ip
   1.659 +        vsri.32         d22, d20, #16
   1.660 +        vst1.32         {d18[1]}, [r8], ip
   1.661 +        vsri.32         d23, d21, #16
   1.662 +        vst1.32         {d19[0]}, [r8], ip
   1.663 +        vst1.32         {d19[1]}, [r8], ip
   1.664 +        vst1.32         {d22[0]}, [r8], ip
   1.665 +        vst1.32         {d22[1]}, [r8], ip
   1.666 +        vst1.32         {d23[0]}, [r8], ip
   1.667 +        vst1.32         {d23[1]}, [r8], ip
   1.668 +8:      subs            r3,  r3,  #2
   1.669 +        add             r0,  r0,  #4
   1.670 +        popeq           {r4-r8,pc}
   1.671 +
   1.672 +        @ 1 channel
   1.673 +4:      ldr             r4,  [r1],#4
   1.674 +        tst             r2,  #8
   1.675 +        mov             lr,  r2
   1.676 +        mov             r5,  r0
   1.677 +        vld1.64         {d0-d1},  [r4,:128]!
   1.678 +        vcvt.s32.f32    q0,  q0,  #16
   1.679 +        vld1.64         {d2-d3},  [r4,:128]!
   1.680 +        vcvt.s32.f32    q1,  q1,  #16
   1.681 +        bne             8f
   1.682 +6:      subs            lr,  lr,  #16
   1.683 +        vld1.64         {d4-d5},  [r4,:128]!
   1.684 +        vcvt.s32.f32    q2,  q2,  #16
   1.685 +        vld1.64         {d6-d7},  [r4,:128]!
   1.686 +        vcvt.s32.f32    q3,  q3,  #16
   1.687 +        vst1.16         {d0[1]},  [r5,:16], ip
   1.688 +        vst1.16         {d0[3]},  [r5,:16], ip
   1.689 +        vst1.16         {d1[1]},  [r5,:16], ip
   1.690 +        vst1.16         {d1[3]},  [r5,:16], ip
   1.691 +        vst1.16         {d2[1]},  [r5,:16], ip
   1.692 +        vst1.16         {d2[3]},  [r5,:16], ip
   1.693 +        vst1.16         {d3[1]},  [r5,:16], ip
   1.694 +        vst1.16         {d3[3]},  [r5,:16], ip
   1.695 +        beq             7f
   1.696 +        vld1.64         {d0-d1},  [r4,:128]!
   1.697 +        vcvt.s32.f32    q0,  q0,  #16
   1.698 +        vld1.64         {d2-d3},  [r4,:128]!
   1.699 +        vcvt.s32.f32    q1,  q1,  #16
   1.700 +7:      vst1.16         {d4[1]},  [r5,:16], ip
   1.701 +        vst1.16         {d4[3]},  [r5,:16], ip
   1.702 +        vst1.16         {d5[1]},  [r5,:16], ip
   1.703 +        vst1.16         {d5[3]},  [r5,:16], ip
   1.704 +        vst1.16         {d6[1]},  [r5,:16], ip
   1.705 +        vst1.16         {d6[3]},  [r5,:16], ip
   1.706 +        vst1.16         {d7[1]},  [r5,:16], ip
   1.707 +        vst1.16         {d7[3]},  [r5,:16], ip
   1.708 +        bgt             6b
   1.709 +        pop             {r4-r8,pc}
   1.710 +8:      subs            lr,  lr,  #8
   1.711 +        vst1.16         {d0[1]},  [r5,:16], ip
   1.712 +        vst1.16         {d0[3]},  [r5,:16], ip
   1.713 +        vst1.16         {d1[1]},  [r5,:16], ip
   1.714 +        vst1.16         {d1[3]},  [r5,:16], ip
   1.715 +        vst1.16         {d2[1]},  [r5,:16], ip
   1.716 +        vst1.16         {d2[3]},  [r5,:16], ip
   1.717 +        vst1.16         {d3[1]},  [r5,:16], ip
   1.718 +        vst1.16         {d3[3]},  [r5,:16], ip
   1.719 +        popeq           {r4-r8,pc}
   1.720 +        vld1.64         {d0-d1},  [r4,:128]!
   1.721 +        vcvt.s32.f32    q0,  q0,  #16
   1.722 +        vld1.64         {d2-d3},  [r4,:128]!
   1.723 +        vcvt.s32.f32    q1,  q1,  #16
   1.724 +        b               6b
   1.725 +endfunc
   1.726 +
   1.727 +function ff_vector_fmul_neon, export=1
   1.728 +        mov             r3,  r0
   1.729 +        subs            r2,  r2,  #8
   1.730 +        vld1.64         {d0-d3},  [r0,:128]!
   1.731 +        vld1.64         {d4-d7},  [r1,:128]!
   1.732 +        vmul.f32        q8,  q0,  q2
   1.733 +        vmul.f32        q9,  q1,  q3
   1.734 +        beq             3f
   1.735 +        bics            ip,  r2,  #15
   1.736 +        beq             2f
   1.737 +1:      subs            ip,  ip,  #16
   1.738 +        vld1.64         {d0-d1},  [r0,:128]!
   1.739 +        vld1.64         {d4-d5},  [r1,:128]!
   1.740 +        vmul.f32        q10, q0,  q2
   1.741 +        vld1.64         {d2-d3},  [r0,:128]!
   1.742 +        vld1.64         {d6-d7},  [r1,:128]!
   1.743 +        vmul.f32        q11, q1,  q3
   1.744 +        vst1.64         {d16-d19},[r3,:128]!
   1.745 +        vld1.64         {d0-d1},  [r0,:128]!
   1.746 +        vld1.64         {d4-d5},  [r1,:128]!
   1.747 +        vmul.f32        q8,  q0,  q2
   1.748 +        vld1.64         {d2-d3},  [r0,:128]!
   1.749 +        vld1.64         {d6-d7},  [r1,:128]!
   1.750 +        vmul.f32        q9,  q1,  q3
   1.751 +        vst1.64         {d20-d23},[r3,:128]!
   1.752 +        bne             1b
   1.753 +        ands            r2,  r2,  #15
   1.754 +        beq             3f
   1.755 +2:      vld1.64         {d0-d1},  [r0,:128]!
   1.756 +        vld1.64         {d4-d5},  [r1,:128]!
   1.757 +        vst1.64         {d16-d17},[r3,:128]!
   1.758 +        vmul.f32        q8,  q0,  q2
   1.759 +        vld1.64         {d2-d3},  [r0,:128]!
   1.760 +        vld1.64         {d6-d7},  [r1,:128]!
   1.761 +        vst1.64         {d18-d19},[r3,:128]!
   1.762 +        vmul.f32        q9,  q1,  q3
   1.763 +3:      vst1.64         {d16-d19},[r3,:128]!
   1.764 +        bx              lr
   1.765 +endfunc
   1.766 +
   1.767 +function ff_vector_fmul_window_neon, export=1
   1.768 +VFP     vdup.32         q8,  d0[0]
   1.769 +NOVFP   vld1.32         {d16[],d17[]}, [sp,:32]
   1.770 +        push            {r4,r5,lr}
   1.771 +VFP     ldr             lr,  [sp, #12]
   1.772 +NOVFP   ldr             lr,  [sp, #16]
   1.773 +        sub             r2,  r2,  #8
   1.774 +        sub             r5,  lr,  #2
   1.775 +        add             r2,  r2,  r5, lsl #2
   1.776 +        add             r4,  r3,  r5, lsl #3
   1.777 +        add             ip,  r0,  r5, lsl #3
   1.778 +        mov             r5,  #-16
   1.779 +        vld1.64         {d0,d1},  [r1,:128]!
   1.780 +        vld1.64         {d2,d3},  [r2,:128], r5
   1.781 +        vld1.64         {d4,d5},  [r3,:128]!
   1.782 +        vld1.64         {d6,d7},  [r4,:128], r5
   1.783 +1:      subs            lr,  lr,  #4
   1.784 +        vmov            q11, q8
   1.785 +        vmla.f32        d22, d0,  d4
   1.786 +        vmov            q10, q8
   1.787 +        vmla.f32        d23, d1,  d5
   1.788 +        vrev64.32       q3,  q3
   1.789 +        vmla.f32        d20, d0,  d7
   1.790 +        vrev64.32       q1,  q1
   1.791 +        vmla.f32        d21, d1,  d6
   1.792 +        beq             2f
   1.793 +        vmla.f32        d22, d3,  d7
   1.794 +        vld1.64         {d0,d1},  [r1,:128]!
   1.795 +        vmla.f32        d23, d2,  d6
   1.796 +        vld1.64         {d18,d19},[r2,:128], r5
   1.797 +        vmls.f32        d20, d3,  d4
   1.798 +        vld1.64         {d24,d25},[r3,:128]!
   1.799 +        vmls.f32        d21, d2,  d5
   1.800 +        vld1.64         {d6,d7},  [r4,:128], r5
   1.801 +        vmov            q1,  q9
   1.802 +        vrev64.32       q11, q11
   1.803 +        vmov            q2,  q12
   1.804 +        vswp            d22, d23
   1.805 +        vst1.64         {d20,d21},[r0,:128]!
   1.806 +        vst1.64         {d22,d23},[ip,:128], r5
   1.807 +        b               1b
   1.808 +2:      vmla.f32        d22, d3,  d7
   1.809 +        vmla.f32        d23, d2,  d6
   1.810 +        vmls.f32        d20, d3,  d4
   1.811 +        vmls.f32        d21, d2,  d5
   1.812 +        vrev64.32       q11, q11
   1.813 +        vswp            d22, d23
   1.814 +        vst1.64         {d20,d21},[r0,:128]!
   1.815 +        vst1.64         {d22,d23},[ip,:128], r5
   1.816 +        pop             {r4,r5,pc}
   1.817 +endfunc
   1.818 +
   1.819 +#if CONFIG_VORBIS_DECODER
   1.820 +function ff_vorbis_inverse_coupling_neon, export=1
   1.821 +        vmov.i32        q10, #1<<31
   1.822 +        subs            r2,  r2,  #4
   1.823 +        mov             r3,  r0
   1.824 +        mov             r12, r1
   1.825 +        beq             3f
   1.826 +
   1.827 +        vld1.32         {d24-d25},[r1,:128]!
   1.828 +        vld1.32         {d22-d23},[r0,:128]!
   1.829 +        vcle.s32        q8,  q12, #0
   1.830 +        vand            q9,  q11, q10
   1.831 +        veor            q12, q12, q9
   1.832 +        vand            q2,  q12, q8
   1.833 +        vbic            q3,  q12, q8
   1.834 +        vadd.f32        q12, q11, q2
   1.835 +        vsub.f32        q11, q11, q3
   1.836 +1:      vld1.32         {d2-d3},  [r1,:128]!
   1.837 +        vld1.32         {d0-d1},  [r0,:128]!
   1.838 +        vcle.s32        q8,  q1,  #0
   1.839 +        vand            q9,  q0,  q10
   1.840 +        veor            q1,  q1,  q9
   1.841 +        vst1.32         {d24-d25},[r3, :128]!
   1.842 +        vst1.32         {d22-d23},[r12,:128]!
   1.843 +        vand            q2,  q1,  q8
   1.844 +        vbic            q3,  q1,  q8
   1.845 +        vadd.f32        q1,  q0,  q2
   1.846 +        vsub.f32        q0,  q0,  q3
   1.847 +        subs            r2,  r2,  #8
   1.848 +        ble             2f
   1.849 +        vld1.32         {d24-d25},[r1,:128]!
   1.850 +        vld1.32         {d22-d23},[r0,:128]!
   1.851 +        vcle.s32        q8,  q12, #0
   1.852 +        vand            q9,  q11, q10
   1.853 +        veor            q12, q12, q9
   1.854 +        vst1.32         {d2-d3},  [r3, :128]!
   1.855 +        vst1.32         {d0-d1},  [r12,:128]!
   1.856 +        vand            q2,  q12, q8
   1.857 +        vbic            q3,  q12, q8
   1.858 +        vadd.f32        q12, q11, q2
   1.859 +        vsub.f32        q11, q11, q3
   1.860 +        b               1b
   1.861 +
   1.862 +2:      vst1.32         {d2-d3},  [r3, :128]!
   1.863 +        vst1.32         {d0-d1},  [r12,:128]!
   1.864 +        bxlt            lr
   1.865 +
   1.866 +3:      vld1.32         {d2-d3},  [r1,:128]
   1.867 +        vld1.32         {d0-d1},  [r0,:128]
   1.868 +        vcle.s32        q8,  q1,  #0
   1.869 +        vand            q9,  q0,  q10
   1.870 +        veor            q1,  q1,  q9
   1.871 +        vand            q2,  q1,  q8
   1.872 +        vbic            q3,  q1,  q8
   1.873 +        vadd.f32        q1,  q0,  q2
   1.874 +        vsub.f32        q0,  q0,  q3
   1.875 +        vst1.32         {d2-d3},  [r0,:128]!
   1.876 +        vst1.32         {d0-d1},  [r1,:128]!
   1.877 +        bx              lr
   1.878 +endfunc
   1.879 +#endif
   1.880 +
   1.881 +function ff_vector_fmul_scalar_neon, export=1
   1.882 +VFP     len .req r2
   1.883 +NOVFP   len .req r3
   1.884 +VFP     vdup.32         q8,  d0[0]
   1.885 +NOVFP   vdup.32         q8,  r2
   1.886 +        bics            r12, len, #15
   1.887 +        beq             3f
   1.888 +        vld1.32         {q0},[r1,:128]!
   1.889 +        vld1.32         {q1},[r1,:128]!
   1.890 +1:      vmul.f32        q0,  q0,  q8
   1.891 +        vld1.32         {q2},[r1,:128]!
   1.892 +        vmul.f32        q1,  q1,  q8
   1.893 +        vld1.32         {q3},[r1,:128]!
   1.894 +        vmul.f32        q2,  q2,  q8
   1.895 +        vst1.32         {q0},[r0,:128]!
   1.896 +        vmul.f32        q3,  q3,  q8
   1.897 +        vst1.32         {q1},[r0,:128]!
   1.898 +        subs            r12, r12, #16
   1.899 +        beq             2f
   1.900 +        vld1.32         {q0},[r1,:128]!
   1.901 +        vst1.32         {q2},[r0,:128]!
   1.902 +        vld1.32         {q1},[r1,:128]!
   1.903 +        vst1.32         {q3},[r0,:128]!
   1.904 +        b               1b
   1.905 +2:      vst1.32         {q2},[r0,:128]!
   1.906 +        vst1.32         {q3},[r0,:128]!
   1.907 +        ands            len, len, #15
   1.908 +        bxeq            lr
   1.909 +3:      vld1.32         {q0},[r1,:128]!
   1.910 +        vmul.f32        q0,  q0,  q8
   1.911 +        vst1.32         {q0},[r0,:128]!
   1.912 +        subs            len, len, #4
   1.913 +        bgt             3b
   1.914 +        bx              lr
   1.915 +        .unreq          len
   1.916 +endfunc
   1.917 +
   1.918 +function ff_vector_fmul_sv_scalar_2_neon, export=1
   1.919 +VFP     vdup.32         d16, d0[0]
   1.920 +NOVFP   vdup.32         d16, r3
   1.921 +NOVFP   ldr             r3,  [sp]
   1.922 +        vld1.32         {d0},[r1,:64]!
   1.923 +        vld1.32         {d1},[r1,:64]!
   1.924 +1:      subs            r3,  r3,  #4
   1.925 +        vmul.f32        d4,  d0,  d16
   1.926 +        vmul.f32        d5,  d1,  d16
   1.927 +        ldr             r12, [r2], #4
   1.928 +        vld1.32         {d2},[r12,:64]
   1.929 +        ldr             r12, [r2], #4
   1.930 +        vld1.32         {d3},[r12,:64]
   1.931 +        vmul.f32        d4,  d4,  d2
   1.932 +        vmul.f32        d5,  d5,  d3
   1.933 +        beq             2f
   1.934 +        vld1.32         {d0},[r1,:64]!
   1.935 +        vld1.32         {d1},[r1,:64]!
   1.936 +        vst1.32         {d4},[r0,:64]!
   1.937 +        vst1.32         {d5},[r0,:64]!
   1.938 +        b               1b
   1.939 +2:      vst1.32         {d4},[r0,:64]!
   1.940 +        vst1.32         {d5},[r0,:64]!
   1.941 +        bx              lr
   1.942 +endfunc
   1.943 +
   1.944 +function ff_vector_fmul_sv_scalar_4_neon, export=1
   1.945 +VFP     vdup.32         q10, d0[0]
   1.946 +NOVFP   vdup.32         q10, r3
   1.947 +NOVFP   ldr             r3,  [sp]
   1.948 +        push            {lr}
   1.949 +        bics            lr,  r3,  #7
   1.950 +        beq             3f
   1.951 +        vld1.32         {q0},[r1,:128]!
   1.952 +        vld1.32         {q2},[r1,:128]!
   1.953 +1:      ldr             r12, [r2], #4
   1.954 +        vld1.32         {q1},[r12,:128]
   1.955 +        ldr             r12, [r2], #4
   1.956 +        vld1.32         {q3},[r12,:128]
   1.957 +        vmul.f32        q8,  q0,  q10
   1.958 +        vmul.f32        q8,  q8,  q1
   1.959 +        vmul.f32        q9,  q2,  q10
   1.960 +        vmul.f32        q9,  q9,  q3
   1.961 +        subs            lr,  lr,  #8
   1.962 +        beq             2f
   1.963 +        vld1.32         {q0},[r1,:128]!
   1.964 +        vld1.32         {q2},[r1,:128]!
   1.965 +        vst1.32         {q8},[r0,:128]!
   1.966 +        vst1.32         {q9},[r0,:128]!
   1.967 +        b               1b
   1.968 +2:      vst1.32         {q8},[r0,:128]!
   1.969 +        vst1.32         {q9},[r0,:128]!
   1.970 +        ands            r3,  r3,  #7
   1.971 +        popeq           {pc}
   1.972 +3:      vld1.32         {q0},[r1,:128]!
   1.973 +        ldr             r12, [r2], #4
   1.974 +        vld1.32         {q1},[r12,:128]
   1.975 +        vmul.f32        q0,  q0,  q10
   1.976 +        vmul.f32        q0,  q0,  q1
   1.977 +        vst1.32         {q0},[r0,:128]!
   1.978 +        subs            r3,  r3,  #4
   1.979 +        bgt             3b
   1.980 +        pop             {pc}
   1.981 +endfunc
   1.982 +
   1.983 +function ff_sv_fmul_scalar_2_neon, export=1
   1.984 +VFP     len .req r2
   1.985 +NOVFP   len .req r3
   1.986 +VFP     vdup.32         q8,  d0[0]
   1.987 +NOVFP   vdup.32         q8,  r2
   1.988 +        ldr             r12, [r1], #4
   1.989 +        vld1.32         {d0},[r12,:64]
   1.990 +        ldr             r12, [r1], #4
   1.991 +        vld1.32         {d1},[r12,:64]
   1.992 +1:      vmul.f32        q1,  q0,  q8
   1.993 +        subs            len, len, #4
   1.994 +        beq             2f
   1.995 +        ldr             r12, [r1], #4
   1.996 +        vld1.32         {d0},[r12,:64]
   1.997 +        ldr             r12, [r1], #4
   1.998 +        vld1.32         {d1},[r12,:64]
   1.999 +        vst1.32         {q1},[r0,:128]!
  1.1000 +        b               1b
  1.1001 +2:      vst1.32         {q1},[r0,:128]!
  1.1002 +        bx              lr
  1.1003 +        .unreq          len
  1.1004 +endfunc
  1.1005 +
  1.1006 +function ff_sv_fmul_scalar_4_neon, export=1
  1.1007 +VFP     len .req r2
  1.1008 +NOVFP   len .req r3
  1.1009 +VFP     vdup.32         q8,  d0[0]
  1.1010 +NOVFP   vdup.32         q8,  r2
  1.1011 +1:      ldr             r12, [r1], #4
  1.1012 +        vld1.32         {q0},[r12,:128]
  1.1013 +        vmul.f32        q0,  q0,  q8
  1.1014 +        vst1.32         {q0},[r0,:128]!
  1.1015 +        subs            len, len, #4
  1.1016 +        bgt             1b
  1.1017 +        bx              lr
  1.1018 +        .unreq          len
  1.1019 +endfunc
  1.1020 +
  1.1021 +function ff_butterflies_float_neon, export=1
  1.1022 +1:      vld1.32         {q0},[r0,:128]
  1.1023 +        vld1.32         {q1},[r1,:128]
  1.1024 +        vsub.f32        q2,  q0,  q1
  1.1025 +        vadd.f32        q1,  q0,  q1
  1.1026 +        vst1.32         {q2},[r1,:128]!
  1.1027 +        vst1.32         {q1},[r0,:128]!
  1.1028 +        subs            r2,  r2,  #4
  1.1029 +        bgt             1b
  1.1030 +        bx              lr
  1.1031 +endfunc
  1.1032 +
  1.1033 +function ff_scalarproduct_float_neon, export=1
  1.1034 +        vmov.f32        q2,  #0.0
  1.1035 +1:      vld1.32         {q0},[r0,:128]!
  1.1036 +        vld1.32         {q1},[r1,:128]!
  1.1037 +        vmla.f32        q2,  q0,  q1
  1.1038 +        subs            r2,  r2,  #4
  1.1039 +        bgt             1b
  1.1040 +        vadd.f32        d0,  d4,  d5
  1.1041 +        vpadd.f32       d0,  d0,  d0
  1.1042 +NOVFP   vmov.32         r0,  d0[0]
  1.1043 +        bx              lr
  1.1044 +endfunc
  1.1045 +
  1.1046 +function ff_int32_to_float_fmul_scalar_neon, export=1
  1.1047 +VFP     vdup.32         q0,  d0[0]
  1.1048 +VFP     len     .req    r2
  1.1049 +NOVFP   vdup.32         q0,  r2
  1.1050 +NOVFP   len     .req    r3
  1.1051 +
  1.1052 +        vld1.32         {q1},[r1,:128]!
  1.1053 +        vcvt.f32.s32    q3,  q1
  1.1054 +        vld1.32         {q2},[r1,:128]!
  1.1055 +        vcvt.f32.s32    q8,  q2
  1.1056 +1:      subs            len, len, #8
  1.1057 +        pld             [r1, #16]
  1.1058 +        vmul.f32        q9,  q3,  q0
  1.1059 +        vmul.f32        q10, q8,  q0
  1.1060 +        beq             2f
  1.1061 +        vld1.32         {q1},[r1,:128]!
  1.1062 +        vcvt.f32.s32    q3,  q1
  1.1063 +        vld1.32         {q2},[r1,:128]!
  1.1064 +        vcvt.f32.s32    q8,  q2
  1.1065 +        vst1.32         {q9}, [r0,:128]!
  1.1066 +        vst1.32         {q10},[r0,:128]!
  1.1067 +        b               1b
  1.1068 +2:      vst1.32         {q9}, [r0,:128]!
  1.1069 +        vst1.32         {q10},[r0,:128]!
  1.1070 +        bx              lr
  1.1071 +        .unreq  len
  1.1072 +endfunc
  1.1073 +
  1.1074 +function ff_vector_fmul_reverse_neon, export=1
  1.1075 +        add             r2,  r2,  r3,  lsl #2
  1.1076 +        sub             r2,  r2,  #32
  1.1077 +        mov             r12, #-32
  1.1078 +        vld1.32         {q0-q1},  [r1,:128]!
  1.1079 +        vld1.32         {q2-q3},  [r2,:128], r12
  1.1080 +1:      pld             [r1, #32]
  1.1081 +        vrev64.32       q3,  q3
  1.1082 +        vmul.f32        d16, d0,  d7
  1.1083 +        vmul.f32        d17, d1,  d6
  1.1084 +        pld             [r2, #-32]
  1.1085 +        vrev64.32       q2,  q2
  1.1086 +        vmul.f32        d18, d2,  d5
  1.1087 +        vmul.f32        d19, d3,  d4
  1.1088 +        subs            r3,  r3,  #8
  1.1089 +        beq             2f
  1.1090 +        vld1.32         {q0-q1},  [r1,:128]!
  1.1091 +        vld1.32         {q2-q3},  [r2,:128], r12
  1.1092 +        vst1.32         {q8-q9},  [r0,:128]!
  1.1093 +        b               1b
  1.1094 +2:      vst1.32         {q8-q9},  [r0,:128]!
  1.1095 +        bx              lr
  1.1096 +endfunc
  1.1097 +
  1.1098 +function ff_vector_fmul_add_neon, export=1
  1.1099 +        ldr             r12, [sp]
  1.1100 +        vld1.32         {q0-q1},  [r1,:128]!
  1.1101 +        vld1.32         {q8-q9},  [r2,:128]!
  1.1102 +        vld1.32         {q2-q3},  [r3,:128]!
  1.1103 +        vmul.f32        q10, q0,  q8
  1.1104 +        vmul.f32        q11, q1,  q9
  1.1105 +1:      vadd.f32        q12, q2,  q10
  1.1106 +        vadd.f32        q13, q3,  q11
  1.1107 +        pld             [r1, #16]
  1.1108 +        pld             [r2, #16]
  1.1109 +        pld             [r3, #16]
  1.1110 +        subs            r12, r12, #8
  1.1111 +        beq             2f
  1.1112 +        vld1.32         {q0},     [r1,:128]!
  1.1113 +        vld1.32         {q8},     [r2,:128]!
  1.1114 +        vmul.f32        q10, q0,  q8
  1.1115 +        vld1.32         {q1},     [r1,:128]!
  1.1116 +        vld1.32         {q9},     [r2,:128]!
  1.1117 +        vmul.f32        q11, q1,  q9
  1.1118 +        vld1.32         {q2-q3},  [r3,:128]!
  1.1119 +        vst1.32         {q12-q13},[r0,:128]!
  1.1120 +        b               1b
  1.1121 +2:      vst1.32         {q12-q13},[r0,:128]!
  1.1122 +        bx              lr
  1.1123 +endfunc
  1.1124 +
  1.1125 +function ff_vector_clipf_neon, export=1
  1.1126 +VFP     vdup.32         q1,  d0[1]
  1.1127 +VFP     vdup.32         q0,  d0[0]
  1.1128 +NOVFP   vdup.32         q0,  r2
  1.1129 +NOVFP   vdup.32         q1,  r3
  1.1130 +NOVFP   ldr             r2,  [sp]
  1.1131 +        vld1.f32        {q2},[r1,:128]!
  1.1132 +        vmin.f32        q10, q2,  q1
  1.1133 +        vld1.f32        {q3},[r1,:128]!
  1.1134 +        vmin.f32        q11, q3,  q1
  1.1135 +1:      vmax.f32        q8,  q10, q0
  1.1136 +        vmax.f32        q9,  q11, q0
  1.1137 +        subs            r2,  r2,  #8
  1.1138 +        beq             2f
  1.1139 +        vld1.f32        {q2},[r1,:128]!
  1.1140 +        vmin.f32        q10, q2,  q1
  1.1141 +        vld1.f32        {q3},[r1,:128]!
  1.1142 +        vmin.f32        q11, q3,  q1
  1.1143 +        vst1.f32        {q8},[r0,:128]!
  1.1144 +        vst1.f32        {q9},[r0,:128]!
  1.1145 +        b               1b
  1.1146 +2:      vst1.f32        {q8},[r0,:128]!
  1.1147 +        vst1.f32        {q9},[r0,:128]!
  1.1148 +        bx              lr
  1.1149 +endfunc