Mercurial > cgi-bin > hgwebdir.cgi > PR > Applications > VSs > VSs__H264__App
diff libavcodec/arm/dsputil_neon.S @ 2:897f711a7157
rearrange to work with autoconf
| author | Nina Engelhardt <nengel@mailbox.tu-berlin.de> |
|---|---|
| date | Tue, 25 Sep 2012 15:55:33 +0200 |
| parents | |
| children |
line diff
1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/libavcodec/arm/dsputil_neon.S Tue Sep 25 15:55:33 2012 +0200 1.3 @@ -0,0 +1,1146 @@ 1.4 +/* 1.5 + * ARM NEON optimised DSP functions 1.6 + * Copyright (c) 2008 Mans Rullgard <mans@mansr.com> 1.7 + * 1.8 + * This file is part of FFmpeg. 1.9 + * 1.10 + * FFmpeg is free software; you can redistribute it and/or 1.11 + * modify it under the terms of the GNU Lesser General Public 1.12 + * License as published by the Free Software Foundation; either 1.13 + * version 2.1 of the License, or (at your option) any later version. 1.14 + * 1.15 + * FFmpeg is distributed in the hope that it will be useful, 1.16 + * but WITHOUT ANY WARRANTY; without even the implied warranty of 1.17 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 1.18 + * Lesser General Public License for more details. 1.19 + * 1.20 + * You should have received a copy of the GNU Lesser General Public 1.21 + * License along with FFmpeg; if not, write to the Free Software 1.22 + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 1.23 + */ 1.24 + 1.25 +#include "config.h" 1.26 +#include "asm.S" 1.27 + 1.28 + preserve8 1.29 + .text 1.30 + 1.31 + .macro pixels16 avg=0 1.32 +.if \avg 1.33 + mov ip, r0 1.34 +.endif 1.35 +1: vld1.64 {d0, d1}, [r1], r2 1.36 + vld1.64 {d2, d3}, [r1], r2 1.37 + vld1.64 {d4, d5}, [r1], r2 1.38 + pld [r1, r2, lsl #2] 1.39 + vld1.64 {d6, d7}, [r1], r2 1.40 + pld [r1] 1.41 + pld [r1, r2] 1.42 + pld [r1, r2, lsl #1] 1.43 +.if \avg 1.44 + vld1.64 {d16,d17}, [ip,:128], r2 1.45 + vrhadd.u8 q0, q0, q8 1.46 + vld1.64 {d18,d19}, [ip,:128], r2 1.47 + vrhadd.u8 q1, q1, q9 1.48 + vld1.64 {d20,d21}, [ip,:128], r2 1.49 + vrhadd.u8 q2, q2, q10 1.50 + vld1.64 {d22,d23}, [ip,:128], r2 1.51 + vrhadd.u8 q3, q3, q11 1.52 +.endif 1.53 + subs r3, r3, #4 1.54 + vst1.64 {d0, d1}, [r0,:128], r2 1.55 + vst1.64 {d2, d3}, [r0,:128], r2 1.56 + vst1.64 {d4, d5}, [r0,:128], r2 1.57 + vst1.64 {d6, d7}, [r0,:128], r2 1.58 + bne 1b 1.59 + bx lr 1.60 + .endm 1.61 + 1.62 + .macro pixels16_x2 vhadd=vrhadd.u8 1.63 +1: vld1.64 {d0-d2}, [r1], r2 1.64 + vld1.64 {d4-d6}, [r1], r2 1.65 + pld [r1] 1.66 + pld [r1, r2] 1.67 + subs r3, r3, #2 1.68 + vext.8 q1, q0, q1, #1 1.69 + \vhadd q0, q0, q1 1.70 + vext.8 q3, q2, q3, #1 1.71 + \vhadd q2, q2, q3 1.72 + vst1.64 {d0, d1}, [r0,:128], r2 1.73 + vst1.64 {d4, d5}, [r0,:128], r2 1.74 + bne 1b 1.75 + bx lr 1.76 + .endm 1.77 + 1.78 + .macro pixels16_y2 vhadd=vrhadd.u8 1.79 + vld1.64 {d0, d1}, [r1], r2 1.80 + vld1.64 {d2, d3}, [r1], r2 1.81 +1: subs r3, r3, #2 1.82 + \vhadd q2, q0, q1 1.83 + vld1.64 {d0, d1}, [r1], r2 1.84 + \vhadd q3, q0, q1 1.85 + vld1.64 {d2, d3}, [r1], r2 1.86 + pld [r1] 1.87 + pld [r1, r2] 1.88 + vst1.64 {d4, d5}, [r0,:128], r2 1.89 + vst1.64 {d6, d7}, [r0,:128], r2 1.90 + bne 1b 1.91 + bx lr 1.92 + .endm 1.93 + 1.94 + .macro pixels16_xy2 vshrn=vrshrn.u16 no_rnd=0 1.95 + vld1.64 {d0-d2}, [r1], r2 1.96 + vld1.64 {d4-d6}, [r1], r2 1.97 +.if \no_rnd 1.98 + vmov.i16 q13, #1 1.99 +.endif 1.100 + pld [r1] 1.101 + pld [r1, r2] 1.102 + vext.8 q1, q0, q1, #1 1.103 + vext.8 q3, q2, q3, #1 1.104 + vaddl.u8 q8, d0, d2 1.105 + vaddl.u8 q10, d1, d3 1.106 + vaddl.u8 q9, d4, d6 1.107 + vaddl.u8 q11, d5, d7 1.108 +1: subs r3, r3, #2 1.109 + vld1.64 {d0-d2}, [r1], r2 1.110 + vadd.u16 q12, q8, q9 1.111 + pld [r1] 1.112 +.if \no_rnd 1.113 + vadd.u16 q12, q12, q13 1.114 +.endif 1.115 + vext.8 q15, q0, q1, #1 1.116 + vadd.u16 q1 , q10, q11 1.117 + \vshrn d28, q12, #2 1.118 +.if \no_rnd 1.119 + vadd.u16 q1, q1, q13 1.120 +.endif 1.121 + \vshrn d29, q1, #2 1.122 + vaddl.u8 q8, d0, d30 1.123 + vld1.64 {d2-d4}, [r1], r2 1.124 + vaddl.u8 q10, d1, d31 1.125 + vst1.64 {d28,d29}, [r0,:128], r2 1.126 + vadd.u16 q12, q8, q9 1.127 + pld [r1, r2] 1.128 +.if \no_rnd 1.129 + vadd.u16 q12, q12, q13 1.130 +.endif 1.131 + vext.8 q2, q1, q2, #1 1.132 + vadd.u16 q0, q10, q11 1.133 + \vshrn d30, q12, #2 1.134 +.if \no_rnd 1.135 + vadd.u16 q0, q0, q13 1.136 +.endif 1.137 + \vshrn d31, q0, #2 1.138 + vaddl.u8 q9, d2, d4 1.139 + vaddl.u8 q11, d3, d5 1.140 + vst1.64 {d30,d31}, [r0,:128], r2 1.141 + bgt 1b 1.142 + bx lr 1.143 + .endm 1.144 + 1.145 + .macro pixels8 avg=0 1.146 +1: vld1.64 {d0}, [r1], r2 1.147 + vld1.64 {d1}, [r1], r2 1.148 + vld1.64 {d2}, [r1], r2 1.149 + pld [r1, r2, lsl #2] 1.150 + vld1.64 {d3}, [r1], r2 1.151 + pld [r1] 1.152 + pld [r1, r2] 1.153 + pld [r1, r2, lsl #1] 1.154 +.if \avg 1.155 + vld1.64 {d4}, [r0,:64], r2 1.156 + vrhadd.u8 d0, d0, d4 1.157 + vld1.64 {d5}, [r0,:64], r2 1.158 + vrhadd.u8 d1, d1, d5 1.159 + vld1.64 {d6}, [r0,:64], r2 1.160 + vrhadd.u8 d2, d2, d6 1.161 + vld1.64 {d7}, [r0,:64], r2 1.162 + vrhadd.u8 d3, d3, d7 1.163 + sub r0, r0, r2, lsl #2 1.164 +.endif 1.165 + subs r3, r3, #4 1.166 + vst1.64 {d0}, [r0,:64], r2 1.167 + vst1.64 {d1}, [r0,:64], r2 1.168 + vst1.64 {d2}, [r0,:64], r2 1.169 + vst1.64 {d3}, [r0,:64], r2 1.170 + bne 1b 1.171 + bx lr 1.172 + .endm 1.173 + 1.174 + .macro pixels8_x2 vhadd=vrhadd.u8 1.175 +1: vld1.64 {d0, d1}, [r1], r2 1.176 + vext.8 d1, d0, d1, #1 1.177 + vld1.64 {d2, d3}, [r1], r2 1.178 + vext.8 d3, d2, d3, #1 1.179 + pld [r1] 1.180 + pld [r1, r2] 1.181 + subs r3, r3, #2 1.182 + vswp d1, d2 1.183 + \vhadd q0, q0, q1 1.184 + vst1.64 {d0}, [r0,:64], r2 1.185 + vst1.64 {d1}, [r0,:64], r2 1.186 + bne 1b 1.187 + bx lr 1.188 + .endm 1.189 + 1.190 + .macro pixels8_y2 vhadd=vrhadd.u8 1.191 + vld1.64 {d0}, [r1], r2 1.192 + vld1.64 {d1}, [r1], r2 1.193 +1: subs r3, r3, #2 1.194 + \vhadd d4, d0, d1 1.195 + vld1.64 {d0}, [r1], r2 1.196 + \vhadd d5, d0, d1 1.197 + vld1.64 {d1}, [r1], r2 1.198 + pld [r1] 1.199 + pld [r1, r2] 1.200 + vst1.64 {d4}, [r0,:64], r2 1.201 + vst1.64 {d5}, [r0,:64], r2 1.202 + bne 1b 1.203 + bx lr 1.204 + .endm 1.205 + 1.206 + .macro pixels8_xy2 vshrn=vrshrn.u16 no_rnd=0 1.207 + vld1.64 {d0, d1}, [r1], r2 1.208 + vld1.64 {d2, d3}, [r1], r2 1.209 +.if \no_rnd 1.210 + vmov.i16 q11, #1 1.211 +.endif 1.212 + pld [r1] 1.213 + pld [r1, r2] 1.214 + vext.8 d4, d0, d1, #1 1.215 + vext.8 d6, d2, d3, #1 1.216 + vaddl.u8 q8, d0, d4 1.217 + vaddl.u8 q9, d2, d6 1.218 +1: subs r3, r3, #2 1.219 + vld1.64 {d0, d1}, [r1], r2 1.220 + pld [r1] 1.221 + vadd.u16 q10, q8, q9 1.222 + vext.8 d4, d0, d1, #1 1.223 +.if \no_rnd 1.224 + vadd.u16 q10, q10, q11 1.225 +.endif 1.226 + vaddl.u8 q8, d0, d4 1.227 + \vshrn d5, q10, #2 1.228 + vld1.64 {d2, d3}, [r1], r2 1.229 + vadd.u16 q10, q8, q9 1.230 + pld [r1, r2] 1.231 +.if \no_rnd 1.232 + vadd.u16 q10, q10, q11 1.233 +.endif 1.234 + vst1.64 {d5}, [r0,:64], r2 1.235 + \vshrn d7, q10, #2 1.236 + vext.8 d6, d2, d3, #1 1.237 + vaddl.u8 q9, d2, d6 1.238 + vst1.64 {d7}, [r0,:64], r2 1.239 + bgt 1b 1.240 + bx lr 1.241 + .endm 1.242 + 1.243 + .macro pixfunc pfx name suf rnd_op args:vararg 1.244 +function ff_\pfx\name\suf\()_neon, export=1 1.245 + \name \rnd_op \args 1.246 +endfunc 1.247 + .endm 1.248 + 1.249 + .macro pixfunc2 pfx name args:vararg 1.250 + pixfunc \pfx \name 1.251 + pixfunc \pfx \name \args 1.252 + .endm 1.253 + 1.254 +function ff_put_h264_qpel16_mc00_neon, export=1 1.255 + mov r3, #16 1.256 +endfunc 1.257 + 1.258 + pixfunc put_ pixels16 1.259 + pixfunc2 put_ pixels16_x2, _no_rnd, vhadd.u8 1.260 + pixfunc2 put_ pixels16_y2, _no_rnd, vhadd.u8 1.261 + pixfunc2 put_ pixels16_xy2, _no_rnd, vshrn.u16, 1 1.262 + 1.263 +function ff_avg_h264_qpel16_mc00_neon, export=1 1.264 + mov r3, #16 1.265 +endfunc 1.266 + 1.267 + pixfunc avg_ pixels16,, 1 1.268 + 1.269 +function ff_put_h264_qpel8_mc00_neon, export=1 1.270 + mov r3, #8 1.271 +endfunc 1.272 + 1.273 + pixfunc put_ pixels8 1.274 + pixfunc2 put_ pixels8_x2, _no_rnd, vhadd.u8 1.275 + pixfunc2 put_ pixels8_y2, _no_rnd, vhadd.u8 1.276 + pixfunc2 put_ pixels8_xy2, _no_rnd, vshrn.u16, 1 1.277 + 1.278 +function ff_avg_h264_qpel8_mc00_neon, export=1 1.279 + mov r3, #8 1.280 +endfunc 1.281 + 1.282 + pixfunc avg_ pixels8,, 1 1.283 + 1.284 +function ff_put_pixels_clamped_neon, export=1 1.285 + vld1.64 {d16-d19}, [r0,:128]! 1.286 + vqmovun.s16 d0, q8 1.287 + vld1.64 {d20-d23}, [r0,:128]! 1.288 + vqmovun.s16 d1, q9 1.289 + vld1.64 {d24-d27}, [r0,:128]! 1.290 + vqmovun.s16 d2, q10 1.291 + vld1.64 {d28-d31}, [r0,:128]! 1.292 + vqmovun.s16 d3, q11 1.293 + vst1.64 {d0}, [r1,:64], r2 1.294 + vqmovun.s16 d4, q12 1.295 + vst1.64 {d1}, [r1,:64], r2 1.296 + vqmovun.s16 d5, q13 1.297 + vst1.64 {d2}, [r1,:64], r2 1.298 + vqmovun.s16 d6, q14 1.299 + vst1.64 {d3}, [r1,:64], r2 1.300 + vqmovun.s16 d7, q15 1.301 + vst1.64 {d4}, [r1,:64], r2 1.302 + vst1.64 {d5}, [r1,:64], r2 1.303 + vst1.64 {d6}, [r1,:64], r2 1.304 + vst1.64 {d7}, [r1,:64], r2 1.305 + bx lr 1.306 +endfunc 1.307 + 1.308 +function ff_put_signed_pixels_clamped_neon, export=1 1.309 + vmov.u8 d31, #128 1.310 + vld1.64 {d16-d17}, [r0,:128]! 1.311 + vqmovn.s16 d0, q8 1.312 + vld1.64 {d18-d19}, [r0,:128]! 1.313 + vqmovn.s16 d1, q9 1.314 + vld1.64 {d16-d17}, [r0,:128]! 1.315 + vqmovn.s16 d2, q8 1.316 + vld1.64 {d18-d19}, [r0,:128]! 1.317 + vadd.u8 d0, d0, d31 1.318 + vld1.64 {d20-d21}, [r0,:128]! 1.319 + vadd.u8 d1, d1, d31 1.320 + vld1.64 {d22-d23}, [r0,:128]! 1.321 + vadd.u8 d2, d2, d31 1.322 + vst1.64 {d0}, [r1,:64], r2 1.323 + vqmovn.s16 d3, q9 1.324 + vst1.64 {d1}, [r1,:64], r2 1.325 + vqmovn.s16 d4, q10 1.326 + vst1.64 {d2}, [r1,:64], r2 1.327 + vqmovn.s16 d5, q11 1.328 + vld1.64 {d24-d25}, [r0,:128]! 1.329 + vadd.u8 d3, d3, d31 1.330 + vld1.64 {d26-d27}, [r0,:128]! 1.331 + vadd.u8 d4, d4, d31 1.332 + vadd.u8 d5, d5, d31 1.333 + vst1.64 {d3}, [r1,:64], r2 1.334 + vqmovn.s16 d6, q12 1.335 + vst1.64 {d4}, [r1,:64], r2 1.336 + vqmovn.s16 d7, q13 1.337 + vst1.64 {d5}, [r1,:64], r2 1.338 + vadd.u8 d6, d6, d31 1.339 + vadd.u8 d7, d7, d31 1.340 + vst1.64 {d6}, [r1,:64], r2 1.341 + vst1.64 {d7}, [r1,:64], r2 1.342 + bx lr 1.343 +endfunc 1.344 + 1.345 +function ff_add_pixels_clamped_neon, export=1 1.346 + mov r3, r1 1.347 + vld1.64 {d16}, [r1,:64], r2 1.348 + vld1.64 {d0-d1}, [r0,:128]! 1.349 + vaddw.u8 q0, q0, d16 1.350 + vld1.64 {d17}, [r1,:64], r2 1.351 + vld1.64 {d2-d3}, [r0,:128]! 1.352 + vqmovun.s16 d0, q0 1.353 + vld1.64 {d18}, [r1,:64], r2 1.354 + vaddw.u8 q1, q1, d17 1.355 + vld1.64 {d4-d5}, [r0,:128]! 1.356 + vaddw.u8 q2, q2, d18 1.357 + vst1.64 {d0}, [r3,:64], r2 1.358 + vqmovun.s16 d2, q1 1.359 + vld1.64 {d19}, [r1,:64], r2 1.360 + vld1.64 {d6-d7}, [r0,:128]! 1.361 + vaddw.u8 q3, q3, d19 1.362 + vqmovun.s16 d4, q2 1.363 + vst1.64 {d2}, [r3,:64], r2 1.364 + vld1.64 {d16}, [r1,:64], r2 1.365 + vqmovun.s16 d6, q3 1.366 + vld1.64 {d0-d1}, [r0,:128]! 1.367 + vaddw.u8 q0, q0, d16 1.368 + vst1.64 {d4}, [r3,:64], r2 1.369 + vld1.64 {d17}, [r1,:64], r2 1.370 + vld1.64 {d2-d3}, [r0,:128]! 1.371 + vaddw.u8 q1, q1, d17 1.372 + vst1.64 {d6}, [r3,:64], r2 1.373 + vqmovun.s16 d0, q0 1.374 + vld1.64 {d18}, [r1,:64], r2 1.375 + vld1.64 {d4-d5}, [r0,:128]! 1.376 + vaddw.u8 q2, q2, d18 1.377 + vst1.64 {d0}, [r3,:64], r2 1.378 + vqmovun.s16 d2, q1 1.379 + vld1.64 {d19}, [r1,:64], r2 1.380 + vqmovun.s16 d4, q2 1.381 + vld1.64 {d6-d7}, [r0,:128]! 1.382 + vaddw.u8 q3, q3, d19 1.383 + vst1.64 {d2}, [r3,:64], r2 1.384 + vqmovun.s16 d6, q3 1.385 + vst1.64 {d4}, [r3,:64], r2 1.386 + vst1.64 {d6}, [r3,:64], r2 1.387 + bx lr 1.388 +endfunc 1.389 + 1.390 +function ff_float_to_int16_neon, export=1 1.391 + subs r2, r2, #8 1.392 + vld1.64 {d0-d1}, [r1,:128]! 1.393 + vcvt.s32.f32 q8, q0, #16 1.394 + vld1.64 {d2-d3}, [r1,:128]! 1.395 + vcvt.s32.f32 q9, q1, #16 1.396 + beq 3f 1.397 + bics ip, r2, #15 1.398 + beq 2f 1.399 +1: subs ip, ip, #16 1.400 + vshrn.s32 d4, q8, #16 1.401 + vld1.64 {d0-d1}, [r1,:128]! 1.402 + vcvt.s32.f32 q0, q0, #16 1.403 + vshrn.s32 d5, q9, #16 1.404 + vld1.64 {d2-d3}, [r1,:128]! 1.405 + vcvt.s32.f32 q1, q1, #16 1.406 + vshrn.s32 d6, q0, #16 1.407 + vst1.64 {d4-d5}, [r0,:128]! 1.408 + vshrn.s32 d7, q1, #16 1.409 + vld1.64 {d16-d17},[r1,:128]! 1.410 + vcvt.s32.f32 q8, q8, #16 1.411 + vld1.64 {d18-d19},[r1,:128]! 1.412 + vcvt.s32.f32 q9, q9, #16 1.413 + vst1.64 {d6-d7}, [r0,:128]! 1.414 + bne 1b 1.415 + ands r2, r2, #15 1.416 + beq 3f 1.417 +2: vld1.64 {d0-d1}, [r1,:128]! 1.418 + vshrn.s32 d4, q8, #16 1.419 + vcvt.s32.f32 q0, q0, #16 1.420 + vld1.64 {d2-d3}, [r1,:128]! 1.421 + vshrn.s32 d5, q9, #16 1.422 + vcvt.s32.f32 q1, q1, #16 1.423 + vshrn.s32 d6, q0, #16 1.424 + vst1.64 {d4-d5}, [r0,:128]! 1.425 + vshrn.s32 d7, q1, #16 1.426 + vst1.64 {d6-d7}, [r0,:128]! 1.427 + bx lr 1.428 +3: vshrn.s32 d4, q8, #16 1.429 + vshrn.s32 d5, q9, #16 1.430 + vst1.64 {d4-d5}, [r0,:128]! 1.431 + bx lr 1.432 +endfunc 1.433 + 1.434 +function ff_float_to_int16_interleave_neon, export=1 1.435 + cmp r3, #2 1.436 + ldrlt r1, [r1] 1.437 + blt ff_float_to_int16_neon 1.438 + bne 4f 1.439 + 1.440 + ldr r3, [r1] 1.441 + ldr r1, [r1, #4] 1.442 + 1.443 + subs r2, r2, #8 1.444 + vld1.64 {d0-d1}, [r3,:128]! 1.445 + vcvt.s32.f32 q8, q0, #16 1.446 + vld1.64 {d2-d3}, [r3,:128]! 1.447 + vcvt.s32.f32 q9, q1, #16 1.448 + vld1.64 {d20-d21},[r1,:128]! 1.449 + vcvt.s32.f32 q10, q10, #16 1.450 + vld1.64 {d22-d23},[r1,:128]! 1.451 + vcvt.s32.f32 q11, q11, #16 1.452 + beq 3f 1.453 + bics ip, r2, #15 1.454 + beq 2f 1.455 +1: subs ip, ip, #16 1.456 + vld1.64 {d0-d1}, [r3,:128]! 1.457 + vcvt.s32.f32 q0, q0, #16 1.458 + vsri.32 q10, q8, #16 1.459 + vld1.64 {d2-d3}, [r3,:128]! 1.460 + vcvt.s32.f32 q1, q1, #16 1.461 + vld1.64 {d24-d25},[r1,:128]! 1.462 + vcvt.s32.f32 q12, q12, #16 1.463 + vld1.64 {d26-d27},[r1,:128]! 1.464 + vsri.32 q11, q9, #16 1.465 + vst1.64 {d20-d21},[r0,:128]! 1.466 + vcvt.s32.f32 q13, q13, #16 1.467 + vst1.64 {d22-d23},[r0,:128]! 1.468 + vsri.32 q12, q0, #16 1.469 + vld1.64 {d16-d17},[r3,:128]! 1.470 + vsri.32 q13, q1, #16 1.471 + vst1.64 {d24-d25},[r0,:128]! 1.472 + vcvt.s32.f32 q8, q8, #16 1.473 + vld1.64 {d18-d19},[r3,:128]! 1.474 + vcvt.s32.f32 q9, q9, #16 1.475 + vld1.64 {d20-d21},[r1,:128]! 1.476 + vcvt.s32.f32 q10, q10, #16 1.477 + vld1.64 {d22-d23},[r1,:128]! 1.478 + vcvt.s32.f32 q11, q11, #16 1.479 + vst1.64 {d26-d27},[r0,:128]! 1.480 + bne 1b 1.481 + ands r2, r2, #15 1.482 + beq 3f 1.483 +2: vsri.32 q10, q8, #16 1.484 + vld1.64 {d0-d1}, [r3,:128]! 1.485 + vcvt.s32.f32 q0, q0, #16 1.486 + vld1.64 {d2-d3}, [r3,:128]! 1.487 + vcvt.s32.f32 q1, q1, #16 1.488 + vld1.64 {d24-d25},[r1,:128]! 1.489 + vcvt.s32.f32 q12, q12, #16 1.490 + vsri.32 q11, q9, #16 1.491 + vld1.64 {d26-d27},[r1,:128]! 1.492 + vcvt.s32.f32 q13, q13, #16 1.493 + vst1.64 {d20-d21},[r0,:128]! 1.494 + vsri.32 q12, q0, #16 1.495 + vst1.64 {d22-d23},[r0,:128]! 1.496 + vsri.32 q13, q1, #16 1.497 + vst1.64 {d24-d27},[r0,:128]! 1.498 + bx lr 1.499 +3: vsri.32 q10, q8, #16 1.500 + vsri.32 q11, q9, #16 1.501 + vst1.64 {d20-d23},[r0,:128]! 1.502 + bx lr 1.503 + 1.504 +4: push {r4-r8,lr} 1.505 + cmp r3, #4 1.506 + lsl ip, r3, #1 1.507 + blt 4f 1.508 + 1.509 + @ 4 channels 1.510 +5: ldmia r1!, {r4-r7} 1.511 + mov lr, r2 1.512 + mov r8, r0 1.513 + vld1.64 {d16-d17},[r4,:128]! 1.514 + vcvt.s32.f32 q8, q8, #16 1.515 + vld1.64 {d18-d19},[r5,:128]! 1.516 + vcvt.s32.f32 q9, q9, #16 1.517 + vld1.64 {d20-d21},[r6,:128]! 1.518 + vcvt.s32.f32 q10, q10, #16 1.519 + vld1.64 {d22-d23},[r7,:128]! 1.520 + vcvt.s32.f32 q11, q11, #16 1.521 +6: subs lr, lr, #8 1.522 + vld1.64 {d0-d1}, [r4,:128]! 1.523 + vcvt.s32.f32 q0, q0, #16 1.524 + vsri.32 q9, q8, #16 1.525 + vld1.64 {d2-d3}, [r5,:128]! 1.526 + vcvt.s32.f32 q1, q1, #16 1.527 + vsri.32 q11, q10, #16 1.528 + vld1.64 {d4-d5}, [r6,:128]! 1.529 + vcvt.s32.f32 q2, q2, #16 1.530 + vzip.32 d18, d22 1.531 + vld1.64 {d6-d7}, [r7,:128]! 1.532 + vcvt.s32.f32 q3, q3, #16 1.533 + vzip.32 d19, d23 1.534 + vst1.64 {d18}, [r8], ip 1.535 + vsri.32 q1, q0, #16 1.536 + vst1.64 {d22}, [r8], ip 1.537 + vsri.32 q3, q2, #16 1.538 + vst1.64 {d19}, [r8], ip 1.539 + vzip.32 d2, d6 1.540 + vst1.64 {d23}, [r8], ip 1.541 + vzip.32 d3, d7 1.542 + beq 7f 1.543 + vld1.64 {d16-d17},[r4,:128]! 1.544 + vcvt.s32.f32 q8, q8, #16 1.545 + vst1.64 {d2}, [r8], ip 1.546 + vld1.64 {d18-d19},[r5,:128]! 1.547 + vcvt.s32.f32 q9, q9, #16 1.548 + vst1.64 {d6}, [r8], ip 1.549 + vld1.64 {d20-d21},[r6,:128]! 1.550 + vcvt.s32.f32 q10, q10, #16 1.551 + vst1.64 {d3}, [r8], ip 1.552 + vld1.64 {d22-d23},[r7,:128]! 1.553 + vcvt.s32.f32 q11, q11, #16 1.554 + vst1.64 {d7}, [r8], ip 1.555 + b 6b 1.556 +7: vst1.64 {d2}, [r8], ip 1.557 + vst1.64 {d6}, [r8], ip 1.558 + vst1.64 {d3}, [r8], ip 1.559 + vst1.64 {d7}, [r8], ip 1.560 + subs r3, r3, #4 1.561 + popeq {r4-r8,pc} 1.562 + cmp r3, #4 1.563 + add r0, r0, #8 1.564 + bge 5b 1.565 + 1.566 + @ 2 channels 1.567 +4: cmp r3, #2 1.568 + blt 4f 1.569 + ldmia r1!, {r4-r5} 1.570 + mov lr, r2 1.571 + mov r8, r0 1.572 + tst lr, #8 1.573 + vld1.64 {d16-d17},[r4,:128]! 1.574 + vcvt.s32.f32 q8, q8, #16 1.575 + vld1.64 {d18-d19},[r5,:128]! 1.576 + vcvt.s32.f32 q9, q9, #16 1.577 + vld1.64 {d20-d21},[r4,:128]! 1.578 + vcvt.s32.f32 q10, q10, #16 1.579 + vld1.64 {d22-d23},[r5,:128]! 1.580 + vcvt.s32.f32 q11, q11, #16 1.581 + beq 6f 1.582 + subs lr, lr, #8 1.583 + beq 7f 1.584 + vsri.32 d18, d16, #16 1.585 + vsri.32 d19, d17, #16 1.586 + vld1.64 {d16-d17},[r4,:128]! 1.587 + vcvt.s32.f32 q8, q8, #16 1.588 + vst1.32 {d18[0]}, [r8], ip 1.589 + vsri.32 d22, d20, #16 1.590 + vst1.32 {d18[1]}, [r8], ip 1.591 + vsri.32 d23, d21, #16 1.592 + vst1.32 {d19[0]}, [r8], ip 1.593 + vst1.32 {d19[1]}, [r8], ip 1.594 + vld1.64 {d18-d19},[r5,:128]! 1.595 + vcvt.s32.f32 q9, q9, #16 1.596 + vst1.32 {d22[0]}, [r8], ip 1.597 + vst1.32 {d22[1]}, [r8], ip 1.598 + vld1.64 {d20-d21},[r4,:128]! 1.599 + vcvt.s32.f32 q10, q10, #16 1.600 + vst1.32 {d23[0]}, [r8], ip 1.601 + vst1.32 {d23[1]}, [r8], ip 1.602 + vld1.64 {d22-d23},[r5,:128]! 1.603 + vcvt.s32.f32 q11, q11, #16 1.604 +6: subs lr, lr, #16 1.605 + vld1.64 {d0-d1}, [r4,:128]! 1.606 + vcvt.s32.f32 q0, q0, #16 1.607 + vsri.32 d18, d16, #16 1.608 + vld1.64 {d2-d3}, [r5,:128]! 1.609 + vcvt.s32.f32 q1, q1, #16 1.610 + vsri.32 d19, d17, #16 1.611 + vld1.64 {d4-d5}, [r4,:128]! 1.612 + vcvt.s32.f32 q2, q2, #16 1.613 + vld1.64 {d6-d7}, [r5,:128]! 1.614 + vcvt.s32.f32 q3, q3, #16 1.615 + vst1.32 {d18[0]}, [r8], ip 1.616 + vsri.32 d22, d20, #16 1.617 + vst1.32 {d18[1]}, [r8], ip 1.618 + vsri.32 d23, d21, #16 1.619 + vst1.32 {d19[0]}, [r8], ip 1.620 + vsri.32 d2, d0, #16 1.621 + vst1.32 {d19[1]}, [r8], ip 1.622 + vsri.32 d3, d1, #16 1.623 + vst1.32 {d22[0]}, [r8], ip 1.624 + vsri.32 d6, d4, #16 1.625 + vst1.32 {d22[1]}, [r8], ip 1.626 + vsri.32 d7, d5, #16 1.627 + vst1.32 {d23[0]}, [r8], ip 1.628 + vst1.32 {d23[1]}, [r8], ip 1.629 + beq 6f 1.630 + vld1.64 {d16-d17},[r4,:128]! 1.631 + vcvt.s32.f32 q8, q8, #16 1.632 + vst1.32 {d2[0]}, [r8], ip 1.633 + vst1.32 {d2[1]}, [r8], ip 1.634 + vld1.64 {d18-d19},[r5,:128]! 1.635 + vcvt.s32.f32 q9, q9, #16 1.636 + vst1.32 {d3[0]}, [r8], ip 1.637 + vst1.32 {d3[1]}, [r8], ip 1.638 + vld1.64 {d20-d21},[r4,:128]! 1.639 + vcvt.s32.f32 q10, q10, #16 1.640 + vst1.32 {d6[0]}, [r8], ip 1.641 + vst1.32 {d6[1]}, [r8], ip 1.642 + vld1.64 {d22-d23},[r5,:128]! 1.643 + vcvt.s32.f32 q11, q11, #16 1.644 + vst1.32 {d7[0]}, [r8], ip 1.645 + vst1.32 {d7[1]}, [r8], ip 1.646 + bgt 6b 1.647 +6: vst1.32 {d2[0]}, [r8], ip 1.648 + vst1.32 {d2[1]}, [r8], ip 1.649 + vst1.32 {d3[0]}, [r8], ip 1.650 + vst1.32 {d3[1]}, [r8], ip 1.651 + vst1.32 {d6[0]}, [r8], ip 1.652 + vst1.32 {d6[1]}, [r8], ip 1.653 + vst1.32 {d7[0]}, [r8], ip 1.654 + vst1.32 {d7[1]}, [r8], ip 1.655 + b 8f 1.656 +7: vsri.32 d18, d16, #16 1.657 + vsri.32 d19, d17, #16 1.658 + vst1.32 {d18[0]}, [r8], ip 1.659 + vsri.32 d22, d20, #16 1.660 + vst1.32 {d18[1]}, [r8], ip 1.661 + vsri.32 d23, d21, #16 1.662 + vst1.32 {d19[0]}, [r8], ip 1.663 + vst1.32 {d19[1]}, [r8], ip 1.664 + vst1.32 {d22[0]}, [r8], ip 1.665 + vst1.32 {d22[1]}, [r8], ip 1.666 + vst1.32 {d23[0]}, [r8], ip 1.667 + vst1.32 {d23[1]}, [r8], ip 1.668 +8: subs r3, r3, #2 1.669 + add r0, r0, #4 1.670 + popeq {r4-r8,pc} 1.671 + 1.672 + @ 1 channel 1.673 +4: ldr r4, [r1],#4 1.674 + tst r2, #8 1.675 + mov lr, r2 1.676 + mov r5, r0 1.677 + vld1.64 {d0-d1}, [r4,:128]! 1.678 + vcvt.s32.f32 q0, q0, #16 1.679 + vld1.64 {d2-d3}, [r4,:128]! 1.680 + vcvt.s32.f32 q1, q1, #16 1.681 + bne 8f 1.682 +6: subs lr, lr, #16 1.683 + vld1.64 {d4-d5}, [r4,:128]! 1.684 + vcvt.s32.f32 q2, q2, #16 1.685 + vld1.64 {d6-d7}, [r4,:128]! 1.686 + vcvt.s32.f32 q3, q3, #16 1.687 + vst1.16 {d0[1]}, [r5,:16], ip 1.688 + vst1.16 {d0[3]}, [r5,:16], ip 1.689 + vst1.16 {d1[1]}, [r5,:16], ip 1.690 + vst1.16 {d1[3]}, [r5,:16], ip 1.691 + vst1.16 {d2[1]}, [r5,:16], ip 1.692 + vst1.16 {d2[3]}, [r5,:16], ip 1.693 + vst1.16 {d3[1]}, [r5,:16], ip 1.694 + vst1.16 {d3[3]}, [r5,:16], ip 1.695 + beq 7f 1.696 + vld1.64 {d0-d1}, [r4,:128]! 1.697 + vcvt.s32.f32 q0, q0, #16 1.698 + vld1.64 {d2-d3}, [r4,:128]! 1.699 + vcvt.s32.f32 q1, q1, #16 1.700 +7: vst1.16 {d4[1]}, [r5,:16], ip 1.701 + vst1.16 {d4[3]}, [r5,:16], ip 1.702 + vst1.16 {d5[1]}, [r5,:16], ip 1.703 + vst1.16 {d5[3]}, [r5,:16], ip 1.704 + vst1.16 {d6[1]}, [r5,:16], ip 1.705 + vst1.16 {d6[3]}, [r5,:16], ip 1.706 + vst1.16 {d7[1]}, [r5,:16], ip 1.707 + vst1.16 {d7[3]}, [r5,:16], ip 1.708 + bgt 6b 1.709 + pop {r4-r8,pc} 1.710 +8: subs lr, lr, #8 1.711 + vst1.16 {d0[1]}, [r5,:16], ip 1.712 + vst1.16 {d0[3]}, [r5,:16], ip 1.713 + vst1.16 {d1[1]}, [r5,:16], ip 1.714 + vst1.16 {d1[3]}, [r5,:16], ip 1.715 + vst1.16 {d2[1]}, [r5,:16], ip 1.716 + vst1.16 {d2[3]}, [r5,:16], ip 1.717 + vst1.16 {d3[1]}, [r5,:16], ip 1.718 + vst1.16 {d3[3]}, [r5,:16], ip 1.719 + popeq {r4-r8,pc} 1.720 + vld1.64 {d0-d1}, [r4,:128]! 1.721 + vcvt.s32.f32 q0, q0, #16 1.722 + vld1.64 {d2-d3}, [r4,:128]! 1.723 + vcvt.s32.f32 q1, q1, #16 1.724 + b 6b 1.725 +endfunc 1.726 + 1.727 +function ff_vector_fmul_neon, export=1 1.728 + mov r3, r0 1.729 + subs r2, r2, #8 1.730 + vld1.64 {d0-d3}, [r0,:128]! 1.731 + vld1.64 {d4-d7}, [r1,:128]! 1.732 + vmul.f32 q8, q0, q2 1.733 + vmul.f32 q9, q1, q3 1.734 + beq 3f 1.735 + bics ip, r2, #15 1.736 + beq 2f 1.737 +1: subs ip, ip, #16 1.738 + vld1.64 {d0-d1}, [r0,:128]! 1.739 + vld1.64 {d4-d5}, [r1,:128]! 1.740 + vmul.f32 q10, q0, q2 1.741 + vld1.64 {d2-d3}, [r0,:128]! 1.742 + vld1.64 {d6-d7}, [r1,:128]! 1.743 + vmul.f32 q11, q1, q3 1.744 + vst1.64 {d16-d19},[r3,:128]! 1.745 + vld1.64 {d0-d1}, [r0,:128]! 1.746 + vld1.64 {d4-d5}, [r1,:128]! 1.747 + vmul.f32 q8, q0, q2 1.748 + vld1.64 {d2-d3}, [r0,:128]! 1.749 + vld1.64 {d6-d7}, [r1,:128]! 1.750 + vmul.f32 q9, q1, q3 1.751 + vst1.64 {d20-d23},[r3,:128]! 1.752 + bne 1b 1.753 + ands r2, r2, #15 1.754 + beq 3f 1.755 +2: vld1.64 {d0-d1}, [r0,:128]! 1.756 + vld1.64 {d4-d5}, [r1,:128]! 1.757 + vst1.64 {d16-d17},[r3,:128]! 1.758 + vmul.f32 q8, q0, q2 1.759 + vld1.64 {d2-d3}, [r0,:128]! 1.760 + vld1.64 {d6-d7}, [r1,:128]! 1.761 + vst1.64 {d18-d19},[r3,:128]! 1.762 + vmul.f32 q9, q1, q3 1.763 +3: vst1.64 {d16-d19},[r3,:128]! 1.764 + bx lr 1.765 +endfunc 1.766 + 1.767 +function ff_vector_fmul_window_neon, export=1 1.768 +VFP vdup.32 q8, d0[0] 1.769 +NOVFP vld1.32 {d16[],d17[]}, [sp,:32] 1.770 + push {r4,r5,lr} 1.771 +VFP ldr lr, [sp, #12] 1.772 +NOVFP ldr lr, [sp, #16] 1.773 + sub r2, r2, #8 1.774 + sub r5, lr, #2 1.775 + add r2, r2, r5, lsl #2 1.776 + add r4, r3, r5, lsl #3 1.777 + add ip, r0, r5, lsl #3 1.778 + mov r5, #-16 1.779 + vld1.64 {d0,d1}, [r1,:128]! 1.780 + vld1.64 {d2,d3}, [r2,:128], r5 1.781 + vld1.64 {d4,d5}, [r3,:128]! 1.782 + vld1.64 {d6,d7}, [r4,:128], r5 1.783 +1: subs lr, lr, #4 1.784 + vmov q11, q8 1.785 + vmla.f32 d22, d0, d4 1.786 + vmov q10, q8 1.787 + vmla.f32 d23, d1, d5 1.788 + vrev64.32 q3, q3 1.789 + vmla.f32 d20, d0, d7 1.790 + vrev64.32 q1, q1 1.791 + vmla.f32 d21, d1, d6 1.792 + beq 2f 1.793 + vmla.f32 d22, d3, d7 1.794 + vld1.64 {d0,d1}, [r1,:128]! 1.795 + vmla.f32 d23, d2, d6 1.796 + vld1.64 {d18,d19},[r2,:128], r5 1.797 + vmls.f32 d20, d3, d4 1.798 + vld1.64 {d24,d25},[r3,:128]! 1.799 + vmls.f32 d21, d2, d5 1.800 + vld1.64 {d6,d7}, [r4,:128], r5 1.801 + vmov q1, q9 1.802 + vrev64.32 q11, q11 1.803 + vmov q2, q12 1.804 + vswp d22, d23 1.805 + vst1.64 {d20,d21},[r0,:128]! 1.806 + vst1.64 {d22,d23},[ip,:128], r5 1.807 + b 1b 1.808 +2: vmla.f32 d22, d3, d7 1.809 + vmla.f32 d23, d2, d6 1.810 + vmls.f32 d20, d3, d4 1.811 + vmls.f32 d21, d2, d5 1.812 + vrev64.32 q11, q11 1.813 + vswp d22, d23 1.814 + vst1.64 {d20,d21},[r0,:128]! 1.815 + vst1.64 {d22,d23},[ip,:128], r5 1.816 + pop {r4,r5,pc} 1.817 +endfunc 1.818 + 1.819 +#if CONFIG_VORBIS_DECODER 1.820 +function ff_vorbis_inverse_coupling_neon, export=1 1.821 + vmov.i32 q10, #1<<31 1.822 + subs r2, r2, #4 1.823 + mov r3, r0 1.824 + mov r12, r1 1.825 + beq 3f 1.826 + 1.827 + vld1.32 {d24-d25},[r1,:128]! 1.828 + vld1.32 {d22-d23},[r0,:128]! 1.829 + vcle.s32 q8, q12, #0 1.830 + vand q9, q11, q10 1.831 + veor q12, q12, q9 1.832 + vand q2, q12, q8 1.833 + vbic q3, q12, q8 1.834 + vadd.f32 q12, q11, q2 1.835 + vsub.f32 q11, q11, q3 1.836 +1: vld1.32 {d2-d3}, [r1,:128]! 1.837 + vld1.32 {d0-d1}, [r0,:128]! 1.838 + vcle.s32 q8, q1, #0 1.839 + vand q9, q0, q10 1.840 + veor q1, q1, q9 1.841 + vst1.32 {d24-d25},[r3, :128]! 1.842 + vst1.32 {d22-d23},[r12,:128]! 1.843 + vand q2, q1, q8 1.844 + vbic q3, q1, q8 1.845 + vadd.f32 q1, q0, q2 1.846 + vsub.f32 q0, q0, q3 1.847 + subs r2, r2, #8 1.848 + ble 2f 1.849 + vld1.32 {d24-d25},[r1,:128]! 1.850 + vld1.32 {d22-d23},[r0,:128]! 1.851 + vcle.s32 q8, q12, #0 1.852 + vand q9, q11, q10 1.853 + veor q12, q12, q9 1.854 + vst1.32 {d2-d3}, [r3, :128]! 1.855 + vst1.32 {d0-d1}, [r12,:128]! 1.856 + vand q2, q12, q8 1.857 + vbic q3, q12, q8 1.858 + vadd.f32 q12, q11, q2 1.859 + vsub.f32 q11, q11, q3 1.860 + b 1b 1.861 + 1.862 +2: vst1.32 {d2-d3}, [r3, :128]! 1.863 + vst1.32 {d0-d1}, [r12,:128]! 1.864 + bxlt lr 1.865 + 1.866 +3: vld1.32 {d2-d3}, [r1,:128] 1.867 + vld1.32 {d0-d1}, [r0,:128] 1.868 + vcle.s32 q8, q1, #0 1.869 + vand q9, q0, q10 1.870 + veor q1, q1, q9 1.871 + vand q2, q1, q8 1.872 + vbic q3, q1, q8 1.873 + vadd.f32 q1, q0, q2 1.874 + vsub.f32 q0, q0, q3 1.875 + vst1.32 {d2-d3}, [r0,:128]! 1.876 + vst1.32 {d0-d1}, [r1,:128]! 1.877 + bx lr 1.878 +endfunc 1.879 +#endif 1.880 + 1.881 +function ff_vector_fmul_scalar_neon, export=1 1.882 +VFP len .req r2 1.883 +NOVFP len .req r3 1.884 +VFP vdup.32 q8, d0[0] 1.885 +NOVFP vdup.32 q8, r2 1.886 + bics r12, len, #15 1.887 + beq 3f 1.888 + vld1.32 {q0},[r1,:128]! 1.889 + vld1.32 {q1},[r1,:128]! 1.890 +1: vmul.f32 q0, q0, q8 1.891 + vld1.32 {q2},[r1,:128]! 1.892 + vmul.f32 q1, q1, q8 1.893 + vld1.32 {q3},[r1,:128]! 1.894 + vmul.f32 q2, q2, q8 1.895 + vst1.32 {q0},[r0,:128]! 1.896 + vmul.f32 q3, q3, q8 1.897 + vst1.32 {q1},[r0,:128]! 1.898 + subs r12, r12, #16 1.899 + beq 2f 1.900 + vld1.32 {q0},[r1,:128]! 1.901 + vst1.32 {q2},[r0,:128]! 1.902 + vld1.32 {q1},[r1,:128]! 1.903 + vst1.32 {q3},[r0,:128]! 1.904 + b 1b 1.905 +2: vst1.32 {q2},[r0,:128]! 1.906 + vst1.32 {q3},[r0,:128]! 1.907 + ands len, len, #15 1.908 + bxeq lr 1.909 +3: vld1.32 {q0},[r1,:128]! 1.910 + vmul.f32 q0, q0, q8 1.911 + vst1.32 {q0},[r0,:128]! 1.912 + subs len, len, #4 1.913 + bgt 3b 1.914 + bx lr 1.915 + .unreq len 1.916 +endfunc 1.917 + 1.918 +function ff_vector_fmul_sv_scalar_2_neon, export=1 1.919 +VFP vdup.32 d16, d0[0] 1.920 +NOVFP vdup.32 d16, r3 1.921 +NOVFP ldr r3, [sp] 1.922 + vld1.32 {d0},[r1,:64]! 1.923 + vld1.32 {d1},[r1,:64]! 1.924 +1: subs r3, r3, #4 1.925 + vmul.f32 d4, d0, d16 1.926 + vmul.f32 d5, d1, d16 1.927 + ldr r12, [r2], #4 1.928 + vld1.32 {d2},[r12,:64] 1.929 + ldr r12, [r2], #4 1.930 + vld1.32 {d3},[r12,:64] 1.931 + vmul.f32 d4, d4, d2 1.932 + vmul.f32 d5, d5, d3 1.933 + beq 2f 1.934 + vld1.32 {d0},[r1,:64]! 1.935 + vld1.32 {d1},[r1,:64]! 1.936 + vst1.32 {d4},[r0,:64]! 1.937 + vst1.32 {d5},[r0,:64]! 1.938 + b 1b 1.939 +2: vst1.32 {d4},[r0,:64]! 1.940 + vst1.32 {d5},[r0,:64]! 1.941 + bx lr 1.942 +endfunc 1.943 + 1.944 +function ff_vector_fmul_sv_scalar_4_neon, export=1 1.945 +VFP vdup.32 q10, d0[0] 1.946 +NOVFP vdup.32 q10, r3 1.947 +NOVFP ldr r3, [sp] 1.948 + push {lr} 1.949 + bics lr, r3, #7 1.950 + beq 3f 1.951 + vld1.32 {q0},[r1,:128]! 1.952 + vld1.32 {q2},[r1,:128]! 1.953 +1: ldr r12, [r2], #4 1.954 + vld1.32 {q1},[r12,:128] 1.955 + ldr r12, [r2], #4 1.956 + vld1.32 {q3},[r12,:128] 1.957 + vmul.f32 q8, q0, q10 1.958 + vmul.f32 q8, q8, q1 1.959 + vmul.f32 q9, q2, q10 1.960 + vmul.f32 q9, q9, q3 1.961 + subs lr, lr, #8 1.962 + beq 2f 1.963 + vld1.32 {q0},[r1,:128]! 1.964 + vld1.32 {q2},[r1,:128]! 1.965 + vst1.32 {q8},[r0,:128]! 1.966 + vst1.32 {q9},[r0,:128]! 1.967 + b 1b 1.968 +2: vst1.32 {q8},[r0,:128]! 1.969 + vst1.32 {q9},[r0,:128]! 1.970 + ands r3, r3, #7 1.971 + popeq {pc} 1.972 +3: vld1.32 {q0},[r1,:128]! 1.973 + ldr r12, [r2], #4 1.974 + vld1.32 {q1},[r12,:128] 1.975 + vmul.f32 q0, q0, q10 1.976 + vmul.f32 q0, q0, q1 1.977 + vst1.32 {q0},[r0,:128]! 1.978 + subs r3, r3, #4 1.979 + bgt 3b 1.980 + pop {pc} 1.981 +endfunc 1.982 + 1.983 +function ff_sv_fmul_scalar_2_neon, export=1 1.984 +VFP len .req r2 1.985 +NOVFP len .req r3 1.986 +VFP vdup.32 q8, d0[0] 1.987 +NOVFP vdup.32 q8, r2 1.988 + ldr r12, [r1], #4 1.989 + vld1.32 {d0},[r12,:64] 1.990 + ldr r12, [r1], #4 1.991 + vld1.32 {d1},[r12,:64] 1.992 +1: vmul.f32 q1, q0, q8 1.993 + subs len, len, #4 1.994 + beq 2f 1.995 + ldr r12, [r1], #4 1.996 + vld1.32 {d0},[r12,:64] 1.997 + ldr r12, [r1], #4 1.998 + vld1.32 {d1},[r12,:64] 1.999 + vst1.32 {q1},[r0,:128]! 1.1000 + b 1b 1.1001 +2: vst1.32 {q1},[r0,:128]! 1.1002 + bx lr 1.1003 + .unreq len 1.1004 +endfunc 1.1005 + 1.1006 +function ff_sv_fmul_scalar_4_neon, export=1 1.1007 +VFP len .req r2 1.1008 +NOVFP len .req r3 1.1009 +VFP vdup.32 q8, d0[0] 1.1010 +NOVFP vdup.32 q8, r2 1.1011 +1: ldr r12, [r1], #4 1.1012 + vld1.32 {q0},[r12,:128] 1.1013 + vmul.f32 q0, q0, q8 1.1014 + vst1.32 {q0},[r0,:128]! 1.1015 + subs len, len, #4 1.1016 + bgt 1b 1.1017 + bx lr 1.1018 + .unreq len 1.1019 +endfunc 1.1020 + 1.1021 +function ff_butterflies_float_neon, export=1 1.1022 +1: vld1.32 {q0},[r0,:128] 1.1023 + vld1.32 {q1},[r1,:128] 1.1024 + vsub.f32 q2, q0, q1 1.1025 + vadd.f32 q1, q0, q1 1.1026 + vst1.32 {q2},[r1,:128]! 1.1027 + vst1.32 {q1},[r0,:128]! 1.1028 + subs r2, r2, #4 1.1029 + bgt 1b 1.1030 + bx lr 1.1031 +endfunc 1.1032 + 1.1033 +function ff_scalarproduct_float_neon, export=1 1.1034 + vmov.f32 q2, #0.0 1.1035 +1: vld1.32 {q0},[r0,:128]! 1.1036 + vld1.32 {q1},[r1,:128]! 1.1037 + vmla.f32 q2, q0, q1 1.1038 + subs r2, r2, #4 1.1039 + bgt 1b 1.1040 + vadd.f32 d0, d4, d5 1.1041 + vpadd.f32 d0, d0, d0 1.1042 +NOVFP vmov.32 r0, d0[0] 1.1043 + bx lr 1.1044 +endfunc 1.1045 + 1.1046 +function ff_int32_to_float_fmul_scalar_neon, export=1 1.1047 +VFP vdup.32 q0, d0[0] 1.1048 +VFP len .req r2 1.1049 +NOVFP vdup.32 q0, r2 1.1050 +NOVFP len .req r3 1.1051 + 1.1052 + vld1.32 {q1},[r1,:128]! 1.1053 + vcvt.f32.s32 q3, q1 1.1054 + vld1.32 {q2},[r1,:128]! 1.1055 + vcvt.f32.s32 q8, q2 1.1056 +1: subs len, len, #8 1.1057 + pld [r1, #16] 1.1058 + vmul.f32 q9, q3, q0 1.1059 + vmul.f32 q10, q8, q0 1.1060 + beq 2f 1.1061 + vld1.32 {q1},[r1,:128]! 1.1062 + vcvt.f32.s32 q3, q1 1.1063 + vld1.32 {q2},[r1,:128]! 1.1064 + vcvt.f32.s32 q8, q2 1.1065 + vst1.32 {q9}, [r0,:128]! 1.1066 + vst1.32 {q10},[r0,:128]! 1.1067 + b 1b 1.1068 +2: vst1.32 {q9}, [r0,:128]! 1.1069 + vst1.32 {q10},[r0,:128]! 1.1070 + bx lr 1.1071 + .unreq len 1.1072 +endfunc 1.1073 + 1.1074 +function ff_vector_fmul_reverse_neon, export=1 1.1075 + add r2, r2, r3, lsl #2 1.1076 + sub r2, r2, #32 1.1077 + mov r12, #-32 1.1078 + vld1.32 {q0-q1}, [r1,:128]! 1.1079 + vld1.32 {q2-q3}, [r2,:128], r12 1.1080 +1: pld [r1, #32] 1.1081 + vrev64.32 q3, q3 1.1082 + vmul.f32 d16, d0, d7 1.1083 + vmul.f32 d17, d1, d6 1.1084 + pld [r2, #-32] 1.1085 + vrev64.32 q2, q2 1.1086 + vmul.f32 d18, d2, d5 1.1087 + vmul.f32 d19, d3, d4 1.1088 + subs r3, r3, #8 1.1089 + beq 2f 1.1090 + vld1.32 {q0-q1}, [r1,:128]! 1.1091 + vld1.32 {q2-q3}, [r2,:128], r12 1.1092 + vst1.32 {q8-q9}, [r0,:128]! 1.1093 + b 1b 1.1094 +2: vst1.32 {q8-q9}, [r0,:128]! 1.1095 + bx lr 1.1096 +endfunc 1.1097 + 1.1098 +function ff_vector_fmul_add_neon, export=1 1.1099 + ldr r12, [sp] 1.1100 + vld1.32 {q0-q1}, [r1,:128]! 1.1101 + vld1.32 {q8-q9}, [r2,:128]! 1.1102 + vld1.32 {q2-q3}, [r3,:128]! 1.1103 + vmul.f32 q10, q0, q8 1.1104 + vmul.f32 q11, q1, q9 1.1105 +1: vadd.f32 q12, q2, q10 1.1106 + vadd.f32 q13, q3, q11 1.1107 + pld [r1, #16] 1.1108 + pld [r2, #16] 1.1109 + pld [r3, #16] 1.1110 + subs r12, r12, #8 1.1111 + beq 2f 1.1112 + vld1.32 {q0}, [r1,:128]! 1.1113 + vld1.32 {q8}, [r2,:128]! 1.1114 + vmul.f32 q10, q0, q8 1.1115 + vld1.32 {q1}, [r1,:128]! 1.1116 + vld1.32 {q9}, [r2,:128]! 1.1117 + vmul.f32 q11, q1, q9 1.1118 + vld1.32 {q2-q3}, [r3,:128]! 1.1119 + vst1.32 {q12-q13},[r0,:128]! 1.1120 + b 1b 1.1121 +2: vst1.32 {q12-q13},[r0,:128]! 1.1122 + bx lr 1.1123 +endfunc 1.1124 + 1.1125 +function ff_vector_clipf_neon, export=1 1.1126 +VFP vdup.32 q1, d0[1] 1.1127 +VFP vdup.32 q0, d0[0] 1.1128 +NOVFP vdup.32 q0, r2 1.1129 +NOVFP vdup.32 q1, r3 1.1130 +NOVFP ldr r2, [sp] 1.1131 + vld1.f32 {q2},[r1,:128]! 1.1132 + vmin.f32 q10, q2, q1 1.1133 + vld1.f32 {q3},[r1,:128]! 1.1134 + vmin.f32 q11, q3, q1 1.1135 +1: vmax.f32 q8, q10, q0 1.1136 + vmax.f32 q9, q11, q0 1.1137 + subs r2, r2, #8 1.1138 + beq 2f 1.1139 + vld1.f32 {q2},[r1,:128]! 1.1140 + vmin.f32 q10, q2, q1 1.1141 + vld1.f32 {q3},[r1,:128]! 1.1142 + vmin.f32 q11, q3, q1 1.1143 + vst1.f32 {q8},[r0,:128]! 1.1144 + vst1.f32 {q9},[r0,:128]! 1.1145 + b 1b 1.1146 +2: vst1.f32 {q8},[r0,:128]! 1.1147 + vst1.f32 {q9},[r0,:128]! 1.1148 + bx lr 1.1149 +endfunc
