Mercurial > cgi-bin > hgwebdir.cgi > PR > Applications > VSs > VSs__H264__App
comparison libavcodec/ppc/dsputil_altivec.c @ 2:897f711a7157
rearrange to work with autoconf
| author | Nina Engelhardt <nengel@mailbox.tu-berlin.de> |
|---|---|
| date | Tue, 25 Sep 2012 15:55:33 +0200 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:bf340f3ebb27 |
|---|---|
| 1 /* | |
| 2 * Copyright (c) 2002 Brian Foley | |
| 3 * Copyright (c) 2002 Dieter Shirley | |
| 4 * Copyright (c) 2003-2004 Romain Dolbeau <romain@dolbeau.org> | |
| 5 * | |
| 6 * This file is part of FFmpeg. | |
| 7 * | |
| 8 * FFmpeg is free software; you can redistribute it and/or | |
| 9 * modify it under the terms of the GNU Lesser General Public | |
| 10 * License as published by the Free Software Foundation; either | |
| 11 * version 2.1 of the License, or (at your option) any later version. | |
| 12 * | |
| 13 * FFmpeg is distributed in the hope that it will be useful, | |
| 14 * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
| 15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
| 16 * Lesser General Public License for more details. | |
| 17 * | |
| 18 * You should have received a copy of the GNU Lesser General Public | |
| 19 * License along with FFmpeg; if not, write to the Free Software | |
| 20 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | |
| 21 */ | |
| 22 | |
| 23 #include "config.h" | |
| 24 #if HAVE_ALTIVEC_H | |
| 25 #include <altivec.h> | |
| 26 #endif | |
| 27 #include "libavcodec/dsputil.h" | |
| 28 #include "dsputil_ppc.h" | |
| 29 #include "util_altivec.h" | |
| 30 #include "types_altivec.h" | |
| 31 #include "dsputil_altivec.h" | |
| 32 | |
| 33 | |
| 34 static void get_pixels_altivec(DCTELEM *restrict block, const uint8_t *pixels, int line_size) | |
| 35 { | |
| 36 int i; | |
| 37 vector unsigned char perm, bytes, *pixv; | |
| 38 const vector unsigned char zero = (const vector unsigned char)vec_splat_u8(0); | |
| 39 vector signed short shorts; | |
| 40 | |
| 41 for (i = 0; i < 8; i++) { | |
| 42 // Read potentially unaligned pixels. | |
| 43 // We're reading 16 pixels, and actually only want 8, | |
| 44 // but we simply ignore the extras. | |
| 45 perm = vec_lvsl(0, pixels); | |
| 46 pixv = (vector unsigned char *) pixels; | |
| 47 bytes = vec_perm(pixv[0], pixv[1], perm); | |
| 48 | |
| 49 // convert the bytes into shorts | |
| 50 shorts = (vector signed short)vec_mergeh(zero, bytes); | |
| 51 | |
| 52 // save the data to the block, we assume the block is 16-byte aligned | |
| 53 vec_st(shorts, i*16, (vector signed short*)block); | |
| 54 | |
| 55 pixels += line_size; | |
| 56 } | |
| 57 } | |
| 58 | |
| 59 static void diff_pixels_altivec(DCTELEM *restrict block, const uint8_t *s1, | |
| 60 const uint8_t *s2, int stride) | |
| 61 { | |
| 62 int i; | |
| 63 vector unsigned char perm, bytes, *pixv; | |
| 64 const vector unsigned char zero = (const vector unsigned char)vec_splat_u8(0); | |
| 65 vector signed short shorts1, shorts2; | |
| 66 | |
| 67 for (i = 0; i < 4; i++) { | |
| 68 // Read potentially unaligned pixels | |
| 69 // We're reading 16 pixels, and actually only want 8, | |
| 70 // but we simply ignore the extras. | |
| 71 perm = vec_lvsl(0, s1); | |
| 72 pixv = (vector unsigned char *) s1; | |
| 73 bytes = vec_perm(pixv[0], pixv[1], perm); | |
| 74 | |
| 75 // convert the bytes into shorts | |
| 76 shorts1 = (vector signed short)vec_mergeh(zero, bytes); | |
| 77 | |
| 78 // Do the same for the second block of pixels | |
| 79 perm = vec_lvsl(0, s2); | |
| 80 pixv = (vector unsigned char *) s2; | |
| 81 bytes = vec_perm(pixv[0], pixv[1], perm); | |
| 82 | |
| 83 // convert the bytes into shorts | |
| 84 shorts2 = (vector signed short)vec_mergeh(zero, bytes); | |
| 85 | |
| 86 // Do the subtraction | |
| 87 shorts1 = vec_sub(shorts1, shorts2); | |
| 88 | |
| 89 // save the data to the block, we assume the block is 16-byte aligned | |
| 90 vec_st(shorts1, 0, (vector signed short*)block); | |
| 91 | |
| 92 s1 += stride; | |
| 93 s2 += stride; | |
| 94 block += 8; | |
| 95 | |
| 96 | |
| 97 // The code below is a copy of the code above... This is a manual | |
| 98 // unroll. | |
| 99 | |
| 100 // Read potentially unaligned pixels | |
| 101 // We're reading 16 pixels, and actually only want 8, | |
| 102 // but we simply ignore the extras. | |
| 103 perm = vec_lvsl(0, s1); | |
| 104 pixv = (vector unsigned char *) s1; | |
| 105 bytes = vec_perm(pixv[0], pixv[1], perm); | |
| 106 | |
| 107 // convert the bytes into shorts | |
| 108 shorts1 = (vector signed short)vec_mergeh(zero, bytes); | |
| 109 | |
| 110 // Do the same for the second block of pixels | |
| 111 perm = vec_lvsl(0, s2); | |
| 112 pixv = (vector unsigned char *) s2; | |
| 113 bytes = vec_perm(pixv[0], pixv[1], perm); | |
| 114 | |
| 115 // convert the bytes into shorts | |
| 116 shorts2 = (vector signed short)vec_mergeh(zero, bytes); | |
| 117 | |
| 118 // Do the subtraction | |
| 119 shorts1 = vec_sub(shorts1, shorts2); | |
| 120 | |
| 121 // save the data to the block, we assume the block is 16-byte aligned | |
| 122 vec_st(shorts1, 0, (vector signed short*)block); | |
| 123 | |
| 124 s1 += stride; | |
| 125 s2 += stride; | |
| 126 block += 8; | |
| 127 } | |
| 128 } | |
| 129 | |
| 130 | |
| 131 static void clear_block_altivec(DCTELEM *block) { | |
| 132 LOAD_ZERO; | |
| 133 vec_st(zero_s16v, 0, block); | |
| 134 vec_st(zero_s16v, 16, block); | |
| 135 vec_st(zero_s16v, 32, block); | |
| 136 vec_st(zero_s16v, 48, block); | |
| 137 vec_st(zero_s16v, 64, block); | |
| 138 vec_st(zero_s16v, 80, block); | |
| 139 vec_st(zero_s16v, 96, block); | |
| 140 vec_st(zero_s16v, 112, block); | |
| 141 } | |
| 142 | |
| 143 | |
| 144 | |
| 145 /* next one assumes that ((line_size % 16) == 0) */ | |
| 146 void put_pixels16_altivec(uint8_t *block, const uint8_t *pixels, int line_size, int h) | |
| 147 { | |
| 148 POWERPC_PERF_DECLARE(altivec_put_pixels16_num, 1); | |
| 149 register vector unsigned char pixelsv1, pixelsv2; | |
| 150 register vector unsigned char pixelsv1B, pixelsv2B; | |
| 151 register vector unsigned char pixelsv1C, pixelsv2C; | |
| 152 register vector unsigned char pixelsv1D, pixelsv2D; | |
| 153 | |
| 154 register vector unsigned char perm = vec_lvsl(0, pixels); | |
| 155 int i; | |
| 156 register int line_size_2 = line_size << 1; | |
| 157 register int line_size_3 = line_size + line_size_2; | |
| 158 register int line_size_4 = line_size << 2; | |
| 159 | |
| 160 POWERPC_PERF_START_COUNT(altivec_put_pixels16_num, 1); | |
| 161 // hand-unrolling the loop by 4 gains about 15% | |
| 162 // mininum execution time goes from 74 to 60 cycles | |
| 163 // it's faster than -funroll-loops, but using | |
| 164 // -funroll-loops w/ this is bad - 74 cycles again. | |
| 165 // all this is on a 7450, tuning for the 7450 | |
| 166 #if 0 | |
| 167 for (i = 0; i < h; i++) { | |
| 168 pixelsv1 = vec_ld(0, pixels); | |
| 169 pixelsv2 = vec_ld(16, pixels); | |
| 170 vec_st(vec_perm(pixelsv1, pixelsv2, perm), | |
| 171 0, block); | |
| 172 pixels+=line_size; | |
| 173 block +=line_size; | |
| 174 } | |
| 175 #else | |
| 176 for (i = 0; i < h; i += 4) { | |
| 177 pixelsv1 = vec_ld( 0, pixels); | |
| 178 pixelsv2 = vec_ld(15, pixels); | |
| 179 pixelsv1B = vec_ld(line_size, pixels); | |
| 180 pixelsv2B = vec_ld(15 + line_size, pixels); | |
| 181 pixelsv1C = vec_ld(line_size_2, pixels); | |
| 182 pixelsv2C = vec_ld(15 + line_size_2, pixels); | |
| 183 pixelsv1D = vec_ld(line_size_3, pixels); | |
| 184 pixelsv2D = vec_ld(15 + line_size_3, pixels); | |
| 185 vec_st(vec_perm(pixelsv1, pixelsv2, perm), | |
| 186 0, (unsigned char*)block); | |
| 187 vec_st(vec_perm(pixelsv1B, pixelsv2B, perm), | |
| 188 line_size, (unsigned char*)block); | |
| 189 vec_st(vec_perm(pixelsv1C, pixelsv2C, perm), | |
| 190 line_size_2, (unsigned char*)block); | |
| 191 vec_st(vec_perm(pixelsv1D, pixelsv2D, perm), | |
| 192 line_size_3, (unsigned char*)block); | |
| 193 pixels+=line_size_4; | |
| 194 block +=line_size_4; | |
| 195 } | |
| 196 #endif | |
| 197 POWERPC_PERF_STOP_COUNT(altivec_put_pixels16_num, 1); | |
| 198 } | |
| 199 | |
| 200 /* next one assumes that ((line_size % 16) == 0) */ | |
| 201 #define op_avg(a,b) a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEUL)>>1) ) | |
| 202 void avg_pixels16_altivec(uint8_t *block, const uint8_t *pixels, int line_size, int h) | |
| 203 { | |
| 204 POWERPC_PERF_DECLARE(altivec_avg_pixels16_num, 1); | |
| 205 register vector unsigned char pixelsv1, pixelsv2, pixelsv, blockv; | |
| 206 register vector unsigned char perm = vec_lvsl(0, pixels); | |
| 207 int i; | |
| 208 | |
| 209 POWERPC_PERF_START_COUNT(altivec_avg_pixels16_num, 1); | |
| 210 | |
| 211 for (i = 0; i < h; i++) { | |
| 212 pixelsv1 = vec_ld( 0, pixels); | |
| 213 pixelsv2 = vec_ld(16,pixels); | |
| 214 blockv = vec_ld(0, block); | |
| 215 pixelsv = vec_perm(pixelsv1, pixelsv2, perm); | |
| 216 blockv = vec_avg(blockv,pixelsv); | |
| 217 vec_st(blockv, 0, (unsigned char*)block); | |
| 218 pixels+=line_size; | |
| 219 block +=line_size; | |
| 220 } | |
| 221 | |
| 222 POWERPC_PERF_STOP_COUNT(altivec_avg_pixels16_num, 1); | |
| 223 } | |
| 224 | |
| 225 /* next one assumes that ((line_size % 8) == 0) */ | |
| 226 static void avg_pixels8_altivec(uint8_t * block, const uint8_t * pixels, int line_size, int h) | |
| 227 { | |
| 228 POWERPC_PERF_DECLARE(altivec_avg_pixels8_num, 1); | |
| 229 register vector unsigned char pixelsv1, pixelsv2, pixelsv, blockv; | |
| 230 int i; | |
| 231 | |
| 232 POWERPC_PERF_START_COUNT(altivec_avg_pixels8_num, 1); | |
| 233 | |
| 234 for (i = 0; i < h; i++) { | |
| 235 /* block is 8 bytes-aligned, so we're either in the | |
| 236 left block (16 bytes-aligned) or in the right block (not) */ | |
| 237 int rightside = ((unsigned long)block & 0x0000000F); | |
| 238 | |
| 239 blockv = vec_ld(0, block); | |
| 240 pixelsv1 = vec_ld( 0, pixels); | |
| 241 pixelsv2 = vec_ld(16, pixels); | |
| 242 pixelsv = vec_perm(pixelsv1, pixelsv2, vec_lvsl(0, pixels)); | |
| 243 | |
| 244 if (rightside) { | |
| 245 pixelsv = vec_perm(blockv, pixelsv, vcprm(0,1,s0,s1)); | |
| 246 } else { | |
| 247 pixelsv = vec_perm(blockv, pixelsv, vcprm(s0,s1,2,3)); | |
| 248 } | |
| 249 | |
| 250 blockv = vec_avg(blockv, pixelsv); | |
| 251 | |
| 252 vec_st(blockv, 0, block); | |
| 253 | |
| 254 pixels += line_size; | |
| 255 block += line_size; | |
| 256 } | |
| 257 | |
| 258 POWERPC_PERF_STOP_COUNT(altivec_avg_pixels8_num, 1); | |
| 259 } | |
| 260 | |
| 261 /* next one assumes that ((line_size % 8) == 0) */ | |
| 262 static void put_pixels8_xy2_altivec(uint8_t *block, const uint8_t *pixels, int line_size, int h) | |
| 263 { | |
| 264 POWERPC_PERF_DECLARE(altivec_put_pixels8_xy2_num, 1); | |
| 265 register int i; | |
| 266 register vector unsigned char pixelsv1, pixelsv2, pixelsavg; | |
| 267 register vector unsigned char blockv, temp1, temp2; | |
| 268 register vector unsigned short pixelssum1, pixelssum2, temp3; | |
| 269 register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0); | |
| 270 register const vector unsigned short vctwo = (const vector unsigned short)vec_splat_u16(2); | |
| 271 | |
| 272 temp1 = vec_ld(0, pixels); | |
| 273 temp2 = vec_ld(16, pixels); | |
| 274 pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels)); | |
| 275 if ((((unsigned long)pixels) & 0x0000000F) == 0x0000000F) { | |
| 276 pixelsv2 = temp2; | |
| 277 } else { | |
| 278 pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels)); | |
| 279 } | |
| 280 pixelsv1 = vec_mergeh(vczero, pixelsv1); | |
| 281 pixelsv2 = vec_mergeh(vczero, pixelsv2); | |
| 282 pixelssum1 = vec_add((vector unsigned short)pixelsv1, | |
| 283 (vector unsigned short)pixelsv2); | |
| 284 pixelssum1 = vec_add(pixelssum1, vctwo); | |
| 285 | |
| 286 POWERPC_PERF_START_COUNT(altivec_put_pixels8_xy2_num, 1); | |
| 287 for (i = 0; i < h ; i++) { | |
| 288 int rightside = ((unsigned long)block & 0x0000000F); | |
| 289 blockv = vec_ld(0, block); | |
| 290 | |
| 291 temp1 = vec_ld(line_size, pixels); | |
| 292 temp2 = vec_ld(line_size + 16, pixels); | |
| 293 pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels)); | |
| 294 if (((((unsigned long)pixels) + line_size) & 0x0000000F) == 0x0000000F) { | |
| 295 pixelsv2 = temp2; | |
| 296 } else { | |
| 297 pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels)); | |
| 298 } | |
| 299 | |
| 300 pixelsv1 = vec_mergeh(vczero, pixelsv1); | |
| 301 pixelsv2 = vec_mergeh(vczero, pixelsv2); | |
| 302 pixelssum2 = vec_add((vector unsigned short)pixelsv1, | |
| 303 (vector unsigned short)pixelsv2); | |
| 304 temp3 = vec_add(pixelssum1, pixelssum2); | |
| 305 temp3 = vec_sra(temp3, vctwo); | |
| 306 pixelssum1 = vec_add(pixelssum2, vctwo); | |
| 307 pixelsavg = vec_packsu(temp3, (vector unsigned short) vczero); | |
| 308 | |
| 309 if (rightside) { | |
| 310 blockv = vec_perm(blockv, pixelsavg, vcprm(0, 1, s0, s1)); | |
| 311 } else { | |
| 312 blockv = vec_perm(blockv, pixelsavg, vcprm(s0, s1, 2, 3)); | |
| 313 } | |
| 314 | |
| 315 vec_st(blockv, 0, block); | |
| 316 | |
| 317 block += line_size; | |
| 318 pixels += line_size; | |
| 319 } | |
| 320 | |
| 321 POWERPC_PERF_STOP_COUNT(altivec_put_pixels8_xy2_num, 1); | |
| 322 } | |
| 323 | |
| 324 /* next one assumes that ((line_size % 8) == 0) */ | |
| 325 static void put_no_rnd_pixels8_xy2_altivec(uint8_t *block, const uint8_t *pixels, int line_size, int h) | |
| 326 { | |
| 327 POWERPC_PERF_DECLARE(altivec_put_no_rnd_pixels8_xy2_num, 1); | |
| 328 register int i; | |
| 329 register vector unsigned char pixelsv1, pixelsv2, pixelsavg; | |
| 330 register vector unsigned char blockv, temp1, temp2; | |
| 331 register vector unsigned short pixelssum1, pixelssum2, temp3; | |
| 332 register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0); | |
| 333 register const vector unsigned short vcone = (const vector unsigned short)vec_splat_u16(1); | |
| 334 register const vector unsigned short vctwo = (const vector unsigned short)vec_splat_u16(2); | |
| 335 | |
| 336 temp1 = vec_ld(0, pixels); | |
| 337 temp2 = vec_ld(16, pixels); | |
| 338 pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels)); | |
| 339 if ((((unsigned long)pixels) & 0x0000000F) == 0x0000000F) { | |
| 340 pixelsv2 = temp2; | |
| 341 } else { | |
| 342 pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels)); | |
| 343 } | |
| 344 pixelsv1 = vec_mergeh(vczero, pixelsv1); | |
| 345 pixelsv2 = vec_mergeh(vczero, pixelsv2); | |
| 346 pixelssum1 = vec_add((vector unsigned short)pixelsv1, | |
| 347 (vector unsigned short)pixelsv2); | |
| 348 pixelssum1 = vec_add(pixelssum1, vcone); | |
| 349 | |
| 350 POWERPC_PERF_START_COUNT(altivec_put_no_rnd_pixels8_xy2_num, 1); | |
| 351 for (i = 0; i < h ; i++) { | |
| 352 int rightside = ((unsigned long)block & 0x0000000F); | |
| 353 blockv = vec_ld(0, block); | |
| 354 | |
| 355 temp1 = vec_ld(line_size, pixels); | |
| 356 temp2 = vec_ld(line_size + 16, pixels); | |
| 357 pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels)); | |
| 358 if (((((unsigned long)pixels) + line_size) & 0x0000000F) == 0x0000000F) { | |
| 359 pixelsv2 = temp2; | |
| 360 } else { | |
| 361 pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels)); | |
| 362 } | |
| 363 | |
| 364 pixelsv1 = vec_mergeh(vczero, pixelsv1); | |
| 365 pixelsv2 = vec_mergeh(vczero, pixelsv2); | |
| 366 pixelssum2 = vec_add((vector unsigned short)pixelsv1, | |
| 367 (vector unsigned short)pixelsv2); | |
| 368 temp3 = vec_add(pixelssum1, pixelssum2); | |
| 369 temp3 = vec_sra(temp3, vctwo); | |
| 370 pixelssum1 = vec_add(pixelssum2, vcone); | |
| 371 pixelsavg = vec_packsu(temp3, (vector unsigned short) vczero); | |
| 372 | |
| 373 if (rightside) { | |
| 374 blockv = vec_perm(blockv, pixelsavg, vcprm(0, 1, s0, s1)); | |
| 375 } else { | |
| 376 blockv = vec_perm(blockv, pixelsavg, vcprm(s0, s1, 2, 3)); | |
| 377 } | |
| 378 | |
| 379 vec_st(blockv, 0, block); | |
| 380 | |
| 381 block += line_size; | |
| 382 pixels += line_size; | |
| 383 } | |
| 384 | |
| 385 POWERPC_PERF_STOP_COUNT(altivec_put_no_rnd_pixels8_xy2_num, 1); | |
| 386 } | |
| 387 | |
| 388 /* next one assumes that ((line_size % 16) == 0) */ | |
| 389 static void put_pixels16_xy2_altivec(uint8_t * block, const uint8_t * pixels, int line_size, int h) | |
| 390 { | |
| 391 POWERPC_PERF_DECLARE(altivec_put_pixels16_xy2_num, 1); | |
| 392 register int i; | |
| 393 register vector unsigned char pixelsv1, pixelsv2, pixelsv3, pixelsv4; | |
| 394 register vector unsigned char blockv, temp1, temp2; | |
| 395 register vector unsigned short temp3, temp4, | |
| 396 pixelssum1, pixelssum2, pixelssum3, pixelssum4; | |
| 397 register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0); | |
| 398 register const vector unsigned short vctwo = (const vector unsigned short)vec_splat_u16(2); | |
| 399 | |
| 400 POWERPC_PERF_START_COUNT(altivec_put_pixels16_xy2_num, 1); | |
| 401 | |
| 402 temp1 = vec_ld(0, pixels); | |
| 403 temp2 = vec_ld(16, pixels); | |
| 404 pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels)); | |
| 405 if ((((unsigned long)pixels) & 0x0000000F) == 0x0000000F) { | |
| 406 pixelsv2 = temp2; | |
| 407 } else { | |
| 408 pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels)); | |
| 409 } | |
| 410 pixelsv3 = vec_mergel(vczero, pixelsv1); | |
| 411 pixelsv4 = vec_mergel(vczero, pixelsv2); | |
| 412 pixelsv1 = vec_mergeh(vczero, pixelsv1); | |
| 413 pixelsv2 = vec_mergeh(vczero, pixelsv2); | |
| 414 pixelssum3 = vec_add((vector unsigned short)pixelsv3, | |
| 415 (vector unsigned short)pixelsv4); | |
| 416 pixelssum3 = vec_add(pixelssum3, vctwo); | |
| 417 pixelssum1 = vec_add((vector unsigned short)pixelsv1, | |
| 418 (vector unsigned short)pixelsv2); | |
| 419 pixelssum1 = vec_add(pixelssum1, vctwo); | |
| 420 | |
| 421 for (i = 0; i < h ; i++) { | |
| 422 blockv = vec_ld(0, block); | |
| 423 | |
| 424 temp1 = vec_ld(line_size, pixels); | |
| 425 temp2 = vec_ld(line_size + 16, pixels); | |
| 426 pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels)); | |
| 427 if (((((unsigned long)pixels) + line_size) & 0x0000000F) == 0x0000000F) { | |
| 428 pixelsv2 = temp2; | |
| 429 } else { | |
| 430 pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels)); | |
| 431 } | |
| 432 | |
| 433 pixelsv3 = vec_mergel(vczero, pixelsv1); | |
| 434 pixelsv4 = vec_mergel(vczero, pixelsv2); | |
| 435 pixelsv1 = vec_mergeh(vczero, pixelsv1); | |
| 436 pixelsv2 = vec_mergeh(vczero, pixelsv2); | |
| 437 | |
| 438 pixelssum4 = vec_add((vector unsigned short)pixelsv3, | |
| 439 (vector unsigned short)pixelsv4); | |
| 440 pixelssum2 = vec_add((vector unsigned short)pixelsv1, | |
| 441 (vector unsigned short)pixelsv2); | |
| 442 temp4 = vec_add(pixelssum3, pixelssum4); | |
| 443 temp4 = vec_sra(temp4, vctwo); | |
| 444 temp3 = vec_add(pixelssum1, pixelssum2); | |
| 445 temp3 = vec_sra(temp3, vctwo); | |
| 446 | |
| 447 pixelssum3 = vec_add(pixelssum4, vctwo); | |
| 448 pixelssum1 = vec_add(pixelssum2, vctwo); | |
| 449 | |
| 450 blockv = vec_packsu(temp3, temp4); | |
| 451 | |
| 452 vec_st(blockv, 0, block); | |
| 453 | |
| 454 block += line_size; | |
| 455 pixels += line_size; | |
| 456 } | |
| 457 | |
| 458 POWERPC_PERF_STOP_COUNT(altivec_put_pixels16_xy2_num, 1); | |
| 459 } | |
| 460 | |
| 461 /* next one assumes that ((line_size % 16) == 0) */ | |
| 462 static void put_no_rnd_pixels16_xy2_altivec(uint8_t * block, const uint8_t * pixels, int line_size, int h) | |
| 463 { | |
| 464 POWERPC_PERF_DECLARE(altivec_put_no_rnd_pixels16_xy2_num, 1); | |
| 465 register int i; | |
| 466 register vector unsigned char pixelsv1, pixelsv2, pixelsv3, pixelsv4; | |
| 467 register vector unsigned char blockv, temp1, temp2; | |
| 468 register vector unsigned short temp3, temp4, | |
| 469 pixelssum1, pixelssum2, pixelssum3, pixelssum4; | |
| 470 register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0); | |
| 471 register const vector unsigned short vcone = (const vector unsigned short)vec_splat_u16(1); | |
| 472 register const vector unsigned short vctwo = (const vector unsigned short)vec_splat_u16(2); | |
| 473 | |
| 474 POWERPC_PERF_START_COUNT(altivec_put_no_rnd_pixels16_xy2_num, 1); | |
| 475 | |
| 476 temp1 = vec_ld(0, pixels); | |
| 477 temp2 = vec_ld(16, pixels); | |
| 478 pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels)); | |
| 479 if ((((unsigned long)pixels) & 0x0000000F) == 0x0000000F) { | |
| 480 pixelsv2 = temp2; | |
| 481 } else { | |
| 482 pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels)); | |
| 483 } | |
| 484 pixelsv3 = vec_mergel(vczero, pixelsv1); | |
| 485 pixelsv4 = vec_mergel(vczero, pixelsv2); | |
| 486 pixelsv1 = vec_mergeh(vczero, pixelsv1); | |
| 487 pixelsv2 = vec_mergeh(vczero, pixelsv2); | |
| 488 pixelssum3 = vec_add((vector unsigned short)pixelsv3, | |
| 489 (vector unsigned short)pixelsv4); | |
| 490 pixelssum3 = vec_add(pixelssum3, vcone); | |
| 491 pixelssum1 = vec_add((vector unsigned short)pixelsv1, | |
| 492 (vector unsigned short)pixelsv2); | |
| 493 pixelssum1 = vec_add(pixelssum1, vcone); | |
| 494 | |
| 495 for (i = 0; i < h ; i++) { | |
| 496 blockv = vec_ld(0, block); | |
| 497 | |
| 498 temp1 = vec_ld(line_size, pixels); | |
| 499 temp2 = vec_ld(line_size + 16, pixels); | |
| 500 pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels)); | |
| 501 if (((((unsigned long)pixels) + line_size) & 0x0000000F) == 0x0000000F) { | |
| 502 pixelsv2 = temp2; | |
| 503 } else { | |
| 504 pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels)); | |
| 505 } | |
| 506 | |
| 507 pixelsv3 = vec_mergel(vczero, pixelsv1); | |
| 508 pixelsv4 = vec_mergel(vczero, pixelsv2); | |
| 509 pixelsv1 = vec_mergeh(vczero, pixelsv1); | |
| 510 pixelsv2 = vec_mergeh(vczero, pixelsv2); | |
| 511 | |
| 512 pixelssum4 = vec_add((vector unsigned short)pixelsv3, | |
| 513 (vector unsigned short)pixelsv4); | |
| 514 pixelssum2 = vec_add((vector unsigned short)pixelsv1, | |
| 515 (vector unsigned short)pixelsv2); | |
| 516 temp4 = vec_add(pixelssum3, pixelssum4); | |
| 517 temp4 = vec_sra(temp4, vctwo); | |
| 518 temp3 = vec_add(pixelssum1, pixelssum2); | |
| 519 temp3 = vec_sra(temp3, vctwo); | |
| 520 | |
| 521 pixelssum3 = vec_add(pixelssum4, vcone); | |
| 522 pixelssum1 = vec_add(pixelssum2, vcone); | |
| 523 | |
| 524 blockv = vec_packsu(temp3, temp4); | |
| 525 | |
| 526 vec_st(blockv, 0, block); | |
| 527 | |
| 528 block += line_size; | |
| 529 pixels += line_size; | |
| 530 } | |
| 531 | |
| 532 POWERPC_PERF_STOP_COUNT(altivec_put_no_rnd_pixels16_xy2_num, 1); | |
| 533 } | |
| 534 | |
| 535 /* next one assumes that ((line_size % 8) == 0) */ | |
| 536 static void avg_pixels8_xy2_altivec(uint8_t *block, const uint8_t *pixels, int line_size, int h) | |
| 537 { | |
| 538 POWERPC_PERF_DECLARE(altivec_avg_pixels8_xy2_num, 1); | |
| 539 register int i; | |
| 540 register vector unsigned char pixelsv1, pixelsv2, pixelsavg; | |
| 541 register vector unsigned char blockv, temp1, temp2, blocktemp; | |
| 542 register vector unsigned short pixelssum1, pixelssum2, temp3; | |
| 543 | |
| 544 register const vector unsigned char vczero = (const vector unsigned char) | |
| 545 vec_splat_u8(0); | |
| 546 register const vector unsigned short vctwo = (const vector unsigned short) | |
| 547 vec_splat_u16(2); | |
| 548 | |
| 549 temp1 = vec_ld(0, pixels); | |
| 550 temp2 = vec_ld(16, pixels); | |
| 551 pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels)); | |
| 552 if ((((unsigned long)pixels) & 0x0000000F) == 0x0000000F) { | |
| 553 pixelsv2 = temp2; | |
| 554 } else { | |
| 555 pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels)); | |
| 556 } | |
| 557 pixelsv1 = vec_mergeh(vczero, pixelsv1); | |
| 558 pixelsv2 = vec_mergeh(vczero, pixelsv2); | |
| 559 pixelssum1 = vec_add((vector unsigned short)pixelsv1, | |
| 560 (vector unsigned short)pixelsv2); | |
| 561 pixelssum1 = vec_add(pixelssum1, vctwo); | |
| 562 | |
| 563 POWERPC_PERF_START_COUNT(altivec_avg_pixels8_xy2_num, 1); | |
| 564 for (i = 0; i < h ; i++) { | |
| 565 int rightside = ((unsigned long)block & 0x0000000F); | |
| 566 blockv = vec_ld(0, block); | |
| 567 | |
| 568 temp1 = vec_ld(line_size, pixels); | |
| 569 temp2 = vec_ld(line_size + 16, pixels); | |
| 570 pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels)); | |
| 571 if (((((unsigned long)pixels) + line_size) & 0x0000000F) == 0x0000000F) { | |
| 572 pixelsv2 = temp2; | |
| 573 } else { | |
| 574 pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels)); | |
| 575 } | |
| 576 | |
| 577 pixelsv1 = vec_mergeh(vczero, pixelsv1); | |
| 578 pixelsv2 = vec_mergeh(vczero, pixelsv2); | |
| 579 pixelssum2 = vec_add((vector unsigned short)pixelsv1, | |
| 580 (vector unsigned short)pixelsv2); | |
| 581 temp3 = vec_add(pixelssum1, pixelssum2); | |
| 582 temp3 = vec_sra(temp3, vctwo); | |
| 583 pixelssum1 = vec_add(pixelssum2, vctwo); | |
| 584 pixelsavg = vec_packsu(temp3, (vector unsigned short) vczero); | |
| 585 | |
| 586 if (rightside) { | |
| 587 blocktemp = vec_perm(blockv, pixelsavg, vcprm(0, 1, s0, s1)); | |
| 588 } else { | |
| 589 blocktemp = vec_perm(blockv, pixelsavg, vcprm(s0, s1, 2, 3)); | |
| 590 } | |
| 591 | |
| 592 blockv = vec_avg(blocktemp, blockv); | |
| 593 vec_st(blockv, 0, block); | |
| 594 | |
| 595 block += line_size; | |
| 596 pixels += line_size; | |
| 597 } | |
| 598 | |
| 599 POWERPC_PERF_STOP_COUNT(altivec_avg_pixels8_xy2_num, 1); | |
| 600 } | |
| 601 | |
| 602 void dsputil_init_altivec(DSPContext* c) | |
| 603 { | |
| 604 c->diff_pixels = diff_pixels_altivec; | |
| 605 c->get_pixels = get_pixels_altivec; | |
| 606 c->clear_block = clear_block_altivec; | |
| 607 | |
| 608 c->put_pixels_tab[0][0] = put_pixels16_altivec; | |
| 609 /* the two functions do the same thing, so use the same code */ | |
| 610 c->put_no_rnd_pixels_tab[0][0] = put_pixels16_altivec; | |
| 611 c->avg_pixels_tab[0][0] = avg_pixels16_altivec; | |
| 612 c->avg_pixels_tab[1][0] = avg_pixels8_altivec; | |
| 613 c->avg_pixels_tab[1][3] = avg_pixels8_xy2_altivec; | |
| 614 c->put_pixels_tab[1][3] = put_pixels8_xy2_altivec; | |
| 615 c->put_no_rnd_pixels_tab[1][3] = put_no_rnd_pixels8_xy2_altivec; | |
| 616 c->put_pixels_tab[0][3] = put_pixels16_xy2_altivec; | |
| 617 c->put_no_rnd_pixels_tab[0][3] = put_no_rnd_pixels16_xy2_altivec; | |
| 618 | |
| 619 } |
