view libavcodec/ppc/dsputil_altivec.c @ 2:897f711a7157

rearrange to work with autoconf
author Nina Engelhardt <nengel@mailbox.tu-berlin.de>
date Tue, 25 Sep 2012 15:55:33 +0200
parents
children
line source
1 /*
2 * Copyright (c) 2002 Brian Foley
3 * Copyright (c) 2002 Dieter Shirley
4 * Copyright (c) 2003-2004 Romain Dolbeau <romain@dolbeau.org>
5 *
6 * This file is part of FFmpeg.
7 *
8 * FFmpeg is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License as published by the Free Software Foundation; either
11 * version 2.1 of the License, or (at your option) any later version.
12 *
13 * FFmpeg is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 * Lesser General Public License for more details.
17 *
18 * You should have received a copy of the GNU Lesser General Public
19 * License along with FFmpeg; if not, write to the Free Software
20 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21 */
23 #include "config.h"
24 #if HAVE_ALTIVEC_H
25 #include <altivec.h>
26 #endif
27 #include "libavcodec/dsputil.h"
28 #include "dsputil_ppc.h"
29 #include "util_altivec.h"
30 #include "types_altivec.h"
31 #include "dsputil_altivec.h"
34 static void get_pixels_altivec(DCTELEM *restrict block, const uint8_t *pixels, int line_size)
35 {
36 int i;
37 vector unsigned char perm, bytes, *pixv;
38 const vector unsigned char zero = (const vector unsigned char)vec_splat_u8(0);
39 vector signed short shorts;
41 for (i = 0; i < 8; i++) {
42 // Read potentially unaligned pixels.
43 // We're reading 16 pixels, and actually only want 8,
44 // but we simply ignore the extras.
45 perm = vec_lvsl(0, pixels);
46 pixv = (vector unsigned char *) pixels;
47 bytes = vec_perm(pixv[0], pixv[1], perm);
49 // convert the bytes into shorts
50 shorts = (vector signed short)vec_mergeh(zero, bytes);
52 // save the data to the block, we assume the block is 16-byte aligned
53 vec_st(shorts, i*16, (vector signed short*)block);
55 pixels += line_size;
56 }
57 }
59 static void diff_pixels_altivec(DCTELEM *restrict block, const uint8_t *s1,
60 const uint8_t *s2, int stride)
61 {
62 int i;
63 vector unsigned char perm, bytes, *pixv;
64 const vector unsigned char zero = (const vector unsigned char)vec_splat_u8(0);
65 vector signed short shorts1, shorts2;
67 for (i = 0; i < 4; i++) {
68 // Read potentially unaligned pixels
69 // We're reading 16 pixels, and actually only want 8,
70 // but we simply ignore the extras.
71 perm = vec_lvsl(0, s1);
72 pixv = (vector unsigned char *) s1;
73 bytes = vec_perm(pixv[0], pixv[1], perm);
75 // convert the bytes into shorts
76 shorts1 = (vector signed short)vec_mergeh(zero, bytes);
78 // Do the same for the second block of pixels
79 perm = vec_lvsl(0, s2);
80 pixv = (vector unsigned char *) s2;
81 bytes = vec_perm(pixv[0], pixv[1], perm);
83 // convert the bytes into shorts
84 shorts2 = (vector signed short)vec_mergeh(zero, bytes);
86 // Do the subtraction
87 shorts1 = vec_sub(shorts1, shorts2);
89 // save the data to the block, we assume the block is 16-byte aligned
90 vec_st(shorts1, 0, (vector signed short*)block);
92 s1 += stride;
93 s2 += stride;
94 block += 8;
97 // The code below is a copy of the code above... This is a manual
98 // unroll.
100 // Read potentially unaligned pixels
101 // We're reading 16 pixels, and actually only want 8,
102 // but we simply ignore the extras.
103 perm = vec_lvsl(0, s1);
104 pixv = (vector unsigned char *) s1;
105 bytes = vec_perm(pixv[0], pixv[1], perm);
107 // convert the bytes into shorts
108 shorts1 = (vector signed short)vec_mergeh(zero, bytes);
110 // Do the same for the second block of pixels
111 perm = vec_lvsl(0, s2);
112 pixv = (vector unsigned char *) s2;
113 bytes = vec_perm(pixv[0], pixv[1], perm);
115 // convert the bytes into shorts
116 shorts2 = (vector signed short)vec_mergeh(zero, bytes);
118 // Do the subtraction
119 shorts1 = vec_sub(shorts1, shorts2);
121 // save the data to the block, we assume the block is 16-byte aligned
122 vec_st(shorts1, 0, (vector signed short*)block);
124 s1 += stride;
125 s2 += stride;
126 block += 8;
127 }
128 }
131 static void clear_block_altivec(DCTELEM *block) {
132 LOAD_ZERO;
133 vec_st(zero_s16v, 0, block);
134 vec_st(zero_s16v, 16, block);
135 vec_st(zero_s16v, 32, block);
136 vec_st(zero_s16v, 48, block);
137 vec_st(zero_s16v, 64, block);
138 vec_st(zero_s16v, 80, block);
139 vec_st(zero_s16v, 96, block);
140 vec_st(zero_s16v, 112, block);
141 }
145 /* next one assumes that ((line_size % 16) == 0) */
146 void put_pixels16_altivec(uint8_t *block, const uint8_t *pixels, int line_size, int h)
147 {
148 POWERPC_PERF_DECLARE(altivec_put_pixels16_num, 1);
149 register vector unsigned char pixelsv1, pixelsv2;
150 register vector unsigned char pixelsv1B, pixelsv2B;
151 register vector unsigned char pixelsv1C, pixelsv2C;
152 register vector unsigned char pixelsv1D, pixelsv2D;
154 register vector unsigned char perm = vec_lvsl(0, pixels);
155 int i;
156 register int line_size_2 = line_size << 1;
157 register int line_size_3 = line_size + line_size_2;
158 register int line_size_4 = line_size << 2;
160 POWERPC_PERF_START_COUNT(altivec_put_pixels16_num, 1);
161 // hand-unrolling the loop by 4 gains about 15%
162 // mininum execution time goes from 74 to 60 cycles
163 // it's faster than -funroll-loops, but using
164 // -funroll-loops w/ this is bad - 74 cycles again.
165 // all this is on a 7450, tuning for the 7450
166 #if 0
167 for (i = 0; i < h; i++) {
168 pixelsv1 = vec_ld(0, pixels);
169 pixelsv2 = vec_ld(16, pixels);
170 vec_st(vec_perm(pixelsv1, pixelsv2, perm),
171 0, block);
172 pixels+=line_size;
173 block +=line_size;
174 }
175 #else
176 for (i = 0; i < h; i += 4) {
177 pixelsv1 = vec_ld( 0, pixels);
178 pixelsv2 = vec_ld(15, pixels);
179 pixelsv1B = vec_ld(line_size, pixels);
180 pixelsv2B = vec_ld(15 + line_size, pixels);
181 pixelsv1C = vec_ld(line_size_2, pixels);
182 pixelsv2C = vec_ld(15 + line_size_2, pixels);
183 pixelsv1D = vec_ld(line_size_3, pixels);
184 pixelsv2D = vec_ld(15 + line_size_3, pixels);
185 vec_st(vec_perm(pixelsv1, pixelsv2, perm),
186 0, (unsigned char*)block);
187 vec_st(vec_perm(pixelsv1B, pixelsv2B, perm),
188 line_size, (unsigned char*)block);
189 vec_st(vec_perm(pixelsv1C, pixelsv2C, perm),
190 line_size_2, (unsigned char*)block);
191 vec_st(vec_perm(pixelsv1D, pixelsv2D, perm),
192 line_size_3, (unsigned char*)block);
193 pixels+=line_size_4;
194 block +=line_size_4;
195 }
196 #endif
197 POWERPC_PERF_STOP_COUNT(altivec_put_pixels16_num, 1);
198 }
200 /* next one assumes that ((line_size % 16) == 0) */
201 #define op_avg(a,b) a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEUL)>>1) )
202 void avg_pixels16_altivec(uint8_t *block, const uint8_t *pixels, int line_size, int h)
203 {
204 POWERPC_PERF_DECLARE(altivec_avg_pixels16_num, 1);
205 register vector unsigned char pixelsv1, pixelsv2, pixelsv, blockv;
206 register vector unsigned char perm = vec_lvsl(0, pixels);
207 int i;
209 POWERPC_PERF_START_COUNT(altivec_avg_pixels16_num, 1);
211 for (i = 0; i < h; i++) {
212 pixelsv1 = vec_ld( 0, pixels);
213 pixelsv2 = vec_ld(16,pixels);
214 blockv = vec_ld(0, block);
215 pixelsv = vec_perm(pixelsv1, pixelsv2, perm);
216 blockv = vec_avg(blockv,pixelsv);
217 vec_st(blockv, 0, (unsigned char*)block);
218 pixels+=line_size;
219 block +=line_size;
220 }
222 POWERPC_PERF_STOP_COUNT(altivec_avg_pixels16_num, 1);
223 }
225 /* next one assumes that ((line_size % 8) == 0) */
226 static void avg_pixels8_altivec(uint8_t * block, const uint8_t * pixels, int line_size, int h)
227 {
228 POWERPC_PERF_DECLARE(altivec_avg_pixels8_num, 1);
229 register vector unsigned char pixelsv1, pixelsv2, pixelsv, blockv;
230 int i;
232 POWERPC_PERF_START_COUNT(altivec_avg_pixels8_num, 1);
234 for (i = 0; i < h; i++) {
235 /* block is 8 bytes-aligned, so we're either in the
236 left block (16 bytes-aligned) or in the right block (not) */
237 int rightside = ((unsigned long)block & 0x0000000F);
239 blockv = vec_ld(0, block);
240 pixelsv1 = vec_ld( 0, pixels);
241 pixelsv2 = vec_ld(16, pixels);
242 pixelsv = vec_perm(pixelsv1, pixelsv2, vec_lvsl(0, pixels));
244 if (rightside) {
245 pixelsv = vec_perm(blockv, pixelsv, vcprm(0,1,s0,s1));
246 } else {
247 pixelsv = vec_perm(blockv, pixelsv, vcprm(s0,s1,2,3));
248 }
250 blockv = vec_avg(blockv, pixelsv);
252 vec_st(blockv, 0, block);
254 pixels += line_size;
255 block += line_size;
256 }
258 POWERPC_PERF_STOP_COUNT(altivec_avg_pixels8_num, 1);
259 }
261 /* next one assumes that ((line_size % 8) == 0) */
262 static void put_pixels8_xy2_altivec(uint8_t *block, const uint8_t *pixels, int line_size, int h)
263 {
264 POWERPC_PERF_DECLARE(altivec_put_pixels8_xy2_num, 1);
265 register int i;
266 register vector unsigned char pixelsv1, pixelsv2, pixelsavg;
267 register vector unsigned char blockv, temp1, temp2;
268 register vector unsigned short pixelssum1, pixelssum2, temp3;
269 register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0);
270 register const vector unsigned short vctwo = (const vector unsigned short)vec_splat_u16(2);
272 temp1 = vec_ld(0, pixels);
273 temp2 = vec_ld(16, pixels);
274 pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels));
275 if ((((unsigned long)pixels) & 0x0000000F) == 0x0000000F) {
276 pixelsv2 = temp2;
277 } else {
278 pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels));
279 }
280 pixelsv1 = vec_mergeh(vczero, pixelsv1);
281 pixelsv2 = vec_mergeh(vczero, pixelsv2);
282 pixelssum1 = vec_add((vector unsigned short)pixelsv1,
283 (vector unsigned short)pixelsv2);
284 pixelssum1 = vec_add(pixelssum1, vctwo);
286 POWERPC_PERF_START_COUNT(altivec_put_pixels8_xy2_num, 1);
287 for (i = 0; i < h ; i++) {
288 int rightside = ((unsigned long)block & 0x0000000F);
289 blockv = vec_ld(0, block);
291 temp1 = vec_ld(line_size, pixels);
292 temp2 = vec_ld(line_size + 16, pixels);
293 pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels));
294 if (((((unsigned long)pixels) + line_size) & 0x0000000F) == 0x0000000F) {
295 pixelsv2 = temp2;
296 } else {
297 pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels));
298 }
300 pixelsv1 = vec_mergeh(vczero, pixelsv1);
301 pixelsv2 = vec_mergeh(vczero, pixelsv2);
302 pixelssum2 = vec_add((vector unsigned short)pixelsv1,
303 (vector unsigned short)pixelsv2);
304 temp3 = vec_add(pixelssum1, pixelssum2);
305 temp3 = vec_sra(temp3, vctwo);
306 pixelssum1 = vec_add(pixelssum2, vctwo);
307 pixelsavg = vec_packsu(temp3, (vector unsigned short) vczero);
309 if (rightside) {
310 blockv = vec_perm(blockv, pixelsavg, vcprm(0, 1, s0, s1));
311 } else {
312 blockv = vec_perm(blockv, pixelsavg, vcprm(s0, s1, 2, 3));
313 }
315 vec_st(blockv, 0, block);
317 block += line_size;
318 pixels += line_size;
319 }
321 POWERPC_PERF_STOP_COUNT(altivec_put_pixels8_xy2_num, 1);
322 }
324 /* next one assumes that ((line_size % 8) == 0) */
325 static void put_no_rnd_pixels8_xy2_altivec(uint8_t *block, const uint8_t *pixels, int line_size, int h)
326 {
327 POWERPC_PERF_DECLARE(altivec_put_no_rnd_pixels8_xy2_num, 1);
328 register int i;
329 register vector unsigned char pixelsv1, pixelsv2, pixelsavg;
330 register vector unsigned char blockv, temp1, temp2;
331 register vector unsigned short pixelssum1, pixelssum2, temp3;
332 register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0);
333 register const vector unsigned short vcone = (const vector unsigned short)vec_splat_u16(1);
334 register const vector unsigned short vctwo = (const vector unsigned short)vec_splat_u16(2);
336 temp1 = vec_ld(0, pixels);
337 temp2 = vec_ld(16, pixels);
338 pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels));
339 if ((((unsigned long)pixels) & 0x0000000F) == 0x0000000F) {
340 pixelsv2 = temp2;
341 } else {
342 pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels));
343 }
344 pixelsv1 = vec_mergeh(vczero, pixelsv1);
345 pixelsv2 = vec_mergeh(vczero, pixelsv2);
346 pixelssum1 = vec_add((vector unsigned short)pixelsv1,
347 (vector unsigned short)pixelsv2);
348 pixelssum1 = vec_add(pixelssum1, vcone);
350 POWERPC_PERF_START_COUNT(altivec_put_no_rnd_pixels8_xy2_num, 1);
351 for (i = 0; i < h ; i++) {
352 int rightside = ((unsigned long)block & 0x0000000F);
353 blockv = vec_ld(0, block);
355 temp1 = vec_ld(line_size, pixels);
356 temp2 = vec_ld(line_size + 16, pixels);
357 pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels));
358 if (((((unsigned long)pixels) + line_size) & 0x0000000F) == 0x0000000F) {
359 pixelsv2 = temp2;
360 } else {
361 pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels));
362 }
364 pixelsv1 = vec_mergeh(vczero, pixelsv1);
365 pixelsv2 = vec_mergeh(vczero, pixelsv2);
366 pixelssum2 = vec_add((vector unsigned short)pixelsv1,
367 (vector unsigned short)pixelsv2);
368 temp3 = vec_add(pixelssum1, pixelssum2);
369 temp3 = vec_sra(temp3, vctwo);
370 pixelssum1 = vec_add(pixelssum2, vcone);
371 pixelsavg = vec_packsu(temp3, (vector unsigned short) vczero);
373 if (rightside) {
374 blockv = vec_perm(blockv, pixelsavg, vcprm(0, 1, s0, s1));
375 } else {
376 blockv = vec_perm(blockv, pixelsavg, vcprm(s0, s1, 2, 3));
377 }
379 vec_st(blockv, 0, block);
381 block += line_size;
382 pixels += line_size;
383 }
385 POWERPC_PERF_STOP_COUNT(altivec_put_no_rnd_pixels8_xy2_num, 1);
386 }
388 /* next one assumes that ((line_size % 16) == 0) */
389 static void put_pixels16_xy2_altivec(uint8_t * block, const uint8_t * pixels, int line_size, int h)
390 {
391 POWERPC_PERF_DECLARE(altivec_put_pixels16_xy2_num, 1);
392 register int i;
393 register vector unsigned char pixelsv1, pixelsv2, pixelsv3, pixelsv4;
394 register vector unsigned char blockv, temp1, temp2;
395 register vector unsigned short temp3, temp4,
396 pixelssum1, pixelssum2, pixelssum3, pixelssum4;
397 register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0);
398 register const vector unsigned short vctwo = (const vector unsigned short)vec_splat_u16(2);
400 POWERPC_PERF_START_COUNT(altivec_put_pixels16_xy2_num, 1);
402 temp1 = vec_ld(0, pixels);
403 temp2 = vec_ld(16, pixels);
404 pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels));
405 if ((((unsigned long)pixels) & 0x0000000F) == 0x0000000F) {
406 pixelsv2 = temp2;
407 } else {
408 pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels));
409 }
410 pixelsv3 = vec_mergel(vczero, pixelsv1);
411 pixelsv4 = vec_mergel(vczero, pixelsv2);
412 pixelsv1 = vec_mergeh(vczero, pixelsv1);
413 pixelsv2 = vec_mergeh(vczero, pixelsv2);
414 pixelssum3 = vec_add((vector unsigned short)pixelsv3,
415 (vector unsigned short)pixelsv4);
416 pixelssum3 = vec_add(pixelssum3, vctwo);
417 pixelssum1 = vec_add((vector unsigned short)pixelsv1,
418 (vector unsigned short)pixelsv2);
419 pixelssum1 = vec_add(pixelssum1, vctwo);
421 for (i = 0; i < h ; i++) {
422 blockv = vec_ld(0, block);
424 temp1 = vec_ld(line_size, pixels);
425 temp2 = vec_ld(line_size + 16, pixels);
426 pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels));
427 if (((((unsigned long)pixels) + line_size) & 0x0000000F) == 0x0000000F) {
428 pixelsv2 = temp2;
429 } else {
430 pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels));
431 }
433 pixelsv3 = vec_mergel(vczero, pixelsv1);
434 pixelsv4 = vec_mergel(vczero, pixelsv2);
435 pixelsv1 = vec_mergeh(vczero, pixelsv1);
436 pixelsv2 = vec_mergeh(vczero, pixelsv2);
438 pixelssum4 = vec_add((vector unsigned short)pixelsv3,
439 (vector unsigned short)pixelsv4);
440 pixelssum2 = vec_add((vector unsigned short)pixelsv1,
441 (vector unsigned short)pixelsv2);
442 temp4 = vec_add(pixelssum3, pixelssum4);
443 temp4 = vec_sra(temp4, vctwo);
444 temp3 = vec_add(pixelssum1, pixelssum2);
445 temp3 = vec_sra(temp3, vctwo);
447 pixelssum3 = vec_add(pixelssum4, vctwo);
448 pixelssum1 = vec_add(pixelssum2, vctwo);
450 blockv = vec_packsu(temp3, temp4);
452 vec_st(blockv, 0, block);
454 block += line_size;
455 pixels += line_size;
456 }
458 POWERPC_PERF_STOP_COUNT(altivec_put_pixels16_xy2_num, 1);
459 }
461 /* next one assumes that ((line_size % 16) == 0) */
462 static void put_no_rnd_pixels16_xy2_altivec(uint8_t * block, const uint8_t * pixels, int line_size, int h)
463 {
464 POWERPC_PERF_DECLARE(altivec_put_no_rnd_pixels16_xy2_num, 1);
465 register int i;
466 register vector unsigned char pixelsv1, pixelsv2, pixelsv3, pixelsv4;
467 register vector unsigned char blockv, temp1, temp2;
468 register vector unsigned short temp3, temp4,
469 pixelssum1, pixelssum2, pixelssum3, pixelssum4;
470 register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0);
471 register const vector unsigned short vcone = (const vector unsigned short)vec_splat_u16(1);
472 register const vector unsigned short vctwo = (const vector unsigned short)vec_splat_u16(2);
474 POWERPC_PERF_START_COUNT(altivec_put_no_rnd_pixels16_xy2_num, 1);
476 temp1 = vec_ld(0, pixels);
477 temp2 = vec_ld(16, pixels);
478 pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels));
479 if ((((unsigned long)pixels) & 0x0000000F) == 0x0000000F) {
480 pixelsv2 = temp2;
481 } else {
482 pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels));
483 }
484 pixelsv3 = vec_mergel(vczero, pixelsv1);
485 pixelsv4 = vec_mergel(vczero, pixelsv2);
486 pixelsv1 = vec_mergeh(vczero, pixelsv1);
487 pixelsv2 = vec_mergeh(vczero, pixelsv2);
488 pixelssum3 = vec_add((vector unsigned short)pixelsv3,
489 (vector unsigned short)pixelsv4);
490 pixelssum3 = vec_add(pixelssum3, vcone);
491 pixelssum1 = vec_add((vector unsigned short)pixelsv1,
492 (vector unsigned short)pixelsv2);
493 pixelssum1 = vec_add(pixelssum1, vcone);
495 for (i = 0; i < h ; i++) {
496 blockv = vec_ld(0, block);
498 temp1 = vec_ld(line_size, pixels);
499 temp2 = vec_ld(line_size + 16, pixels);
500 pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels));
501 if (((((unsigned long)pixels) + line_size) & 0x0000000F) == 0x0000000F) {
502 pixelsv2 = temp2;
503 } else {
504 pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels));
505 }
507 pixelsv3 = vec_mergel(vczero, pixelsv1);
508 pixelsv4 = vec_mergel(vczero, pixelsv2);
509 pixelsv1 = vec_mergeh(vczero, pixelsv1);
510 pixelsv2 = vec_mergeh(vczero, pixelsv2);
512 pixelssum4 = vec_add((vector unsigned short)pixelsv3,
513 (vector unsigned short)pixelsv4);
514 pixelssum2 = vec_add((vector unsigned short)pixelsv1,
515 (vector unsigned short)pixelsv2);
516 temp4 = vec_add(pixelssum3, pixelssum4);
517 temp4 = vec_sra(temp4, vctwo);
518 temp3 = vec_add(pixelssum1, pixelssum2);
519 temp3 = vec_sra(temp3, vctwo);
521 pixelssum3 = vec_add(pixelssum4, vcone);
522 pixelssum1 = vec_add(pixelssum2, vcone);
524 blockv = vec_packsu(temp3, temp4);
526 vec_st(blockv, 0, block);
528 block += line_size;
529 pixels += line_size;
530 }
532 POWERPC_PERF_STOP_COUNT(altivec_put_no_rnd_pixels16_xy2_num, 1);
533 }
535 /* next one assumes that ((line_size % 8) == 0) */
536 static void avg_pixels8_xy2_altivec(uint8_t *block, const uint8_t *pixels, int line_size, int h)
537 {
538 POWERPC_PERF_DECLARE(altivec_avg_pixels8_xy2_num, 1);
539 register int i;
540 register vector unsigned char pixelsv1, pixelsv2, pixelsavg;
541 register vector unsigned char blockv, temp1, temp2, blocktemp;
542 register vector unsigned short pixelssum1, pixelssum2, temp3;
544 register const vector unsigned char vczero = (const vector unsigned char)
545 vec_splat_u8(0);
546 register const vector unsigned short vctwo = (const vector unsigned short)
547 vec_splat_u16(2);
549 temp1 = vec_ld(0, pixels);
550 temp2 = vec_ld(16, pixels);
551 pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels));
552 if ((((unsigned long)pixels) & 0x0000000F) == 0x0000000F) {
553 pixelsv2 = temp2;
554 } else {
555 pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels));
556 }
557 pixelsv1 = vec_mergeh(vczero, pixelsv1);
558 pixelsv2 = vec_mergeh(vczero, pixelsv2);
559 pixelssum1 = vec_add((vector unsigned short)pixelsv1,
560 (vector unsigned short)pixelsv2);
561 pixelssum1 = vec_add(pixelssum1, vctwo);
563 POWERPC_PERF_START_COUNT(altivec_avg_pixels8_xy2_num, 1);
564 for (i = 0; i < h ; i++) {
565 int rightside = ((unsigned long)block & 0x0000000F);
566 blockv = vec_ld(0, block);
568 temp1 = vec_ld(line_size, pixels);
569 temp2 = vec_ld(line_size + 16, pixels);
570 pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels));
571 if (((((unsigned long)pixels) + line_size) & 0x0000000F) == 0x0000000F) {
572 pixelsv2 = temp2;
573 } else {
574 pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels));
575 }
577 pixelsv1 = vec_mergeh(vczero, pixelsv1);
578 pixelsv2 = vec_mergeh(vczero, pixelsv2);
579 pixelssum2 = vec_add((vector unsigned short)pixelsv1,
580 (vector unsigned short)pixelsv2);
581 temp3 = vec_add(pixelssum1, pixelssum2);
582 temp3 = vec_sra(temp3, vctwo);
583 pixelssum1 = vec_add(pixelssum2, vctwo);
584 pixelsavg = vec_packsu(temp3, (vector unsigned short) vczero);
586 if (rightside) {
587 blocktemp = vec_perm(blockv, pixelsavg, vcprm(0, 1, s0, s1));
588 } else {
589 blocktemp = vec_perm(blockv, pixelsavg, vcprm(s0, s1, 2, 3));
590 }
592 blockv = vec_avg(blocktemp, blockv);
593 vec_st(blockv, 0, block);
595 block += line_size;
596 pixels += line_size;
597 }
599 POWERPC_PERF_STOP_COUNT(altivec_avg_pixels8_xy2_num, 1);
600 }
602 void dsputil_init_altivec(DSPContext* c)
603 {
604 c->diff_pixels = diff_pixels_altivec;
605 c->get_pixels = get_pixels_altivec;
606 c->clear_block = clear_block_altivec;
608 c->put_pixels_tab[0][0] = put_pixels16_altivec;
609 /* the two functions do the same thing, so use the same code */
610 c->put_no_rnd_pixels_tab[0][0] = put_pixels16_altivec;
611 c->avg_pixels_tab[0][0] = avg_pixels16_altivec;
612 c->avg_pixels_tab[1][0] = avg_pixels8_altivec;
613 c->avg_pixels_tab[1][3] = avg_pixels8_xy2_altivec;
614 c->put_pixels_tab[1][3] = put_pixels8_xy2_altivec;
615 c->put_no_rnd_pixels_tab[1][3] = put_no_rnd_pixels8_xy2_altivec;
616 c->put_pixels_tab[0][3] = put_pixels16_xy2_altivec;
617 c->put_no_rnd_pixels_tab[0][3] = put_no_rnd_pixels16_xy2_altivec;
619 }