diff libavcodec/arm/dsputil_iwmmxt.c @ 2:897f711a7157

rearrange to work with autoconf
author Nina Engelhardt <nengel@mailbox.tu-berlin.de>
date Tue, 25 Sep 2012 15:55:33 +0200
parents
children
line diff
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/libavcodec/arm/dsputil_iwmmxt.c	Tue Sep 25 15:55:33 2012 +0200
     1.3 @@ -0,0 +1,205 @@
     1.4 +/*
     1.5 + * iWMMXt optimized DSP utils
     1.6 + * Copyright (c) 2004 AGAWA Koji
     1.7 + *
     1.8 + * This file is part of FFmpeg.
     1.9 + *
    1.10 + * FFmpeg is free software; you can redistribute it and/or
    1.11 + * modify it under the terms of the GNU Lesser General Public
    1.12 + * License as published by the Free Software Foundation; either
    1.13 + * version 2.1 of the License, or (at your option) any later version.
    1.14 + *
    1.15 + * FFmpeg is distributed in the hope that it will be useful,
    1.16 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
    1.17 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
    1.18 + * Lesser General Public License for more details.
    1.19 + *
    1.20 + * You should have received a copy of the GNU Lesser General Public
    1.21 + * License along with FFmpeg; if not, write to the Free Software
    1.22 + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
    1.23 + */
    1.24 +
    1.25 +#include "libavcodec/dsputil.h"
    1.26 +
    1.27 +#define DEF(x, y) x ## _no_rnd_ ## y ##_iwmmxt
    1.28 +#define SET_RND(regd)  __asm__ volatile ("mov r12, #1 \n\t tbcsth " #regd ", r12":::"r12");
    1.29 +#define WAVG2B "wavg2b"
    1.30 +#include "dsputil_iwmmxt_rnd_template.c"
    1.31 +#undef DEF
    1.32 +#undef SET_RND
    1.33 +#undef WAVG2B
    1.34 +
    1.35 +#define DEF(x, y) x ## _ ## y ##_iwmmxt
    1.36 +#define SET_RND(regd)  __asm__ volatile ("mov r12, #2 \n\t tbcsth " #regd ", r12":::"r12");
    1.37 +#define WAVG2B "wavg2br"
    1.38 +#include "dsputil_iwmmxt_rnd_template.c"
    1.39 +#undef DEF
    1.40 +#undef SET_RND
    1.41 +#undef WAVG2BR
    1.42 +
    1.43 +// need scheduling
    1.44 +#define OP(AVG)                                         \
    1.45 +    __asm__ volatile (                                      \
    1.46 +        /* alignment */                                 \
    1.47 +        "and r12, %[pixels], #7 \n\t"                   \
    1.48 +        "bic %[pixels], %[pixels], #7 \n\t"             \
    1.49 +        "tmcr wcgr1, r12 \n\t"                          \
    1.50 +                                                        \
    1.51 +        "wldrd wr0, [%[pixels]] \n\t"                   \
    1.52 +        "wldrd wr1, [%[pixels], #8] \n\t"               \
    1.53 +        "add %[pixels], %[pixels], %[line_size] \n\t"   \
    1.54 +        "walignr1 wr4, wr0, wr1 \n\t"                   \
    1.55 +                                                        \
    1.56 +        "1: \n\t"                                       \
    1.57 +                                                        \
    1.58 +        "wldrd wr2, [%[pixels]] \n\t"                   \
    1.59 +        "wldrd wr3, [%[pixels], #8] \n\t"               \
    1.60 +        "add %[pixels], %[pixels], %[line_size] \n\t"   \
    1.61 +        "pld [%[pixels]] \n\t"                          \
    1.62 +        "walignr1 wr5, wr2, wr3 \n\t"                   \
    1.63 +        AVG " wr6, wr4, wr5 \n\t"                       \
    1.64 +        "wstrd wr6, [%[block]] \n\t"                    \
    1.65 +        "add %[block], %[block], %[line_size] \n\t"     \
    1.66 +                                                        \
    1.67 +        "wldrd wr0, [%[pixels]] \n\t"                   \
    1.68 +        "wldrd wr1, [%[pixels], #8] \n\t"               \
    1.69 +        "add %[pixels], %[pixels], %[line_size] \n\t"   \
    1.70 +        "walignr1 wr4, wr0, wr1 \n\t"                   \
    1.71 +        "pld [%[pixels]] \n\t"                          \
    1.72 +        AVG " wr6, wr4, wr5 \n\t"                       \
    1.73 +        "wstrd wr6, [%[block]] \n\t"                    \
    1.74 +        "add %[block], %[block], %[line_size] \n\t"     \
    1.75 +                                                        \
    1.76 +        "subs %[h], %[h], #2 \n\t"                      \
    1.77 +        "bne 1b \n\t"                                   \
    1.78 +        : [block]"+r"(block), [pixels]"+r"(pixels), [h]"+r"(h)  \
    1.79 +        : [line_size]"r"(line_size) \
    1.80 +        : "memory", "r12");
    1.81 +void put_pixels8_y2_iwmmxt(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
    1.82 +{
    1.83 +    OP("wavg2br");
    1.84 +}
    1.85 +void put_no_rnd_pixels8_y2_iwmmxt(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
    1.86 +{
    1.87 +    OP("wavg2b");
    1.88 +}
    1.89 +#undef OP
    1.90 +
    1.91 +void add_pixels_clamped_iwmmxt(const DCTELEM *block, uint8_t *pixels, int line_size)
    1.92 +{
    1.93 +    uint8_t *pixels2 = pixels + line_size;
    1.94 +
    1.95 +    __asm__ volatile (
    1.96 +        "mov            r12, #4                 \n\t"
    1.97 +        "1:                                     \n\t"
    1.98 +        "pld            [%[pixels], %[line_size2]]              \n\t"
    1.99 +        "pld            [%[pixels2], %[line_size2]]             \n\t"
   1.100 +        "wldrd          wr4, [%[pixels]]        \n\t"
   1.101 +        "wldrd          wr5, [%[pixels2]]       \n\t"
   1.102 +        "pld            [%[block], #32]         \n\t"
   1.103 +        "wunpckelub     wr6, wr4                \n\t"
   1.104 +        "wldrd          wr0, [%[block]]         \n\t"
   1.105 +        "wunpckehub     wr7, wr4                \n\t"
   1.106 +        "wldrd          wr1, [%[block], #8]     \n\t"
   1.107 +        "wunpckelub     wr8, wr5                \n\t"
   1.108 +        "wldrd          wr2, [%[block], #16]    \n\t"
   1.109 +        "wunpckehub     wr9, wr5                \n\t"
   1.110 +        "wldrd          wr3, [%[block], #24]    \n\t"
   1.111 +        "add            %[block], %[block], #32 \n\t"
   1.112 +        "waddhss        wr10, wr0, wr6          \n\t"
   1.113 +        "waddhss        wr11, wr1, wr7          \n\t"
   1.114 +        "waddhss        wr12, wr2, wr8          \n\t"
   1.115 +        "waddhss        wr13, wr3, wr9          \n\t"
   1.116 +        "wpackhus       wr14, wr10, wr11        \n\t"
   1.117 +        "wpackhus       wr15, wr12, wr13        \n\t"
   1.118 +        "wstrd          wr14, [%[pixels]]       \n\t"
   1.119 +        "add            %[pixels], %[pixels], %[line_size2]     \n\t"
   1.120 +        "subs           r12, r12, #1            \n\t"
   1.121 +        "wstrd          wr15, [%[pixels2]]      \n\t"
   1.122 +        "add            %[pixels2], %[pixels2], %[line_size2]   \n\t"
   1.123 +        "bne            1b                      \n\t"
   1.124 +        : [block]"+r"(block), [pixels]"+r"(pixels), [pixels2]"+r"(pixels2)
   1.125 +        : [line_size2]"r"(line_size << 1)
   1.126 +        : "cc", "memory", "r12");
   1.127 +}
   1.128 +
   1.129 +static void clear_blocks_iwmmxt(DCTELEM *blocks)
   1.130 +{
   1.131 +    __asm__ volatile(
   1.132 +                "wzero wr0                      \n\t"
   1.133 +                "mov r1, #(128 * 6 / 32)        \n\t"
   1.134 +                "1:                             \n\t"
   1.135 +                "wstrd wr0, [%0]                \n\t"
   1.136 +                "wstrd wr0, [%0, #8]            \n\t"
   1.137 +                "wstrd wr0, [%0, #16]           \n\t"
   1.138 +                "wstrd wr0, [%0, #24]           \n\t"
   1.139 +                "subs r1, r1, #1                \n\t"
   1.140 +                "add %0, %0, #32                \n\t"
   1.141 +                "bne 1b                         \n\t"
   1.142 +                : "+r"(blocks)
   1.143 +                :
   1.144 +                : "r1"
   1.145 +        );
   1.146 +}
   1.147 +
   1.148 +static void nop(uint8_t *block, const uint8_t *pixels, int line_size, int h)
   1.149 +{
   1.150 +    return;
   1.151 +}
   1.152 +
   1.153 +/* A run time test is not simple. If this file is compiled in
   1.154 + * then we should install the functions
   1.155 + */
   1.156 +int mm_flags = FF_MM_IWMMXT; /* multimedia extension flags */
   1.157 +
   1.158 +void ff_dsputil_init_iwmmxt(DSPContext* c, AVCodecContext *avctx)
   1.159 +{
   1.160 +    if (avctx->dsp_mask) {
   1.161 +        if (avctx->dsp_mask & FF_MM_FORCE)
   1.162 +            mm_flags |= (avctx->dsp_mask & 0xffff);
   1.163 +        else
   1.164 +            mm_flags &= ~(avctx->dsp_mask & 0xffff);
   1.165 +    }
   1.166 +
   1.167 +    if (!(mm_flags & FF_MM_IWMMXT)) return;
   1.168 +
   1.169 +    c->add_pixels_clamped = add_pixels_clamped_iwmmxt;
   1.170 +
   1.171 +    c->clear_blocks = clear_blocks_iwmmxt;
   1.172 +
   1.173 +    c->put_pixels_tab[0][0] = put_pixels16_iwmmxt;
   1.174 +    c->put_pixels_tab[0][1] = put_pixels16_x2_iwmmxt;
   1.175 +    c->put_pixels_tab[0][2] = put_pixels16_y2_iwmmxt;
   1.176 +    c->put_pixels_tab[0][3] = put_pixels16_xy2_iwmmxt;
   1.177 +    c->put_no_rnd_pixels_tab[0][0] = put_pixels16_iwmmxt;
   1.178 +    c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_iwmmxt;
   1.179 +    c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_iwmmxt;
   1.180 +    c->put_no_rnd_pixels_tab[0][3] = put_no_rnd_pixels16_xy2_iwmmxt;
   1.181 +
   1.182 +    c->put_pixels_tab[1][0] = put_pixels8_iwmmxt;
   1.183 +    c->put_pixels_tab[1][1] = put_pixels8_x2_iwmmxt;
   1.184 +    c->put_pixels_tab[1][2] = put_pixels8_y2_iwmmxt;
   1.185 +    c->put_pixels_tab[1][3] = put_pixels8_xy2_iwmmxt;
   1.186 +    c->put_no_rnd_pixels_tab[1][0] = put_pixels8_iwmmxt;
   1.187 +    c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_iwmmxt;
   1.188 +    c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_iwmmxt;
   1.189 +    c->put_no_rnd_pixels_tab[1][3] = put_no_rnd_pixels8_xy2_iwmmxt;
   1.190 +
   1.191 +    c->avg_pixels_tab[0][0] = avg_pixels16_iwmmxt;
   1.192 +    c->avg_pixels_tab[0][1] = avg_pixels16_x2_iwmmxt;
   1.193 +    c->avg_pixels_tab[0][2] = avg_pixels16_y2_iwmmxt;
   1.194 +    c->avg_pixels_tab[0][3] = avg_pixels16_xy2_iwmmxt;
   1.195 +    c->avg_no_rnd_pixels_tab[0][0] = avg_pixels16_iwmmxt;
   1.196 +    c->avg_no_rnd_pixels_tab[0][1] = avg_no_rnd_pixels16_x2_iwmmxt;
   1.197 +    c->avg_no_rnd_pixels_tab[0][2] = avg_no_rnd_pixels16_y2_iwmmxt;
   1.198 +    c->avg_no_rnd_pixels_tab[0][3] = avg_no_rnd_pixels16_xy2_iwmmxt;
   1.199 +
   1.200 +    c->avg_pixels_tab[1][0] = avg_pixels8_iwmmxt;
   1.201 +    c->avg_pixels_tab[1][1] = avg_pixels8_x2_iwmmxt;
   1.202 +    c->avg_pixels_tab[1][2] = avg_pixels8_y2_iwmmxt;
   1.203 +    c->avg_pixels_tab[1][3] = avg_pixels8_xy2_iwmmxt;
   1.204 +    c->avg_no_rnd_pixels_tab[1][0] = avg_no_rnd_pixels8_iwmmxt;
   1.205 +    c->avg_no_rnd_pixels_tab[1][1] = avg_no_rnd_pixels8_x2_iwmmxt;
   1.206 +    c->avg_no_rnd_pixels_tab[1][2] = avg_no_rnd_pixels8_y2_iwmmxt;
   1.207 +    c->avg_no_rnd_pixels_tab[1][3] = avg_no_rnd_pixels8_xy2_iwmmxt;
   1.208 +}