diff libavcodec/cell/spe_ed.c @ 2:897f711a7157

rearrange to work with autoconf
author Nina Engelhardt <nengel@mailbox.tu-berlin.de>
date Tue, 25 Sep 2012 15:55:33 +0200
parents
children
line diff
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/libavcodec/cell/spe_ed.c	Tue Sep 25 15:55:33 2012 +0200
     1.3 @@ -0,0 +1,508 @@
     1.4 +#define CELL_SPE
     1.5 +
     1.6 +#include <string.h>
     1.7 +#include <stdio.h>
     1.8 +#include <spu_intrinsics.h>
     1.9 +#include <spu_mfcio.h>
    1.10 +#include "libavcodec/avcodec.h"
    1.11 +#include "h264_cabac_spu.h"
    1.12 +#include "cabac_spu.h"
    1.13 +#include "h264_types_spu.h"
    1.14 +#include "h264_tables.h"
    1.15 +#include "h264_dma.h"
    1.16 +#include "h264_tables.h"
    1.17 +
    1.18 +#define MB_WIDTH 240
    1.19 +#define MB_STRIDE (MB_WIDTH+16)
    1.20 +
    1.21 +H264Cabac_spu hcabac;
    1.22 +CABACContext cabac;
    1.23 +DECLARE_ALIGNED_16(EDSlice_spu, slice[2]);
    1.24 +DECLARE_ALIGNED_16(H264Mb, mb[2]);
    1.25 +DECLARE_ALIGNED_16(H264spe, spe);
    1.26 +
    1.27 +DECLARE_ALIGNED_16(uint8_t, non_zero_count_table[2][MB_STRIDE][32]);
    1.28 +DECLARE_ALIGNED_16(uint8_t, mvd_table[2][2][8*MB_STRIDE][2]);
    1.29 +DECLARE_ALIGNED_16(uint8_t, direct_table[2][4*MB_STRIDE]);
    1.30 +DECLARE_ALIGNED_16(uint8_t, chroma_pred_mode_table[2][MB_STRIDE]);
    1.31 +DECLARE_ALIGNED_16(uint8_t, intra4x4_pred_mode_table[2][8*MB_STRIDE]);
    1.32 +DECLARE_ALIGNED_16(uint16_t,cbp_table[2][MB_STRIDE]);
    1.33 +DECLARE_ALIGNED_16(uint8_t, qscale_table[2][MB_STRIDE]);
    1.34 +
    1.35 +DECLARE_ALIGNED_16(uint32_t, mb_type_table[2][MB_STRIDE]);
    1.36 +DECLARE_ALIGNED_16(int8_t, ref_index_table[2][2][4*MB_STRIDE]);
    1.37 +DECLARE_ALIGNED_16(int16_t, motion_val_table[2][2][4*4*MB_WIDTH][2]);
    1.38 +
    1.39 +DECLARE_ALIGNED(128, uint8_t, bytestream_ls[4096]);
    1.40 +DECLARE_ALIGNED_16(uint32_t, list1_mb_type_table[2][MB_STRIDE]);
    1.41 +DECLARE_ALIGNED_16(int8_t, list1_ref_index_table[2][2][4*MB_STRIDE]);
    1.42 +
    1.43 +DECLARE_ALIGNED_16(spe_pos, dma_temp); //dma temp for sending
    1.44 +//mb position of neighbouring spes
    1.45 +DECLARE_ALIGNED_16(volatile spe_pos, src_spe); //written by SPE_ID -1
    1.46 +static int total_lines;
    1.47 +
    1.48 +static inline int dep_resolved(H264spe *p){
    1.49 +	int spe_id = p->spe_id;
    1.50 +	volatile int lines_proc = src_spe.count;
    1.51 +	if (spe_id==0)
    1.52 +		return (total_lines < lines_proc-1 +p->mb_height)? 1:0;
    1.53 +	else
    1.54 +		return (total_lines < lines_proc-1)? 1:0;
    1.55 +}
    1.56 +
    1.57 +static void update_tgt_spe_dep(H264spe *p, int end){
    1.58 +	// 	if (end ){
    1.59 +   total_lines++;
    1.60 +   spe_pos* dma_spe = &dma_temp;
    1.61 +   spe_pos* tgt_spe = p->tgt_spe + (unsigned) &src_spe; //located in target spe local store
    1.62 +   dma_spe->count = end? total_lines+1: total_lines;
    1.63 +   spu_dma_barrier_put(dma_spe, (unsigned) tgt_spe, sizeof(dma_temp), ED_put);
    1.64 +   // 	}
    1.65 +   
    1.66 +}
    1.67 +
    1.68 +static int init_cabac(H264spe *p, H264Cabac_spu *hc){
    1.69 +	hc->mb_height = p->mb_height;
    1.70 +	hc->mb_width = p->mb_width;
    1.71 +	hc->b_stride = 4*p->mb_width;
    1.72 +	hc->mb_stride = p->mb_stride;
    1.73 +	
    1.74 +	for(int i=0; i<16; i++){
    1.75 +		#define T(x) (x>>2) | ((x<<2) & 0xF)
    1.76 +		hc->zigzag_scan[i] = T(zigzag_scan[i]);
    1.77 +		#undef T
    1.78 +	}
    1.79 +	for(int i=0; i<64; i++){
    1.80 +		#define T(x) (x>>3) | ((x&7)<<3)
    1.81 +		hc->zigzag_scan8x8[i] = T(ff_zigzag_direct[i]);
    1.82 +		#undef T
    1.83 +	}
    1.84 +}
    1.85 +
    1.86 +static void reset_cabac_buffers(){
    1.87 + memset(intra4x4_pred_mode_table, 0, sizeof(intra4x4_pred_mode_table));
    1.88 +	memset(mvd_table, 0, sizeof(mvd_table));
    1.89 +	memset(direct_table, 0, sizeof(direct_table));
    1.90 +	memset(chroma_pred_mode_table, 0, sizeof(chroma_pred_mode_table));
    1.91 +	memset(cbp_table, 0, sizeof(cbp_table));
    1.92 +	memset(qscale_table, 0, sizeof(qscale_table));
    1.93 + 	memset(mb_type_table, 0, sizeof(mb_type_table));
    1.94 +	memset(ref_index_table, 0, sizeof(ref_index_table));
    1.95 +	memset(motion_val_table, 0, sizeof(motion_val_table));
    1.96 +}
    1.97 +
    1.98 +static void ff_init_cabac_decoder(CABACContext *c, const uint8_t *buf, int bufsize){
    1.99 +	int align = (unsigned) buf & 0xF;
   1.100 +	int dma_size;
   1.101 +	
   1.102 +	c->bytestream_ea_start=
   1.103 +	c->bytestream_ea= buf;
   1.104 +	c->bytestream_ea_end= buf + bufsize;
   1.105 +	c->bufsize = bufsize;
   1.106 +	
   1.107 +	if (bufsize + align >= sizeof(bytestream_ls)){
   1.108 +		dma_size = sizeof(bytestream_ls);
   1.109 +		c->bufsize = c->bufsize +align - sizeof(bytestream_ls);				
   1.110 +	}else{
   1.111 +		int align_end = (bufsize+align) &0xF;
   1.112 +		if (align_end)
   1.113 +			dma_size = bufsize+align + 16-align_end;
   1.114 +		else
   1.115 +			dma_size = bufsize+align;
   1.116 +		c->bufsize = 0;
   1.117 +	}
   1.118 +// 	printf("%d\n", dma_size);
   1.119 +	c->bytestream_end  = &bytestream_ls[dma_size]; 
   1.120 +	c->bytestream_start= c->bytestream = &bytestream_ls[align];
   1.121 + 	spu_dma_get(bytestream_ls, (unsigned) buf - align, dma_size, ED_get );
   1.122 +	c->bytestream_ea_start=
   1.123 +	c->bytestream_ea= buf + dma_size -align;
   1.124 +
   1.125 +	wait_dma_id(ED_get);
   1.126 +	
   1.127 +	if (align %2){
   1.128 +		c->low =  (*c->bytestream++)<<18;
   1.129 +		c->low+=  (*c->bytestream++)<<10;
   1.130 +		c->low+= ((*c->bytestream++)<<2) + 2;
   1.131 +	}else {
   1.132 +		c->low =  (*c->bytestream++)<<18;
   1.133 +		c->low+=  (*c->bytestream++)<<10;
   1.134 +		c->low+=  (2<<8);
   1.135 +	}
   1.136 +
   1.137 +	c->range= 0x1FE;
   1.138 +	bytecount=0;
   1.139 +}
   1.140 +
   1.141 +static void init_dequant8_coeff_table(EDSlice_spu *s, H264Cabac_spu *hc){
   1.142 +    int i,q,x;
   1.143 +    const int transpose = HAVE_ALTIVEC;
   1.144 +    hc->dequant8_coeff[0] = hc->dequant8_buffer[0];
   1.145 +    hc->dequant8_coeff[1] = hc->dequant8_buffer[1];
   1.146 +
   1.147 +    for(i=0; i<2; i++){
   1.148 +        if(i && !memcmp(s->pps.scaling_matrix8[0], s->pps.scaling_matrix8[1], 64*sizeof(uint8_t))){
   1.149 +            hc->dequant8_coeff[1] = hc->dequant8_buffer[0];
   1.150 +            break;
   1.151 +        }
   1.152 +
   1.153 +        for(q=0; q<52; q++){
   1.154 +            int shift = div6[q];
   1.155 +            int idx = rem6[q];
   1.156 +            for(x=0; x<64; x++)
   1.157 +                hc->dequant8_coeff[i][q][transpose ? (x>>3)|((x&7)<<3) : x] =
   1.158 +                    ((uint32_t)dequant8_coeff_init[idx][ dequant8_coeff_init_scan[((x>>1)&12) | (x&3)] ] *
   1.159 +                    s->pps.scaling_matrix8[i][x]) << shift;
   1.160 +        }
   1.161 +    }
   1.162 +}
   1.163 +
   1.164 +static void init_dequant4_coeff_table(EDSlice_spu *s, H264Cabac_spu *hc){
   1.165 +    int i,j,q,x;
   1.166 +    const int transpose = HAVE_MMX | HAVE_ALTIVEC | HAVE_NEON;
   1.167 +    for(i=0; i<6; i++ ){
   1.168 +        hc->dequant4_coeff[i] = hc->dequant4_buffer[i];
   1.169 +        for(j=0; j<i; j++){
   1.170 +            if(!memcmp(s->pps.scaling_matrix4[j], s->pps.scaling_matrix4[i], 16*sizeof(uint8_t))){
   1.171 +                hc->dequant4_coeff[i] = hc->dequant4_buffer[j];
   1.172 +                break;
   1.173 +            }
   1.174 +        }
   1.175 +        if(j<i)
   1.176 +            continue;
   1.177 +
   1.178 +        for(q=0; q<52; q++){
   1.179 +            int shift = div6[q] + 2;
   1.180 +            int idx = rem6[q];
   1.181 +            for(x=0; x<16; x++)
   1.182 +                hc->dequant4_coeff[i][q][transpose ? (x>>2)|((x<<2)&0xF) : x] =
   1.183 +                    ((uint32_t)dequant4_coeff_init[idx][(x&1) + ((x>>2)&1)] *
   1.184 +                    s->pps.scaling_matrix4[i][x]) << shift;
   1.185 +        }
   1.186 +    }
   1.187 +}
   1.188 +
   1.189 +static void init_dequant_tables(EDSlice_spu *s, H264Cabac_spu *hc){
   1.190 +    int i,x;
   1.191 +
   1.192 +    init_dequant4_coeff_table(s, hc);
   1.193 +    if(s->pps.transform_8x8_mode)
   1.194 +        init_dequant8_coeff_table(s, hc);
   1.195 +    if(s->transform_bypass){
   1.196 +        for(i=0; i<6; i++)
   1.197 +            for(x=0; x<16; x++)
   1.198 +                hc->dequant4_coeff[i][0][x] = 1<<6;
   1.199 +        if(s->pps.transform_8x8_mode)
   1.200 +            for(i=0; i<2; i++)
   1.201 +                for(x=0; x<64; x++)
   1.202 +                    hc->dequant8_coeff[i][0][x] = 1<<6;
   1.203 +    }
   1.204 +}
   1.205 +
   1.206 +static void init_entropy_buf(H264Cabac_spu *hc, EDSlice_spu *s){
   1.207 +	hc->non_zero_count_top 		= non_zero_count_table[0];
   1.208 +	hc->non_zero_count     		= non_zero_count_table[1];
   1.209 +	hc->mvd_top[0]				= mvd_table[0][0];
   1.210 +	hc->mvd[0]					= mvd_table[0][1];
   1.211 +	hc->mvd_top[1]				= mvd_table[1][0];
   1.212 +	hc->mvd[1]					= mvd_table[1][1];
   1.213 +	hc->direct_top		   		= direct_table[0];
   1.214 +	hc->direct			   		= direct_table[1];
   1.215 +	hc->chroma_pred_mode_top	= chroma_pred_mode_table[0];
   1.216 +	hc->chroma_pred_mode  		= chroma_pred_mode_table[1];
   1.217 +	hc->intra4x4_pred_mode_top	= intra4x4_pred_mode_table[0];
   1.218 +	hc->intra4x4_pred_mode  	= intra4x4_pred_mode_table[1];
   1.219 +	hc->cbp_top			   		= cbp_table[0];
   1.220 +	hc->cbp				   		= cbp_table[1];
   1.221 +	hc->qscale_top			   	= qscale_table[0] +1;
   1.222 +	hc->qscale				   	= qscale_table[1] +1;
   1.223 +
   1.224 +	hc->mb_type_top 			= mb_type_table[0]+1;
   1.225 +	hc->mb_type		 			= mb_type_table[1]+1;
   1.226 +	hc->ref_index_top[0]		= ref_index_table[0][0];
   1.227 +	hc->ref_index_top[1]		= ref_index_table[1][0];
   1.228 +	hc->ref_index[0]			= ref_index_table[0][1];
   1.229 +	hc->ref_index[1]			= ref_index_table[1][1];
   1.230 +	hc->motion_val_top[0] 		= motion_val_table[0][0];
   1.231 +	hc->motion_val_top[1] 		= motion_val_table[1][0];
   1.232 +	hc->motion_val[0] 			= motion_val_table[0][1];
   1.233 +	hc->motion_val[1] 			= motion_val_table[1][1];
   1.234 +
   1.235 +	int mb_stride = hc->mb_stride;
   1.236 +
   1.237 +	if (s->slice_type_nos == FF_B_TYPE){
   1.238 +		while(!dep_resolved(&spe));
   1.239 +		spu_dma_get(list1_mb_type_table[0], (unsigned) (s->list1.mb_type -1), mb_stride*sizeof(uint32_t), ED_get);
   1.240 +		spu_dma_get(list1_ref_index_table[0][0], (unsigned) s->list1.ref_index[0], mb_stride*4*sizeof(int8_t), ED_get);
   1.241 +		spu_dma_get(list1_ref_index_table[0][1], (unsigned) s->list1.ref_index[1], mb_stride*4*sizeof(int8_t), ED_get);
   1.242 +		wait_dma_id(ED_get);
   1.243 +		spu_dma_get(list1_mb_type_table[1], (unsigned) (s->list1.mb_type -1 + mb_stride), mb_stride*sizeof(uint32_t), ED_get);
   1.244 +		spu_dma_get(list1_ref_index_table[1][0], (unsigned) (s->list1.ref_index[0] + 4*mb_stride), mb_stride*4*sizeof(int8_t), ED_get);
   1.245 +		spu_dma_get(list1_ref_index_table[1][1], (unsigned) (s->list1.ref_index[1] + 4*mb_stride), mb_stride*4*sizeof(int8_t), ED_get);
   1.246 +		hc->list1_mb_type = list1_mb_type_table[0]+1;
   1.247 +		hc->list1_ref_index[0] = list1_ref_index_table[0][0];
   1.248 +		hc->list1_ref_index[1] = list1_ref_index_table[0][1];
   1.249 +	}	
   1.250 +
   1.251 +}
   1.252 +
   1.253 +static void update_entropy_buf(H264Cabac_spu *hc, EDSlice_spu *s, int line){
   1.254 +	int mb_stride = hc->mb_stride;
   1.255 +	int mb_width = hc->mb_width;
   1.256 +	int top = (line+1)%2;
   1.257 +	int cur = line%2;
   1.258 +	int bottom = (line+1)%2; //same as top, but to identify prebuffering of next line.
   1.259 +
   1.260 +	hc->non_zero_count_top 		= non_zero_count_table[top];
   1.261 +	hc->non_zero_count     		= non_zero_count_table[cur];
   1.262 +	hc->mvd_top[0]				= mvd_table[0][top];
   1.263 +	hc->mvd[0]					= mvd_table[0][cur];
   1.264 +	hc->mvd_top[1]				= mvd_table[1][top];
   1.265 +	hc->mvd[1]					= mvd_table[1][cur];
   1.266 +	hc->direct_top		   		= direct_table[top];
   1.267 +	hc->direct			   		= direct_table[cur];
   1.268 +	hc->chroma_pred_mode_top	= chroma_pred_mode_table[top];
   1.269 +	hc->chroma_pred_mode  		= chroma_pred_mode_table[cur];
   1.270 +	hc->intra4x4_pred_mode_top	= intra4x4_pred_mode_table[top];
   1.271 +	hc->intra4x4_pred_mode  	= intra4x4_pred_mode_table[cur];
   1.272 +	hc->cbp_top			   		= cbp_table[top];
   1.273 +	hc->cbp				   		= cbp_table[cur];
   1.274 +	hc->qscale_top			   	= qscale_table[top] +1;
   1.275 +	hc->qscale				   	= qscale_table[cur] +1;
   1.276 +
   1.277 +	hc->mb_type_top 			= mb_type_table[top]+1;
   1.278 +	hc->mb_type		 			= mb_type_table[cur]+1;
   1.279 +	hc->ref_index_top[0]		= ref_index_table[0][top];
   1.280 +	hc->ref_index_top[1]		= ref_index_table[1][top];
   1.281 +	hc->ref_index[0]			= ref_index_table[0][cur];
   1.282 +	hc->ref_index[1]			= ref_index_table[1][cur];
   1.283 +	hc->motion_val_top[0] 		= motion_val_table[0][top];
   1.284 +	hc->motion_val_top[1] 		= motion_val_table[1][top];
   1.285 +	hc->motion_val[0] 			= motion_val_table[0][cur];
   1.286 +	hc->motion_val[1] 			= motion_val_table[1][cur];
   1.287 +
   1.288 +	wait_dma_id(ED_put);
   1.289 +	
   1.290 +	spu_dma_put(mb_type_table[top], (unsigned) (s->pic.mb_type -1 + line*mb_stride), mb_stride*sizeof(uint32_t), ED_put);
   1.291 +	spu_dma_put(ref_index_table[0][top], (unsigned) (s->pic.ref_index[0] + line*4*mb_stride), 4*mb_stride*sizeof(int8_t), ED_put);
   1.292 +	spu_dma_put(ref_index_table[1][top], (unsigned) (s->pic.ref_index[1] + line*4*mb_stride), 4*mb_stride*sizeof(int8_t), ED_put);
   1.293 +	spu_dma_put(motion_val_table[0][top], (unsigned) (s->pic.motion_val[0]+ line*16*mb_width), 16*mb_width*2*sizeof(int16_t), ED_put);
   1.294 +	spu_dma_put(motion_val_table[1][top], (unsigned) (s->pic.motion_val[1]+ line*16*mb_width), 16*mb_width*2*sizeof(int16_t), ED_put);
   1.295 +
   1.296 +	if (s->slice_type_nos == FF_B_TYPE){
   1.297 +		update_tgt_spe_dep(&spe, 0);
   1.298 +		wait_dma_id(ED_get);
   1.299 +						
   1.300 +		if (line + 2 < hc->mb_height){
   1.301 +			while(!dep_resolved(&spe));
   1.302 +			spu_dma_get(list1_mb_type_table[cur], (unsigned) (s->list1.mb_type -1 + (line+2)*mb_stride), mb_stride*sizeof(uint32_t), ED_get);
   1.303 +			spu_dma_get(list1_ref_index_table[cur][0], (unsigned) (s->list1.ref_index[0] + (line+2)*4*mb_stride), mb_stride*4*sizeof(int8_t), ED_get);
   1.304 +			spu_dma_get(list1_ref_index_table[cur][1], (unsigned) (s->list1.ref_index[1] + (line+2)*4*mb_stride), mb_stride*4*sizeof(int8_t), ED_get);
   1.305 +		}
   1.306 +		hc->list1_mb_type = list1_mb_type_table[bottom]+1;
   1.307 +		hc->list1_ref_index[0] = list1_ref_index_table[bottom][0];
   1.308 +		hc->list1_ref_index[1] = list1_ref_index_table[bottom][1];
   1.309 +	}
   1.310 +
   1.311 +}
   1.312 +
   1.313 +// void printmbdiff(EDSlice_spu *s, H264Cabac_spu *hc, H264Mb *mp, H264Mb *ms){
   1.314 +// 
   1.315 +// 	printf("mb_x %d, %d\n", mp->mb_x, ms->mb_x);
   1.316 +// 	printf("mb_y %d, %d\n", mp->mb_y, ms->mb_y);
   1.317 +// 	printf("mb_xy %d, %d\n", mp->mb_xy, ms->mb_xy);
   1.318 +// 	printf("top_mb_xy %d, %d\n", mp->top_mb_xy, ms->top_mb_xy);
   1.319 +// 	printf("left_mb_xy %d, %d\n", mp->left_mb_xy, ms->left_mb_xy);
   1.320 +// 	printf("chroma_pred_mode %d, %d\n", mp->chroma_pred_mode, ms->chroma_pred_mode);
   1.321 +// 	printf("intra16x16_pred_mode %d, %d\n", mp->intra16x16_pred_mode, ms->intra16x16_pred_mode);
   1.322 +// 	printf("topleft_samples %d, %d\n", mp->topleft_samples_available, ms->topleft_samples_available);
   1.323 +// 	printf("topright_samples %d, %d\n", mp->topright_samples_available, ms->topright_samples_available);
   1.324 +// 	printf("top_samples %d, %d\n", mp->top_samples_available, ms->top_samples_available);
   1.325 +// 	printf("left_samples %d, %d\n", mp->left_samples_available, ms->left_samples_available);
   1.326 +// 
   1.327 +// 	if (memcmp(mp->intra4x4_pred_mode_cache, ms->intra4x4_pred_mode_cache, 40)){
   1.328 +// 		for (int i=0; i<5; i++){
   1.329 +// 			for (int j=0; j<8; j++){
   1.330 +// 				printf("%d, %d\t", mp->intra4x4_pred_mode_cache[i*8+j],ms->intra4x4_pred_mode_cache[i*8+j]);
   1.331 +// 			}
   1.332 +// 			printf("\n");
   1.333 +// 		}
   1.334 +// 	}
   1.335 +// 
   1.336 +// 	if (memcmp(mp->non_zero_count_cache, ms->non_zero_count_cache, 48)){
   1.337 +// 		for (int i=0; i<6; i++){
   1.338 +// 			for (int j=0; j<8; j++){
   1.339 +// 				printf("%u, %u\t", mp->non_zero_count_cache[i*8+j],ms->non_zero_count_cache[i*8+j]);
   1.340 +// 			}
   1.341 +// 			printf("\n");
   1.342 +// 		}
   1.343 +// 	}
   1.344 +// 
   1.345 +// 	if (memcmp(mp->sub_mb_type, ms->sub_mb_type, 8)){
   1.346 +// 		for (int i=0; i<4; i++){
   1.347 +// 			printf("%u, %u\t", mp->sub_mb_type[i], mp->sub_mb_type[i]);
   1.348 +// 			printf("\n");
   1.349 +// 		}
   1.350 +// 	}
   1.351 +// 
   1.352 +// 	if (memcmp(mp->mv_cache, ms->mv_cache, 320)){
   1.353 +// 		for (int k=0; k<2; k++){
   1.354 +// 			for (int i=0; i<5; i++){
   1.355 +// 				for (int j=0; j<8; j++){
   1.356 +// 					printf("%d, %d, %d, %d\t", mp->mv_cache[k][i*8+j][0], mp->mv_cache[k][i*8+j][1], ms->mv_cache[k][i*8+j][0], ms->mv_cache[k][i*8+j][1]);
   1.357 +// 				}
   1.358 +// 				printf("\n");
   1.359 +// 			}
   1.360 +// 		}
   1.361 +// 	}
   1.362 +// 
   1.363 +// 	if (memcmp(mp->ref_cache, ms->ref_cache, 80)){
   1.364 +// 		for (int k=0; k<2; k++){
   1.365 +// 			for (int i=0; i<5; i++){
   1.366 +// 				for (int j=0; j<8; j++){
   1.367 +// 					printf("%d, %d\t", mp->ref_cache[k][i*8+j], ms->ref_cache[k][i*8+j]);
   1.368 +// 				}
   1.369 +// 				printf("\n");
   1.370 +// 			}
   1.371 +// 		}
   1.372 +// 	}
   1.373 +// 
   1.374 +// 	printf("cbp %d, %d\n", mp->cbp, ms->cbp);
   1.375 +// 	for (int i=0; i<hc->mb_stride; i++){
   1.376 +//    		printf("%d, ", hc->cbp[i]); fflush(0);
   1.377 +//    	}
   1.378 +// 	printf("\n");
   1.379 +// 
   1.380 +// 	printf("mb_type %x, %x\n", mp->mb_type, ms->mb_type);
   1.381 +// 	printf("mb_type IS_INTRA %d, IS_INTRA16x16 %d, IS_DIRECT %d\n", IS_INTRA(ms->mb_type), IS_INTRA16x16(ms->mb_type), IS_DIRECT(ms->mb_type) );
   1.382 +// 	printf("left_type %d, %d\n", mp->left_type, ms->left_type);
   1.383 +// 	printf("top_type %d, %d\n", mp->top_type, ms->top_type);
   1.384 +// 	printf("qscale_mb_xy %d, %d\n", mp->qscale_mb_xy, ms->qscale_mb_xy);
   1.385 +// 	printf("qscale_left_mb_xy %d, %d\n", mp->qscale_left_mb_xy, ms->qscale_left_mb_xy);
   1.386 +// 	printf("qscale_top_mb_xy %d, %d\n", mp->qscale_top_mb_xy, ms->qscale_top_mb_xy);
   1.387 +// // 	for (int i=0; i<hc->mb_stride; i++){
   1.388 +// // 		printf("%d, ", qscale_table[0][i]); fflush(0);
   1.389 +// // 	}
   1.390 +// 
   1.391 +// 	if (memcmp(mp->mb, ms->mb, 768)){
   1.392 +// 		for (int i=0; i<16; i++){
   1.393 +// 			for (int j=0; j<16; j++){
   1.394 +// 				printf("%d, %d\t", mp->mb[j + i*16], ms->ref_cache[j + i*16]);
   1.395 +// 			}
   1.396 +// 			printf("\n");
   1.397 +// 		}
   1.398 +// 		for (int i=0; i<8; i++){
   1.399 +// 			for (int j=0; j<8; j++){
   1.400 +// 				printf("%d, %d\t", mp->mb[256 + j + i*8], ms->ref_cache[j + i*8]);
   1.401 +// 			}
   1.402 +// 			printf("\n");
   1.403 +// 		}
   1.404 +// 		for (int i=0; i<8; i++){
   1.405 +// 			for (int j=0; j<8; j++){
   1.406 +// 				printf("%d, %d\t", mp->mb[320+ j + i*8], ms->ref_cache[j + i*8]);
   1.407 +// 			}
   1.408 +// 			printf("\n");
   1.409 +// 		}
   1.410 +// 	}
   1.411 +// 
   1.412 +// 	if (memcmp(mp->bS, ms->bS, 32)){
   1.413 +// 		for (int k=0; k<2; k++){
   1.414 +// 			for (int i=0; i<4; i++){
   1.415 +// 				for (int j=0; j<4; j++){
   1.416 +// 					printf("%d, %d\t", mp->bS[k][i][j], mp->mv_cache[k][i][j]);
   1.417 +// 				}
   1.418 +// 				printf("\n");
   1.419 +// 			}
   1.420 +// 		}
   1.421 +// 	}
   1.422 +// 	if (memcmp(mp->edges, ms->edges, 4)){
   1.423 +// 		printf("edges %d, %d, %d, %d\n", mp->edges[0], ms->edges[0], mp->edges[1], ms->edges[1]);
   1.424 +// 		printf("deblock %d, %d\n", mp->deblock_mb, ms->deblock_mb);
   1.425 +// 	}
   1.426 +// 
   1.427 +// 	printf("dequant4_coeff_y %d, %d\n", mp->dequant4_coeff_y, ms->dequant4_coeff_y);
   1.428 +// 	printf("dequant4_coeff_cb %d, %d\n", mp->dequant4_coeff_cb, ms->dequant4_coeff_cb);
   1.429 +// 	printf("dequant4_coeff_cr %d, %d\n", mp->dequant4_coeff_cr, ms->dequant4_coeff_cr);
   1.430 +// }
   1.431 +// DECLARE_ALIGNED_16(H264Mb, tmp);
   1.432 +
   1.433 +
   1.434 +int main(unsigned long long id, unsigned long long argp){
   1.435 +	EDSlice_spu *s;
   1.436 +	H264Cabac_spu *hc = &hcabac;
   1.437 +	CABACContext *c = &cabac;
   1.438 +	H264spe *p = &spe;
   1.439 +	
   1.440 +	spu_write_out_mbox((unsigned) slice);
   1.441 +	spu_dma_get(p, (unsigned) argp, sizeof(H264spe), ED_spe); //ID_slice is used out of convienience
   1.442 +	wait_dma_id(ED_spe);
   1.443 +
   1.444 +	ff_init_cabac_states();
   1.445 +	init_cabac(p, hc);
   1.446 +	hc->blocking=0;
   1.447 +	for(;;){
   1.448 +		spu_read_in_mbox();
   1.449 +		s = &slice[0];
   1.450 +		reset_cabac_buffers();
   1.451 +		init_entropy_buf(hc, s);
   1.452 +
   1.453 +		if (hc->blocking) wait_dma_id(ED_get);
   1.454 +		//printf("framesize %d\n", s->byte_bufsize);fflush(0);
   1.455 + 		init_dequant_tables(s, hc);
   1.456 +		ff_init_cabac_decoder( c, s->bytestream_start, s->byte_bufsize );
   1.457 + 		ff_h264_init_cabac_states(s, c);
   1.458 +
   1.459 +		int mb_slot=0;
   1.460 + 		for(int j=0; j<hc->mb_height; j++){
   1.461 +			for(int i=0; i<hc->mb_width; i++){
   1.462 +				int eos,ret;
   1.463 +				H264Mb *m = &mb[mb_slot];
   1.464 +				m->mb_x=i;
   1.465 +				m->mb_y=j;
   1.466 +				s->m = m;
   1.467 +
   1.468 +				ret = ff_h264_decode_mb_cabac(hc, s, c);
   1.469 +
   1.470 +// 				spu_dma_get(&tmp, (unsigned) &s->mbs[j*hc->mb_width + i], sizeof(H264Mb), ED_get);
   1.471 +// 				wait_dma_id(ED_get);
   1.472 +// 				if (memcmp(&tmp, m, sizeof(H264Mb))){
   1.473 +// 					printf("coded pic num %d\n", s->coded_pic_num);
   1.474 +// 					printmbdiff(s, hc,&tmp, m);
   1.475 +// 					return 0;
   1.476 +// 				}
   1.477 +				//printf("qscale %d\n", m->qscale_mb_xy);
   1.478 +				if (!hc->blocking){
   1.479 +					if (mb_slot){
   1.480 +						spu_dma_put(m, (unsigned) &s->mbs[j*hc->mb_width + i], sizeof(H264Mb), ED_putmb1);
   1.481 +						wait_dma_id(ED_putmb0);
   1.482 +					}else {
   1.483 +						spu_dma_put(m, (unsigned) &s->mbs[j*hc->mb_width + i], sizeof(H264Mb), ED_putmb0);
   1.484 +						wait_dma_id(ED_putmb1);
   1.485 +					}
   1.486 +					mb_slot++; mb_slot%=2;
   1.487 +				}else {
   1.488 +					spu_dma_put(m, (unsigned) &s->mbs[j*hc->mb_width + i], sizeof(H264Mb), ED_putmb0);
   1.489 +					wait_dma_id(ED_putmb0);
   1.490 +				}
   1.491 +				
   1.492 +
   1.493 +				eos = get_cabac_terminate( c);
   1.494 +
   1.495 +				if( ret < 0) {
   1.496 +					fprintf(stderr, "error at %d bytecount\n", bytecount);
   1.497 +					return -1;
   1.498 +				}
   1.499 +			}
   1.500 +			update_entropy_buf(hc, s, j);
   1.501 +			if (hc->blocking){ wait_dma_id(ED_get); wait_dma_id(ED_put);}
   1.502 +		}
   1.503 +		wait_dma_id(ED_put);
   1.504 +		spu_write_out_mbox(1);
   1.505 +
   1.506 +	}
   1.507 +
   1.508 +	return 0;
   1.509 +
   1.510 +
   1.511 +}