Context Navigation

← Previous Revision
Latest Revision
Next Revision →
Normal
Revision Log

intrinsic.cpp @ 4

Revision 4, 17.7 KB checked in by ajaworski, 13 years ago (diff)
Added modified SAGE sources

Rev	Line
[4]	1	/******************************************************************************
	2	* Fast DXT - a realtime DXT compression tool
	3	*
	4	* Author : Luc Renambot
	5	*
	6	* Copyright (C) 2007 Electronic Visualization Laboratory,
	7	* University of Illinois at Chicago
	8	*
	9	* All rights reserved.
	10	*
	11	* Redistribution and use in source and binary forms, with or without
	12	* modification, are permitted provided that the following conditions are met:
	13	*
	14	* * Redistributions of source code must retain the above copyright
	15	* notice, this list of conditions and the following disclaimer.
	16	* * Redistributions in binary form must reproduce the above
	17	* copyright notice, this list of conditions and the following disclaimer
	18	* in the documentation and/or other materials provided with the distribution.
	19	* * Neither the name of the University of Illinois at Chicago nor
	20	* the names of its contributors may be used to endorse or promote
	21	* products derived from this software without specific prior written permission.
	22	*
	23	* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
	24	* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
	25	* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
	26	* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
	27	* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
	28	* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
	29	* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
	30	* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
	31	* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
	32	* NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
	33	* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
	34	*
	35	* Direct questions, comments etc about SAGE to http://www.evl.uic.edu/cavern/forum/
	36	*
	37	*****************************************************************************/
	38
	39	/*
	40	Code convert from asm to intrinsics from:
	41
	42	Copyright (C) 2006 Id Software, Inc.
	43	Written by J.M.P. van Waveren
	44	This code is free software; you can redistribute it and/or
	45	modify it under the terms of the GNU Lesser General Public
	46	License as published by the Free Software Foundation; either
	47	version 2.1 of the License, or (at your option) any later version.
	48	This code is distributed in the hope that it will be useful,
	49	but WITHOUT ANY WARRANTY; without even the implied warranty of
	50	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
	51	Lesser General Public License for more details.
	52	*/
	53
	54	#include "dxt.h"
	55
	56	#include <emmintrin.h> // sse2
	57
	58
	59	void ExtractBlock_Intrinsics( const byte inPtr, int width, byte colorBlock )
	60	{
	61	__m128i t0, t1, t2, t3;
	62	register int w = width << 2; // width*4
	63
	64	t0 = _mm_load_si128 ( (__m128i*) inPtr );
	65	_mm_store_si128 ( (__m128i*) &colorBlock[0], t0 ); // copy first row, 16bytes
	66
	67	t1 = _mm_load_si128 ( (__m128i*) (inPtr + w) );
	68	_mm_store_si128 ( (__m128i*) &colorBlock[16], t1 ); // copy second row
	69
	70	t2 = _mm_load_si128 ( (__m128i) (inPtr + 2w) );
	71	_mm_store_si128 ( (__m128i*) &colorBlock[32], t2 ); // copy third row
	72
	73	inPtr = inPtr + w; // add width, intead of *3
	74
	75	t3 = _mm_load_si128 ( (__m128i) (inPtr + 2w) );
	76	_mm_store_si128 ( (__m128i*) &colorBlock[48], t3 ); // copy last row
	77	}
	78
	79	#define R_SHUFFLE_D( x, y, z, w ) (( (w) & 3 ) << 6 \| ( (z) & 3 ) << 4 \| ( (y) & 3 ) << 2 \| ( (x) & 3 ))
	80
	81	ALIGN16( static byte SIMD_SSE2_byte_0[16] ) = { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 };
	82
	83	void GetMinMaxColors_Intrinsics( const byte colorBlock, byte minColor, byte *maxColor )
	84	{
	85	__m128i t0, t1, t3, t4, t6, t7;
	86
	87	// get bounding box
	88	// ----------------
	89
	90	// load the first row
	91	t0 = _mm_load_si128 ( (__m128i*) colorBlock );
	92	t1 = _mm_load_si128 ( (__m128i*) colorBlock );
	93
	94	__m128i t16 = _mm_load_si128 ( (__m128i*) (colorBlock+16) );
	95	// Minimum of Packed Unsigned Byte Integers
	96	t0 = _mm_min_epu8 ( t0, t16);
	97	// Maximum of Packed Unsigned Byte Integers
	98	t1 = _mm_max_epu8 ( t1, t16);
	99
	100	__m128i t32 = _mm_load_si128 ( (__m128i*) (colorBlock+32) );
	101	t0 = _mm_min_epu8 ( t0, t32);
	102	t1 = _mm_max_epu8 ( t1, t32);
	103
	104	__m128i t48 = _mm_load_si128 ( (__m128i*) (colorBlock+48) );
	105	t0 = _mm_min_epu8 ( t0, t48);
	106	t1 = _mm_max_epu8 ( t1, t48);
	107
	108	// Shuffle Packed Doublewords
	109	t3 = _mm_shuffle_epi32( t0, R_SHUFFLE_D( 2, 3, 2, 3 ) );
	110	t4 = _mm_shuffle_epi32( t1, R_SHUFFLE_D( 2, 3, 2, 3 ) );
	111
	112	t0 = _mm_min_epu8 ( t0, t3);
	113	t1 = _mm_max_epu8 ( t1, t4);
	114
	115	// Shuffle Packed Low Words
	116	t6 = _mm_shufflelo_epi16( t0, R_SHUFFLE_D( 2, 3, 2, 3 ) );
	117	t7 = _mm_shufflelo_epi16( t1, R_SHUFFLE_D( 2, 3, 2, 3 ) );
	118
	119	t0 = _mm_min_epu8 ( t0, t6);
	120	t1 = _mm_max_epu8 ( t1, t7);
	121
	122	// inset the bounding box
	123	// ----------------------
	124
	125	// Unpack Low Data
	126	//__m128i t66 = _mm_set1_epi8( 0 );
	127	__m128i t66 = _mm_load_si128 ( (__m128i*) SIMD_SSE2_byte_0 );
	128	t0 = _mm_unpacklo_epi8(t0, t66);
	129	t1 = _mm_unpacklo_epi8(t1, t66);
	130
	131	// copy (movdqa)
	132	//__m128i t2 = _mm_load_si128 ( &t1 );
	133	__m128i t2 = t1;
	134
	135	// Subtract Packed Integers
	136	t2 = _mm_sub_epi16(t2, t0);
	137
	138	// Shift Packed Data Right Logical
	139	t2 = _mm_srli_epi16(t2, INSET_SHIFT);
	140
	141	// Add Packed Integers
	142	t0 = _mm_add_epi16(t0, t2);
	143
	144	t1 = _mm_sub_epi16(t1, t2);
	145
	146	// Pack with Unsigned Saturation
	147	t0 = _mm_packus_epi16(t0, t0);
	148	t1 = _mm_packus_epi16(t1, t1);
	149
	150	// store bounding box extents
	151	// --------------------------
	152	_mm_store_si128 ( (__m128i*) minColor, t0 );
	153	_mm_store_si128 ( (__m128i*) maxColor, t1 );
	154	}
	155
	156
	157	ALIGN16( static word SIMD_SSE2_word_0[8] ) = { 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000 };
	158	ALIGN16( static word SIMD_SSE2_word_1[8] ) = { 0x0001, 0x0001, 0x0001, 0x0001, 0x0001, 0x0001, 0x0001, 0x0001 };
	159	ALIGN16( static word SIMD_SSE2_word_2[8] ) = { 0x0002, 0x0002, 0x0002, 0x0002, 0x0002, 0x0002, 0x0002, 0x0002 };
	160	ALIGN16( static word SIMD_SSE2_word_div_by_3[8] ) = { (1<<16)/3+1, (1<<16)/3+1, (1<<16)/3+1, (1<<16)/3+1, (1<<16)/3+1, (1<<16)/3+1, (1<<16)/3+1, (1<<16)/3+1 };
	161	ALIGN16( static byte SIMD_SSE2_byte_colorMask[16] ) = { C565_5_MASK, C565_6_MASK, C565_5_MASK, 0x00, 0x00, 0x00, 0x00, 0x00, C565_5_MASK, C565_6_MASK, C565_5_MASK, 0x00, 0x00, 0x00, 0x00, 0x00 };
	162
	163	void EmitColorIndices_Intrinsics( const byte colorBlock, const byte minColor, const byte maxColor, byte &outData )
	164	{
	165	ALIGN16( byte color0[16] );
	166	ALIGN16( byte color1[16] );
	167	ALIGN16( byte color2[16] );
	168	ALIGN16( byte color3[16] );
	169	ALIGN16( byte result[16] );
	170
	171	// mov esi, maxColor
	172	// mov edi, minColor
	173
	174	__m128i t0, t1, t2, t3, t4, t5, t6, t7;
	175
	176	t7 = _mm_setzero_si128();
	177	//t7 = _mm_xor_si128(t7, t7);
	178	_mm_store_si128 ( (__m128i*) &result, t7 );
	179
	180
	181	//t0 = _mm_load_si128 ( (__m128i*) maxColor );
	182	t0 = _mm_cvtsi32_si128( (int)maxColor);
	183
	184	// Bitwise AND
	185	__m128i tt = _mm_load_si128 ( (__m128i*) SIMD_SSE2_byte_colorMask );
	186	t0 = _mm_and_si128(t0, tt);
	187
	188	t0 = _mm_unpacklo_epi8(t0, t7);
	189
	190	t4 = _mm_shufflelo_epi16( t0, R_SHUFFLE_D( 0, 3, 2, 3 ));
	191	t5 = _mm_shufflelo_epi16( t0, R_SHUFFLE_D( 3, 1, 3, 3 ));
	192
	193	t4 = _mm_srli_epi16(t4, 5);
	194	t5 = _mm_srli_epi16(t5, 6);
	195
	196	// Bitwise Logical OR
	197	t0 = _mm_or_si128(t0, t4);
	198	t0 = _mm_or_si128(t0, t5); // t0 contains color0 in 565
	199
	200
	201
	202
	203	//t1 = _mm_load_si128 ( (__m128i*) minColor );
	204	t1 = _mm_cvtsi32_si128( (int)minColor);
	205
	206	t1 = _mm_and_si128(t1, tt);
	207
	208	t1 = _mm_unpacklo_epi8(t1, t7);
	209
	210	t4 = _mm_shufflelo_epi16( t1, R_SHUFFLE_D( 0, 3, 2, 3 ));
	211	t5 = _mm_shufflelo_epi16( t1, R_SHUFFLE_D( 3, 1, 3, 3 ));
	212
	213	t4 = _mm_srli_epi16(t4, 5);
	214	t5 = _mm_srli_epi16(t5, 6);
	215
	216	t1 = _mm_or_si128(t1, t4);
	217	t1 = _mm_or_si128(t1, t5); // t1 contains color1 in 565
	218
	219
	220
	221	t2 = t0;
	222
	223	t2 = _mm_packus_epi16(t2, t7);
	224
	225	t2 = _mm_shuffle_epi32( t2, R_SHUFFLE_D( 0, 1, 0, 1 ));
	226
	227	_mm_store_si128 ( (__m128i*) &color0, t2 );
	228
	229	t6 = t0;
	230	t6 = _mm_add_epi16(t6, t0);
	231	t6 = _mm_add_epi16(t6, t1);
	232
	233	// Multiply Packed Signed Integers and Store High Result
	234	__m128i tw3 = _mm_load_si128 ( (__m128i*) SIMD_SSE2_word_div_by_3 );
	235	t6 = _mm_mulhi_epi16(t6, tw3);
	236	t6 = _mm_packus_epi16(t6, t7);
	237
	238	t6 = _mm_shuffle_epi32( t6, R_SHUFFLE_D( 0, 1, 0, 1 ));
	239
	240	_mm_store_si128 ( (__m128i*) &color2, t6 );
	241
	242	t3 = t1;
	243	t3 = _mm_packus_epi16(t3, t7);
	244	t3 = _mm_shuffle_epi32( t3, R_SHUFFLE_D( 0, 1, 0, 1 ));
	245
	246	_mm_store_si128 ( (__m128i*) &color1, t3 );
	247
	248	t1 = _mm_add_epi16(t1, t1);
	249	t0 = _mm_add_epi16(t0, t1);
	250
	251	t0 = _mm_mulhi_epi16(t0, tw3);
	252	t0 = _mm_packus_epi16(t0, t7);
	253
	254	t0 = _mm_shuffle_epi32( t0, R_SHUFFLE_D( 0, 1, 0, 1 ));
	255	_mm_store_si128 ( (__m128i*) &color3, t0 );
	256
	257	__m128i w0 = _mm_load_si128 ( (__m128i*) SIMD_SSE2_word_0);
	258	__m128i w1 = _mm_load_si128 ( (__m128i*) SIMD_SSE2_word_1);
	259	__m128i w2 = _mm_load_si128 ( (__m128i*) SIMD_SSE2_word_2);
	260
	261	// mov eax, 32
	262	// mov esi, colorBlock
	263	int x = 32;
	264	//const byte *c = colorBlock;
	265	while (x >= 0)
	266	{
	267	t3 = _mm_loadl_epi64( (__m128i*) (colorBlock+x+0));
	268	t3 = _mm_shuffle_epi32( t3, R_SHUFFLE_D( 0, 2, 1, 3 ));
	269
	270	t5 = _mm_loadl_epi64( (__m128i*) (colorBlock+x+8));
	271	t5 = _mm_shuffle_epi32( t5, R_SHUFFLE_D( 0, 2, 1, 3 ));
	272
	273	t0 = t3;
	274	t6 = t5;
	275	// Compute Sum of Absolute Difference
	276	__m128i c0 = _mm_load_si128 ( (__m128i*) color0 );
	277	t0 = _mm_sad_epu8(t0, c0);
	278	t6 = _mm_sad_epu8(t6, c0);
	279	// Pack with Signed Saturation
	280	t0 = _mm_packs_epi32 (t0, t6);
	281
	282	t1 = t3;
	283	t6 = t5;
	284	__m128i c1 = _mm_load_si128 ( (__m128i*) color1 );
	285	t1 = _mm_sad_epu8(t1, c1);
	286	t6 = _mm_sad_epu8(t6, c1);
	287	t1 = _mm_packs_epi32 (t1, t6);
	288
	289	t2 = t3;
	290	t6 = t5;
	291	__m128i c2 = _mm_load_si128 ( (__m128i*) color2 );
	292	t2 = _mm_sad_epu8(t2, c2);
	293	t6 = _mm_sad_epu8(t6, c2);
	294	t2 = _mm_packs_epi32 (t2, t6);
	295
	296	__m128i c3 = _mm_load_si128 ( (__m128i*) color3 );
	297	t3 = _mm_sad_epu8(t3, c3);
	298	t5 = _mm_sad_epu8(t5, c3);
	299	t3 = _mm_packs_epi32 (t3, t5);
	300
	301
	302	t4 = _mm_loadl_epi64( (__m128i*) (colorBlock+x+16));
	303	t4 = _mm_shuffle_epi32( t4, R_SHUFFLE_D( 0, 2, 1, 3 ));
	304
	305	t5 = _mm_loadl_epi64( (__m128i*) (colorBlock+x+24));
	306	t5 = _mm_shuffle_epi32( t5, R_SHUFFLE_D( 0, 2, 1, 3 ));
	307
	308	t6 = t4;
	309	t7 = t5;
	310	t6 = _mm_sad_epu8(t6, c0);
	311	t7 = _mm_sad_epu8(t7, c0);
	312	t6 = _mm_packs_epi32 (t6, t7);
	313	t0 = _mm_packs_epi32 (t0, t6); // d0
	314
	315	t6 = t4;
	316	t7 = t5;
	317	t6 = _mm_sad_epu8(t6, c1);
	318	t7 = _mm_sad_epu8(t7, c1);
	319	t6 = _mm_packs_epi32 (t6, t7);
	320	t1 = _mm_packs_epi32 (t1, t6); // d1
	321
	322	t6 = t4;
	323	t7 = t5;
	324	t6 = _mm_sad_epu8(t6, c2);
	325	t7 = _mm_sad_epu8(t7, c2);
	326	t6 = _mm_packs_epi32 (t6, t7);
	327	t2 = _mm_packs_epi32 (t2, t6); // d2
	328
	329	t4 = _mm_sad_epu8(t4, c3);
	330	t5 = _mm_sad_epu8(t5, c3);
	331	t4 = _mm_packs_epi32 (t4, t5);
	332	t3 = _mm_packs_epi32 (t3, t4); // d3
	333
	334	t7 = _mm_load_si128 ( (__m128i*) result );
	335
	336	t7 = _mm_slli_epi32( t7, 16);
	337
	338	t4 = t0;
	339	t5 = t1;
	340	// Compare Packed Signed Integers for Greater Than
	341	t0 = _mm_cmpgt_epi16(t0, t3); // b0
	342	t1 = _mm_cmpgt_epi16(t1, t2); // b1
	343	t4 = _mm_cmpgt_epi16(t4, t2); // b2
	344	t5 = _mm_cmpgt_epi16(t5, t3); // b3
	345	t2 = _mm_cmpgt_epi16(t2, t3); // b4
	346
	347	t4 = _mm_and_si128(t4, t1); // x0
	348	t5 = _mm_and_si128(t5, t0); // x1
	349	t2 = _mm_and_si128(t2, t0); // x2
	350
	351	t4 = _mm_or_si128(t4, t5);
	352	t2 = _mm_and_si128(t2, w1);
	353	t4 = _mm_and_si128(t4, w2);
	354	t2 = _mm_or_si128(t2, t4);
	355
	356	t5 = _mm_shuffle_epi32( t2, R_SHUFFLE_D( 2, 3, 0, 1 ));
	357
	358	// Unpack Low Data
	359	t2 = _mm_unpacklo_epi16 ( t2, w0);
	360	t5 = _mm_unpacklo_epi16 ( t5, w0);
	361
	362	//t5 = _mm_slli_si128 ( t5, 8);
	363	t5 = _mm_slli_epi32( t5, 8);
	364
	365	t7 = _mm_or_si128(t7, t5);
	366	t7 = _mm_or_si128(t7, t2);
	367
	368	_mm_store_si128 ( (__m128i*) &result, t7 );
	369
	370	x -=32;
	371	}
	372
	373	t4 = _mm_shuffle_epi32( t7, R_SHUFFLE_D( 1, 2, 3, 0 ));
	374	t5 = _mm_shuffle_epi32( t7, R_SHUFFLE_D( 2, 3, 0, 1 ));
	375	t6 = _mm_shuffle_epi32( t7, R_SHUFFLE_D( 3, 0, 1, 2 ));
	376
	377	t4 = _mm_slli_epi32 ( t4, 2);
	378	t5 = _mm_slli_epi32 ( t5, 4);
	379	t6 = _mm_slli_epi32 ( t6, 6);
	380
	381	t7 = _mm_or_si128(t7, t4);
	382	t7 = _mm_or_si128(t7, t5);
	383	t7 = _mm_or_si128(t7, t6);
	384
	385	//_mm_store_si128 ( (__m128i*) outData, t7 );
	386
	387	int r = _mm_cvtsi128_si32 (t7);
	388	memcpy(outData, &r, 4); // Anything better ?
	389
	390	outData += 4;
	391	}
	392
	393
	394
	395	ALIGN16( static byte SIMD_SSE2_byte_1[16] ) = { 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01 };
	396	ALIGN16( static byte SIMD_SSE2_byte_2[16] ) = { 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02 };
	397	ALIGN16( static byte SIMD_SSE2_byte_7[16] ) = { 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07 };
	398	ALIGN16( static word SIMD_SSE2_word_div_by_7[8] ) = { (1<<16)/7+1, (1<<16)/7+1, (1<<16)/7+1, (1<<16)/7+1, (1<<16)/7+1, (1<<16)/7+1, (1<<16)/7+1, (1<<16)/7+1 };
	399	ALIGN16( static word SIMD_SSE2_word_div_by_14[8] ) = { (1<<16)/14+1, (1<<16)/14+1, (1<<16)/14+1, (1<<16)/14+1, (1<<16)/14+1, (1<<16)/14+1, (1<<16)/14+1, (1<<16)/14+1 };
	400	ALIGN16( static word SIMD_SSE2_word_scale66554400[8] ) = { 6, 6, 5, 5, 4, 4, 0, 0 };
	401	ALIGN16( static word SIMD_SSE2_word_scale11223300[8] ) = { 1, 1, 2, 2, 3, 3, 0, 0 };
	402	ALIGN16( static dword SIMD_SSE2_dword_alpha_bit_mask0[4] ) = { 7<<0, 0, 7<<0, 0 };
	403	ALIGN16( static dword SIMD_SSE2_dword_alpha_bit_mask1[4] ) = { 7<<3, 0, 7<<3, 0 };
	404	ALIGN16( static dword SIMD_SSE2_dword_alpha_bit_mask2[4] ) = { 7<<6, 0, 7<<6, 0 };
	405	ALIGN16( static dword SIMD_SSE2_dword_alpha_bit_mask3[4] ) = { 7<<9, 0, 7<<9, 0 };
	406	ALIGN16( static dword SIMD_SSE2_dword_alpha_bit_mask4[4] ) = { 7<<12, 0, 7<<12, 0 };
	407	ALIGN16( static dword SIMD_SSE2_dword_alpha_bit_mask5[4] ) = { 7<<15, 0, 7<<15, 0 };
	408	ALIGN16( static dword SIMD_SSE2_dword_alpha_bit_mask6[4] ) = { 7<<18, 0, 7<<18, 0 };
	409	ALIGN16( static dword SIMD_SSE2_dword_alpha_bit_mask7[4] ) = { 7<<21, 0, 7<<21, 0 };
	410
	411
	412	void EmitAlphaIndices_Intrinsics( const byte colorBlock, const byte minAlpha, const byte maxAlpha, byte &outData)
	413	{
	414	/*
	415	__asm {
	416	mov esi, colorBlock
	417	movdqa xmm0, [esi+ 0]
	418	movdqa xmm5, [esi+16]
	419	psrld xmm0, 24
	420	psrld xmm5, 24
	421	packuswb xmm0, xmm5
	422
	423	movdqa xmm6, [esi+32]
	424	movdqa xmm4, [esi+48]
	425	psrld xmm6, 24
	426	psrld xmm4, 24
	427	packuswb xmm6, xmm4
	428
	429	movzx ecx, maxAlpha
	430	movd xmm5, ecx
	431	pshuflw xmm5, xmm5, R_SHUFFLE_D( 0, 0, 0, 0 )
	432	pshufd xmm5, xmm5, R_SHUFFLE_D( 0, 0, 0, 0 )
	433	movdqa xmm7, xmm5
	434
	435	movzx edx, minAlpha
	436	movd xmm2, edx
	437	pshuflw xmm2, xmm2, R_SHUFFLE_D( 0, 0, 0, 0 )
	438	pshufd xmm2, xmm2, R_SHUFFLE_D( 0, 0, 0, 0 )
	439	movdqa xmm3, xmm2
	440
	441	movdqa xmm4, xmm5
	442	psubw xmm4, xmm2
	443	pmulhw xmm4, SIMD_SSE2_word_div_by_14 // * ( ( 1 << 16 ) / 14 + 1 ) ) >> 16
	444	movdqa xmm1, xmm2
	445	paddw xmm1, xmm4
	446	packuswb xmm1, xmm1 // ab1
	447
	448	pmullw xmm5, SIMD_SSE2_word_scale66554400
	449	pmullw xmm7, SIMD_SSE2_word_scale11223300
	450	pmullw xmm2, SIMD_SSE2_word_scale11223300
	451	pmullw xmm3, SIMD_SSE2_word_scale66554400
	452	paddw xmm5, xmm2
	453	paddw xmm7, xmm3
	454	pmulhw xmm5, SIMD_SSE2_word_div_by_7 // * ( ( 1 << 16 ) / 7 + 1 ) ) >> 16
	455	pmulhw xmm7, SIMD_SSE2_word_div_by_7 // * ( ( 1 << 16 ) / 7 + 1 ) ) >> 16
	456	paddw xmm5, xmm4
	457	paddw xmm7, xmm4
	458
	459	pshufd xmm2, xmm5, R_SHUFFLE_D( 0, 0, 0, 0 )
	460	pshufd xmm3, xmm5, R_SHUFFLE_D( 1, 1, 1, 1 )
	461	pshufd xmm4, xmm5, R_SHUFFLE_D( 2, 2, 2, 2 )
	462	packuswb xmm2, xmm2 // ab2
	463	packuswb xmm3, xmm3 // ab3
	464	packuswb xmm4, xmm4 // ab4
	465
	466	packuswb xmm0, xmm6 // alpha values
	467
	468	pshufd xmm5, xmm7, R_SHUFFLE_D( 2, 2, 2, 2 )
	469	pshufd xmm6, xmm7, R_SHUFFLE_D( 1, 1, 1, 1 )
	470	pshufd xmm7, xmm7, R_SHUFFLE_D( 0, 0, 0, 0 )
	471	packuswb xmm5, xmm5 // ab5
	472	packuswb xmm6, xmm6 // ab6
	473	packuswb xmm7, xmm7 // ab7
	474
	475	pminub xmm1, xmm0
	476	pminub xmm2, xmm0
	477	pminub xmm3, xmm0
	478	pcmpeqb xmm1, xmm0
	479	pcmpeqb xmm2, xmm0
	480	pcmpeqb xmm3, xmm0
	481	pminub xmm4, xmm0
	482	pminub xmm5, xmm0
	483	pminub xmm6, xmm0
	484	pminub xmm7, xmm0
	485	pcmpeqb xmm4, xmm0
	486	pcmpeqb xmm5, xmm0
	487	pcmpeqb xmm6, xmm0
	488	pcmpeqb xmm7, xmm0
	489	pand xmm1, SIMD_SSE2_byte_1
	490	pand xmm2, SIMD_SSE2_byte_1
	491	pand xmm3, SIMD_SSE2_byte_1
	492	pand xmm4, SIMD_SSE2_byte_1
	493	pand xmm5, SIMD_SSE2_byte_1
	494	pand xmm6, SIMD_SSE2_byte_1
	495	pand xmm7, SIMD_SSE2_byte_1
	496	movdqa xmm0, SIMD_SSE2_byte_1
	497	paddusb xmm0, xmm1
	498	paddusb xmm2, xmm3
	499	paddusb xmm4, xmm5
	500	paddusb xmm6, xmm7
	501	paddusb xmm0, xmm2
	502	paddusb xmm4, xmm6
	503	paddusb xmm0, xmm4
	504	pand xmm0, SIMD_SSE2_byte_7
	505	movdqa xmm1, SIMD_SSE2_byte_2
	506	pcmpgtb xmm1, xmm0
	507	pand xmm1, SIMD_SSE2_byte_1
	508	pxor xmm0, xmm1
	509	movdqa xmm1, xmm0
	510	movdqa xmm2, xmm0
	511	movdqa xmm3, xmm0
	512	movdqa xmm4, xmm0
	513	movdqa xmm5, xmm0
	514	movdqa xmm6, xmm0
	515	movdqa xmm7, xmm0
	516	psrlq xmm1, 8- 3
	517	psrlq xmm2, 16- 6
	518	psrlq xmm3, 24- 9
	519
	520	psrlq xmm4, 32-12
	521	psrlq xmm5, 40-15
	522	psrlq xmm6, 48-18
	523	psrlq xmm7, 56-21
	524	pand xmm0, SIMD_SSE2_dword_alpha_bit_mask0
	525	pand xmm1, SIMD_SSE2_dword_alpha_bit_mask1
	526	pand xmm2, SIMD_SSE2_dword_alpha_bit_mask2
	527	pand xmm3, SIMD_SSE2_dword_alpha_bit_mask3
	528	pand xmm4, SIMD_SSE2_dword_alpha_bit_mask4
	529	pand xmm5, SIMD_SSE2_dword_alpha_bit_mask5
	530	pand xmm6, SIMD_SSE2_dword_alpha_bit_mask6
	531	pand xmm7, SIMD_SSE2_dword_alpha_bit_mask7
	532	por xmm0, xmm1
	533	por xmm2, xmm3
	534	por xmm4, xmm5
	535	por xmm6, xmm7
	536	por xmm0, xmm2
	537	por xmm4, xmm6
	538	por xmm0, xmm4
	539	mov esi, outData
	540	movd [esi+0], xmm0
	541	pshufd xmm1, xmm0, R_SHUFFLE_D( 2, 3, 0, 1 )
	542	movd [esi+3], xmm1
	543	}
	544	outData += 6;
	545	*/
	546	}
	547

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: trunk/src/testing/app/ishare/FastDXT/intrinsic.cpp @ 4

Download in other formats: