Context Navigation

← Previous Revision
Latest Revision
Next Revision →
Annotate
Revision Log

intrinsic.cpp @ 4

Revision 4, 17.7 KB checked in by ajaworski, 13 years ago (diff)
Added modified SAGE sources

Line
1	/******************************************************************************
2	* Fast DXT - a realtime DXT compression tool
3	*
4	* Author : Luc Renambot
5	*
6	* Copyright (C) 2007 Electronic Visualization Laboratory,
7	* University of Illinois at Chicago
8	*
9	* All rights reserved.
10	*
11	* Redistribution and use in source and binary forms, with or without
12	* modification, are permitted provided that the following conditions are met:
13	*
14	* * Redistributions of source code must retain the above copyright
15	* notice, this list of conditions and the following disclaimer.
16	* * Redistributions in binary form must reproduce the above
17	* copyright notice, this list of conditions and the following disclaimer
18	* in the documentation and/or other materials provided with the distribution.
19	* * Neither the name of the University of Illinois at Chicago nor
20	* the names of its contributors may be used to endorse or promote
21	* products derived from this software without specific prior written permission.
22	*
23	* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
24	* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
25	* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
26	* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
27	* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
28	* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
29	* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
30	* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
31	* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
32	* NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
33	* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
34	*
35	* Direct questions, comments etc about SAGE to http://www.evl.uic.edu/cavern/forum/
36	*
37	*****************************************************************************/
38
39	/*
40	Code convert from asm to intrinsics from:
41
42	Copyright (C) 2006 Id Software, Inc.
43	Written by J.M.P. van Waveren
44	This code is free software; you can redistribute it and/or
45	modify it under the terms of the GNU Lesser General Public
46	License as published by the Free Software Foundation; either
47	version 2.1 of the License, or (at your option) any later version.
48	This code is distributed in the hope that it will be useful,
49	but WITHOUT ANY WARRANTY; without even the implied warranty of
50	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
51	Lesser General Public License for more details.
52	*/
53
54	#include "dxt.h"
55
56	#include <emmintrin.h> // sse2
57
58
59	void ExtractBlock_Intrinsics( const byte inPtr, int width, byte colorBlock )
60	{
61	__m128i t0, t1, t2, t3;
62	register int w = width << 2; // width*4
63
64	t0 = _mm_load_si128 ( (__m128i*) inPtr );
65	_mm_store_si128 ( (__m128i*) &colorBlock[0], t0 ); // copy first row, 16bytes
66
67	t1 = _mm_load_si128 ( (__m128i*) (inPtr + w) );
68	_mm_store_si128 ( (__m128i*) &colorBlock[16], t1 ); // copy second row
69
70	t2 = _mm_load_si128 ( (__m128i) (inPtr + 2w) );
71	_mm_store_si128 ( (__m128i*) &colorBlock[32], t2 ); // copy third row
72
73	inPtr = inPtr + w; // add width, intead of *3
74
75	t3 = _mm_load_si128 ( (__m128i) (inPtr + 2w) );
76	_mm_store_si128 ( (__m128i*) &colorBlock[48], t3 ); // copy last row
77	}
78
79	#define R_SHUFFLE_D( x, y, z, w ) (( (w) & 3 ) << 6 \| ( (z) & 3 ) << 4 \| ( (y) & 3 ) << 2 \| ( (x) & 3 ))
80
81	ALIGN16( static byte SIMD_SSE2_byte_0[16] ) = { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 };
82
83	void GetMinMaxColors_Intrinsics( const byte colorBlock, byte minColor, byte *maxColor )
84	{
85	__m128i t0, t1, t3, t4, t6, t7;
86
87	// get bounding box
88	// ----------------
89
90	// load the first row
91	t0 = _mm_load_si128 ( (__m128i*) colorBlock );
92	t1 = _mm_load_si128 ( (__m128i*) colorBlock );
93
94	__m128i t16 = _mm_load_si128 ( (__m128i*) (colorBlock+16) );
95	// Minimum of Packed Unsigned Byte Integers
96	t0 = _mm_min_epu8 ( t0, t16);
97	// Maximum of Packed Unsigned Byte Integers
98	t1 = _mm_max_epu8 ( t1, t16);
99
100	__m128i t32 = _mm_load_si128 ( (__m128i*) (colorBlock+32) );
101	t0 = _mm_min_epu8 ( t0, t32);
102	t1 = _mm_max_epu8 ( t1, t32);
103
104	__m128i t48 = _mm_load_si128 ( (__m128i*) (colorBlock+48) );
105	t0 = _mm_min_epu8 ( t0, t48);
106	t1 = _mm_max_epu8 ( t1, t48);
107
108	// Shuffle Packed Doublewords
109	t3 = _mm_shuffle_epi32( t0, R_SHUFFLE_D( 2, 3, 2, 3 ) );
110	t4 = _mm_shuffle_epi32( t1, R_SHUFFLE_D( 2, 3, 2, 3 ) );
111
112	t0 = _mm_min_epu8 ( t0, t3);
113	t1 = _mm_max_epu8 ( t1, t4);
114
115	// Shuffle Packed Low Words
116	t6 = _mm_shufflelo_epi16( t0, R_SHUFFLE_D( 2, 3, 2, 3 ) );
117	t7 = _mm_shufflelo_epi16( t1, R_SHUFFLE_D( 2, 3, 2, 3 ) );
118
119	t0 = _mm_min_epu8 ( t0, t6);
120	t1 = _mm_max_epu8 ( t1, t7);
121
122	// inset the bounding box
123	// ----------------------
124
125	// Unpack Low Data
126	//__m128i t66 = _mm_set1_epi8( 0 );
127	__m128i t66 = _mm_load_si128 ( (__m128i*) SIMD_SSE2_byte_0 );
128	t0 = _mm_unpacklo_epi8(t0, t66);
129	t1 = _mm_unpacklo_epi8(t1, t66);
130
131	// copy (movdqa)
132	//__m128i t2 = _mm_load_si128 ( &t1 );
133	__m128i t2 = t1;
134
135	// Subtract Packed Integers
136	t2 = _mm_sub_epi16(t2, t0);
137
138	// Shift Packed Data Right Logical
139	t2 = _mm_srli_epi16(t2, INSET_SHIFT);
140
141	// Add Packed Integers
142	t0 = _mm_add_epi16(t0, t2);
143
144	t1 = _mm_sub_epi16(t1, t2);
145
146	// Pack with Unsigned Saturation
147	t0 = _mm_packus_epi16(t0, t0);
148	t1 = _mm_packus_epi16(t1, t1);
149
150	// store bounding box extents
151	// --------------------------
152	_mm_store_si128 ( (__m128i*) minColor, t0 );
153	_mm_store_si128 ( (__m128i*) maxColor, t1 );
154	}
155
156
157	ALIGN16( static word SIMD_SSE2_word_0[8] ) = { 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000 };
158	ALIGN16( static word SIMD_SSE2_word_1[8] ) = { 0x0001, 0x0001, 0x0001, 0x0001, 0x0001, 0x0001, 0x0001, 0x0001 };
159	ALIGN16( static word SIMD_SSE2_word_2[8] ) = { 0x0002, 0x0002, 0x0002, 0x0002, 0x0002, 0x0002, 0x0002, 0x0002 };
160	ALIGN16( static word SIMD_SSE2_word_div_by_3[8] ) = { (1<<16)/3+1, (1<<16)/3+1, (1<<16)/3+1, (1<<16)/3+1, (1<<16)/3+1, (1<<16)/3+1, (1<<16)/3+1, (1<<16)/3+1 };
161	ALIGN16( static byte SIMD_SSE2_byte_colorMask[16] ) = { C565_5_MASK, C565_6_MASK, C565_5_MASK, 0x00, 0x00, 0x00, 0x00, 0x00, C565_5_MASK, C565_6_MASK, C565_5_MASK, 0x00, 0x00, 0x00, 0x00, 0x00 };
162
163	void EmitColorIndices_Intrinsics( const byte colorBlock, const byte minColor, const byte maxColor, byte &outData )
164	{
165	ALIGN16( byte color0[16] );
166	ALIGN16( byte color1[16] );
167	ALIGN16( byte color2[16] );
168	ALIGN16( byte color3[16] );
169	ALIGN16( byte result[16] );
170
171	// mov esi, maxColor
172	// mov edi, minColor
173
174	__m128i t0, t1, t2, t3, t4, t5, t6, t7;
175
176	t7 = _mm_setzero_si128();
177	//t7 = _mm_xor_si128(t7, t7);
178	_mm_store_si128 ( (__m128i*) &result, t7 );
179
180
181	//t0 = _mm_load_si128 ( (__m128i*) maxColor );
182	t0 = _mm_cvtsi32_si128( (int)maxColor);
183
184	// Bitwise AND
185	__m128i tt = _mm_load_si128 ( (__m128i*) SIMD_SSE2_byte_colorMask );
186	t0 = _mm_and_si128(t0, tt);
187
188	t0 = _mm_unpacklo_epi8(t0, t7);
189
190	t4 = _mm_shufflelo_epi16( t0, R_SHUFFLE_D( 0, 3, 2, 3 ));
191	t5 = _mm_shufflelo_epi16( t0, R_SHUFFLE_D( 3, 1, 3, 3 ));
192
193	t4 = _mm_srli_epi16(t4, 5);
194	t5 = _mm_srli_epi16(t5, 6);
195
196	// Bitwise Logical OR
197	t0 = _mm_or_si128(t0, t4);
198	t0 = _mm_or_si128(t0, t5); // t0 contains color0 in 565
199
200
201
202
203	//t1 = _mm_load_si128 ( (__m128i*) minColor );
204	t1 = _mm_cvtsi32_si128( (int)minColor);
205
206	t1 = _mm_and_si128(t1, tt);
207
208	t1 = _mm_unpacklo_epi8(t1, t7);
209
210	t4 = _mm_shufflelo_epi16( t1, R_SHUFFLE_D( 0, 3, 2, 3 ));
211	t5 = _mm_shufflelo_epi16( t1, R_SHUFFLE_D( 3, 1, 3, 3 ));
212
213	t4 = _mm_srli_epi16(t4, 5);
214	t5 = _mm_srli_epi16(t5, 6);
215
216	t1 = _mm_or_si128(t1, t4);
217	t1 = _mm_or_si128(t1, t5); // t1 contains color1 in 565
218
219
220
221	t2 = t0;
222
223	t2 = _mm_packus_epi16(t2, t7);
224
225	t2 = _mm_shuffle_epi32( t2, R_SHUFFLE_D( 0, 1, 0, 1 ));
226
227	_mm_store_si128 ( (__m128i*) &color0, t2 );
228
229	t6 = t0;
230	t6 = _mm_add_epi16(t6, t0);
231	t6 = _mm_add_epi16(t6, t1);
232
233	// Multiply Packed Signed Integers and Store High Result
234	__m128i tw3 = _mm_load_si128 ( (__m128i*) SIMD_SSE2_word_div_by_3 );
235	t6 = _mm_mulhi_epi16(t6, tw3);
236	t6 = _mm_packus_epi16(t6, t7);
237
238	t6 = _mm_shuffle_epi32( t6, R_SHUFFLE_D( 0, 1, 0, 1 ));
239
240	_mm_store_si128 ( (__m128i*) &color2, t6 );
241
242	t3 = t1;
243	t3 = _mm_packus_epi16(t3, t7);
244	t3 = _mm_shuffle_epi32( t3, R_SHUFFLE_D( 0, 1, 0, 1 ));
245
246	_mm_store_si128 ( (__m128i*) &color1, t3 );
247
248	t1 = _mm_add_epi16(t1, t1);
249	t0 = _mm_add_epi16(t0, t1);
250
251	t0 = _mm_mulhi_epi16(t0, tw3);
252	t0 = _mm_packus_epi16(t0, t7);
253
254	t0 = _mm_shuffle_epi32( t0, R_SHUFFLE_D( 0, 1, 0, 1 ));
255	_mm_store_si128 ( (__m128i*) &color3, t0 );
256
257	__m128i w0 = _mm_load_si128 ( (__m128i*) SIMD_SSE2_word_0);
258	__m128i w1 = _mm_load_si128 ( (__m128i*) SIMD_SSE2_word_1);
259	__m128i w2 = _mm_load_si128 ( (__m128i*) SIMD_SSE2_word_2);
260
261	// mov eax, 32
262	// mov esi, colorBlock
263	int x = 32;
264	//const byte *c = colorBlock;
265	while (x >= 0)
266	{
267	t3 = _mm_loadl_epi64( (__m128i*) (colorBlock+x+0));
268	t3 = _mm_shuffle_epi32( t3, R_SHUFFLE_D( 0, 2, 1, 3 ));
269
270	t5 = _mm_loadl_epi64( (__m128i*) (colorBlock+x+8));
271	t5 = _mm_shuffle_epi32( t5, R_SHUFFLE_D( 0, 2, 1, 3 ));
272
273	t0 = t3;
274	t6 = t5;
275	// Compute Sum of Absolute Difference
276	__m128i c0 = _mm_load_si128 ( (__m128i*) color0 );
277	t0 = _mm_sad_epu8(t0, c0);
278	t6 = _mm_sad_epu8(t6, c0);
279	// Pack with Signed Saturation
280	t0 = _mm_packs_epi32 (t0, t6);
281
282	t1 = t3;
283	t6 = t5;
284	__m128i c1 = _mm_load_si128 ( (__m128i*) color1 );
285	t1 = _mm_sad_epu8(t1, c1);
286	t6 = _mm_sad_epu8(t6, c1);
287	t1 = _mm_packs_epi32 (t1, t6);
288
289	t2 = t3;
290	t6 = t5;
291	__m128i c2 = _mm_load_si128 ( (__m128i*) color2 );
292	t2 = _mm_sad_epu8(t2, c2);
293	t6 = _mm_sad_epu8(t6, c2);
294	t2 = _mm_packs_epi32 (t2, t6);
295
296	__m128i c3 = _mm_load_si128 ( (__m128i*) color3 );
297	t3 = _mm_sad_epu8(t3, c3);
298	t5 = _mm_sad_epu8(t5, c3);
299	t3 = _mm_packs_epi32 (t3, t5);
300
301
302	t4 = _mm_loadl_epi64( (__m128i*) (colorBlock+x+16));
303	t4 = _mm_shuffle_epi32( t4, R_SHUFFLE_D( 0, 2, 1, 3 ));
304
305	t5 = _mm_loadl_epi64( (__m128i*) (colorBlock+x+24));
306	t5 = _mm_shuffle_epi32( t5, R_SHUFFLE_D( 0, 2, 1, 3 ));
307
308	t6 = t4;
309	t7 = t5;
310	t6 = _mm_sad_epu8(t6, c0);
311	t7 = _mm_sad_epu8(t7, c0);
312	t6 = _mm_packs_epi32 (t6, t7);
313	t0 = _mm_packs_epi32 (t0, t6); // d0
314
315	t6 = t4;
316	t7 = t5;
317	t6 = _mm_sad_epu8(t6, c1);
318	t7 = _mm_sad_epu8(t7, c1);
319	t6 = _mm_packs_epi32 (t6, t7);
320	t1 = _mm_packs_epi32 (t1, t6); // d1
321
322	t6 = t4;
323	t7 = t5;
324	t6 = _mm_sad_epu8(t6, c2);
325	t7 = _mm_sad_epu8(t7, c2);
326	t6 = _mm_packs_epi32 (t6, t7);
327	t2 = _mm_packs_epi32 (t2, t6); // d2
328
329	t4 = _mm_sad_epu8(t4, c3);
330	t5 = _mm_sad_epu8(t5, c3);
331	t4 = _mm_packs_epi32 (t4, t5);
332	t3 = _mm_packs_epi32 (t3, t4); // d3
333
334	t7 = _mm_load_si128 ( (__m128i*) result );
335
336	t7 = _mm_slli_epi32( t7, 16);
337
338	t4 = t0;
339	t5 = t1;
340	// Compare Packed Signed Integers for Greater Than
341	t0 = _mm_cmpgt_epi16(t0, t3); // b0
342	t1 = _mm_cmpgt_epi16(t1, t2); // b1
343	t4 = _mm_cmpgt_epi16(t4, t2); // b2
344	t5 = _mm_cmpgt_epi16(t5, t3); // b3
345	t2 = _mm_cmpgt_epi16(t2, t3); // b4
346
347	t4 = _mm_and_si128(t4, t1); // x0
348	t5 = _mm_and_si128(t5, t0); // x1
349	t2 = _mm_and_si128(t2, t0); // x2
350
351	t4 = _mm_or_si128(t4, t5);
352	t2 = _mm_and_si128(t2, w1);
353	t4 = _mm_and_si128(t4, w2);
354	t2 = _mm_or_si128(t2, t4);
355
356	t5 = _mm_shuffle_epi32( t2, R_SHUFFLE_D( 2, 3, 0, 1 ));
357
358	// Unpack Low Data
359	t2 = _mm_unpacklo_epi16 ( t2, w0);
360	t5 = _mm_unpacklo_epi16 ( t5, w0);
361
362	//t5 = _mm_slli_si128 ( t5, 8);
363	t5 = _mm_slli_epi32( t5, 8);
364
365	t7 = _mm_or_si128(t7, t5);
366	t7 = _mm_or_si128(t7, t2);
367
368	_mm_store_si128 ( (__m128i*) &result, t7 );
369
370	x -=32;
371	}
372
373	t4 = _mm_shuffle_epi32( t7, R_SHUFFLE_D( 1, 2, 3, 0 ));
374	t5 = _mm_shuffle_epi32( t7, R_SHUFFLE_D( 2, 3, 0, 1 ));
375	t6 = _mm_shuffle_epi32( t7, R_SHUFFLE_D( 3, 0, 1, 2 ));
376
377	t4 = _mm_slli_epi32 ( t4, 2);
378	t5 = _mm_slli_epi32 ( t5, 4);
379	t6 = _mm_slli_epi32 ( t6, 6);
380
381	t7 = _mm_or_si128(t7, t4);
382	t7 = _mm_or_si128(t7, t5);
383	t7 = _mm_or_si128(t7, t6);
384
385	//_mm_store_si128 ( (__m128i*) outData, t7 );
386
387	int r = _mm_cvtsi128_si32 (t7);
388	memcpy(outData, &r, 4); // Anything better ?
389
390	outData += 4;
391	}
392
393
394
395	ALIGN16( static byte SIMD_SSE2_byte_1[16] ) = { 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01 };
396	ALIGN16( static byte SIMD_SSE2_byte_2[16] ) = { 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02 };
397	ALIGN16( static byte SIMD_SSE2_byte_7[16] ) = { 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07 };
398	ALIGN16( static word SIMD_SSE2_word_div_by_7[8] ) = { (1<<16)/7+1, (1<<16)/7+1, (1<<16)/7+1, (1<<16)/7+1, (1<<16)/7+1, (1<<16)/7+1, (1<<16)/7+1, (1<<16)/7+1 };
399	ALIGN16( static word SIMD_SSE2_word_div_by_14[8] ) = { (1<<16)/14+1, (1<<16)/14+1, (1<<16)/14+1, (1<<16)/14+1, (1<<16)/14+1, (1<<16)/14+1, (1<<16)/14+1, (1<<16)/14+1 };
400	ALIGN16( static word SIMD_SSE2_word_scale66554400[8] ) = { 6, 6, 5, 5, 4, 4, 0, 0 };
401	ALIGN16( static word SIMD_SSE2_word_scale11223300[8] ) = { 1, 1, 2, 2, 3, 3, 0, 0 };
402	ALIGN16( static dword SIMD_SSE2_dword_alpha_bit_mask0[4] ) = { 7<<0, 0, 7<<0, 0 };
403	ALIGN16( static dword SIMD_SSE2_dword_alpha_bit_mask1[4] ) = { 7<<3, 0, 7<<3, 0 };
404	ALIGN16( static dword SIMD_SSE2_dword_alpha_bit_mask2[4] ) = { 7<<6, 0, 7<<6, 0 };
405	ALIGN16( static dword SIMD_SSE2_dword_alpha_bit_mask3[4] ) = { 7<<9, 0, 7<<9, 0 };
406	ALIGN16( static dword SIMD_SSE2_dword_alpha_bit_mask4[4] ) = { 7<<12, 0, 7<<12, 0 };
407	ALIGN16( static dword SIMD_SSE2_dword_alpha_bit_mask5[4] ) = { 7<<15, 0, 7<<15, 0 };
408	ALIGN16( static dword SIMD_SSE2_dword_alpha_bit_mask6[4] ) = { 7<<18, 0, 7<<18, 0 };
409	ALIGN16( static dword SIMD_SSE2_dword_alpha_bit_mask7[4] ) = { 7<<21, 0, 7<<21, 0 };
410
411
412	void EmitAlphaIndices_Intrinsics( const byte colorBlock, const byte minAlpha, const byte maxAlpha, byte &outData)
413	{
414	/*
415	__asm {
416	mov esi, colorBlock
417	movdqa xmm0, [esi+ 0]
418	movdqa xmm5, [esi+16]
419	psrld xmm0, 24
420	psrld xmm5, 24
421	packuswb xmm0, xmm5
422
423	movdqa xmm6, [esi+32]
424	movdqa xmm4, [esi+48]
425	psrld xmm6, 24
426	psrld xmm4, 24
427	packuswb xmm6, xmm4
428
429	movzx ecx, maxAlpha
430	movd xmm5, ecx
431	pshuflw xmm5, xmm5, R_SHUFFLE_D( 0, 0, 0, 0 )
432	pshufd xmm5, xmm5, R_SHUFFLE_D( 0, 0, 0, 0 )
433	movdqa xmm7, xmm5
434
435	movzx edx, minAlpha
436	movd xmm2, edx
437	pshuflw xmm2, xmm2, R_SHUFFLE_D( 0, 0, 0, 0 )
438	pshufd xmm2, xmm2, R_SHUFFLE_D( 0, 0, 0, 0 )
439	movdqa xmm3, xmm2
440
441	movdqa xmm4, xmm5
442	psubw xmm4, xmm2
443	pmulhw xmm4, SIMD_SSE2_word_div_by_14 // * ( ( 1 << 16 ) / 14 + 1 ) ) >> 16
444	movdqa xmm1, xmm2
445	paddw xmm1, xmm4
446	packuswb xmm1, xmm1 // ab1
447
448	pmullw xmm5, SIMD_SSE2_word_scale66554400
449	pmullw xmm7, SIMD_SSE2_word_scale11223300
450	pmullw xmm2, SIMD_SSE2_word_scale11223300
451	pmullw xmm3, SIMD_SSE2_word_scale66554400
452	paddw xmm5, xmm2
453	paddw xmm7, xmm3
454	pmulhw xmm5, SIMD_SSE2_word_div_by_7 // * ( ( 1 << 16 ) / 7 + 1 ) ) >> 16
455	pmulhw xmm7, SIMD_SSE2_word_div_by_7 // * ( ( 1 << 16 ) / 7 + 1 ) ) >> 16
456	paddw xmm5, xmm4
457	paddw xmm7, xmm4
458
459	pshufd xmm2, xmm5, R_SHUFFLE_D( 0, 0, 0, 0 )
460	pshufd xmm3, xmm5, R_SHUFFLE_D( 1, 1, 1, 1 )
461	pshufd xmm4, xmm5, R_SHUFFLE_D( 2, 2, 2, 2 )
462	packuswb xmm2, xmm2 // ab2
463	packuswb xmm3, xmm3 // ab3
464	packuswb xmm4, xmm4 // ab4
465
466	packuswb xmm0, xmm6 // alpha values
467
468	pshufd xmm5, xmm7, R_SHUFFLE_D( 2, 2, 2, 2 )
469	pshufd xmm6, xmm7, R_SHUFFLE_D( 1, 1, 1, 1 )
470	pshufd xmm7, xmm7, R_SHUFFLE_D( 0, 0, 0, 0 )
471	packuswb xmm5, xmm5 // ab5
472	packuswb xmm6, xmm6 // ab6
473	packuswb xmm7, xmm7 // ab7
474
475	pminub xmm1, xmm0
476	pminub xmm2, xmm0
477	pminub xmm3, xmm0
478	pcmpeqb xmm1, xmm0
479	pcmpeqb xmm2, xmm0
480	pcmpeqb xmm3, xmm0
481	pminub xmm4, xmm0
482	pminub xmm5, xmm0
483	pminub xmm6, xmm0
484	pminub xmm7, xmm0
485	pcmpeqb xmm4, xmm0
486	pcmpeqb xmm5, xmm0
487	pcmpeqb xmm6, xmm0
488	pcmpeqb xmm7, xmm0
489	pand xmm1, SIMD_SSE2_byte_1
490	pand xmm2, SIMD_SSE2_byte_1
491	pand xmm3, SIMD_SSE2_byte_1
492	pand xmm4, SIMD_SSE2_byte_1
493	pand xmm5, SIMD_SSE2_byte_1
494	pand xmm6, SIMD_SSE2_byte_1
495	pand xmm7, SIMD_SSE2_byte_1
496	movdqa xmm0, SIMD_SSE2_byte_1
497	paddusb xmm0, xmm1
498	paddusb xmm2, xmm3
499	paddusb xmm4, xmm5
500	paddusb xmm6, xmm7
501	paddusb xmm0, xmm2
502	paddusb xmm4, xmm6
503	paddusb xmm0, xmm4
504	pand xmm0, SIMD_SSE2_byte_7
505	movdqa xmm1, SIMD_SSE2_byte_2
506	pcmpgtb xmm1, xmm0
507	pand xmm1, SIMD_SSE2_byte_1
508	pxor xmm0, xmm1
509	movdqa xmm1, xmm0
510	movdqa xmm2, xmm0
511	movdqa xmm3, xmm0
512	movdqa xmm4, xmm0
513	movdqa xmm5, xmm0
514	movdqa xmm6, xmm0
515	movdqa xmm7, xmm0
516	psrlq xmm1, 8- 3
517	psrlq xmm2, 16- 6
518	psrlq xmm3, 24- 9
519
520	psrlq xmm4, 32-12
521	psrlq xmm5, 40-15
522	psrlq xmm6, 48-18
523	psrlq xmm7, 56-21
524	pand xmm0, SIMD_SSE2_dword_alpha_bit_mask0
525	pand xmm1, SIMD_SSE2_dword_alpha_bit_mask1
526	pand xmm2, SIMD_SSE2_dword_alpha_bit_mask2
527	pand xmm3, SIMD_SSE2_dword_alpha_bit_mask3
528	pand xmm4, SIMD_SSE2_dword_alpha_bit_mask4
529	pand xmm5, SIMD_SSE2_dword_alpha_bit_mask5
530	pand xmm6, SIMD_SSE2_dword_alpha_bit_mask6
531	pand xmm7, SIMD_SSE2_dword_alpha_bit_mask7
532	por xmm0, xmm1
533	por xmm2, xmm3
534	por xmm4, xmm5
535	por xmm6, xmm7
536	por xmm0, xmm2
537	por xmm4, xmm6
538	por xmm0, xmm4
539	mov esi, outData
540	movd [esi+0], xmm0
541	pshufd xmm1, xmm0, R_SHUFFLE_D( 2, 3, 0, 1 )
542	movd [esi+3], xmm1
543	}
544	outData += 6;
545	*/
546	}
547

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: trunk/src/testing/app/ishare/FastDXT/intrinsic.cpp @ 4

Download in other formats: