source: trunk/src/testing/app/ishare/FastDXT/intrinsic.cpp @ 4

Revision 4, 17.7 KB checked in by ajaworski, 13 years ago (diff)

Added modified SAGE sources

Line 
1/******************************************************************************
2 * Fast DXT - a realtime DXT compression tool
3 *
4 * Author : Luc Renambot
5 *
6 * Copyright (C) 2007 Electronic Visualization Laboratory,
7 * University of Illinois at Chicago
8 *
9 * All rights reserved.
10 *
11 * Redistribution and use in source and binary forms, with or without
12 * modification, are permitted provided that the following conditions are met:
13 *
14 *  * Redistributions of source code must retain the above copyright
15 *    notice, this list of conditions and the following disclaimer.
16 *  * Redistributions in binary form must reproduce the above
17 *    copyright notice, this list of conditions and the following disclaimer
18 *    in the documentation and/or other materials provided with the distribution.
19 *  * Neither the name of the University of Illinois at Chicago nor
20 *    the names of its contributors may be used to endorse or promote
21 *    products derived from this software without specific prior written permission.
22 *
23 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
24 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
25 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
26 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
27 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
28 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
29 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
30 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
31 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
32 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
33 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
34 *
35 * Direct questions, comments etc about SAGE to http://www.evl.uic.edu/cavern/forum/
36 *
37 *****************************************************************************/
38
39/*
40        Code convert from asm to intrinsics from:
41
42                Copyright (C) 2006 Id Software, Inc.
43                Written by J.M.P. van Waveren
44                This code is free software; you can redistribute it and/or
45                modify it under the terms of the GNU Lesser General Public
46                License as published by the Free Software Foundation; either
47                version 2.1 of the License, or (at your option) any later version.
48                This code is distributed in the hope that it will be useful,
49                but WITHOUT ANY WARRANTY; without even the implied warranty of
50                MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
51                Lesser General Public License for more details.
52*/
53
54#include "dxt.h"
55
56#include <emmintrin.h>  // sse2
57
58
59void ExtractBlock_Intrinsics( const byte *inPtr, int width, byte *colorBlock )
60{
61        __m128i t0, t1, t2, t3;
62        register int w = width << 2;  // width*4
63
64        t0 = _mm_load_si128 ( (__m128i*) inPtr );
65        _mm_store_si128 ( (__m128i*) &colorBlock[0], t0 );   // copy first row, 16bytes
66
67        t1 = _mm_load_si128 ( (__m128i*) (inPtr + w) );
68        _mm_store_si128 ( (__m128i*) &colorBlock[16], t1 );   // copy second row
69
70        t2 = _mm_load_si128 ( (__m128i*) (inPtr + 2*w) );
71        _mm_store_si128 ( (__m128i*) &colorBlock[32], t2 );   // copy third row
72
73        inPtr = inPtr + w;     // add width, intead of *3
74
75        t3 = _mm_load_si128 ( (__m128i*) (inPtr + 2*w) );
76        _mm_store_si128 ( (__m128i*) &colorBlock[48], t3 );   // copy last row
77}
78
79#define R_SHUFFLE_D( x, y, z, w ) (( (w) & 3 ) << 6 | ( (z) & 3 ) << 4 | ( (y) & 3 ) << 2 | ( (x) & 3 ))
80
81ALIGN16( static byte SIMD_SSE2_byte_0[16] ) = { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 };
82
83void GetMinMaxColors_Intrinsics( const byte *colorBlock, byte *minColor, byte *maxColor )
84{
85    __m128i t0, t1, t3, t4, t6, t7;
86
87    // get bounding box
88    // ----------------
89
90    // load the first row
91    t0 = _mm_load_si128 ( (__m128i*) colorBlock );
92    t1 = _mm_load_si128 ( (__m128i*) colorBlock );
93
94    __m128i t16 = _mm_load_si128 ( (__m128i*) (colorBlock+16) );
95    // Minimum of Packed Unsigned Byte Integers
96    t0 = _mm_min_epu8 ( t0, t16);
97    // Maximum of Packed Unsigned Byte Integers
98    t1 = _mm_max_epu8 ( t1, t16);
99
100    __m128i t32 = _mm_load_si128 ( (__m128i*) (colorBlock+32) );
101    t0 = _mm_min_epu8 ( t0, t32);
102    t1 = _mm_max_epu8 ( t1, t32);
103
104    __m128i t48 = _mm_load_si128 ( (__m128i*) (colorBlock+48) );
105    t0 = _mm_min_epu8 ( t0, t48);
106    t1 = _mm_max_epu8 ( t1, t48);
107
108    // Shuffle Packed Doublewords
109    t3 = _mm_shuffle_epi32( t0, R_SHUFFLE_D( 2, 3, 2, 3 ) );
110    t4 = _mm_shuffle_epi32( t1, R_SHUFFLE_D( 2, 3, 2, 3 ) );
111
112    t0 = _mm_min_epu8 ( t0, t3);
113    t1 = _mm_max_epu8 ( t1, t4);
114
115    // Shuffle Packed Low Words
116    t6 = _mm_shufflelo_epi16( t0, R_SHUFFLE_D( 2, 3, 2, 3 ) );
117    t7 = _mm_shufflelo_epi16( t1, R_SHUFFLE_D( 2, 3, 2, 3 ) );
118
119    t0 = _mm_min_epu8 ( t0, t6);
120    t1 = _mm_max_epu8 ( t1, t7);
121
122    // inset the bounding box
123    // ----------------------
124
125    // Unpack Low Data
126    //__m128i t66 = _mm_set1_epi8( 0 );
127    __m128i t66 = _mm_load_si128 ( (__m128i*) SIMD_SSE2_byte_0 );
128    t0 = _mm_unpacklo_epi8(t0, t66);
129    t1 = _mm_unpacklo_epi8(t1, t66);
130
131    // copy (movdqa)
132    //__m128i t2 = _mm_load_si128 ( &t1 );
133    __m128i t2 = t1;
134
135    // Subtract Packed Integers
136    t2 = _mm_sub_epi16(t2, t0);
137
138    // Shift Packed Data Right Logical
139    t2 = _mm_srli_epi16(t2, INSET_SHIFT);
140
141    // Add Packed Integers
142    t0 = _mm_add_epi16(t0, t2);
143
144    t1 = _mm_sub_epi16(t1, t2);
145
146    // Pack with Unsigned Saturation
147    t0 = _mm_packus_epi16(t0, t0);
148    t1 = _mm_packus_epi16(t1, t1);
149
150    // store bounding box extents
151    // --------------------------
152    _mm_store_si128 ( (__m128i*) minColor, t0 );
153    _mm_store_si128 ( (__m128i*) maxColor, t1 );
154}
155
156
157ALIGN16( static word SIMD_SSE2_word_0[8] ) = { 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000 };
158ALIGN16( static word SIMD_SSE2_word_1[8] ) = { 0x0001, 0x0001, 0x0001, 0x0001, 0x0001, 0x0001, 0x0001, 0x0001 };
159ALIGN16( static word SIMD_SSE2_word_2[8] ) = { 0x0002, 0x0002, 0x0002, 0x0002, 0x0002, 0x0002, 0x0002, 0x0002 };
160ALIGN16( static word SIMD_SSE2_word_div_by_3[8] ) = { (1<<16)/3+1, (1<<16)/3+1, (1<<16)/3+1, (1<<16)/3+1, (1<<16)/3+1, (1<<16)/3+1, (1<<16)/3+1, (1<<16)/3+1 };
161ALIGN16( static byte SIMD_SSE2_byte_colorMask[16] ) = { C565_5_MASK, C565_6_MASK, C565_5_MASK, 0x00, 0x00, 0x00, 0x00, 0x00, C565_5_MASK, C565_6_MASK, C565_5_MASK, 0x00, 0x00, 0x00, 0x00, 0x00 };
162
163void EmitColorIndices_Intrinsics( const byte *colorBlock, const byte *minColor, const byte *maxColor, byte *&outData )
164{
165        ALIGN16( byte color0[16] );
166        ALIGN16( byte color1[16] );
167        ALIGN16( byte color2[16] );
168        ALIGN16( byte color3[16] );
169        ALIGN16( byte result[16] );
170
171        // mov esi, maxColor
172        // mov edi, minColor
173
174        __m128i t0, t1, t2, t3, t4, t5, t6, t7;
175
176        t7 = _mm_setzero_si128();
177        //t7 = _mm_xor_si128(t7, t7);
178        _mm_store_si128 ( (__m128i*) &result, t7 );
179
180
181        //t0 = _mm_load_si128 ( (__m128i*)  maxColor );
182        t0 = _mm_cvtsi32_si128( *(int*)maxColor);
183
184        // Bitwise AND
185        __m128i tt = _mm_load_si128 ( (__m128i*) SIMD_SSE2_byte_colorMask );
186        t0 = _mm_and_si128(t0, tt);
187
188        t0 = _mm_unpacklo_epi8(t0, t7);
189
190        t4 = _mm_shufflelo_epi16( t0, R_SHUFFLE_D( 0, 3, 2, 3 ));
191        t5 = _mm_shufflelo_epi16( t0, R_SHUFFLE_D( 3, 1, 3, 3 ));
192
193        t4 = _mm_srli_epi16(t4, 5);
194        t5 = _mm_srli_epi16(t5, 6);
195
196        // Bitwise Logical OR
197        t0 = _mm_or_si128(t0, t4);
198        t0 = _mm_or_si128(t0, t5);   // t0 contains color0 in 565
199
200
201
202
203        //t1 = _mm_load_si128 ( (__m128i*)  minColor );
204        t1 = _mm_cvtsi32_si128( *(int*)minColor);
205
206        t1 = _mm_and_si128(t1, tt);
207
208        t1 = _mm_unpacklo_epi8(t1, t7);
209
210        t4 = _mm_shufflelo_epi16( t1, R_SHUFFLE_D( 0, 3, 2, 3 ));
211        t5 = _mm_shufflelo_epi16( t1, R_SHUFFLE_D( 3, 1, 3, 3 ));
212
213        t4 = _mm_srli_epi16(t4, 5);
214        t5 = _mm_srli_epi16(t5, 6);
215
216        t1 = _mm_or_si128(t1, t4);
217        t1 = _mm_or_si128(t1, t5);  // t1 contains color1 in 565
218
219
220
221        t2 = t0;
222
223        t2 = _mm_packus_epi16(t2, t7);
224
225        t2 = _mm_shuffle_epi32( t2, R_SHUFFLE_D( 0, 1, 0, 1 ));
226
227        _mm_store_si128 ( (__m128i*) &color0, t2 );
228
229        t6 = t0;
230        t6 = _mm_add_epi16(t6, t0);
231        t6 = _mm_add_epi16(t6, t1);
232
233        // Multiply Packed Signed Integers and Store High Result
234        __m128i tw3 = _mm_load_si128 ( (__m128i*) SIMD_SSE2_word_div_by_3 );
235        t6 = _mm_mulhi_epi16(t6, tw3);
236        t6 = _mm_packus_epi16(t6, t7);
237
238        t6 = _mm_shuffle_epi32( t6, R_SHUFFLE_D( 0, 1, 0, 1 ));
239
240        _mm_store_si128 ( (__m128i*) &color2, t6 );
241
242        t3 = t1;
243        t3 = _mm_packus_epi16(t3, t7);
244        t3 = _mm_shuffle_epi32( t3, R_SHUFFLE_D( 0, 1, 0, 1 ));
245
246        _mm_store_si128 ( (__m128i*) &color1, t3 );
247
248        t1 = _mm_add_epi16(t1, t1);
249        t0 = _mm_add_epi16(t0, t1);
250
251        t0 = _mm_mulhi_epi16(t0, tw3);
252        t0 = _mm_packus_epi16(t0, t7);
253
254        t0 = _mm_shuffle_epi32( t0, R_SHUFFLE_D( 0, 1, 0, 1 ));
255        _mm_store_si128 ( (__m128i*) &color3, t0 );
256
257        __m128i w0 = _mm_load_si128 ( (__m128i*) SIMD_SSE2_word_0);
258        __m128i w1 = _mm_load_si128 ( (__m128i*) SIMD_SSE2_word_1);
259        __m128i w2 = _mm_load_si128 ( (__m128i*) SIMD_SSE2_word_2);
260
261            // mov eax, 32
262            // mov esi, colorBlock
263        int x = 32;
264        //const byte *c = colorBlock;
265        while (x >= 0)
266          {
267            t3 = _mm_loadl_epi64( (__m128i*) (colorBlock+x+0));
268            t3 = _mm_shuffle_epi32( t3, R_SHUFFLE_D( 0, 2, 1, 3 ));
269
270            t5 = _mm_loadl_epi64( (__m128i*) (colorBlock+x+8));
271            t5 = _mm_shuffle_epi32( t5, R_SHUFFLE_D( 0, 2, 1, 3 ));
272
273            t0 = t3;
274            t6 = t5;
275            // Compute Sum of Absolute Difference
276            __m128i c0 = _mm_load_si128 ( (__m128i*)  color0 );
277            t0 = _mm_sad_epu8(t0, c0);
278            t6 = _mm_sad_epu8(t6, c0);
279            // Pack with Signed Saturation
280            t0 = _mm_packs_epi32 (t0, t6);
281
282            t1 = t3;
283            t6 = t5;
284            __m128i c1 = _mm_load_si128 ( (__m128i*)  color1 );
285            t1 = _mm_sad_epu8(t1, c1);
286            t6 = _mm_sad_epu8(t6, c1);
287            t1 = _mm_packs_epi32 (t1, t6);
288
289            t2 = t3;
290            t6 = t5;
291            __m128i c2 = _mm_load_si128 ( (__m128i*)  color2 );
292            t2 = _mm_sad_epu8(t2, c2);
293            t6 = _mm_sad_epu8(t6, c2);
294            t2 = _mm_packs_epi32 (t2, t6);
295
296            __m128i c3 = _mm_load_si128 ( (__m128i*)  color3 );
297            t3 = _mm_sad_epu8(t3, c3);
298            t5 = _mm_sad_epu8(t5, c3);
299            t3 = _mm_packs_epi32 (t3, t5);
300
301
302            t4 = _mm_loadl_epi64( (__m128i*) (colorBlock+x+16));
303            t4 = _mm_shuffle_epi32( t4, R_SHUFFLE_D( 0, 2, 1, 3 ));
304
305            t5 = _mm_loadl_epi64( (__m128i*) (colorBlock+x+24));
306            t5 = _mm_shuffle_epi32( t5, R_SHUFFLE_D( 0, 2, 1, 3 ));
307
308            t6 = t4;
309            t7 = t5;
310            t6 = _mm_sad_epu8(t6, c0);
311            t7 = _mm_sad_epu8(t7, c0);
312            t6 = _mm_packs_epi32 (t6, t7);
313            t0 = _mm_packs_epi32 (t0, t6);  // d0
314
315            t6 = t4;
316            t7 = t5;
317            t6 = _mm_sad_epu8(t6, c1);
318            t7 = _mm_sad_epu8(t7, c1);
319            t6 = _mm_packs_epi32 (t6, t7);
320            t1 = _mm_packs_epi32 (t1, t6);  // d1
321
322            t6 = t4;
323            t7 = t5;
324            t6 = _mm_sad_epu8(t6, c2);
325            t7 = _mm_sad_epu8(t7, c2);
326            t6 = _mm_packs_epi32 (t6, t7);
327            t2 = _mm_packs_epi32 (t2, t6);  // d2
328
329            t4 = _mm_sad_epu8(t4, c3);
330            t5 = _mm_sad_epu8(t5, c3);
331            t4 = _mm_packs_epi32 (t4, t5);
332            t3 = _mm_packs_epi32 (t3, t4);  // d3
333
334            t7 = _mm_load_si128 ( (__m128i*) result );
335
336            t7 = _mm_slli_epi32( t7, 16);
337
338            t4 = t0;
339            t5 = t1;
340            // Compare Packed Signed Integers for Greater Than
341            t0 = _mm_cmpgt_epi16(t0, t3); // b0
342            t1 = _mm_cmpgt_epi16(t1, t2); // b1
343            t4 = _mm_cmpgt_epi16(t4, t2); // b2
344            t5 = _mm_cmpgt_epi16(t5, t3); // b3
345            t2 = _mm_cmpgt_epi16(t2, t3); // b4
346
347            t4 = _mm_and_si128(t4, t1); // x0
348            t5 = _mm_and_si128(t5, t0); // x1
349            t2 = _mm_and_si128(t2, t0); // x2
350
351            t4 = _mm_or_si128(t4, t5);
352            t2 = _mm_and_si128(t2, w1);
353            t4 = _mm_and_si128(t4, w2);
354            t2 = _mm_or_si128(t2, t4);
355
356            t5 = _mm_shuffle_epi32( t2, R_SHUFFLE_D( 2, 3, 0, 1 ));
357
358            // Unpack Low Data
359            t2 = _mm_unpacklo_epi16 ( t2, w0);
360            t5 = _mm_unpacklo_epi16 ( t5, w0);
361
362            //t5 = _mm_slli_si128 ( t5, 8);
363            t5 = _mm_slli_epi32( t5, 8);
364
365            t7 = _mm_or_si128(t7, t5);
366            t7 = _mm_or_si128(t7, t2);
367
368            _mm_store_si128 ( (__m128i*) &result, t7 );
369
370            x -=32;
371          }
372
373        t4 = _mm_shuffle_epi32( t7, R_SHUFFLE_D( 1, 2, 3, 0 ));
374        t5 = _mm_shuffle_epi32( t7, R_SHUFFLE_D( 2, 3, 0, 1 ));
375        t6 = _mm_shuffle_epi32( t7, R_SHUFFLE_D( 3, 0, 1, 2 ));
376
377        t4 = _mm_slli_epi32 ( t4, 2);
378        t5 = _mm_slli_epi32 ( t5, 4);
379        t6 = _mm_slli_epi32 ( t6, 6);
380
381        t7 = _mm_or_si128(t7, t4);
382        t7 = _mm_or_si128(t7, t5);
383        t7 = _mm_or_si128(t7, t6);
384
385        //_mm_store_si128 ( (__m128i*) outData, t7 );
386
387        int r = _mm_cvtsi128_si32 (t7);
388        memcpy(outData, &r, 4);   // Anything better ?
389
390        outData += 4;
391}
392
393
394
395ALIGN16( static byte SIMD_SSE2_byte_1[16] ) = { 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01 };
396ALIGN16( static byte SIMD_SSE2_byte_2[16] ) = { 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02 };
397ALIGN16( static byte SIMD_SSE2_byte_7[16] ) = { 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07 };
398ALIGN16( static word SIMD_SSE2_word_div_by_7[8] ) = { (1<<16)/7+1, (1<<16)/7+1, (1<<16)/7+1, (1<<16)/7+1, (1<<16)/7+1, (1<<16)/7+1, (1<<16)/7+1, (1<<16)/7+1 };
399ALIGN16( static word SIMD_SSE2_word_div_by_14[8] ) = { (1<<16)/14+1, (1<<16)/14+1, (1<<16)/14+1, (1<<16)/14+1, (1<<16)/14+1, (1<<16)/14+1, (1<<16)/14+1, (1<<16)/14+1 };
400ALIGN16( static word SIMD_SSE2_word_scale66554400[8] ) = { 6, 6, 5, 5, 4, 4, 0, 0 };
401ALIGN16( static word SIMD_SSE2_word_scale11223300[8] ) = { 1, 1, 2, 2, 3, 3, 0, 0 };
402ALIGN16( static dword SIMD_SSE2_dword_alpha_bit_mask0[4] ) = { 7<<0, 0, 7<<0, 0 };
403ALIGN16( static dword SIMD_SSE2_dword_alpha_bit_mask1[4] ) = { 7<<3, 0, 7<<3, 0 };
404ALIGN16( static dword SIMD_SSE2_dword_alpha_bit_mask2[4] ) = { 7<<6, 0, 7<<6, 0 };
405ALIGN16( static dword SIMD_SSE2_dword_alpha_bit_mask3[4] ) = { 7<<9, 0, 7<<9, 0 };
406ALIGN16( static dword SIMD_SSE2_dword_alpha_bit_mask4[4] ) = { 7<<12, 0, 7<<12, 0 };
407ALIGN16( static dword SIMD_SSE2_dword_alpha_bit_mask5[4] ) = { 7<<15, 0, 7<<15, 0 };
408ALIGN16( static dword SIMD_SSE2_dword_alpha_bit_mask6[4] ) = { 7<<18, 0, 7<<18, 0 };
409ALIGN16( static dword SIMD_SSE2_dword_alpha_bit_mask7[4] ) = { 7<<21, 0, 7<<21, 0 };
410
411
412void EmitAlphaIndices_Intrinsics( const byte *colorBlock, const byte minAlpha, const byte maxAlpha, byte *&outData)
413{
414/*
415  __asm {
416    mov esi, colorBlock
417      movdqa xmm0, [esi+ 0]
418      movdqa xmm5, [esi+16]
419      psrld xmm0, 24
420      psrld xmm5, 24
421      packuswb xmm0, xmm5
422
423      movdqa xmm6, [esi+32]
424      movdqa xmm4, [esi+48]
425      psrld xmm6, 24
426      psrld xmm4, 24
427      packuswb xmm6, xmm4
428
429      movzx ecx, maxAlpha
430      movd xmm5, ecx
431      pshuflw xmm5, xmm5, R_SHUFFLE_D( 0, 0, 0, 0 )
432      pshufd xmm5, xmm5, R_SHUFFLE_D( 0, 0, 0, 0 )
433      movdqa xmm7, xmm5
434
435      movzx edx, minAlpha
436      movd xmm2, edx
437      pshuflw xmm2, xmm2, R_SHUFFLE_D( 0, 0, 0, 0 )
438      pshufd xmm2, xmm2, R_SHUFFLE_D( 0, 0, 0, 0 )
439      movdqa xmm3, xmm2
440
441      movdqa xmm4, xmm5
442      psubw xmm4, xmm2
443      pmulhw xmm4, SIMD_SSE2_word_div_by_14    // * ( ( 1 << 16 ) / 14 + 1 ) ) >> 16
444      movdqa xmm1, xmm2
445      paddw xmm1, xmm4
446      packuswb xmm1, xmm1                      // ab1
447
448      pmullw xmm5, SIMD_SSE2_word_scale66554400
449      pmullw xmm7, SIMD_SSE2_word_scale11223300
450      pmullw xmm2, SIMD_SSE2_word_scale11223300
451      pmullw xmm3, SIMD_SSE2_word_scale66554400
452      paddw xmm5, xmm2
453      paddw xmm7, xmm3
454      pmulhw xmm5, SIMD_SSE2_word_div_by_7 // * ( ( 1 << 16 ) / 7 + 1 ) ) >> 16
455      pmulhw xmm7, SIMD_SSE2_word_div_by_7 // * ( ( 1 << 16 ) / 7 + 1 ) ) >> 16
456      paddw xmm5, xmm4
457      paddw xmm7, xmm4
458
459      pshufd xmm2, xmm5, R_SHUFFLE_D( 0, 0, 0, 0 )
460      pshufd xmm3, xmm5, R_SHUFFLE_D( 1, 1, 1, 1 )
461      pshufd xmm4, xmm5, R_SHUFFLE_D( 2, 2, 2, 2 )
462      packuswb xmm2, xmm2 // ab2
463      packuswb xmm3, xmm3 // ab3
464      packuswb xmm4, xmm4 // ab4
465
466      packuswb xmm0, xmm6 // alpha values
467
468      pshufd xmm5, xmm7, R_SHUFFLE_D( 2, 2, 2, 2 )
469      pshufd xmm6, xmm7, R_SHUFFLE_D( 1, 1, 1, 1 )
470      pshufd xmm7, xmm7, R_SHUFFLE_D( 0, 0, 0, 0 )
471      packuswb xmm5, xmm5 // ab5
472      packuswb xmm6, xmm6 // ab6
473      packuswb xmm7, xmm7 // ab7
474
475      pminub xmm1, xmm0
476      pminub xmm2, xmm0
477      pminub xmm3, xmm0
478      pcmpeqb xmm1, xmm0
479      pcmpeqb xmm2, xmm0
480      pcmpeqb xmm3, xmm0
481      pminub xmm4, xmm0
482      pminub xmm5, xmm0
483      pminub xmm6, xmm0
484      pminub xmm7, xmm0
485      pcmpeqb xmm4, xmm0
486      pcmpeqb xmm5, xmm0
487      pcmpeqb xmm6, xmm0
488      pcmpeqb xmm7, xmm0
489      pand xmm1, SIMD_SSE2_byte_1
490      pand xmm2, SIMD_SSE2_byte_1
491      pand xmm3, SIMD_SSE2_byte_1
492      pand xmm4, SIMD_SSE2_byte_1
493      pand xmm5, SIMD_SSE2_byte_1
494      pand xmm6, SIMD_SSE2_byte_1
495      pand xmm7, SIMD_SSE2_byte_1
496      movdqa xmm0, SIMD_SSE2_byte_1
497      paddusb xmm0, xmm1
498      paddusb xmm2, xmm3
499      paddusb xmm4, xmm5
500      paddusb xmm6, xmm7
501      paddusb xmm0, xmm2
502      paddusb xmm4, xmm6
503      paddusb xmm0, xmm4
504      pand xmm0, SIMD_SSE2_byte_7
505      movdqa xmm1, SIMD_SSE2_byte_2
506      pcmpgtb xmm1, xmm0
507      pand xmm1, SIMD_SSE2_byte_1
508      pxor xmm0, xmm1
509      movdqa xmm1, xmm0
510      movdqa xmm2, xmm0
511      movdqa xmm3, xmm0
512      movdqa xmm4, xmm0
513      movdqa xmm5, xmm0
514      movdqa xmm6, xmm0
515      movdqa xmm7, xmm0
516      psrlq xmm1, 8- 3
517      psrlq xmm2, 16- 6
518      psrlq xmm3, 24- 9
519
520      psrlq xmm4, 32-12
521      psrlq xmm5, 40-15
522      psrlq xmm6, 48-18
523      psrlq xmm7, 56-21
524      pand xmm0, SIMD_SSE2_dword_alpha_bit_mask0
525      pand xmm1, SIMD_SSE2_dword_alpha_bit_mask1
526      pand xmm2, SIMD_SSE2_dword_alpha_bit_mask2
527      pand xmm3, SIMD_SSE2_dword_alpha_bit_mask3
528      pand xmm4, SIMD_SSE2_dword_alpha_bit_mask4
529      pand xmm5, SIMD_SSE2_dword_alpha_bit_mask5
530      pand xmm6, SIMD_SSE2_dword_alpha_bit_mask6
531      pand xmm7, SIMD_SSE2_dword_alpha_bit_mask7
532      por xmm0, xmm1
533      por xmm2, xmm3
534      por xmm4, xmm5
535      por xmm6, xmm7
536      por xmm0, xmm2
537      por xmm4, xmm6
538      por xmm0, xmm4
539      mov esi, outData
540      movd [esi+0], xmm0
541      pshufd xmm1, xmm0, R_SHUFFLE_D( 2, 3, 0, 1 )
542      movd [esi+3], xmm1
543      }
544  outData += 6;
545*/
546}
547
Note: See TracBrowser for help on using the repository browser.