32 #include "api_swrender.h"
34 #if defined(__GNUC__) && !defined(__SSE2__)
40 #include <emmintrin.h>
52 static void copy_pixels(
unsigned int *dest,
const unsigned int *src);
53 static void load_pixel(__m128i &xmm,
const unsigned int &pixel);
54 static void load_pixels(__m128i &xmm,
const unsigned int *pixels);
55 static void load_pixels(__m128i &xmm,
const unsigned int &p1,
unsigned int &p2);
56 static void load_pixel_linear(__m128i &xmm,
const unsigned int &p1,
const unsigned int &p2,
const unsigned int &p3,
const unsigned int &p4,
unsigned int ifracx,
unsigned int ifracy);
57 static void set_one(__m128i &xmm);
59 static void set_color(__m128i &xmm,
unsigned short red,
unsigned short green,
unsigned short blue,
unsigned short alpha);
60 static void set_color(__m128i &xmm,
unsigned short r1,
unsigned short g1,
unsigned short b1,
unsigned short a1,
unsigned short r2,
unsigned short g2,
unsigned short b2,
unsigned short a2);
68 static void blend_normal(__m128i &dest, __m128i &src, __m128i &one, __m128i &half);
70 static void blend_lcd(__m128i &dest, __m128i &src, __m128i &one, __m128i &half, __m128i &color);
71 static void store_pixel(
unsigned int &pixel, __m128i &xmm);
72 static void store_pixels(
unsigned int *pixels, __m128i &xmm);
74 static void pixels_to_channels(__m128i &red, __m128i &green, __m128i &blue, __m128i &alpha,
const __m128i &src0,
const __m128i &src1);
75 static void channels_to_pixels(__m128i &dest0, __m128i &dest1, __m128i &red, __m128i &green, __m128i &blue, __m128i &alpha);
82 src0 = _mm_loadl_epi64((
const __m128i *) src);
83 _mm_storel_epi64((__m128i *) dest, src0);
88 xmm = _mm_cvtsi32_si128(pixel);
89 xmm = _mm_unpacklo_epi8(xmm, _mm_setzero_si128());
94 xmm = _mm_loadl_epi64((
const __m128i *) pixels);
95 xmm = _mm_unpacklo_epi8(xmm, _mm_setzero_si128());
100 xmm = _mm_set_epi32(0, 0, p2, p1);
101 xmm = _mm_unpacklo_epi8(xmm, _mm_setzero_si128());
104 inline void BlitARGB8SSE::load_pixel_linear(__m128i &xmm,
const unsigned int &pixel1,
const unsigned int &pixel2,
const unsigned int &pixel3,
const unsigned int &pixel4,
unsigned int ifracx,
unsigned int ifracy)
106 __m128i src0, src1, src2, src3;
107 __m128i frac0, frac1, frac2, frac3;
108 __m128i fracx, inv_fracx, fracy, inv_fracy;
109 __m128i half = _mm_set1_epi16(64);
110 fracx = _mm_set1_epi16(ifracx);
111 fracy = _mm_set1_epi16(ifracy);
112 inv_fracx = _mm_set1_epi16(0x80-ifracx);
113 inv_fracy = _mm_set1_epi16(0x80-ifracy);
114 frac0 = _mm_srli_epi16(_mm_mullo_epi16(inv_fracx, inv_fracy), 7);
115 frac1 = _mm_srli_epi16(_mm_mullo_epi16(fracx, inv_fracy), 7);
116 frac2 = _mm_srli_epi16(_mm_mullo_epi16(inv_fracx, fracy), 7);
117 frac3 = _mm_srli_epi16(_mm_mullo_epi16(fracx, fracy), 7);
118 src0 = _mm_mullo_epi16(_mm_unpacklo_epi8(_mm_cvtsi32_si128(pixel1), _mm_setzero_si128()), frac0);
119 src1 = _mm_mullo_epi16(_mm_unpacklo_epi8(_mm_cvtsi32_si128(pixel2), _mm_setzero_si128()), frac1);
120 src2 = _mm_mullo_epi16(_mm_unpacklo_epi8(_mm_cvtsi32_si128(pixel3), _mm_setzero_si128()), frac2);
121 src3 = _mm_mullo_epi16(_mm_unpacklo_epi8(_mm_cvtsi32_si128(pixel4), _mm_setzero_si128()), frac3);
122 xmm = _mm_srli_epi16(_mm_add_epi16(_mm_add_epi16(_mm_add_epi16(_mm_add_epi16(src0, src1), src2), src3), half), 7);
127 xmm = _mm_set1_epi16(0x0100);
132 xmm = _mm_set1_epi16(0x007f);
135 inline void BlitARGB8SSE::set_color(__m128i &xmm,
unsigned short red,
unsigned short green,
unsigned short blue,
unsigned short alpha)
137 xmm = _mm_set_epi16(alpha, red, green, blue, alpha, red, green, blue);
140 inline void BlitARGB8SSE::set_color(__m128i &xmm,
unsigned short r1,
unsigned short g1,
unsigned short b1,
unsigned short a1,
unsigned short r2,
unsigned short g2,
unsigned short b2,
unsigned short a2)
142 xmm = _mm_set_epi16(a2, r2, g2, b2, a1, r1, g1, b1);
148 src = _mm_mullo_epi16(src, primcolor);
149 src = _mm_srli_epi16(src, 8);
155 src = _mm_mullo_epi16(src, primcolor);
156 src = _mm_srli_epi16(src, 8);
160 #define cl_blitargb8sse_multiply_color(src, primcolor) \
162 src = _mm_mullo_epi16(src, primcolor); \
163 src = _mm_srli_epi16(src, 8); \
168 __m128i src_alpha, invsrc_alpha;
171 src_alpha = _mm_shufflelo_epi16(src_alpha, 0xff);
172 src_alpha = _mm_shufflehi_epi16(src_alpha, 0xff);
174 invsrc_alpha = _mm_sub_epi16(one, src_alpha);
176 src = _mm_mullo_epi16(src, src_alpha);
177 dest = _mm_mullo_epi16(dest, invsrc_alpha);
179 dest = _mm_add_epi16(dest, src);
180 dest = _mm_add_epi16(dest, half);
181 dest = _mm_srli_epi16(dest, 8);
184 #define cl_blitargb8sse_blend_normal(dest, src, one, half) \
186 __m128i src_alpha, invsrc_alpha; \
189 src_alpha = _mm_shufflelo_epi16(src_alpha, 0xff); \
190 src_alpha = _mm_shufflehi_epi16(src_alpha, 0xff); \
192 invsrc_alpha = _mm_sub_epi16(one, src_alpha); \
194 src = _mm_mullo_epi16(src, src_alpha); \
195 dest = _mm_mullo_epi16(dest, invsrc_alpha); \
197 dest = _mm_add_epi16(dest, src); \
198 dest = _mm_add_epi16(dest, half); \
199 dest = _mm_srli_epi16(dest, 8); \
204 __m128i src_alpha, invsrc_alpha;
207 src_alpha = _mm_shufflelo_epi16(src_alpha, 0xff);
208 src_alpha = _mm_shufflehi_epi16(src_alpha, 0xff);
210 invsrc_alpha = _mm_sub_epi16(one, src_alpha);
212 dest = _mm_mullo_epi16(dest, invsrc_alpha);
213 dest = _mm_add_epi16(dest, half);
214 dest = _mm_srli_epi16(dest, 8);
215 dest = _mm_add_epi16(dest, src);
221 invsrc = _mm_sub_epi16(one, _mm_add_epi16(_mm_srli_epi16(src, 7), src));
223 dest = _mm_add_epi16(_mm_mullo_epi16(src, color), _mm_mullo_epi16(dest, invsrc));
224 dest = _mm_add_epi16(dest, half);
225 dest = _mm_srli_epi16(dest, 8);
230 xmm = _mm_packus_epi16(xmm, _mm_setzero_si128());
231 pixel = _mm_cvtsi128_si32(xmm);
236 xmm = _mm_packus_epi16(xmm, _mm_setzero_si128());
237 _mm_storel_epi64((__m128i *) pixels, xmm);
242 __m128i alpha_mask = _mm_set1_epi32(0xff000000);
243 __m128i red_mask = _mm_set1_epi32(0x00ff0000);
244 __m128i green_mask = _mm_set1_epi32(0x0000ff00);
245 __m128i blue_mask = _mm_set1_epi32(0x000000ff);
247 alpha = _mm_srli_si128(_mm_and_si128(alpha_mask, src0), 1);
248 alpha = _mm_or_si128(alpha, _mm_srli_si128(_mm_and_si128(alpha_mask, src1), 3));
250 red = _mm_and_si128(red_mask, src0);
251 red = _mm_or_si128(red, _mm_srli_si128(_mm_and_si128(red_mask, src1), 2));
253 green = _mm_slli_si128(_mm_and_si128(green_mask, src0), 1);
254 green = _mm_or_si128(green, _mm_srli_si128(_mm_and_si128(green_mask, src1), 1));
256 blue = _mm_slli_si128(_mm_and_si128(blue_mask, src0), 2);
257 blue = _mm_or_si128(blue, _mm_and_si128(blue_mask, src1));
262 __m128i alpha_mask = _mm_set1_epi32(0xff000000);
263 __m128i red_mask = _mm_set1_epi32(0x00ff0000);
264 __m128i green_mask = _mm_set1_epi32(0x0000ff00);
265 __m128i blue_mask = _mm_set1_epi32(0x000000ff);
267 dest0 = _mm_and_si128(alpha_mask, _mm_slli_si128(alpha, 1));
268 dest1 = _mm_and_si128(alpha_mask, _mm_slli_si128(alpha, 3));
270 dest0 = _mm_or_si128(dest0, _mm_and_si128(red_mask, red));
271 dest1 = _mm_or_si128(dest1, _mm_and_si128(red_mask, _mm_slli_si128(red, 2)));
273 dest0 = _mm_or_si128(dest0, _mm_and_si128(green_mask, _mm_srli_si128(green, 1)));
274 dest1 = _mm_or_si128(dest1, _mm_and_si128(green_mask, _mm_slli_si128(green, 1)));
276 dest0 = _mm_or_si128(dest0, _mm_and_si128(blue_mask, _mm_srli_si128(blue, 2)));
277 dest1 = _mm_or_si128(dest1, _mm_and_si128(blue_mask, blue));
282 #define cl_blitargb8sse_sample_nearest(out0, tx, ty, data, width) \
284 __declspec(align(16)) unsigned int x[4], y[4]; \
285 _mm_store_si128((__m128i*) x, _mm_srai_epi32(tx, 16)); \
286 _mm_store_si128((__m128i*) y, _mm_srai_epi32(ty, 16)); \
287 out0 = _mm_set_epi32(data[x[0]+y[0]*width], data[x[1]+y[1]*width], data[x[2]+y[2]*width], data[x[3]+y[3]*width]); \
292 #define cl_blitargb8sse_sample_nearest(out0, tx, ty, data, width) \
294 __attribute__ ((aligned(16))) unsigned int x[4], y[4]; \
295 _mm_store_si128((__m128i*) x, _mm_srai_epi32(tx, 16)); \
296 _mm_store_si128((__m128i*) y, _mm_srai_epi32(ty, 16)); \
297 out0 = _mm_set_epi32(data[x[0]+y[0]*width], data[x[1]+y[1]*width], data[x[2]+y[2]*width], data[x[3]+y[3]*width]); \
305 #define cl_blitargb8sse_texture_repeat(tx, ty, width, height) \
309 __m128i compare_result = _mm_cmplt_epi32(tx, _mm_setzero_si128()); \
310 if (_mm_movemask_epi8(compare_result)) \
311 tx = _mm_add_epi32(tx, _mm_and_si128(compare_result, width)); \
317 __m128i compare_result = _mm_cmplt_epi32(tx, width); \
318 if (_mm_movemask_epi8(compare_result)!=0xffff) \
319 tx = _mm_sub_epi32(tx, _mm_andnot_si128(compare_result, width)); \
325 __m128i compare_result = _mm_cmplt_epi32(ty, _mm_setzero_si128()); \
326 if (_mm_movemask_epi8(compare_result)) \
327 ty = _mm_add_epi32(ty, _mm_and_si128(compare_result, height)); \
333 __m128i compare_result = _mm_cmplt_epi32(ty, height); \
334 if (_mm_movemask_epi8(compare_result)!=0xffff) \
335 ty = _mm_sub_epi32(ty, _mm_andnot_si128(compare_result, height)); \
static void channels_to_pixels(__m128i &dest0, __m128i &dest1, __m128i &red, __m128i &green, __m128i &blue, __m128i &alpha)
Definition: blit_argb8_sse.h:260
SSE accelerated rendering operations for ARGB8888.
Definition: blit_argb8_sse.h:48
static void store_pixel(unsigned int &pixel, __m128i &xmm)
Definition: blit_argb8_sse.h:228
static void multiply_color(__m128i &src, __m128i primcolor)
Definition: blit_argb8_sse.h:153
static void copy_pixels(unsigned int *dest, const unsigned int *src)
Operations.
Definition: blit_argb8_sse.h:79
static void set_color(__m128i &xmm, unsigned short red, unsigned short green, unsigned short blue, unsigned short alpha)
Definition: blit_argb8_sse.h:135
static void set_half(__m128i &xmm)
Definition: blit_argb8_sse.h:130
static void pixels_to_channels(__m128i &red, __m128i &green, __m128i &blue, __m128i &alpha, const __m128i &src0, const __m128i &src1)
Definition: blit_argb8_sse.h:240
static void blend_normal(__m128i &dest, __m128i &src, __m128i &one, __m128i &half)
Definition: blit_argb8_sse.h:166
static void blend_lcd(__m128i &dest, __m128i &src, __m128i &one, __m128i &half, __m128i &color)
Definition: blit_argb8_sse.h:218
static void set_one(__m128i &xmm)
Definition: blit_argb8_sse.h:125
static void load_pixels(__m128i &xmm, const unsigned int *pixels)
Definition: blit_argb8_sse.h:92
static void load_pixel_linear(__m128i &xmm, const unsigned int &p1, const unsigned int &p2, const unsigned int &p3, const unsigned int &p4, unsigned int ifracx, unsigned int ifracy)
Definition: blit_argb8_sse.h:104
static void store_pixels(unsigned int *pixels, __m128i &xmm)
Definition: blit_argb8_sse.h:234
static void load_pixel(__m128i &xmm, const unsigned int &pixel)
Definition: blit_argb8_sse.h:86
static void blend_premultiplied(__m128i &dest, __m128i &src, __m128i &one, __m128i &half)
Definition: blit_argb8_sse.h:202